31883 files changed, 1069803 insertions, 600727 deletions
diff --git a/.gitignore b/.gitignore
index 7587ef56b92d..8f5422cba6e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@
 *.lz4
 *.lzma
 *.lzo
+*.mod
 *.mod.c
 *.o
 *.o.*
diff --git a/.mailmap b/.mailmap
index 07a777f9d687..acba1a6163f1 100644
--- a/.mailmap
+++ b/.mailmap
@@ -81,6 +81,7 @@ Greg Kroah-Hartman <greg@echidna.(none)>
 Greg Kroah-Hartman <gregkh@suse.de>
 Greg Kroah-Hartman <greg@kroah.com>
 Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
+Hanjun Guo <guohanjun@huawei.com> <hanjun.guo@linaro.org>
 Henk Vergonet <Henk.Vergonet@gmail.com>
 Henrik Kretzschmar <henne@nachtwindheim.de>
 Henrik Rydberg <rydberg@bitmath.org>
@@ -97,6 +98,7 @@ Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
 Javi Merino <javi.merino@kernel.org> <javi.merino@arm.com>
 <javier@osg.samsung.com> <javier.martinez@collabora.co.uk>
 Jean Tourrilhes <jt@hpl.hp.com>
+<jean-philippe@linaro.org> <jean-philippe.brucker@arm.com>
 Jeff Garzik <jgarzik@pretzel.yyz.us>
 Jeff Layton <jlayton@kernel.org> <jlayton@redhat.com>
 Jeff Layton <jlayton@kernel.org> <jlayton@poochiereds.net>
@@ -115,6 +117,7 @@ John Stultz <johnstul@us.ibm.com>
 Juha Yrjola <at solidboot.com>
 Juha Yrjola <juha.yrjola@nokia.com>
 Juha Yrjola <juha.yrjola@solidboot.com>
+Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
 Kay Sievers <kay.sievers@vrfy.org>
 Kenneth W Chen <kenneth.w.chen@intel.com>
 Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
@@ -131,6 +134,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
 Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
 Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
+Marc Zyngier <maz@kernel.org> <marc.zyngier@arm.com>
 Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
 Mark Brown <broonie@sirena.org.uk>
 Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
@@ -238,6 +242,7 @@ Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
 Takashi YOSHII <takashi.yoshii.zj@renesas.com>
+Will Deacon <will@kernel.org> <will.deacon@arm.com>
 Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
diff --git a/CREDITS b/CREDITS
index 8e0342620a06..a7387605fbdc 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1770,7 +1770,6 @@ S: USA
 
 N: Dave Jones
 E: davej@codemonkey.org.uk
-W: http://www.codemonkey.org.uk
 D: Assorted VIA x86 support.
 D: 2.5 AGPGART overhaul.
 D: CPUFREQ maintenance.
@@ -1800,7 +1799,7 @@ S: 2300 Copenhagen S.
 S: Denmark
 
 N: Jozsef Kadlecsik
-E: kadlec@blackhole.kfki.hu
+E: kadlec@netfilter.org
 P: 1024D/470DB964 4CB3 1A05 713E 9BF7 FAC5  5809 DD8C B7B1 470D B964
 D: netfilter: TCP window tracking code
 D: netfilter: raw table
@@ -3120,7 +3119,7 @@ S: France
 N: Rik van Riel
 E: riel@redhat.com
 W: http://www.surriel.com/
-D: Linux-MM site, Documentation/sysctl/*, swap/mm readaround
+D: Linux-MM site, Documentation/admin-guide/sysctl/*, swap/mm readaround
 D: kswapd fixes, random kernel hacker, rmap VM,
 D: nl.linux.org administrator, minor scheduler additions
 S: Red Hat Boston
@@ -3364,6 +3363,14 @@ S: Braunschweiger Strasse 79
 S: 31134 Hildesheim
 S: Germany
 
+N: Martin Schwidefsky
+D: Martin was the most significant contributor to the initial s390
+D: port of the Linux Kernel and later the maintainer of the s390
+D: architecture backend for almost two decades.
+D: He passed away in 2019, and will be greatly missed.
+S: Germany
+W: https://lwn.net/Articles/789028/
+
 N: Marcel Selhorst
 E: tpmdd@selhorst.net
 D: TPM driver
diff --git a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
index 16020b31ae64..5d41ebadf15e 100644
--- a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
+++ b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
@@ -5,7 +5,7 @@ Description:	It is possible to switch the cpi setting of the mouse with the
 		press of a button.
 		When read, this file returns the raw number of the actual cpi
 		setting reported by the mouse. This number has to be further
-		processed to receive the real dpi value.
+		processed to receive the real dpi value:
 
 		VALUE DPI
 		1     400
diff --git a/Documentation/ABI/obsolete/sysfs-gpio b/Documentation/ABI/obsolete/sysfs-gpio
index 40d41ea1a3f5..e0d4e5e2dd90 100644
--- a/Documentation/ABI/obsolete/sysfs-gpio
+++ b/Documentation/ABI/obsolete/sysfs-gpio
@@ -11,7 +11,7 @@ Description:
   Kernel code may export it for complete or partial access.
 
   GPIOs are identified as they are inside the kernel, using integers in
-  the range 0..INT_MAX.  See Documentation/gpio for more information.
+  the range 0..INT_MAX.  See Documentation/admin-guide/gpio for more information.
 
     /sys/class/gpio
 	/export ... asks the kernel to export a GPIO to userspace
diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill
index 3ce6231f20b2..9c08c7f98ffb 100644
--- a/Documentation/ABI/removed/sysfs-class-rfkill
+++ b/Documentation/ABI/removed/sysfs-class-rfkill
@@ -1,6 +1,6 @@
 rfkill - radio frequency (RF) connector kill switch support
 
-For details to this subsystem look at Documentation/rfkill.txt.
+For details to this subsystem look at Documentation/driver-api/rfkill.rst.
 
 What:		/sys/class/rfkill/rfkill[0-9]+/claim
 Date:		09-Jul-2007
diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband
index 17211ceb9bf4..aed21b8916a2 100644
--- a/Documentation/ABI/stable/sysfs-class-infiniband
+++ b/Documentation/ABI/stable/sysfs-class-infiniband
@@ -423,23 +423,6 @@ Description:
 		(e.g. driver restart on the VM which owns the VF).
 
 
-sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes)
----------------------------------------------------------------
-
-What:		/sys/class/infiniband/nesX/hw_rev
-What:		/sys/class/infiniband/nesX/hca_type
-What:		/sys/class/infiniband/nesX/board_id
-Date:		Feb, 2008
-KernelVersion:	v2.6.25
-Contact:	linux-rdma@vger.kernel.org
-Description:
-		hw_rev:		(RO) Hardware revision number
-
-		hca_type:	(RO) Host Channel Adapter type (NEX020)
-
-		board_id:	(RO) Manufacturing board id
-
-
 sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4)
 -----------------------------------------------------
 
diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill
index 80151a409d67..5b154f922643 100644
--- a/Documentation/ABI/stable/sysfs-class-rfkill
+++ b/Documentation/ABI/stable/sysfs-class-rfkill
@@ -1,6 +1,6 @@
 rfkill - radio frequency (RF) connector kill switch support
 
-For details to this subsystem look at Documentation/rfkill.txt.
+For details to this subsystem look at Documentation/driver-api/rfkill.rst.
 
 For the deprecated /sys/class/rfkill/*/claim knobs of this interface look in
 Documentation/ABI/removed/sysfs-class-rfkill.
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index f7ce68fbd4b9..df8413cf1468 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -61,7 +61,7 @@ Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
 		The node's hit/miss statistics, in units of pages.
-		See Documentation/numastat.txt
+		See Documentation/admin-guide/numastat.rst
 
 What:		/sys/devices/system/node/nodeX/distance
 Date:		October 2002
diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
index 156319fc5b80..8ca498447aeb 100644
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@@ -1,5 +1,4 @@
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							asic_health
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_health
 
 Date:		June 2018
 KernelVersion:	4.19
@@ -9,9 +8,8 @@ Description:	This file shows ASIC health status. The possible values are:
 
 		The files are read only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							cpld1_version
-							cpld2_version
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld1_version
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld2_version
 Date:		June 2018
 KernelVersion:	4.19
 Contact:	Vadim Pasternak <vadimpmellanox.com>
@@ -20,8 +18,7 @@ Description:	These files show with which CPLD versions have been burned
 
 		The files are read only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							fan_dir
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/fan_dir
 
 Date:		December 2018
 KernelVersion:	5.0
@@ -32,8 +29,7 @@ Description:	This file shows the system fans direction:
 
 		The files are read only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							jtag_enable
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
 
 Date:		November 2018
 KernelVersion:	5.0
@@ -43,8 +39,7 @@ Description:	These files show with which CPLD versions have been burned
 
 		The files are read only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							jtag_enable
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
 
 Date:		November 2018
 KernelVersion:	5.0
@@ -87,16 +82,15 @@ Description:	These files allow asserting system power cycling, switching
 
 		The files are write only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-							reset_aux_pwr_or_ref
-							reset_asic_thermal
-							reset_hotswap_or_halt
-							reset_hotswap_or_wd
-							reset_fw_reset
-							reset_long_pb
-							reset_main_pwr_fail
-							reset_short_pb
-							reset_sw_reset
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_aux_pwr_or_ref
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_asic_thermal
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_hotswap_or_halt
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_hotswap_or_wd
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_fw_reset
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_long_pb
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_main_pwr_fail
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_short_pb
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_reset
 Date:		June 2018
 KernelVersion:	4.19
 Contact:	Vadim Pasternak <vadimpmellanox.com>
@@ -110,11 +104,10 @@ Description:	These files show the system reset cause, as following: power
 
 		The files are read only.
 
-What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
-						reset_comex_pwr_fail
-						reset_from_comex
-						reset_system
-						reset_voltmon_upgrade_fail
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_pwr_fail
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_comex
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_system
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_voltmon_upgrade_fail
 
 Date:		November 2018
 KernelVersion:	5.0
@@ -127,3 +120,23 @@ Description:	These files show the system reset cause, as following: ComEx
 		the last reset cause.
 
 		The files are read only.
+
+Date:		June 2019
+KernelVersion:	5.3
+Contact:	Vadim Pasternak <vadimpmellanox.com>
+Description:	These files show the system reset cause, as following:
+		COMEX thermal shutdown; wathchdog power off or reset was derived
+		by one of the next components: COMEX, switch board or by Small Form
+		Factor mezzanine, reset requested from ASIC, reset cuased by BIOS
+		reload. Value 1 in file means this is reset cause, 0 - otherwise.
+		Only one of the above causes could be 1 at the same time, representing
+		only last reset cause.
+
+		The files are read only.
+
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd
diff --git a/Documentation/ABI/testing/debugfs-cec-error-inj b/Documentation/ABI/testing/debugfs-cec-error-inj
index 122b65c5fe62..4c3596c6d25b 100644
--- a/Documentation/ABI/testing/debugfs-cec-error-inj
+++ b/Documentation/ABI/testing/debugfs-cec-error-inj
@@ -1,6 +1,6 @@
 What:		/sys/kernel/debug/cec/*/error-inj
 Date:		March 2018
-Contact:	Hans Verkuil <hans.verkuil@cisco.com>
+Contact:	Hans Verkuil <hverkuil-cisco@xs4all.nl>
 Description:
 
 The CEC Framework allows for CEC error injection commands through
diff --git a/Documentation/ABI/testing/debugfs-cros-ec b/Documentation/ABI/testing/debugfs-cros-ec
new file mode 100644
index 000000000000..1fe0add99a2a
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-cros-ec
@@ -0,0 +1,56 @@
+What:		/sys/kernel/debug/<cros-ec-device>/console_log
+Date:		September 2017
+KernelVersion:	4.13
+Description:
+		If the EC supports the CONSOLE_READ command type, this file
+		can be used to grab the EC logs. The kernel polls for the log
+		and keeps its own buffer but userspace should grab this and
+		write it out to some logs.
+
+What:		/sys/kernel/debug/<cros-ec-device>/panicinfo
+Date:		September 2017
+KernelVersion:	4.13
+Description:
+		This file dumps the EC panic information from the previous
+		reboot. This file will only exist if the PANIC_INFO command
+		type is supported by the EC.
+
+What:		/sys/kernel/debug/<cros-ec-device>/pdinfo
+Date:		June 2018
+KernelVersion:	4.17
+Description:
+		This file provides the port role, muxes and power debug
+		information for all the USB PD/type-C ports available. If
+		the are no ports available, this file will be just an empty
+		file.
+
+What:		/sys/kernel/debug/<cros-ec-device>/uptime
+Date:		June 2019
+KernelVersion:	5.3
+Description:
+		A u32 providing the time since EC booted in ms. This is
+		is used for synchronizing the AP host time with the EC
+		log. An error is returned if the command is not supported
+		by the EC or there is a communication problem.
+
+What:		/sys/kernel/debug/<cros-ec-device>/last_resume_result
+Date:		June 2019
+KernelVersion:	5.3
+Description:
+		Some ECs have a feature where they will track transitions to
+		the (Intel) processor's SLP_S0 line, in order to detect cases
+		where a system failed to go into S0ix. When the system resumes,
+		an EC with this feature will return a summary of SLP_S0
+		transitions that occurred. The last_resume_result file returns
+		the most recent response from the AP's resume message to the EC.
+
+		The bottom 31 bits contain a count of the number of SLP_S0
+		transitions that occurred since the suspend message was
+		received. Bit 31 is set if the EC attempted to wake the
+		system due to a timeout when watching for SLP_S0 transitions.
+		Callers can use this to detect a wake from the EC due to
+		S0ix timeouts. The result will be zero if no suspend
+		transitions have been attempted, or the EC does not support
+		this feature.
+
+		Output will be in the format: "0x%08x\n".
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 2f5b80be07a3..f0ac14b70ecb 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -3,7 +3,10 @@ Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Sets the device address to be used for read or write through
-                PCI bar. The acceptable value is a string that starts with "0x"
+                PCI bar, or the device VA of a host mapped memory to be read or
+                written directly from the host. The latter option is allowed
+                only when the IOMMU is disabled.
+                The acceptable value is a string that starts with "0x"
 
 What:           /sys/kernel/debug/habanalabs/hl<n>/command_buffers
 Date:           Jan 2019
@@ -33,10 +36,12 @@ Contact:        oded.gabbay@gmail.com
 Description:    Allows the root user to read or write directly through the
                 device's PCI bar. Writing to this file generates a write
                 transaction while reading from the file generates a read
-                transcation. This custom interface is needed (instead of using
+                transaction. This custom interface is needed (instead of using
                 the generic Linux user-space PCI mapping) because the DDR bar
                 is very small compared to the DDR memory and only the driver can
-                move the bar before and after the transaction
+                move the bar before and after the transaction.
+                If the IOMMU is disabled, it also allows the root user to read
+                or write from the host a device VA of a host mapped memory
 
 What:           /sys/kernel/debug/habanalabs/hl<n>/device
 Date:           Jan 2019
@@ -46,6 +51,13 @@ Description:    Enables the root user to set the device to specific state.
                 Valid values are "disable", "enable", "suspend", "resume".
                 User can read this property to see the valid values
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/engines
+Date:           Jul 2019
+KernelVersion:  5.3
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the status registers values of the device engines and
+                their derived idle status
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
 Date:           Jan 2019
 KernelVersion:  5.1
diff --git a/Documentation/ABI/testing/debugfs-wilco-ec b/Documentation/ABI/testing/debugfs-wilco-ec
index 73a5a66ddca6..9d8d9d2def5b 100644
--- a/Documentation/ABI/testing/debugfs-wilco-ec
+++ b/Documentation/ABI/testing/debugfs-wilco-ec
@@ -23,11 +23,9 @@ Description:
 
 		For writing, bytes 0-1 indicate the message type, one of enum
 		wilco_ec_msg_type. Byte 2+ consist of the data passed in the
-		request, starting at MBOX[0]
-
-		At least three bytes are required for writing, two for the type
-		and at least a single byte of data. Only the first
-		EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
+		request, starting at MBOX[0]. At least three bytes are required
+		for writing, two for the type and at least a single byte of
+		data.
 
 		Example:
 		// Request EC info type 3 (EC firmware build date)
@@ -40,7 +38,7 @@ Description:
 		$ cat /sys/kernel/debug/wilco_ec/raw
 		00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00  ..12/21/18.8...
 
-		Note that the first 32 bytes of the received MBOX[] will be
-		printed, even if some of the data is junk. It is up to you to
-		know how many of the first bytes of data are the actual
-		response.
+		Note that the first 16 bytes of the received MBOX[] will be
+		printed, even if some of the data is junk, and skipping bytes
+		17 to 32. It is up to you to know how many of the first bytes of
+		data are the actual response.
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index 74c6702de74e..fc376a323908 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -24,11 +24,11 @@ Description:
 				[euid=] [fowner=] [fsname=]]
 			lsm:	[[subj_user=] [subj_role=] [subj_type=]
 				 [obj_user=] [obj_role=] [obj_type=]]
-			option:	[[appraise_type=]] [permit_directio]
-
+			option:	[[appraise_type=]] [template=] [permit_directio]
 		base: 	func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
 				[FIRMWARE_CHECK]
 				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
+				[KEXEC_CMDLINE]
 			mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
 			       [[^]MAY_EXEC]
 			fsmagic:= hex value
@@ -38,6 +38,8 @@ Description:
 			fowner:= decimal value
 		lsm:  	are LSM specific
 		option:	appraise_type:= [imasig]
+			template:= name of a defined IMA template type
+			(eg, ima-ng). Only valid when action is "measure".
 			pcr:= decimal value
 
 		default policy:
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index abac31d216de..2c44b4f1b060 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,4 @@ Description:
 		17 - sectors discarded
 		18 - time spent discarding
 
-		For more details refer to Documentation/iostats.txt
+		For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
index 0a54ed0d63c9..274df44d8b1b 100644
--- a/Documentation/ABI/testing/procfs-smaps_rollup
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -3,18 +3,28 @@ Date:		August 2017
 Contact:	Daniel Colascione <dancol@google.com>
 Description:
 		This file provides pre-summed memory information for a
-		process.  The format is identical to /proc/pid/smaps,
+		process.  The format is almost identical to /proc/pid/smaps,
 		except instead of an entry for each VMA in a process,
 		smaps_rollup has a single entry (tagged "[rollup]")
 		for which each field is the sum of the corresponding
 		fields from all the maps in /proc/pid/smaps.
-		For more details, see the procfs man page.
+		Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem
+		are not present in /proc/pid/smaps.  These fields represent
+		the sum of the Pss field of each type (anon, file, shmem).
+		For more details, see Documentation/filesystems/proc.txt
+		and the procfs man page.
 
 		Typical output looks like this:
 
 		00100000-ff709000 ---p 00000000 00:00 0		 [rollup]
+		Size:               1192 kB
+		KernelPageSize:        4 kB
+		MMUPageSize:           4 kB
 		Rss:		     884 kB
 		Pss:		     385 kB
+		Pss_Anon:	     301 kB
+		Pss_File:	      80 kB
+		Pss_Shmem:	       4 kB
 		Shared_Clean:	     696 kB
 		Shared_Dirty:	       0 kB
 		Private_Clean:	     120 kB
diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore
index 5fca9f5e10a3..d45209abdb1b 100644
--- a/Documentation/ABI/testing/pstore
+++ b/Documentation/ABI/testing/pstore
@@ -1,6 +1,6 @@
-Where:		/sys/fs/pstore/... (or /dev/pstore/...)
+What:		/sys/fs/pstore/... (or /dev/pstore/...)
 Date:		March 2011
-Kernel Version: 2.6.39
+KernelVersion: 2.6.39
 Contact:	tony.luck@intel.com
 Description:	Generic interface to platform dependent persistent storage.
 
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dfad7427817c..f8c7c7126bb1 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,7 +15,7 @@ Description:
 		 9 - I/Os currently in progress
 		10 - time spent doing I/Os (ms)
 		11 - weighted time spent doing I/Os (ms)
-		For more details refer Documentation/iostats.txt
+		For more details refer Documentation/admin-guide/iostats.rst
 
 
 What:		/sys/block/<disk>/<part>/stat
diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device
index 82ef6eab042d..17f2bc7dd261 100644
--- a/Documentation/ABI/testing/sysfs-block-device
+++ b/Documentation/ABI/testing/sysfs-block-device
@@ -45,7 +45,7 @@ Description:
 		- Values below -2 are rejected with -EINVAL
 
 		For more information, see
-		Documentation/laptops/disk-shock-protection.txt
+		Documentation/admin-guide/laptops/disk-shock-protection.rst
 
 
 What:		/sys/block/*/device/ncq_prio_enable
diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
index 2979c40c10e9..966f8504bd7b 100644
--- a/Documentation/ABI/testing/sysfs-bus-css
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -33,3 +33,26 @@ Description:	Contains the PIM/PAM/POM values, as reported by the
 		in sync with the values current in the channel subsystem).
 		Note: This is an I/O-subchannel specific attribute.
 Users:		s390-tools, HAL
+
+What:		/sys/bus/css/devices/.../driver_override
+Date:		June 2019
+Contact:	Cornelia Huck <cohuck@redhat.com>
+		linux-s390@vger.kernel.org
+Description:	This file allows the driver for a device to be specified. When
+		specified, only a driver with a name matching the value written
+		to driver_override will have an opportunity to bind to the
+		device. The override is specified by writing a string to the
+		driver_override file (echo vfio-ccw > driver_override) and
+		may be cleared with an empty string (echo > driver_override).
+		This returns the device to standard matching rules binding.
+		Writing to driver_override does not automatically unbind the
+		device from its current driver or make any attempt to
+		automatically load the specified driver.  If no driver with a
+		matching name is currently loaded in the kernel, the device
+		will not bind to any driver.  This also allows devices to
+		opt-out of driver binding using a driver_override name such as
+		"none".  Only a single driver may be specified in the override,
+		there is no support for parsing delimiters.
+		Note that unlike the mechanism of the same name for pci, this
+		file does not allow to override basic matching rules. I.e.,
+		the driver must still match the subchannel type of the device.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
index 77f47ff5ee02..5bb793ec926c 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
@@ -1,6 +1,6 @@
-Where:		/sys/bus/event_source/devices/<dev>/format
+What:		/sys/bus/event_source/devices/<dev>/format
 Date:		January 2012
-Kernel Version: 3.3
+KernelVersion: 3.3
 Contact:	Jiri Olsa <jolsa@redhat.com>
 Description:
 		Attribute group to describe the magic bits that go into
diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
index feb2e4a87075..4a251b7f11e4 100644
--- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
@@ -1,20 +1,20 @@
-Where:		/sys/bus/i2c/devices/.../heading0_input
+What:		/sys/bus/i2c/devices/.../heading0_input
 Date:		April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
 Contact:	alan.cox@intel.com
 Description:	Reports the current heading from the compass as a floating
 		point value in degrees.
 
-Where:		/sys/bus/i2c/devices/.../power_state
+What:		/sys/bus/i2c/devices/.../power_state
 Date:		April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
 Contact:	alan.cox@intel.com
 Description:	Sets the power state of the device. 0 sets the device into
 		sleep mode, 1 wakes it up.
 
-Where:		/sys/bus/i2c/devices/.../calibration
+What:		/sys/bus/i2c/devices/.../calibration
 Date:		April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
 Contact:	alan.cox@intel.com
 Description:	Sets the calibration on or off (1 = on, 0 = off). See the
 		chip data sheet.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 6aef7dbbde44..680451695422 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -61,8 +61,11 @@ What:		/sys/bus/iio/devices/triggerX/sampling_frequency_available
 KernelVersion:	2.6.35
 Contact:	linux-iio@vger.kernel.org
 Description:
-		When the internal sampling clock can only take a small
-		discrete set of values, this file lists those available.
+		When the internal sampling clock can only take a specific set of
+		frequencies, we can specify the available values with:
+		- a small discrete set of values like "0 2 4 6 8"
+		- a range with minimum, step and maximum frequencies like
+		  "[min step max]"
 
 What:		/sys/bus/iio/devices/iio:deviceX/oversampling_ratio
 KernelVersion:	2.6.38
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
index 0e95c2ca105c..6158f831c761 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
+++ b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
@@ -18,11 +18,11 @@ Description:
 		values are 'base' and 'lid'.
 
 What:		/sys/bus/iio/devices/iio:deviceX/id
-Date:		Septembre 2017
+Date:		September 2017
 KernelVersion:	4.14
 Contact:	linux-iio@vger.kernel.org
 Description:
-		This attribute is exposed by the CrOS EC legacy accelerometer
-		driver and represents the sensor ID as exposed by the EC. This
-		ID is used by the Android sensor service hardware abstraction
-		layer (sensor HAL) through the Android container on ChromeOS.
+		This attribute is exposed by the CrOS EC sensors driver and
+		represents the sensor ID as exposed by the EC. This ID is used
+		by the Android sensor service hardware abstraction layer (sensor
+		HAL) through the Android container on ChromeOS.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
index 0a1ca1487fa9..a133fd8d081a 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
+++ b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
@@ -1,4 +1,4 @@
-What		/sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
+What:		/sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
 Date:		January 2017
 KernelVersion:	4.11
 Contact:	linux-iio@vger.kernel.org
@@ -6,7 +6,7 @@ Description:
 		Show or set the gain boost of the amp, from 0-31 range.
 		default 31
 
-What		/sys/bus/iio/devices/iio:deviceX/sensor_max_range
+What:		/sys/bus/iio/devices/iio:deviceX/sensor_max_range
 Date:		January 2017
 KernelVersion:	4.11
 Contact:	linux-iio@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371 b/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371
new file mode 100644
index 000000000000..302de64cb424
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371
@@ -0,0 +1,44 @@
+What:		/sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency
+KernelVersion:
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Stores the PLL frequency in Hz for channel Y.
+		Reading returns the actual frequency in Hz.
+		The ADF4371 has an integrated VCO with fundamendal output
+		frequency ranging from 4000000000 Hz 8000000000 Hz.
+
+		out_altvoltage0_frequency:
+			A divide by 1, 2, 4, 8, 16, 32 or circuit generates
+			frequencies from 62500000 Hz to 8000000000 Hz.
+		out_altvoltage1_frequency:
+			This channel duplicates the channel 0 frequency
+		out_altvoltage2_frequency:
+			A frequency doubler generates frequencies from
+			8000000000 Hz to 16000000000 Hz.
+		out_altvoltage3_frequency:
+			A frequency quadrupler generates frequencies from
+			16000000000 Hz to 32000000000 Hz.
+
+		Note: writes to one of the channels will affect the frequency of
+		all the other channels, since it involves changing the VCO
+		fundamental output frequency.
+
+What:		/sys/bus/iio/devices/iio:deviceX/out_altvoltageY_name
+KernelVersion:
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Reading returns the datasheet name for channel Y:
+
+		out_altvoltage0_name: RF8x
+		out_altvoltage1_name: RFAUX8x
+		out_altvoltage2_name: RF16x
+		out_altvoltage3_name: RF32x
+
+What:		/sys/bus/iio/devices/iio:deviceX/out_altvoltageY_powerdown
+KernelVersion:
+Contact:	linux-iio@vger.kernel.org
+Description:
+		This attribute allows the user to power down the PLL and it's
+		RFOut buffers.
+		Writing 1 causes the specified channel to power down.
+		Clearing returns to normal operation.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
index 9a17ab5036a4..c59d95346341 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
+++ b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
@@ -1,4 +1,4 @@
-What		/sys/bus/iio/devices/iio:deviceX/in_proximity_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_proximity_input
 Date:		March 2014
 KernelVersion:	3.15
 Contact:	Matt Ranostay <matt.ranostay@konsulko.com>
@@ -6,7 +6,7 @@ Description:
 		Get the current distance in meters of storm (1km steps)
 		1000-40000 = distance in meters
 
-What		/sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
+What:		/sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
 Date:		March 2014
 KernelVersion:	3.15
 Contact:	Matt Ranostay <matt.ranostay@konsulko.com>
diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio
deleted file mode 100644
index 491baaf4285f..000000000000
--- a/Documentation/ABI/testing/sysfs-bus-mdio
+++ /dev/null
@@ -1,29 +0,0 @@
-What:		/sys/bus/mdio_bus/devices/.../phy_id
-Date:		November 2012
-KernelVersion:	3.8
-Contact:	netdev@vger.kernel.org
-Description:
-		This attribute contains the 32-bit PHY Identifier as reported
-		by the device during bus enumeration, encoded in hexadecimal.
-		This ID is used to match the device with the appropriate
-		driver.
-
-What:		/sys/bus/mdio_bus/devices/.../phy_interface
-Date:		February 2014
-KernelVersion:	3.15
-Contact:	netdev@vger.kernel.org
-Description:
-		This attribute contains the PHY interface as configured by the
-		Ethernet driver during bus enumeration, encoded in string.
-		This interface mode is used to configure the Ethernet MAC with the
-		appropriate mode for its data lines to the PHY hardware.
-
-What:		/sys/bus/mdio_bus/devices/.../phy_has_fixups
-Date:		February 2014
-KernelVersion:	3.15
-Contact:	netdev@vger.kernel.org
-Description:
-		This attribute contains the boolean value whether a given PHY
-		device has had any "fixup" workaround running on it, encoded as
-		a boolean. This information is provided to help troubleshooting
-		PHY configurations.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
index 4b0318c99507..3c9a8c4a25eb 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -9,9 +9,9 @@ errors may be "seen" / reported by the link partner and not the
 problematic endpoint itself (which may report all counters as 0 as it never
 saw any problems).
 
-Where:		/sys/bus/pci/devices/<dev>/aer_dev_correctable
+What:		/sys/bus/pci/devices/<dev>/aer_dev_correctable
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	List of correctable errors seen and reported by this
 		PCI device using ERR_COR. Note that since multiple errors may
@@ -31,9 +31,9 @@ Header Log Overflow 0
 TOTAL_ERR_COR 2
 -------------------------------------------------------------------------
 
-Where:		/sys/bus/pci/devices/<dev>/aer_dev_fatal
+What:		/sys/bus/pci/devices/<dev>/aer_dev_fatal
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	List of uncorrectable fatal errors seen and reported by this
 		PCI device using ERR_FATAL. Note that since multiple errors may
@@ -62,9 +62,9 @@ TLP Prefix Blocked Error 0
 TOTAL_ERR_FATAL 0
 -------------------------------------------------------------------------
 
-Where:		/sys/bus/pci/devices/<dev>/aer_dev_nonfatal
+What:		/sys/bus/pci/devices/<dev>/aer_dev_nonfatal
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	List of uncorrectable nonfatal errors seen and reported by this
 		PCI device using ERR_NONFATAL. Note that since multiple errors
@@ -103,20 +103,20 @@ collectors) that are AER capable. These indicate the number of error messages as
 device, so these counters include them and are thus cumulative of all the error
 messages on the PCI hierarchy originating at that root port.
 
-Where:		/sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
+What:		/sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	Total number of ERR_COR messages reported to rootport.
 
-Where:	    /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
+What:	    /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	Total number of ERR_FATAL messages reported to rootport.
 
-Where:	    /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
+What:	    /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
 Date:		July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
 Contact:	linux-pci@vger.kernel.org, rajatja@google.com
 Description:	Total number of ERR_NONFATAL messages reported to rootport.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
index 53d99edd1d75..92a94e1068c2 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -1,68 +1,68 @@
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/model
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/model
 Date:		March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
 Contact:	iss_storagedev@hp.com
 Description:	Displays the SCSI INQUIRY page 0 model for logical drive
 		Y of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
 Date:		March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
 Contact:	iss_storagedev@hp.com
 Description:	Displays the SCSI INQUIRY page 0 revision for logical
 		drive Y of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
 Date:		March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
 Contact:	iss_storagedev@hp.com
 Description:	Displays the SCSI INQUIRY page 83 serial number for logical
 		drive Y of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
 Date:		March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
 Contact:	iss_storagedev@hp.com
 Description:	Displays the SCSI INQUIRY page 0 vendor for logical drive
 		Y of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
 Date:		March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
 Contact:	iss_storagedev@hp.com
 Description:	A symbolic link to /sys/block/cciss!cXdY
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/rescan
+What:		/sys/bus/pci/devices/<dev>/ccissX/rescan
 Date:		August 2009
-Kernel Version:	2.6.31
+KernelVersion:	2.6.31
 Contact:	iss_storagedev@hp.com
 Description:	Kicks of a rescan of the controller to discover logical
 		drive topology changes.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/lunid
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/lunid
 Date:		August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
 Contact:	iss_storagedev@hp.com
 Description:	Displays the 8-byte LUN ID used to address logical
 		drive Y of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/raid_level
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/raid_level
 Date:		August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
 Contact:	iss_storagedev@hp.com
 Description:	Displays the RAID level of logical drive Y of
 		controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/usage_count
+What:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/usage_count
 Date:		August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
 Contact:	iss_storagedev@hp.com
 Description:	Displays the usage count (number of opens) of logical drive Y
 		of controller X.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/resettable
+What:		/sys/bus/pci/devices/<dev>/ccissX/resettable
 Date:		February 2011
-Kernel Version:	2.6.38
+KernelVersion:	2.6.38
 Contact:	iss_storagedev@hp.com
 Description:	Value of 1 indicates the controller can honor the reset_devices
 		kernel parameter.  Value of 0 indicates reset_devices cannot be
@@ -71,9 +71,9 @@ Description:	Value of 1 indicates the controller can honor the reset_devices
 		a dump device, as kdump requires resetting the device in order
 		to work reliably.
 
-Where:		/sys/bus/pci/devices/<dev>/ccissX/transport_mode
+What:		/sys/bus/pci/devices/<dev>/ccissX/transport_mode
 Date:		July 2011
-Kernel Version:	3.0
+KernelVersion:	3.0
 Contact:	iss_storagedev@hp.com
 Description:	Value of "simple" indicates that the controller has been placed
 		in "simple mode". Value of "performant" indicates that the
diff --git a/Documentation/ABI/testing/sysfs-bus-siox b/Documentation/ABI/testing/sysfs-bus-siox
index fed7c3765a4e..c2a403f20b90 100644
--- a/Documentation/ABI/testing/sysfs-bus-siox
+++ b/Documentation/ABI/testing/sysfs-bus-siox
@@ -1,6 +1,6 @@
 What:		/sys/bus/siox/devices/siox-X/active
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		On reading represents the current state of the bus. If it
 		contains a "0" the bus is stopped and connected devices are
@@ -12,7 +12,7 @@ Description:
 
 What:		/sys/bus/siox/devices/siox-X/device_add
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Write-only file. Write
 
@@ -27,13 +27,13 @@ Description:
 
 What:		/sys/bus/siox/devices/siox-X/device_remove
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Write-only file. A single write removes the last device in the siox chain.
 
 What:		/sys/bus/siox/devices/siox-X/poll_interval_ns
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Defines the interval between two poll cycles in nano seconds.
 		Note this is rounded to jiffies on writing. On reading the current value
@@ -41,33 +41,33 @@ Description:
 
 What:		/sys/bus/siox/devices/siox-X-Y/connected
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value. "0" means the Yth device on siox bus X isn't "connected" i.e.
 		communication with it is not ensured. "1" signals a working connection.
 
 What:		/sys/bus/siox/devices/siox-X-Y/inbytes
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value reporting the inbytes value provided to siox-X/device_add
 
 What:		/sys/bus/siox/devices/siox-X-Y/status_errors
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Counts the number of time intervals when the read status byte doesn't yield the
 		expected value.
 
 What:		/sys/bus/siox/devices/siox-X-Y/type
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value reporting the type value provided to siox-X/device_add.
 
 What:		/sys/bus/siox/devices/siox-X-Y/watchdog
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value reporting if the watchdog of the siox device is
 		active. "0" means the watchdog is not active and the device is expected to
@@ -75,13 +75,13 @@ Description:
 
 What:		/sys/bus/siox/devices/siox-X-Y/watchdog_errors
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value reporting the number to time intervals when the
 		watchdog was active.
 
 What:		/sys/bus/siox/devices/siox-X-Y/outbytes
 KernelVersion:	4.16
-Contact:	Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact:	Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 Description:
 		Read-only value reporting the outbytes value provided to siox-X/device_add.
diff --git a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
index 70d00dfa443d..9ade80f81f96 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
+++ b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
@@ -1,14 +1,14 @@
-Where:		/sys/bus/usb/.../powered
+What:		/sys/bus/usb/.../powered
 Date:		August 2008
-Kernel Version:	2.6.26
+KernelVersion:	2.6.26
 Contact:	Harrison Metzger <harrisonmetz@gmail.com>
 Description:	Controls whether the device's display will powered.
 		A value of 0 is off and a non-zero value is on.
 
-Where:		/sys/bus/usb/.../mode_msb
-Where:		/sys/bus/usb/.../mode_lsb
+What:		/sys/bus/usb/.../mode_msb
+What:		/sys/bus/usb/.../mode_lsb
 Date:		August 2008
-Kernel Version:	2.6.26
+KernelVersion:	2.6.26
 Contact:	Harrison Metzger <harrisonmetz@gmail.com>
 Description:	Controls the devices display mode.
 		For a 6 character display the values are
@@ -16,24 +16,24 @@ Description:	Controls the devices display mode.
 		for an 8 character display the values are
 			MSB 0x08; LSB 0xFF.
 
-Where:		/sys/bus/usb/.../textmode
+What:		/sys/bus/usb/.../textmode
 Date:		August 2008
-Kernel Version:	2.6.26
+KernelVersion:	2.6.26
 Contact:	Harrison Metzger <harrisonmetz@gmail.com>
 Description:	Controls the way the device interprets its text buffer.
 		raw:	each character controls its segment manually
 		hex:	each character is between 0-15
 		ascii:	each character is between '0'-'9' and 'A'-'F'.
 
-Where:		/sys/bus/usb/.../text
+What:		/sys/bus/usb/.../text
 Date:		August 2008
-Kernel Version:	2.6.26
+KernelVersion:	2.6.26
 Contact:	Harrison Metzger <harrisonmetz@gmail.com>
 Description:	The text (or data) for the device to display
 
-Where:		/sys/bus/usb/.../decimals
+What:		/sys/bus/usb/.../decimals
 Date:		August 2008
-Kernel Version:	2.6.26
+KernelVersion:	2.6.26
 Contact:	Harrison Metzger <harrisonmetz@gmail.com>
 Description:	Controls the decimal places on the device.
 		To set the nth decimal place, give this field
diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
index 77cf7ac949af..c0e0a9ae7b3d 100644
--- a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
+++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
@@ -4,7 +4,7 @@ KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
 		Get the ALS output channel used as input in
-		ALS-current-control mode (0, 1), where
+		ALS-current-control mode (0, 1), where:
 
 		0 - out_current0 (backlight 0)
 		1 - out_current1 (backlight 1)
@@ -28,7 +28,7 @@ Date:		April 2012
 KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
-		Set the brightness-mapping mode (0, 1), where
+		Set the brightness-mapping mode (0, 1), where:
 
 		0 - exponential mode
 		1 - linear mode
@@ -38,7 +38,7 @@ Date:		April 2012
 KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
-		Set the PWM-input control mask (5 bits), where
+		Set the PWM-input control mask (5 bits), where:
 
 		bit 5 - PWM-input enabled in Zone 4
 		bit 4 - PWM-input enabled in Zone 3
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index bbbabffc682a..7970e3713e70 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -1,6 +1,6 @@
-Note: Attributes that are shared between devices are stored in the directory
-pointed to by the symlink device/.
-Example: The real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is
+Please note that attributes that are shared between devices are stored in
+the directory pointed to by the symlink device/.
+For example, the real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is
 /sys/class/cxl/afu0.0s/device/irqs_max, i.e. /sys/class/cxl/afu0.0/irqs_max.
 
 
diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq
index ee39acacf6f8..01196e19afca 100644
--- a/Documentation/ABI/testing/sysfs-class-devfreq
+++ b/Documentation/ABI/testing/sysfs-class-devfreq
@@ -47,7 +47,7 @@ Description:
 What:		/sys/class/devfreq/.../trans_stat
 Date:		October 2012
 Contact:	MyungJoo Ham <myungjoo.ham@samsung.com>
-Descrtiption:
+Description:
 		This ABI shows the statistics of devfreq behavior on a
 		specific device. It shows the time spent in each state and
 		the number of transitions between states.
diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
index 620ebb3b9baa..e4c89b261546 100644
--- a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
+++ b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
@@ -4,7 +4,7 @@ KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
 		Set the ALS output channel to use as input in
-		ALS-current-control mode (1, 2), where
+		ALS-current-control mode (1, 2), where:
 
 		1 - out_current1
 		2 - out_current2
@@ -22,7 +22,7 @@ Date:		April 2012
 KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
-		Set the pattern generator fall and rise times (0..7), where
+		Set the pattern generator fall and rise times (0..7), where:
 
 		0 - 2048 us
 		1 - 262 ms
@@ -45,7 +45,7 @@ Date:		April 2012
 KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
-		Set the brightness-mapping mode (0, 1), where
+		Set the brightness-mapping mode (0, 1), where:
 
 		0 - exponential mode
 		1 - linear mode
@@ -55,7 +55,7 @@ Date:		April 2012
 KernelVersion:	3.5
 Contact:	Johan Hovold <jhovold@gmail.com>
 Description:
-		Set the PWM-input control mask (5 bits), where
+		Set the PWM-input control mask (5 bits), where:
 
 		bit 5 - PWM-input enabled in Zone 4
 		bit 4 - PWM-input enabled in Zone 3
diff --git a/Documentation/ABI/testing/sysfs-class-leds-gt683r b/Documentation/ABI/testing/sysfs-class-leds-gt683r
index e4fae6026e79..6adab27f646e 100644
--- a/Documentation/ABI/testing/sysfs-class-leds-gt683r
+++ b/Documentation/ABI/testing/sysfs-class-leds-gt683r
@@ -5,7 +5,7 @@ Contact:	Janne Kanniainen <janne.kanniainen@gmail.com>
 Description:
 		Set the mode of LEDs. You should notice that changing the mode
 		of one LED will update the mode of its two sibling devices as
-		well.
+		well. Possible values are:
 
 		0 - normal
 		1 - audio
@@ -13,4 +13,4 @@ Description:
 
 		Normal: LEDs are fully on when enabled
 		Audio:  LEDs brightness depends on sound level
-		Breathing: LEDs brightness varies at human breathing rate
-\ No newline at end of file
+		Breathing: LEDs brightness varies at human breathing rate
diff --git a/Documentation/ABI/testing/sysfs-class-net-phydev b/Documentation/ABI/testing/sysfs-class-net-phydev
index 6ebabfb27912..206cbf538b59 100644
--- a/Documentation/ABI/testing/sysfs-class-net-phydev
+++ b/Documentation/ABI/testing/sysfs-class-net-phydev
@@ -11,26 +11,41 @@ Date:		February 2014
 KernelVersion:	3.15
 Contact:	netdev@vger.kernel.org
 Description:
-		Boolean value indicating whether the PHY device has
-		any fixups registered against it (phy_register_fixup)
+		This attribute contains the boolean value whether a given PHY
+		device has had any "fixup" workaround running on it, encoded as
+		a boolean. This information is provided to help troubleshooting
+		PHY configurations.
 
 What:		/sys/class/mdio_bus/<bus>/<device>/phy_id
 Date:		November 2012
 KernelVersion:	3.8
 Contact:	netdev@vger.kernel.org
 Description:
-		32-bit hexadecimal value corresponding to the PHY device's OUI,
-		model and revision number.
+		This attribute contains the 32-bit PHY Identifier as reported
+		by the device during bus enumeration, encoded in hexadecimal.
+		This ID is used to match the device with the appropriate
+		driver.
 
 What:		/sys/class/mdio_bus/<bus>/<device>/phy_interface
 Date:		February 2014
 KernelVersion:	3.15
 Contact:	netdev@vger.kernel.org
 Description:
-		String value indicating the PHY interface, possible
-		values are:.
+		This attribute contains the PHY interface as configured by the
+		Ethernet driver during bus enumeration, encoded in string.
+		This interface mode is used to configure the Ethernet MAC with the
+		appropriate mode for its data lines to the PHY hardware.
+		Possible values are:
 		<empty> (not available), mii, gmii, sgmii, tbi, rev-mii,
 		rmii, rgmii, rgmii-id, rgmii-rxid, rgmii-txid, rtbi, smii
 		xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui,
 		xaui, 10gbase-kr, unknown
 
+What:		/sys/class/mdio_bus/<bus>/<device>/phy_standalone
+Date:		May 2019
+KernelVersion:	5.3
+Contact:	netdev@vger.kernel.org
+Description:
+		Boolean value indicating whether the PHY device is used in
+		standalone mode, without a net_device associated, by PHYLINK.
+		Attribute created only when this is the case.
diff --git a/Documentation/ABI/testing/sysfs-class-net-qmi b/Documentation/ABI/testing/sysfs-class-net-qmi
index 7122d6264c49..c310db4ccbc2 100644
--- a/Documentation/ABI/testing/sysfs-class-net-qmi
+++ b/Documentation/ABI/testing/sysfs-class-net-qmi
@@ -29,7 +29,7 @@ Contact:	Bjørn Mork <bjorn@mork.no>
 Description:
 		Unsigned integer.
 
-		Write a number ranging from 1 to 127 to add a qmap mux
+		Write a number ranging from 1 to 254 to add a qmap mux
 		based network device, supported by recent Qualcomm based
 		modems.
 
@@ -46,5 +46,5 @@ Contact:	Bjørn Mork <bjorn@mork.no>
 Description:
 		Unsigned integer.
 
-		Write a number ranging from 1 to 127 to delete a previously
+		Write a number ranging from 1 to 254 to delete a previously
 		created qmap mux based network device.
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index b77e30b9014e..27edc06e2495 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -376,10 +376,42 @@ Description:
 		supply. Normally this is configured based on the type of
 		connection made (e.g. A configured SDP should output a maximum
 		of 500mA so the input current limit is set to the same value).
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_current_limit.
 
 		Access: Read, Write
 		Valid values: Represented in microamps
 
+What:		/sys/class/power_supply/<supply_name>/input_voltage_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming VBUS voltage limit currently
+		set in the supply. Normally this is configured based on
+		system-level knowledge or user input (e.g. This is part of the
+		Pixel C's thermal management strategy to effectively limit the
+		input power to 5V when the screen is on to meet Google's skin
+		temperature targets). Note that this feature should not be
+		used for safety critical things.
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_voltage_limit.
+
+		Access: Read, Write
+		Valid values: Represented in microvolts
+
+What:		/sys/class/power_supply/<supply_name>/input_power_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming power limit currently set
+		in the supply. Normally this is configured based on
+		system-level knowledge or user input. Use preferably this
+		feature to limit the incoming power and use current/voltage
+		limit only for problems that can be solved using power limit.
+
+		Access: Read, Write
+		Valid values: Represented in microwatts
+
 What:		/sys/class/power_supply/<supply_name>/online,
 Date:		May 2007
 Contact:	linux-pm@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-class-power-wilco b/Documentation/ABI/testing/sysfs-class-power-wilco
new file mode 100644
index 000000000000..da1d6ffe5e3c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-wilco
@@ -0,0 +1,30 @@
+What:		/sys/class/power_supply/wilco-charger/charge_type
+Date:		April 2019
+KernelVersion:	5.2
+Description:
+		What charging algorithm to use:
+
+		Standard: Fully charges battery at a standard rate.
+		Adaptive: Battery settings adaptively optimized based on
+			typical battery usage pattern.
+		Fast: Battery charges over a shorter period.
+		Trickle: Extends battery lifespan, intended for users who
+			primarily use their Chromebook while connected to AC.
+		Custom: A low and high threshold percentage is specified.
+			Charging begins when level drops below
+			charge_control_start_threshold, and ceases when
+			level is above charge_control_end_threshold.
+
+What:		/sys/class/power_supply/wilco-charger/charge_control_start_threshold
+Date:		April 2019
+KernelVersion:	5.2
+Description:
+		Used when charge_type="Custom", as described above. Measured in
+		percentages. The valid range is [50, 95].
+
+What:		/sys/class/power_supply/wilco-charger/charge_control_end_threshold
+Date:		April 2019
+KernelVersion:	5.2
+Description:
+		Used when charge_type="Custom", as described above. Measured in
+		percentages. The valid range is [55, 100].
diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap
index db3b3ff70d84..ca491ec4e693 100644
--- a/Documentation/ABI/testing/sysfs-class-powercap
+++ b/Documentation/ABI/testing/sysfs-class-powercap
@@ -5,7 +5,7 @@ Contact:	linux-pm@vger.kernel.org
 Description:
 		The powercap/ class sub directory belongs to the power cap
 		subsystem. Refer to
-		Documentation/power/powercap/powercap.txt for details.
+		Documentation/power/powercap/powercap.rst for details.
 
 What:		/sys/class/powercap/<control type>
 Date:		September 2013
@@ -147,6 +147,6 @@ What:		/sys/class/powercap/.../<power zone>/enabled
 Date:		September 2013
 KernelVersion:	3.13
 Contact:	linux-pm@vger.kernel.org
-Description
+Description:
 		This allows to enable/disable power capping at power zone level.
 		This applies to current power zone and its children.
diff --git a/Documentation/ABI/testing/sysfs-class-switchtec b/Documentation/ABI/testing/sysfs-class-switchtec
index 48cb4c15e430..76c7a661a595 100644
--- a/Documentation/ABI/testing/sysfs-class-switchtec
+++ b/Documentation/ABI/testing/sysfs-class-switchtec
@@ -1,6 +1,6 @@
 switchtec - Microsemi Switchtec PCI Switch Management Endpoint
 
-For details on this subsystem look at Documentation/switchtec.txt.
+For details on this subsystem look at Documentation/driver-api/switchtec.rst.
 
 What: 		/sys/class/switchtec
 Date:		05-Jan-2017
diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc
index 85f4875d16ac..a0578751c1e3 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc
@@ -125,12 +125,6 @@ Description:
                 The EUI-48 of this device in colon separated hex
                 octets.
 
-What:           /sys/class/uwb_rc/uwbN/<EUI-48>/BPST
-Date:           July 2008
-KernelVersion:  2.6.27
-Contact:        linux-usb@vger.kernel.org
-Description:
-
 What:           /sys/class/uwb_rc/uwbN/<EUI-48>/IEs
 Date:           July 2008
 KernelVersion:  2.6.27
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 1528239f69b2..5f7d7b14fa44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -34,7 +34,7 @@ Description:	CPU topology files that describe kernel limits related to
 		present: cpus that have been identified as being present in
 		the system.
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/probe
@@ -103,7 +103,7 @@ Description:	CPU topology files that describe a logical CPU's relationship
 		thread_siblings_list: human-readable list of cpu#'s hardware
 		threads within the same core as cpu#
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/cpuidle/current_driver
@@ -137,7 +137,8 @@ Description:	Discover cpuidle policy and mechanism
 		current_governor: (RW) displays current idle policy. Users can
 		switch the governor at runtime by writing to this file.
 
-		See files in Documentation/cpuidle/ for more information.
+		See Documentation/admin-guide/pm/cpuidle.rst and
+		Documentation/driver-api/pm/cpuidle.rst for more information.
 
 
 What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/name
@@ -538,3 +539,26 @@ Description:	Intel Energy and Performance Bias Hint (EPB)
 
 		This attribute is present for all online CPUs supporting the
 		Intel EPB feature.
+
+What:		/sys/devices/system/cpu/umwait_control
+		/sys/devices/system/cpu/umwait_control/enable_c02
+		/sys/devices/system/cpu/umwait_control/max_time
+Date:		May 2019
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Umwait control
+
+		enable_c02: Read/write interface to control umwait C0.2 state
+			Read returns C0.2 state status:
+				0: C0.2 is disabled
+				1: C0.2 is enabled
+
+			Write 'y' or '1'  or 'on' to enable C0.2 state.
+			Write 'n' or '0'  or 'off' to disable C0.2 state.
+
+			The interface is case insensitive.
+
+		max_time: Read/write interface to control umwait maximum time
+			  in TSC-quanta that the CPU can reside in either C0.1
+			  or C0.2 state. The time is an unsigned 32-bit number.
+			  Note that a value of zero means there is no limit.
+			  Low order two bits must be zero.
diff --git a/Documentation/ABI/testing/sysfs-driver-altera-cvp b/Documentation/ABI/testing/sysfs-driver-altera-cvp
index 8cde64a71edb..fbd8078fd7ad 100644
--- a/Documentation/ABI/testing/sysfs-driver-altera-cvp
+++ b/Documentation/ABI/testing/sysfs-driver-altera-cvp
@@ -1,6 +1,6 @@
 What:		/sys/bus/pci/drivers/altera-cvp/chkcfg
 Date:		May 2017
-Kernel Version:	4.13
+KernelVersion:	4.13
 Contact:	Anatolij Gustschin <agust@denx.de>
 Description:
 		Contains either 1 or 0 and controls if configuration
diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs
index 78b2bcf316a3..f433fc6db3c6 100644
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -62,18 +62,20 @@ What:           /sys/class/habanalabs/hl<n>/ic_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                Interconnect fabric. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device IC clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the Interconnect fabric. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device IC clock might be set to lower value than the
                 maximum. The user should read the ic_clk_curr to see the actual
-                frequency value of the IC
+                frequency value of the IC. This property is valid only for the
+                Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/ic_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the Interconnect fabric
+Description:    Displays the current clock frequency, in Hz, of the Interconnect
+                fabric. This property is valid only for the Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/infineon_ver
 Date:           Jan 2019
@@ -92,18 +94,20 @@ What:           /sys/class/habanalabs/hl<n>/mme_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                MME compute engine. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device MME clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the MME compute engine. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device MME clock might be set to lower value than the
                 maximum. The user should read the mme_clk_curr to see the actual
-                frequency value of the MME
+                frequency value of the MME. This property is valid only for the
+                Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/mme_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the MME compute engine
+Description:    Displays the current clock frequency, in Hz, of the MME compute
+                engine. This property is valid only for the Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/pci_addr
 Date:           Jan 2019
@@ -163,18 +167,20 @@ What:           /sys/class/habanalabs/hl<n>/tpc_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                TPC compute engines. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device TPC clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the TPC compute engines. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device TPC clock might be set to lower value than the
                 maximum. The user should read the tpc_clk_curr to see the actual
-                frequency value of the TPC
+                frequency value of the TPC. This property is valid only for
+                Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/tpc_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the TPC compute engines
+Description:    Displays the current clock frequency, in Hz, of the TPC compute
+                engines. This property is valid only for the Goya ASIC family
 
 What:           /sys/class/habanalabs/hl<n>/uboot_ver
 Date:           Jan 2019
diff --git a/Documentation/ABI/testing/sysfs-driver-hid b/Documentation/ABI/testing/sysfs-driver-hid
index 48942cacb0bf..a59533410871 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid
+++ b/Documentation/ABI/testing/sysfs-driver-hid
@@ -1,6 +1,6 @@
-What:		For USB devices	: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
-		For BT devices	: /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
-		Symlink		: /sys/class/hidraw/hidraw<num>/device/report_descriptor
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
+What:		/sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
+What:		/sys/class/hidraw/hidraw<num>/device/report_descriptor
 Date:		Jan 2011
 KernelVersion:	2.0.39
 Contact:	Alan Ott <alan@signal11.us>
@@ -9,9 +9,9 @@ Description:	When read, this file returns the device's raw binary HID
 		This file cannot be written.
 Users:		HIDAPI library (http://www.signal11.us/oss/hidapi)
 
-What:		For USB devices	: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
-		For BT devices	: /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
-		Symlink		: /sys/class/hidraw/hidraw<num>/device/country
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
+What:		/sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
+What:		/sys/class/hidraw/hidraw<num>/device/country
 Date:		February 2015
 KernelVersion:	3.19
 Contact:	Olivier Gay <ogay@logitech.com>
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
index 3ca3971109bf..8f7982c70d72 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
@@ -5,7 +5,7 @@ Description:	It is possible to switch the dpi setting of the mouse with the
 		press of a button.
 		When read, this file returns the raw number of the actual dpi
 		setting reported by the mouse. This number has to be further
-		processed to receive the real dpi value.
+		processed to receive the real dpi value:
 
 		VALUE DPI
 		1     800
diff --git a/Documentation/ABI/testing/sysfs-driver-ppi b/Documentation/ABI/testing/sysfs-driver-ppi
index 9921ef285899..1a56fc507689 100644
--- a/Documentation/ABI/testing/sysfs-driver-ppi
+++ b/Documentation/ABI/testing/sysfs-driver-ppi
@@ -1,6 +1,6 @@
 What:		/sys/class/tpm/tpmX/ppi/
 Date:		August 2012
-Kernel Version:	3.6
+KernelVersion:	3.6
 Contact:	xiaoyan.zhang@intel.com
 Description:
 		This folder includes the attributes related with PPI (Physical
diff --git a/Documentation/ABI/testing/sysfs-driver-st b/Documentation/ABI/testing/sysfs-driver-st
index ba5d77008a85..88cab66fd77f 100644
--- a/Documentation/ABI/testing/sysfs-driver-st
+++ b/Documentation/ABI/testing/sysfs-driver-st
@@ -1,6 +1,6 @@
 What:		/sys/bus/scsi/drivers/st/debug_flag
 Date:		October 2015
-Kernel Version:	?.?
+KernelVersion:	?.?
 Contact:	shane.seymour@hpe.com
 Description:
 		This file allows you to turn debug output from the st driver
diff --git a/Documentation/ABI/testing/sysfs-driver-wacom b/Documentation/ABI/testing/sysfs-driver-wacom
index 2aa5503ee200..afc48fc163b5 100644
--- a/Documentation/ABI/testing/sysfs-driver-wacom
+++ b/Documentation/ABI/testing/sysfs-driver-wacom
@@ -1,6 +1,6 @@
 What:		/sys/bus/hid/devices/<bus>:<vid>:<pid>.<n>/speed
 Date:		April 2010
-Kernel Version:	2.6.35
+KernelVersion:	2.6.35
 Contact:	linux-bluetooth@vger.kernel.org
 Description:
 		The /sys/bus/hid/devices/<bus>:<vid>:<pid>.<n>/speed file
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 91822ce25831..dca326e0ee3e 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -243,3 +243,11 @@ Description:
 		 - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
 		 - [h] means add/del hot file extension
 		 - [c] means add/del cold file extension
+
+What:		/sys/fs/f2fs/<disk>/unusable
+Date		April 2019
+Contact:	"Daniel Rosenberg" <drosen@google.com>
+Description:
+		If checkpoint=disable, it displays the number of blocks that are unusable.
+                If checkpoint=enable it displays the enumber of blocks that would be unusable
+                if checkpoint=disable were to be set.
diff --git a/Documentation/ABI/testing/sysfs-kernel-fscaps b/Documentation/ABI/testing/sysfs-kernel-fscaps
index 50a3033b5e15..bcff34665192 100644
--- a/Documentation/ABI/testing/sysfs-kernel-fscaps
+++ b/Documentation/ABI/testing/sysfs-kernel-fscaps
@@ -2,7 +2,7 @@ What:		/sys/kernel/fscaps
 Date:		February 2011
 KernelVersion:	2.6.38
 Contact:	Ludwig Nussel <ludwig.nussel@suse.de>
-Description
+Description:
 		Shows whether file system capabilities are honored
 		when executing a binary
 
diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 35c64e00b35c..017f5bc3920c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -24,3 +24,12 @@ Description:    /sys/kernel/iommu_groups/reserved_regions list IOVA
 		region is described on a single line: the 1st field is
 		the base IOVA, the second is the end IOVA and the third
 		field describes the type of the region.
+
+What:		/sys/kernel/iommu_groups/reserved_regions
+Date: 		June 2019
+KernelVersion:  v5.3
+Contact: 	Eric Auger <eric.auger@redhat.com>
+Description:    In case an RMRR is used only by graphics or USB devices
+		it is now exposed as "direct-relaxable" instead of "direct".
+		In device assignment use case, for instance, those RMRR
+		are considered to be relaxable and safe.
diff --git a/Documentation/ABI/testing/sysfs-kernel-uids b/Documentation/ABI/testing/sysfs-kernel-uids
index 28f14695a852..4182b7061816 100644
--- a/Documentation/ABI/testing/sysfs-kernel-uids
+++ b/Documentation/ABI/testing/sysfs-kernel-uids
@@ -11,4 +11,4 @@ Description:
 		example would be, if User A has shares = 1024 and user
 		B has shares = 2048, User B will get twice the CPU
 		bandwidth user A will. For more details refer
-		Documentation/scheduler/sched-design-CFS.txt
+		Documentation/scheduler/sched-design-CFS.rst
diff --git a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
index 7bd81168e063..1f1087a5f075 100644
--- a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
+++ b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
@@ -4,7 +4,7 @@ KernelVersion:	2.6.24
 Contact:	Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
 		Kexec Mailing List <kexec@lists.infradead.org>
 		Vivek Goyal <vgoyal@redhat.com>
-Description
+Description:
 		Shows physical address and size of vmcoreinfo ELF note.
 		First value contains physical address of note in hex and
 		second value contains the size of note in hex. This ELF
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop
index cd9d667c3da2..8b0e8205a6a2 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop
@@ -31,7 +31,7 @@ Description:
 		To control the LED display, use the following :
 		    echo 0x0T000DDD > /sys/devices/platform/asus_laptop/
 		where T control the 3 letters display, and DDD the 3 digits display.
-		The DDD table can be found in Documentation/laptops/asus-laptop.txt
+		The DDD table can be found in Documentation/admin-guide/laptops/asus-laptop.rst
 
 What:		/sys/devices/platform/asus_laptop/bluetooth
 Date:		January 2007
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..9e99f2909612 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,13 @@ KernelVersion:	3.5
 Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/<platform>/fan_boost_mode
+Date:		Sep 2019
+KernelVersion:	5.3
+Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
+Description:
+		Fan boost mode:
+			* 0 - normal,
+			* 1 - overboost,
+			* 2 - silent
diff --git a/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl b/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
index 3c3514815cd5..c394b808be19 100644
--- a/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
+++ b/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
@@ -1,7 +1,7 @@
 What:		/sys/devices/platform/<i2c-demux-name>/available_masters
 Date:		January 2016
 KernelVersion:	4.6
-Contact:	Wolfram Sang <wsa@the-dreams.de>
+Contact:	Wolfram Sang <wsa+renesas@sang-engineering.com>
 Description:
 		Reading the file will give you a list of masters which can be
 		selected for a demultiplexed bus. The format is
@@ -12,7 +12,7 @@ Description:
 What:		/sys/devices/platform/<i2c-demux-name>/current_master
 Date:		January 2016
 KernelVersion:	4.6
-Contact:	Wolfram Sang <wsa@the-dreams.de>
+Contact:	Wolfram Sang <wsa+renesas@sang-engineering.com>
 Description:
 		This file selects/shows the active I2C master for a demultiplexed
 		bus. It uses the <index> value from the file 'available_masters'.
diff --git a/Documentation/ABI/testing/sysfs-platform-wilco-ec b/Documentation/ABI/testing/sysfs-platform-wilco-ec
new file mode 100644
index 000000000000..8827a734f933
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-wilco-ec
@@ -0,0 +1,40 @@
+What:		/sys/bus/platform/devices/GOOG000C\:00/boot_on_ac
+Date:		April 2019
+KernelVersion:	5.3
+Description:
+		Boot on AC is a policy which makes the device boot from S5
+		when AC power is connected. This is useful for users who
+		want to run their device headless or with a dock.
+
+		Input should be parseable by kstrtou8() to 0 or 1.
+
+What:          /sys/bus/platform/devices/GOOG000C\:00/build_date
+Date:          May 2019
+KernelVersion: 5.3
+Description:
+               Display Wilco Embedded Controller firmware build date.
+               Output will a MM/DD/YY string.
+
+What:          /sys/bus/platform/devices/GOOG000C\:00/build_revision
+Date:          May 2019
+KernelVersion: 5.3
+Description:
+               Display Wilco Embedded Controller build revision.
+               Output will a version string be similar to the example below:
+               d2592cae0
+
+What:          /sys/bus/platform/devices/GOOG000C\:00/model_number
+Date:          May 2019
+KernelVersion: 5.3
+Description:
+               Display Wilco Embedded Controller model number.
+               Output will a version string be similar to the example below:
+               08B6
+
+What:          /sys/bus/platform/devices/GOOG000C\:00/version
+Date:          May 2019
+KernelVersion: 5.3
+Description:
+               Display Wilco Embedded Controller firmware version.
+               The format of the string is x.y.z. Where x is major, y is minor
+               and z is the build number. For example: 95.00.06
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 18b7dc929234..3c5130355011 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -300,4 +300,4 @@ Description:
 		attempt.
 
 		Using this sysfs file will override any values that were
-		set using the kernel command line for disk offset.
-\ No newline at end of file
+		set using the kernel command line for disk offset.
diff --git a/Documentation/logo.txt b/Documentation/COPYING-logo
index 296f0f7f67eb..296f0f7f67eb 100644
--- a/Documentation/logo.txt
+++ b/Documentation/COPYING-logo
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index cb712a02f59f..358d495456d1 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -212,7 +212,7 @@ The standard 64-bit addressing device would do something like this::
 
 If the device only supports 32-bit addressing for descriptors in the
 coherent allocations, but supports full 64-bits for streaming mappings
-it would look like this:
+it would look like this::
 
 	if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
 		dev_warn(dev, "mydev: No suitable DMA available\n");
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 0076150fdccb..e47c63bd4887 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -198,7 +198,7 @@ call to set the mask to the value returned.
 ::
 
 	size_t
-	dma_direct_max_mapping_size(struct device *dev);
+	dma_max_mapping_size(struct device *dev);
 
 Returns the maximum size of a mapping for the device. The size parameter
 of the mapping functions like dma_map_single(), dma_map_page() and
diff --git a/Documentation/EDID/HOWTO.txt b/Documentation/EDID/HOWTO.txt
deleted file mode 100644
index 539871c3b785..000000000000
--- a/Documentation/EDID/HOWTO.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-In the good old days when graphics parameters were configured explicitly
-in a file called xorg.conf, even broken hardware could be managed.
-
-Today, with the advent of Kernel Mode Setting, a graphics board is
-either correctly working because all components follow the standards -
-or the computer is unusable, because the screen remains dark after
-booting or it displays the wrong area. Cases when this happens are:
-- The graphics board does not recognize the monitor.
-- The graphics board is unable to detect any EDID data.
-- The graphics board incorrectly forwards EDID data to the driver.
-- The monitor sends no or bogus EDID data.
-- A KVM sends its own EDID data instead of querying the connected monitor.
-Adding the kernel parameter "nomodeset" helps in most cases, but causes
-restrictions later on.
-
-As a remedy for such situations, the kernel configuration item
-CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an
-individually prepared or corrected EDID data set in the /lib/firmware
-directory from where it is loaded via the firmware interface. The code
-(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for
-commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200,
-1680x1050, 1920x1080) as binary blobs, but the kernel source tree does
-not contain code to create these data. In order to elucidate the origin
-of the built-in binary EDID blobs and to facilitate the creation of
-individual data for a specific misbehaving monitor, commented sources
-and a Makefile environment are given here.
-
-To create binary EDID and C source code files from the existing data
-material, simply type "make".
-
-If you want to create your own EDID file, copy the file 1024x768.S,
-replace the settings with your own data and add a new target to the
-Makefile. Please note that the EDID data structure expects the timing
-values in a different way as compared to the standard X11 format.
-
-X11:
-HTimings:  hdisp hsyncstart hsyncend htotal
-VTimings:  vdisp vsyncstart vsyncend vtotal
-
-EDID:
-#define XPIX hdisp
-#define XBLANK htotal-hdisp
-#define XOFFSET hsyncstart-hdisp
-#define XPULSE hsyncend-hsyncstart
-
-#define YPIX vdisp
-#define YBLANK vtotal-vdisp
-#define YOFFSET vsyncstart-vdisp
-#define YPULSE vsyncend-vsyncstart
diff --git a/Documentation/Kconfig b/Documentation/Kconfig
new file mode 100644
index 000000000000..66046fa1c341
--- /dev/null
+++ b/Documentation/Kconfig
@@ -0,0 +1,13 @@
+config WARN_MISSING_DOCUMENTS
+
+	bool "Warn if there's a missing documentation file"
+	depends on COMPILE_TEST
+	help
+	   It is not uncommon that a document gets renamed.
+	   This option makes the Kernel to check for missing dependencies,
+	   warning when something is missing. Works only if the Kernel
+	   is built from a git tree.
+
+	   If unsure, select 'N'.
+
+
diff --git a/Documentation/Makefile b/Documentation/Makefile
index e889e7cb8511..e145e4db508b 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -4,6 +4,11 @@
 
 subdir-y := devicetree/bindings/
 
+# Check for broken documentation file references
+ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y)
+$(shell $(srctree)/scripts/documentation-file-ref-check --warn)
+endif
+
 # You can set these variables from the command line.
 SPHINXBUILD   = sphinx-build
 SPHINXOPTS    =
@@ -23,11 +28,13 @@ ifeq ($(HAVE_SPHINX),0)
 .DEFAULT:
 	$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
 	@echo
-	@./scripts/sphinx-pre-install
+	@$(srctree)/scripts/sphinx-pre-install
 	@echo "  SKIP    Sphinx $@ target."
 
 else # HAVE_SPHINX
 
+export SPHINXOPTS = $(shell perl -e 'open IN,"sphinx-build --version 2>&1 |"; while (<IN>) { if (m/([\d\.]+)/) { print "-jauto" if ($$1 >= "1.7") } ;} close IN')
+
 # User-friendly check for pdflatex and latexmk
 HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
 HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
@@ -70,12 +77,14 @@ quiet_cmd_sphinx = SPHINX  $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
 	$(abspath $(BUILDDIR)/$3/$4)
 
 htmldocs:
+	@$(srctree)/scripts/sphinx-pre-install --version-check
 	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
 
 linkcheckdocs:
 	@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
 
 latexdocs:
+	@$(srctree)/scripts/sphinx-pre-install --version-check
 	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
 
 ifeq ($(HAVE_PDFLATEX),0)
@@ -87,14 +96,17 @@ pdfdocs:
 else # HAVE_PDFLATEX
 
 pdfdocs: latexdocs
+	@$(srctree)/scripts/sphinx-pre-install --version-check
 	$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
 
 endif # HAVE_PDFLATEX
 
 epubdocs:
+	@$(srctree)/scripts/sphinx-pre-install --version-check
 	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
 
 xmldocs:
+	@$(srctree)/scripts/sphinx-pre-install --version-check
 	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
 
 endif # HAVE_SPHINX
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
deleted file mode 100644
index 618e13d5e276..000000000000
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ /dev/null
@@ -1,270 +0,0 @@
-		The MSI Driver Guide HOWTO
-	Tom L Nguyen tom.l.nguyen@intel.com
-			10/03/2003
-	Revised Feb 12, 2004 by Martine Silbermann
-		email: Martine.Silbermann@hp.com
-	Revised Jun 25, 2004 by Tom L Nguyen
-	Revised Jul  9, 2008 by Matthew Wilcox <willy@linux.intel.com>
-		Copyright 2003, 2008 Intel Corporation
-
-1. About this guide
-
-This guide describes the basics of Message Signaled Interrupts (MSIs),
-the advantages of using MSI over traditional interrupt mechanisms, how
-to change your driver to use MSI or MSI-X and some basic diagnostics to
-try if a device doesn't support MSIs.
-
-
-2. What are MSIs?
-
-A Message Signaled Interrupt is a write from the device to a special
-address which causes an interrupt to be received by the CPU.
-
-The MSI capability was first specified in PCI 2.2 and was later enhanced
-in PCI 3.0 to allow each interrupt to be masked individually.  The MSI-X
-capability was also introduced with PCI 3.0.  It supports more interrupts
-per device than MSI and allows interrupts to be independently configured.
-
-Devices may support both MSI and MSI-X, but only one can be enabled at
-a time.
-
-
-3. Why use MSIs?
-
-There are three reasons why using MSIs can give an advantage over
-traditional pin-based interrupts.
-
-Pin-based PCI interrupts are often shared amongst several devices.
-To support this, the kernel must call each interrupt handler associated
-with an interrupt, which leads to reduced performance for the system as
-a whole.  MSIs are never shared, so this problem cannot arise.
-
-When a device writes data to memory, then raises a pin-based interrupt,
-it is possible that the interrupt may arrive before all the data has
-arrived in memory (this becomes more likely with devices behind PCI-PCI
-bridges).  In order to ensure that all the data has arrived in memory,
-the interrupt handler must read a register on the device which raised
-the interrupt.  PCI transaction ordering rules require that all the data
-arrive in memory before the value may be returned from the register.
-Using MSIs avoids this problem as the interrupt-generating write cannot
-pass the data writes, so by the time the interrupt is raised, the driver
-knows that all the data has arrived in memory.
-
-PCI devices can only support a single pin-based interrupt per function.
-Often drivers have to query the device to find out what event has
-occurred, slowing down interrupt handling for the common case.  With
-MSIs, a device can support more interrupts, allowing each interrupt
-to be specialised to a different purpose.  One possible design gives
-infrequent conditions (such as errors) their own interrupt which allows
-the driver to handle the normal interrupt handling path more efficiently.
-Other possible designs include giving one interrupt to each packet queue
-in a network card or each port in a storage controller.
-
-
-4. How to use MSIs
-
-PCI devices are initialised to use pin-based interrupts.  The device
-driver has to set up the device to use MSI or MSI-X.  Not all machines
-support MSIs correctly, and for those machines, the APIs described below
-will simply fail and the device will continue to use pin-based interrupts.
-
-4.1 Include kernel support for MSIs
-
-To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
-option enabled.  This option is only available on some architectures,
-and it may depend on some other options also being set.  For example,
-on x86, you must also enable X86_UP_APIC or SMP in order to see the
-CONFIG_PCI_MSI option.
-
-4.2 Using MSI
-
-Most of the hard work is done for the driver in the PCI layer.  The driver
-simply has to request that the PCI layer set up the MSI capability for this
-device.
-
-To automatically use MSI or MSI-X interrupt vectors, use the following
-function:
-
-  int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
-		unsigned int max_vecs, unsigned int flags);
-
-which allocates up to max_vecs interrupt vectors for a PCI device.  It
-returns the number of vectors allocated or a negative error.  If the device
-has a requirements for a minimum number of vectors the driver can pass a
-min_vecs argument set to this limit, and the PCI core will return -ENOSPC
-if it can't meet the minimum number of vectors.
-
-The flags argument is used to specify which type of interrupt can be used
-by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX).
-A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for
-any possible kind of interrupt.  If the PCI_IRQ_AFFINITY flag is set,
-pci_alloc_irq_vectors() will spread the interrupts around the available CPUs.
-
-To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
-vectors, use the following function:
-
-  int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
-
-Any allocated resources should be freed before removing the device using
-the following function:
-
-  void pci_free_irq_vectors(struct pci_dev *dev);
-
-If a device supports both MSI-X and MSI capabilities, this API will use the
-MSI-X facilities in preference to the MSI facilities.  MSI-X supports any
-number of interrupts between 1 and 2048.  In contrast, MSI is restricted to
-a maximum of 32 interrupts (and must be a power of two).  In addition, the
-MSI interrupt vectors must be allocated consecutively, so the system might
-not be able to allocate as many vectors for MSI as it could for MSI-X.  On
-some platforms, MSI interrupts must all be targeted at the same set of CPUs
-whereas MSI-X interrupts can all be targeted at different CPUs.
-
-If a device supports neither MSI-X or MSI it will fall back to a single
-legacy IRQ vector.
-
-The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
-as possible, likely up to the limit supported by the device.  If nvec is
-larger than the number supported by the device it will automatically be
-capped to the supported limit, so there is no need to query the number of
-vectors supported beforehand:
-
-	nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES)
-	if (nvec < 0)
-		goto out_err;
-
-If a driver is unable or unwilling to deal with a variable number of MSI
-interrupts it can request a particular number of interrupts by passing that
-number to pci_alloc_irq_vectors() function as both 'min_vecs' and
-'max_vecs' parameters:
-
-	ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES);
-	if (ret < 0)
-		goto out_err;
-
-The most notorious example of the request type described above is enabling
-the single MSI mode for a device.  It could be done by passing two 1s as
-'min_vecs' and 'max_vecs':
-
-	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
-	if (ret < 0)
-		goto out_err;
-
-Some devices might not support using legacy line interrupts, in which case
-the driver can specify that only MSI or MSI-X is acceptable:
-
-	nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX);
-	if (nvec < 0)
-		goto out_err;
-
-4.3 Legacy APIs
-
-The following old APIs to enable and disable MSI or MSI-X interrupts should
-not be used in new code:
-
-  pci_enable_msi()		/* deprecated */
-  pci_disable_msi()		/* deprecated */
-  pci_enable_msix_range()	/* deprecated */
-  pci_enable_msix_exact()	/* deprecated */
-  pci_disable_msix()		/* deprecated */
-
-Additionally there are APIs to provide the number of supported MSI or MSI-X
-vectors: pci_msi_vec_count() and pci_msix_vec_count().  In general these
-should be avoided in favor of letting pci_alloc_irq_vectors() cap the
-number of vectors.  If you have a legitimate special use case for the count
-of vectors we might have to revisit that decision and add a
-pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
-
-4.4 Considerations when using MSIs
-
-4.4.1 Spinlocks
-
-Most device drivers have a per-device spinlock which is taken in the
-interrupt handler.  With pin-based interrupts or a single MSI, it is not
-necessary to disable interrupts (Linux guarantees the same interrupt will
-not be re-entered).  If a device uses multiple interrupts, the driver
-must disable interrupts while the lock is held.  If the device sends
-a different interrupt, the driver will deadlock trying to recursively
-acquire the spinlock.  Such deadlocks can be avoided by using
-spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
-and acquire the lock (see Documentation/kernel-hacking/locking.rst).
-
-4.5 How to tell whether MSI/MSI-X is enabled on a device
-
-Using 'lspci -v' (as root) may show some devices with "MSI", "Message
-Signalled Interrupts" or "MSI-X" capabilities.  Each of these capabilities
-has an 'Enable' flag which is followed with either "+" (enabled)
-or "-" (disabled).
-
-
-5. MSI quirks
-
-Several PCI chipsets or devices are known not to support MSIs.
-The PCI stack provides three ways to disable MSIs:
-
-1. globally
-2. on all devices behind a specific bridge
-3. on a single device
-
-5.1. Disabling MSIs globally
-
-Some host chipsets simply don't support MSIs properly.  If we're
-lucky, the manufacturer knows this and has indicated it in the ACPI
-FADT table.  In this case, Linux automatically disables MSIs.
-Some boards don't include this information in the table and so we have
-to detect them ourselves.  The complete list of these is found near the
-quirk_disable_all_msi() function in drivers/pci/quirks.c.
-
-If you have a board which has problems with MSIs, you can pass pci=nomsi
-on the kernel command line to disable MSIs on all devices.  It would be
-in your best interests to report the problem to linux-pci@vger.kernel.org
-including a full 'lspci -v' so we can add the quirks to the kernel.
-
-5.2. Disabling MSIs below a bridge
-
-Some PCI bridges are not able to route MSIs between busses properly.
-In this case, MSIs must be disabled on all devices behind the bridge.
-
-Some bridges allow you to enable MSIs by changing some bits in their
-PCI configuration space (especially the Hypertransport chipsets such
-as the nVidia nForce and Serverworks HT2000).  As with host chipsets,
-Linux mostly knows about them and automatically enables MSIs if it can.
-If you have a bridge unknown to Linux, you can enable
-MSIs in configuration space using whatever method you know works, then
-enable MSIs on that bridge by doing:
-
-       echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
-
-where $bridge is the PCI address of the bridge you've enabled (eg
-0000:00:0e.0).
-
-To disable MSIs, echo 0 instead of 1.  Changing this value should be
-done with caution as it could break interrupt handling for all devices
-below this bridge.
-
-Again, please notify linux-pci@vger.kernel.org of any bridges that need
-special handling.
-
-5.3. Disabling MSIs on a single device
-
-Some devices are known to have faulty MSI implementations.  Usually this
-is handled in the individual device driver, but occasionally it's necessary
-to handle this with a quirk.  Some drivers have an option to disable use
-of MSI.  While this is a convenient workaround for the driver author,
-it is not good practice, and should not be emulated.
-
-5.4. Finding why MSIs are disabled on a device
-
-From the above three sections, you can see that there are many reasons
-why MSIs may not be enabled for a given device.  Your first step should
-be to examine your dmesg carefully to determine whether MSIs are enabled
-for your machine.  You should also check your .config to be sure you
-have enabled CONFIG_PCI_MSI.
-
-Then, 'lspci -t' gives the list of bridges above a device.  Reading
-/sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1)
-or disabled (0).  If 0 is found in any of the msi_bus files belonging
-to bridges between the PCI root and the device, MSIs are disabled.
-
-It is also worth checking the device driver to see whether it supports MSIs.
-For example, it may contain calls to pci_irq_alloc_vectors() with the
-PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/Documentation/PCI/PCIEBUS-HOWTO.txt b/Documentation/PCI/PCIEBUS-HOWTO.txt
deleted file mode 100644
index 15f0bb3b5045..000000000000
--- a/Documentation/PCI/PCIEBUS-HOWTO.txt
+++ /dev/null
@@ -1,198 +0,0 @@
-		The PCI Express Port Bus Driver Guide HOWTO
-	Tom L Nguyen tom.l.nguyen@intel.com
-			11/03/2004
-
-1. About this guide
-
-This guide describes the basics of the PCI Express Port Bus driver
-and provides information on how to enable the service drivers to
-register/unregister with the PCI Express Port Bus Driver.
-
-2. Copyright 2004 Intel Corporation
-
-3. What is the PCI Express Port Bus Driver
-
-A PCI Express Port is a logical PCI-PCI Bridge structure. There
-are two types of PCI Express Port: the Root Port and the Switch
-Port. The Root Port originates a PCI Express link from a PCI Express
-Root Complex and the Switch Port connects PCI Express links to
-internal logical PCI buses. The Switch Port, which has its secondary
-bus representing the switch's internal routing logic, is called the
-switch's Upstream Port. The switch's Downstream Port is bridging from
-switch's internal routing bus to a bus representing the downstream
-PCI Express link from the PCI Express Switch.
-
-A PCI Express Port can provide up to four distinct functions,
-referred to in this document as services, depending on its port type.
-PCI Express Port's services include native hotplug support (HP),
-power management event support (PME), advanced error reporting
-support (AER), and virtual channel support (VC). These services may
-be handled by a single complex driver or be individually distributed
-and handled by corresponding service drivers.
-
-4. Why use the PCI Express Port Bus Driver?
-
-In existing Linux kernels, the Linux Device Driver Model allows a
-physical device to be handled by only a single driver. The PCI
-Express Port is a PCI-PCI Bridge device with multiple distinct
-services. To maintain a clean and simple solution each service
-may have its own software service driver. In this case several
-service drivers will compete for a single PCI-PCI Bridge device.
-For example, if the PCI Express Root Port native hotplug service
-driver is loaded first, it claims a PCI-PCI Bridge Root Port. The
-kernel therefore does not load other service drivers for that Root
-Port. In other words, it is impossible to have multiple service
-drivers load and run on a PCI-PCI Bridge device simultaneously
-using the current driver model.
-
-To enable multiple service drivers running simultaneously requires
-having a PCI Express Port Bus driver, which manages all populated
-PCI Express Ports and distributes all provided service requests
-to the corresponding service drivers as required. Some key
-advantages of using the PCI Express Port Bus driver are listed below:
-
-	- Allow multiple service drivers to run simultaneously on
-	  a PCI-PCI Bridge Port device.
-
-	- Allow service drivers implemented in an independent
-	  staged approach.
-
-	- Allow one service driver to run on multiple PCI-PCI Bridge
-	  Port devices.
-
-	- Manage and distribute resources of a PCI-PCI Bridge Port
-	  device to requested service drivers.
-
-5. Configuring the PCI Express Port Bus Driver vs. Service Drivers
-
-5.1 Including the PCI Express Port Bus Driver Support into the Kernel
-
-Including the PCI Express Port Bus driver depends on whether the PCI
-Express support is included in the kernel config. The kernel will
-automatically include the PCI Express Port Bus driver as a kernel
-driver when the PCI Express support is enabled in the kernel.
-
-5.2 Enabling Service Driver Support
-
-PCI device drivers are implemented based on Linux Device Driver Model.
-All service drivers are PCI device drivers. As discussed above, it is
-impossible to load any service driver once the kernel has loaded the
-PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver
-Model requires some minimal changes on existing service drivers that
-imposes no impact on the functionality of existing service drivers.
-
-A service driver is required to use the two APIs shown below to
-register its service with the PCI Express Port Bus driver (see
-section 5.2.1 & 5.2.2). It is important that a service driver
-initializes the pcie_port_service_driver data structure, included in
-header file /include/linux/pcieport_if.h, before calling these APIs.
-Failure to do so will result an identity mismatch, which prevents
-the PCI Express Port Bus driver from loading a service driver.
-
-5.2.1 pcie_port_service_register
-
-int pcie_port_service_register(struct pcie_port_service_driver *new)
-
-This API replaces the Linux Driver Model's pci_register_driver API. A
-service driver should always calls pcie_port_service_register at
-module init. Note that after service driver being loaded, calls
-such as pci_enable_device(dev) and pci_set_master(dev) are no longer
-necessary since these calls are executed by the PCI Port Bus driver.
-
-5.2.2 pcie_port_service_unregister
-
-void pcie_port_service_unregister(struct pcie_port_service_driver *new)
-
-pcie_port_service_unregister replaces the Linux Driver Model's
-pci_unregister_driver. It's always called by service driver when a
-module exits.
-
-5.2.3 Sample Code
-
-Below is sample service driver code to initialize the port service
-driver data structure.
-
-static struct pcie_port_service_id service_id[] = { {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.port_type = PCIE_RC_PORT,
-	.service_type = PCIE_PORT_SERVICE_AER,
-	}, { /* end: all zeroes */ }
-};
-
-static struct pcie_port_service_driver root_aerdrv = {
-	.name		= (char *)device_name,
-	.id_table	= &service_id[0],
-
-	.probe		= aerdrv_load,
-	.remove		= aerdrv_unload,
-
-	.suspend	= aerdrv_suspend,
-	.resume		= aerdrv_resume,
-};
-
-Below is a sample code for registering/unregistering a service
-driver.
-
-static int __init aerdrv_service_init(void)
-{
-	int retval = 0;
-
-	retval = pcie_port_service_register(&root_aerdrv);
-	if (!retval) {
-		/*
-		 * FIX ME
-		 */
-	}
-	return retval;
-}
-
-static void __exit aerdrv_service_exit(void)
-{
-	pcie_port_service_unregister(&root_aerdrv);
-}
-
-module_init(aerdrv_service_init);
-module_exit(aerdrv_service_exit);
-
-6. Possible Resource Conflicts
-
-Since all service drivers of a PCI-PCI Bridge Port device are
-allowed to run simultaneously, below lists a few of possible resource
-conflicts with proposed solutions.
-
-6.1 MSI and MSI-X Vector Resource
-
-Once MSI or MSI-X interrupts are enabled on a device, it stays in this
-mode until they are disabled again.  Since service drivers of the same
-PCI-PCI Bridge port share the same physical device, if an individual
-service driver enables or disables MSI/MSI-X mode it may result
-unpredictable behavior.
-
-To avoid this situation all service drivers are not permitted to
-switch interrupt mode on its device. The PCI Express Port Bus driver
-is responsible for determining the interrupt mode and this should be
-transparent to service drivers. Service drivers need to know only
-the vector IRQ assigned to the field irq of struct pcie_device, which
-is passed in when the PCI Express Port Bus driver probes each service
-driver. Service drivers should use (struct pcie_device*)dev->irq to
-call request_irq/free_irq. In addition, the interrupt mode is stored
-in the field interrupt_mode of struct pcie_device.
-
-6.3 PCI Memory/IO Mapped Regions
-
-Service drivers for PCI Express Power Management (PME), Advanced
-Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access
-PCI configuration space on the PCI Express port. In all cases the
-registers accessed are independent of each other. This patch assumes
-that all service drivers will be well behaved and not overwrite
-other service driver's configuration settings.
-
-6.4 PCI Config Registers
-
-Each service driver runs its PCI config operations on its own
-capability structure except the PCI Express capability structure, in
-which Root Control register and Device Control register are shared
-between PME and AER. This patch assumes that all service drivers
-will be well behaved and not overwrite other service driver's
-configuration settings.
diff --git a/Documentation/PCI/acpi-info.rst b/Documentation/PCI/acpi-info.rst
new file mode 100644
index 000000000000..060217081c79
--- /dev/null
+++ b/Documentation/PCI/acpi-info.rst
@@ -0,0 +1,192 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================
+ACPI considerations for PCI host bridges
+========================================
+
+The general rule is that the ACPI namespace should describe everything the
+OS might use unless there's another way for the OS to find it [1, 2].
+
+For example, there's no standard hardware mechanism for enumerating PCI
+host bridges, so the ACPI namespace must describe each host bridge, the
+method for accessing PCI config space below it, the address space windows
+the host bridge forwards to PCI (using _CRS), and the routing of legacy
+INTx interrupts (using _PRT).
+
+PCI devices, which are below the host bridge, generally do not need to be
+described via ACPI.  The OS can discover them via the standard PCI
+enumeration mechanism, using config accesses to discover and identify
+devices and read and size their BARs.  However, ACPI may describe PCI
+devices if it provides power management or hotplug functionality for them
+or if the device has INTx interrupts connected by platform interrupt
+controllers and a _PRT is needed to describe those connections.
+
+ACPI resource description is done via _CRS objects of devices in the ACPI
+namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read
+_CRS and figure out what resource is being consumed even if it doesn't have
+a driver for the device [3].  That's important because it means an old OS
+can work correctly even on a system with new devices unknown to the OS.
+The new devices might not do anything, but the OS can at least make sure no
+resources conflict with them.
+
+Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for
+reserving address space.  The static tables are for things the OS needs to
+know early in boot, before it can parse the ACPI namespace.  If a new table
+is defined, an old OS needs to operate correctly even though it ignores the
+table.  _CRS allows that because it is generic and understood by the old
+OS; a static table does not.
+
+If the OS is expected to manage a non-discoverable device described via
+ACPI, that device will have a specific _HID/_CID that tells the OS what
+driver to bind to it, and the _CRS tells the OS and the driver where the
+device's registers are.
+
+PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should
+describe all the address space they consume.  This includes all the windows
+they forward down to the PCI bus, as well as registers of the host bridge
+itself that are not forwarded to PCI.  The host bridge registers include
+things like secondary/subordinate bus registers that determine the bus
+range below the bridge, window registers that describe the apertures, etc.
+These are all device-specific, non-architected things, so the only way a
+PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain
+the device-specific details.  The host bridge registers also include ECAM
+space, since it is consumed by the host bridge.
+
+ACPI defines a Consumer/Producer bit to distinguish the bridge registers
+("Consumer") from the bridge apertures ("Producer") [4, 5], but early
+BIOSes didn't use that bit correctly.  The result is that the current ACPI
+spec defines Consumer/Producer only for the Extended Address Space
+descriptors; the bit should be ignored in the older QWord/DWord/Word
+Address Space descriptors.  Consequently, OSes have to assume all
+QWord/DWord/Word descriptors are windows.
+
+Prior to the addition of Extended Address Space descriptors, the failure of
+Consumer/Producer meant there was no way to describe bridge registers in
+the PNP0A03/PNP0A08 device itself.  The workaround was to describe the
+bridge registers (including ECAM space) in PNP0C02 catch-all devices [6].
+With the exception of ECAM, the bridge register space is device-specific
+anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to
+know about it.  
+
+New architectures should be able to use "Consumer" Extended Address Space
+descriptors in the PNP0A03 device for bridge registers, including ECAM,
+although a strict interpretation of [6] might prohibit this.  Old x86 and
+ia64 kernels assume all address space descriptors, including "Consumer"
+Extended Address Space ones, are windows, so it would not be safe to
+describe bridge registers this way on those architectures.
+
+PNP0C02 "motherboard" devices are basically a catch-all.  There's no
+programming model for them other than "don't use these resources for
+anything else."  So a PNP0C02 _CRS should claim any address space that is
+(1) not claimed by _CRS under any other device object in the ACPI namespace
+and (2) should not be assigned by the OS to something else.
+
+The PCIe spec requires the Enhanced Configuration Access Method (ECAM)
+unless there's a standard firmware interface for config access, e.g., the
+ia64 SAL interface [7].  A host bridge consumes ECAM memory address space
+and converts memory accesses into PCI configuration accesses.  The spec
+defines the ECAM address space layout and functionality; only the base of
+the address space is device-specific.  An ACPI OS learns the base address
+from either the static MCFG table or a _CBA method in the PNP0A03 device.
+
+The MCFG table must describe the ECAM space of non-hot pluggable host
+bridges [8].  Since MCFG is a static table and can't be updated by hotplug,
+a _CBA method in the PNP0A03 device describes the ECAM space of a
+hot-pluggable host bridge [9].  Note that for both MCFG and _CBA, the base
+address always corresponds to bus 0, even if the bus range below the bridge
+(which is reported via _CRS) doesn't start at 0.
+
+
+[1] ACPI 6.2, sec 6.1:
+    For any device that is on a non-enumerable type of bus (for example, an
+    ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI
+    system firmware must supply an _HID object ... for each device to
+    enable OSPM to do that.
+
+[2] ACPI 6.2, sec 3.7:
+    The OS enumerates motherboard devices simply by reading through the
+    ACPI Namespace looking for devices with hardware IDs.
+
+    Each device enumerated by ACPI includes ACPI-defined objects in the
+    ACPI Namespace that report the hardware resources the device could
+    occupy [_PRS], an object that reports the resources that are currently
+    used by the device [_CRS], and objects for configuring those resources
+    [_SRS].  The information is used by the Plug and Play OS (OSPM) to
+    configure the devices.
+
+[3] ACPI 6.2, sec 6.2:
+    OSPM uses device configuration objects to configure hardware resources
+    for devices enumerated via ACPI.  Device configuration objects provide
+    information about current and possible resource requirements, the
+    relationship between shared resources, and methods for configuring
+    hardware resources.
+
+    When OSPM enumerates a device, it calls _PRS to determine the resource
+    requirements of the device.  It may also call _CRS to find the current
+    resource settings for the device.  Using this information, the Plug and
+    Play system determines what resources the device should consume and
+    sets those resources by calling the device’s _SRS control method.
+
+    In ACPI, devices can consume resources (for example, legacy keyboards),
+    provide resources (for example, a proprietary PCI bridge), or do both.
+    Unless otherwise specified, resources for a device are assumed to be
+    taken from the nearest matching resource above the device in the device
+    hierarchy.
+
+[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4:
+    QWord/DWord/Word Address Space Descriptor (.1, .2, .3)
+      General Flags: Bit [0] Ignored
+
+    Extended Address Space Descriptor (.4)
+      General Flags: Bit [0] Consumer/Producer:
+
+        * 1 – This device consumes this resource
+        * 0 – This device produces and consumes this resource
+
+[5] ACPI 6.2, sec 19.6.43:
+    ResourceUsage specifies whether the Memory range is consumed by
+    this device (ResourceConsumer) or passed on to child devices
+    (ResourceProducer).  If nothing is specified, then
+    ResourceConsumer is assumed.
+
+[6] PCI Firmware 3.2, sec 4.1.2:
+    If the operating system does not natively comprehend reserving the
+    MMCFG region, the MMCFG region must be reserved by firmware.  The
+    address range reported in the MCFG table or by _CBA method (see Section
+    4.1.3) must be reserved by declaring a motherboard resource.  For most
+    systems, the motherboard resource would appear at the root of the ACPI
+    namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and
+    the resources in this case should not be claimed in the root PCI bus’s
+    _CRS.  The resources can optionally be returned in Int15 E820 or
+    EFIGetMemoryMap as reserved memory but must always be reported through
+    ACPI as a motherboard resource.
+
+[7] PCI Express 4.0, sec 7.2.2:
+    For systems that are PC-compatible, or that do not implement a
+    processor-architecture-specific firmware interface standard that allows
+    access to the Configuration Space, the ECAM is required as defined in
+    this section.
+
+[8] PCI Firmware 3.2, sec 4.1.2:
+    The MCFG table is an ACPI table that is used to communicate the base
+    addresses corresponding to the non-hot removable PCI Segment Groups
+    range within a PCI Segment Group available to the operating system at
+    boot. This is required for the PC-compatible systems.
+
+    The MCFG table is only used to communicate the base addresses
+    corresponding to the PCI Segment Groups available to the system at
+    boot.
+
+[9] PCI Firmware 3.2, sec 4.1.3:
+    The _CBA (Memory mapped Configuration Base Address) control method is
+    an optional ACPI object that returns the 64-bit memory mapped
+    configuration base address for the hot plug capable host bridge. The
+    base address returned by _CBA is processor-relative address. The _CBA
+    control method evaluates to an Integer.
+
+    This control method appears under a host bridge object. When the _CBA
+    method appears under an active host bridge object, the operating system
+    evaluates this structure to identify the memory mapped configuration
+    base address corresponding to the PCI Segment Group for the bus number
+    range specified in _CRS method. An ACPI name space object that contains
+    the _CBA method must also contain a corresponding _SEG method.
diff --git a/Documentation/PCI/acpi-info.txt b/Documentation/PCI/acpi-info.txt
deleted file mode 100644
index 3ffa3b03970e..000000000000
--- a/Documentation/PCI/acpi-info.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-		ACPI considerations for PCI host bridges
-
-The general rule is that the ACPI namespace should describe everything the
-OS might use unless there's another way for the OS to find it [1, 2].
-
-For example, there's no standard hardware mechanism for enumerating PCI
-host bridges, so the ACPI namespace must describe each host bridge, the
-method for accessing PCI config space below it, the address space windows
-the host bridge forwards to PCI (using _CRS), and the routing of legacy
-INTx interrupts (using _PRT).
-
-PCI devices, which are below the host bridge, generally do not need to be
-described via ACPI.  The OS can discover them via the standard PCI
-enumeration mechanism, using config accesses to discover and identify
-devices and read and size their BARs.  However, ACPI may describe PCI
-devices if it provides power management or hotplug functionality for them
-or if the device has INTx interrupts connected by platform interrupt
-controllers and a _PRT is needed to describe those connections.
-
-ACPI resource description is done via _CRS objects of devices in the ACPI
-namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read
-_CRS and figure out what resource is being consumed even if it doesn't have
-a driver for the device [3].  That's important because it means an old OS
-can work correctly even on a system with new devices unknown to the OS.
-The new devices might not do anything, but the OS can at least make sure no
-resources conflict with them.
-
-Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for
-reserving address space.  The static tables are for things the OS needs to
-know early in boot, before it can parse the ACPI namespace.  If a new table
-is defined, an old OS needs to operate correctly even though it ignores the
-table.  _CRS allows that because it is generic and understood by the old
-OS; a static table does not.
-
-If the OS is expected to manage a non-discoverable device described via
-ACPI, that device will have a specific _HID/_CID that tells the OS what
-driver to bind to it, and the _CRS tells the OS and the driver where the
-device's registers are.
-
-PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should
-describe all the address space they consume.  This includes all the windows
-they forward down to the PCI bus, as well as registers of the host bridge
-itself that are not forwarded to PCI.  The host bridge registers include
-things like secondary/subordinate bus registers that determine the bus
-range below the bridge, window registers that describe the apertures, etc.
-These are all device-specific, non-architected things, so the only way a
-PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain
-the device-specific details.  The host bridge registers also include ECAM
-space, since it is consumed by the host bridge.
-
-ACPI defines a Consumer/Producer bit to distinguish the bridge registers
-("Consumer") from the bridge apertures ("Producer") [4, 5], but early
-BIOSes didn't use that bit correctly.  The result is that the current ACPI
-spec defines Consumer/Producer only for the Extended Address Space
-descriptors; the bit should be ignored in the older QWord/DWord/Word
-Address Space descriptors.  Consequently, OSes have to assume all
-QWord/DWord/Word descriptors are windows.
-
-Prior to the addition of Extended Address Space descriptors, the failure of
-Consumer/Producer meant there was no way to describe bridge registers in
-the PNP0A03/PNP0A08 device itself.  The workaround was to describe the
-bridge registers (including ECAM space) in PNP0C02 catch-all devices [6].
-With the exception of ECAM, the bridge register space is device-specific
-anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to
-know about it.  
-
-New architectures should be able to use "Consumer" Extended Address Space
-descriptors in the PNP0A03 device for bridge registers, including ECAM,
-although a strict interpretation of [6] might prohibit this.  Old x86 and
-ia64 kernels assume all address space descriptors, including "Consumer"
-Extended Address Space ones, are windows, so it would not be safe to
-describe bridge registers this way on those architectures.
-
-PNP0C02 "motherboard" devices are basically a catch-all.  There's no
-programming model for them other than "don't use these resources for
-anything else."  So a PNP0C02 _CRS should claim any address space that is
-(1) not claimed by _CRS under any other device object in the ACPI namespace
-and (2) should not be assigned by the OS to something else.
-
-The PCIe spec requires the Enhanced Configuration Access Method (ECAM)
-unless there's a standard firmware interface for config access, e.g., the
-ia64 SAL interface [7].  A host bridge consumes ECAM memory address space
-and converts memory accesses into PCI configuration accesses.  The spec
-defines the ECAM address space layout and functionality; only the base of
-the address space is device-specific.  An ACPI OS learns the base address
-from either the static MCFG table or a _CBA method in the PNP0A03 device.
-
-The MCFG table must describe the ECAM space of non-hot pluggable host
-bridges [8].  Since MCFG is a static table and can't be updated by hotplug,
-a _CBA method in the PNP0A03 device describes the ECAM space of a
-hot-pluggable host bridge [9].  Note that for both MCFG and _CBA, the base
-address always corresponds to bus 0, even if the bus range below the bridge
-(which is reported via _CRS) doesn't start at 0.
-
-
-[1] ACPI 6.2, sec 6.1:
-    For any device that is on a non-enumerable type of bus (for example, an
-    ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI
-    system firmware must supply an _HID object ... for each device to
-    enable OSPM to do that.
-
-[2] ACPI 6.2, sec 3.7:
-    The OS enumerates motherboard devices simply by reading through the
-    ACPI Namespace looking for devices with hardware IDs.
-
-    Each device enumerated by ACPI includes ACPI-defined objects in the
-    ACPI Namespace that report the hardware resources the device could
-    occupy [_PRS], an object that reports the resources that are currently
-    used by the device [_CRS], and objects for configuring those resources
-    [_SRS].  The information is used by the Plug and Play OS (OSPM) to
-    configure the devices.
-
-[3] ACPI 6.2, sec 6.2:
-    OSPM uses device configuration objects to configure hardware resources
-    for devices enumerated via ACPI.  Device configuration objects provide
-    information about current and possible resource requirements, the
-    relationship between shared resources, and methods for configuring
-    hardware resources.
-
-    When OSPM enumerates a device, it calls _PRS to determine the resource
-    requirements of the device.  It may also call _CRS to find the current
-    resource settings for the device.  Using this information, the Plug and
-    Play system determines what resources the device should consume and
-    sets those resources by calling the device’s _SRS control method.
-
-    In ACPI, devices can consume resources (for example, legacy keyboards),
-    provide resources (for example, a proprietary PCI bridge), or do both.
-    Unless otherwise specified, resources for a device are assumed to be
-    taken from the nearest matching resource above the device in the device
-    hierarchy.
-
-[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4:
-    QWord/DWord/Word Address Space Descriptor (.1, .2, .3)
-    General Flags: Bit [0] Ignored
-
-    Extended Address Space Descriptor (.4)
-    General Flags: Bit [0] Consumer/Producer:
-	1–This device consumes this resource
-	0–This device produces and consumes this resource
-
-[5] ACPI 6.2, sec 19.6.43:
-    ResourceUsage specifies whether the Memory range is consumed by
-    this device (ResourceConsumer) or passed on to child devices
-    (ResourceProducer).  If nothing is specified, then
-    ResourceConsumer is assumed.
-
-[6] PCI Firmware 3.2, sec 4.1.2:
-    If the operating system does not natively comprehend reserving the
-    MMCFG region, the MMCFG region must be reserved by firmware.  The
-    address range reported in the MCFG table or by _CBA method (see Section
-    4.1.3) must be reserved by declaring a motherboard resource.  For most
-    systems, the motherboard resource would appear at the root of the ACPI
-    namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and
-    the resources in this case should not be claimed in the root PCI bus’s
-    _CRS.  The resources can optionally be returned in Int15 E820 or
-    EFIGetMemoryMap as reserved memory but must always be reported through
-    ACPI as a motherboard resource.
-
-[7] PCI Express 4.0, sec 7.2.2:
-    For systems that are PC-compatible, or that do not implement a
-    processor-architecture-specific firmware interface standard that allows
-    access to the Configuration Space, the ECAM is required as defined in
-    this section.
-
-[8] PCI Firmware 3.2, sec 4.1.2:
-    The MCFG table is an ACPI table that is used to communicate the base
-    addresses corresponding to the non-hot removable PCI Segment Groups
-    range within a PCI Segment Group available to the operating system at
-    boot. This is required for the PC-compatible systems.
-
-    The MCFG table is only used to communicate the base addresses
-    corresponding to the PCI Segment Groups available to the system at
-    boot.
-
-[9] PCI Firmware 3.2, sec 4.1.3:
-    The _CBA (Memory mapped Configuration Base Address) control method is
-    an optional ACPI object that returns the 64-bit memory mapped
-    configuration base address for the hot plug capable host bridge. The
-    base address returned by _CBA is processor-relative address. The _CBA
-    control method evaluates to an Integer.
-
-    This control method appears under a host bridge object. When the _CBA
-    method appears under an active host bridge object, the operating system
-    evaluates this structure to identify the memory mapped configuration
-    base address corresponding to the PCI Segment Group for the bus number
-    range specified in _CRS method. An ACPI name space object that contains
-    the _CBA method must also contain a corresponding _SEG method.
diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
new file mode 100644
index 000000000000..d114ea74b444
--- /dev/null
+++ b/Documentation/PCI/endpoint/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+PCI Endpoint Framework
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   pci-endpoint
+   pci-endpoint-cfs
+   pci-test-function
+   pci-test-howto
diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.rst b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
new file mode 100644
index 000000000000..b6d39cdec56e
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
@@ -0,0 +1,118 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+Configuring PCI Endpoint Using CONFIGFS
+=======================================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
+PCI endpoint function and to bind the endpoint function
+with the endpoint controller. (For introducing other mechanisms to
+configure the PCI Endpoint Function refer to [1]).
+
+Mounting configfs
+=================
+
+The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
+directory. configfs can be mounted using the following command::
+
+	mount -t configfs none /sys/kernel/config
+
+Directory Structure
+===================
+
+The pci_ep configfs has two directories at its root: controllers and
+functions. Every EPC device present in the system will have an entry in
+the *controllers* directory and and every EPF driver present in the system
+will have an entry in the *functions* directory.
+::
+
+	/sys/kernel/config/pci_ep/
+		.. controllers/
+		.. functions/
+
+Creating EPF Device
+===================
+
+Every registered EPF driver will be listed in controllers directory. The
+entries corresponding to EPF driver will be created by the EPF core.
+::
+
+	/sys/kernel/config/pci_ep/functions/
+		.. <EPF Driver1>/
+			... <EPF Device 11>/
+			... <EPF Device 21>/
+		.. <EPF Driver2>/
+			... <EPF Device 12>/
+			... <EPF Device 22>/
+
+In order to create a <EPF device> of the type probed by <EPF Driver>, the
+user has to create a directory inside <EPF DriverN>.
+
+Every <EPF device> directory consists of the following entries that can be
+used to configure the standard configuration header of the endpoint function.
+(These entries are created by the framework when any new <EPF Device> is
+created)
+::
+
+		.. <EPF Driver1>/
+			... <EPF Device 11>/
+				... vendorid
+				... deviceid
+				... revid
+				... progif_code
+				... subclass_code
+				... baseclass_code
+				... cache_line_size
+				... subsys_vendor_id
+				... subsys_id
+				... interrupt_pin
+
+EPC Device
+==========
+
+Every registered EPC device will be listed in controllers directory. The
+entries corresponding to EPC device will be created by the EPC core.
+::
+
+	/sys/kernel/config/pci_ep/controllers/
+		.. <EPC Device1>/
+			... <Symlink EPF Device11>/
+			... <Symlink EPF Device12>/
+			... start
+		.. <EPC Device2>/
+			... <Symlink EPF Device21>/
+			... <Symlink EPF Device22>/
+			... start
+
+The <EPC Device> directory will have a list of symbolic links to
+<EPF Device>. These symbolic links should be created by the user to
+represent the functions present in the endpoint device.
+
+The <EPC Device> directory will also have a *start* field. Once
+"1" is written to this field, the endpoint device will be ready to
+establish the link with the host. This is usually done after
+all the EPF devices are created and linked with the EPC device.
+::
+
+			 | controllers/
+				| <Directory: EPC name>/
+					| <Symbolic Link: Function>
+					| start
+			 | functions/
+				| <Directory: EPF driver>/
+					| <Directory: EPF device>/
+						| vendorid
+						| deviceid
+						| revid
+						| progif_code
+						| subclass_code
+						| baseclass_code
+						| cache_line_size
+						| subsys_vendor_id
+						| subsys_id
+						| interrupt_pin
+						| function
+
+[1] :doc:`pci-endpoint`
diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
deleted file mode 100644
index d740f29960a4..000000000000
--- a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-                   CONFIGURING PCI ENDPOINT USING CONFIGFS
-                    Kishon Vijay Abraham I <kishon@ti.com>
-
-The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
-PCI endpoint function and to bind the endpoint function
-with the endpoint controller. (For introducing other mechanisms to
-configure the PCI Endpoint Function refer to [1]).
-
-*) Mounting configfs
-
-The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
-directory. configfs can be mounted using the following command.
-
-	mount -t configfs none /sys/kernel/config
-
-*) Directory Structure
-
-The pci_ep configfs has two directories at its root: controllers and
-functions. Every EPC device present in the system will have an entry in
-the *controllers* directory and and every EPF driver present in the system
-will have an entry in the *functions* directory.
-
-/sys/kernel/config/pci_ep/
-	.. controllers/
-	.. functions/
-
-*) Creating EPF Device
-
-Every registered EPF driver will be listed in controllers directory. The
-entries corresponding to EPF driver will be created by the EPF core.
-
-/sys/kernel/config/pci_ep/functions/
-	.. <EPF Driver1>/
-		... <EPF Device 11>/
-		... <EPF Device 21>/
-	.. <EPF Driver2>/
-		... <EPF Device 12>/
-		... <EPF Device 22>/
-
-In order to create a <EPF device> of the type probed by <EPF Driver>, the
-user has to create a directory inside <EPF DriverN>.
-
-Every <EPF device> directory consists of the following entries that can be
-used to configure the standard configuration header of the endpoint function.
-(These entries are created by the framework when any new <EPF Device> is
-created)
-
-	.. <EPF Driver1>/
-		... <EPF Device 11>/
-			... vendorid
-			... deviceid
-			... revid
-			... progif_code
-			... subclass_code
-			... baseclass_code
-			... cache_line_size
-			... subsys_vendor_id
-			... subsys_id
-			... interrupt_pin
-
-*) EPC Device
-
-Every registered EPC device will be listed in controllers directory. The
-entries corresponding to EPC device will be created by the EPC core.
-
-/sys/kernel/config/pci_ep/controllers/
-	.. <EPC Device1>/
-		... <Symlink EPF Device11>/
-		... <Symlink EPF Device12>/
-		... start
-	.. <EPC Device2>/
-		... <Symlink EPF Device21>/
-		... <Symlink EPF Device22>/
-		... start
-
-The <EPC Device> directory will have a list of symbolic links to
-<EPF Device>. These symbolic links should be created by the user to
-represent the functions present in the endpoint device.
-
-The <EPC Device> directory will also have a *start* field. Once
-"1" is written to this field, the endpoint device will be ready to
-establish the link with the host. This is usually done after
-all the EPF devices are created and linked with the EPC device.
-
-
-			 | controllers/
-				| <Directory: EPC name>/
-					| <Symbolic Link: Function>
-					| start
-			 | functions/
-				| <Directory: EPF driver>/
-					| <Directory: EPF device>/
-						| vendorid
-						| deviceid
-						| revid
-						| progif_code
-						| subclass_code
-						| baseclass_code
-						| cache_line_size
-						| subsys_vendor_id
-						| subsys_id
-						| interrupt_pin
-						| function
-
-[1] -> Documentation/PCI/endpoint/pci-endpoint.txt
diff --git a/Documentation/PCI/endpoint/pci-endpoint.rst b/Documentation/PCI/endpoint/pci-endpoint.rst
new file mode 100644
index 000000000000..0e2311b5617b
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-endpoint.rst
@@ -0,0 +1,231 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+This document is a guide to use the PCI Endpoint Framework in order to create
+endpoint controller driver, endpoint function driver, and using configfs
+interface to bind the function driver to the controller driver.
+
+Introduction
+============
+
+Linux has a comprehensive PCI subsystem to support PCI controllers that
+operates in Root Complex mode. The subsystem has capability to scan PCI bus,
+assign memory resources and IRQ resources, load PCI driver (based on
+vendor ID, device ID), support other services like hot-plug, power management,
+advanced error reporting and virtual channels.
+
+However the PCI controller IP integrated in some SoCs is capable of operating
+either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
+add endpoint mode support in Linux. This will help to run Linux in an
+EP system which can have a wide variety of use cases from testing or
+validation, co-processor accelerator, etc.
+
+PCI Endpoint Core
+=================
+
+The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
+library, the Endpoint Function library, and the configfs layer to bind the
+endpoint function with the endpoint controller.
+
+PCI Endpoint Controller(EPC) Library
+------------------------------------
+
+The EPC library provides APIs to be used by the controller that can operate
+in endpoint mode. It also provides APIs to be used by function driver/library
+in order to implement a particular endpoint function.
+
+APIs for the PCI controller Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI controller driver.
+
+* devm_pci_epc_create()/pci_epc_create()
+
+   The PCI controller driver should implement the following ops:
+
+	 * write_header: ops to populate configuration space header
+	 * set_bar: ops to configure the BAR
+	 * clear_bar: ops to reset the BAR
+	 * alloc_addr_space: ops to allocate in PCI controller address space
+	 * free_addr_space: ops to free the allocated address space
+	 * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
+	 * start: ops to start the PCI link
+	 * stop: ops to stop the PCI link
+
+   The PCI controller driver can then create a new EPC device by invoking
+   devm_pci_epc_create()/pci_epc_create().
+
+* devm_pci_epc_destroy()/pci_epc_destroy()
+
+   The PCI controller driver can destroy the EPC device created by either
+   devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
+   pci_epc_destroy().
+
+* pci_epc_linkup()
+
+   In order to notify all the function devices that the EPC device to which
+   they are linked has established a link with the host, the PCI controller
+   driver should invoke pci_epc_linkup().
+
+* pci_epc_mem_init()
+
+   Initialize the pci_epc_mem structure used for allocating EPC addr space.
+
+* pci_epc_mem_exit()
+
+   Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
+
+
+APIs for the PCI Endpoint Function Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint function driver.
+
+* pci_epc_write_header()
+
+   The PCI endpoint function driver should use pci_epc_write_header() to
+   write the standard configuration header to the endpoint controller.
+
+* pci_epc_set_bar()
+
+   The PCI endpoint function driver should use pci_epc_set_bar() to configure
+   the Base Address Register in order for the host to assign PCI addr space.
+   Register space of the function driver is usually configured
+   using this API.
+
+* pci_epc_clear_bar()
+
+   The PCI endpoint function driver should use pci_epc_clear_bar() to reset
+   the BAR.
+
+* pci_epc_raise_irq()
+
+   The PCI endpoint function driver should use pci_epc_raise_irq() to raise
+   Legacy Interrupt, MSI or MSI-X Interrupt.
+
+* pci_epc_mem_alloc_addr()
+
+   The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
+   allocate memory address from EPC addr space which is required to access
+   RC's buffer
+
+* pci_epc_mem_free_addr()
+
+   The PCI endpoint function driver should use pci_epc_mem_free_addr() to
+   free the memory space allocated using pci_epc_mem_alloc_addr().
+
+Other APIs
+~~~~~~~~~~
+
+There are other APIs provided by the EPC library. These are used for binding
+the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
+using these APIs.
+
+* pci_epc_get()
+
+   Get a reference to the PCI endpoint controller based on the device name of
+   the controller.
+
+* pci_epc_put()
+
+   Release the reference to the PCI endpoint controller obtained using
+   pci_epc_get()
+
+* pci_epc_add_epf()
+
+   Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
+   can have up to 8 functions according to the specification.
+
+* pci_epc_remove_epf()
+
+   Remove the PCI endpoint function from PCI endpoint controller.
+
+* pci_epc_start()
+
+   The PCI endpoint function driver should invoke pci_epc_start() once it
+   has configured the endpoint function and wants to start the PCI link.
+
+* pci_epc_stop()
+
+   The PCI endpoint function driver should invoke pci_epc_stop() to stop
+   the PCI LINK.
+
+
+PCI Endpoint Function(EPF) Library
+----------------------------------
+
+The EPF library provides APIs to be used by the function driver and the EPC
+library to provide endpoint mode functionality.
+
+APIs for the PCI Endpoint Function Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint function driver.
+
+* pci_epf_register_driver()
+
+   The PCI Endpoint Function driver should implement the following ops:
+	 * bind: ops to perform when a EPC device has been bound to EPF device
+	 * unbind: ops to perform when a binding has been lost between a EPC
+	   device and EPF device
+	 * linkup: ops to perform when the EPC device has established a
+	   connection with a host system
+
+  The PCI Function driver can then register the PCI EPF driver by using
+  pci_epf_register_driver().
+
+* pci_epf_unregister_driver()
+
+  The PCI Function driver can unregister the PCI EPF driver by using
+  pci_epf_unregister_driver().
+
+* pci_epf_alloc_space()
+
+  The PCI Function driver can allocate space for a particular BAR using
+  pci_epf_alloc_space().
+
+* pci_epf_free_space()
+
+  The PCI Function driver can free the allocated space
+  (using pci_epf_alloc_space) by invoking pci_epf_free_space().
+
+APIs for the PCI Endpoint Controller Library
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint controller library.
+
+* pci_epf_linkup()
+
+   The PCI endpoint controller library invokes pci_epf_linkup() when the
+   EPC device has established the connection to the host.
+
+Other APIs
+~~~~~~~~~~
+
+There are other APIs provided by the EPF library. These are used to notify
+the function driver when the EPF device is bound to the EPC device.
+pci-ep-cfs.c can be used as reference for using these APIs.
+
+* pci_epf_create()
+
+   Create a new PCI EPF device by passing the name of the PCI EPF device.
+   This name will be used to bind the the EPF device to a EPF driver.
+
+* pci_epf_destroy()
+
+   Destroy the created PCI EPF device.
+
+* pci_epf_bind()
+
+   pci_epf_bind() should be invoked when the EPF device has been bound to
+   a EPC device.
+
+* pci_epf_unbind()
+
+   pci_epf_unbind() should be invoked when the binding between EPC device
+   and EPF device is lost.
diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt
deleted file mode 100644
index e86a96b66a6a..000000000000
--- a/Documentation/PCI/endpoint/pci-endpoint.txt
+++ /dev/null
@@ -1,215 +0,0 @@
-			    PCI ENDPOINT FRAMEWORK
-		    Kishon Vijay Abraham I <kishon@ti.com>
-
-This document is a guide to use the PCI Endpoint Framework in order to create
-endpoint controller driver, endpoint function driver, and using configfs
-interface to bind the function driver to the controller driver.
-
-1. Introduction
-
-Linux has a comprehensive PCI subsystem to support PCI controllers that
-operates in Root Complex mode. The subsystem has capability to scan PCI bus,
-assign memory resources and IRQ resources, load PCI driver (based on
-vendor ID, device ID), support other services like hot-plug, power management,
-advanced error reporting and virtual channels.
-
-However the PCI controller IP integrated in some SoCs is capable of operating
-either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
-add endpoint mode support in Linux. This will help to run Linux in an
-EP system which can have a wide variety of use cases from testing or
-validation, co-processor accelerator, etc.
-
-2. PCI Endpoint Core
-
-The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
-library, the Endpoint Function library, and the configfs layer to bind the
-endpoint function with the endpoint controller.
-
-2.1 PCI Endpoint Controller(EPC) Library
-
-The EPC library provides APIs to be used by the controller that can operate
-in endpoint mode. It also provides APIs to be used by function driver/library
-in order to implement a particular endpoint function.
-
-2.1.1 APIs for the PCI controller Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI controller driver.
-
-*) devm_pci_epc_create()/pci_epc_create()
-
-   The PCI controller driver should implement the following ops:
-	 * write_header: ops to populate configuration space header
-	 * set_bar: ops to configure the BAR
-	 * clear_bar: ops to reset the BAR
-	 * alloc_addr_space: ops to allocate in PCI controller address space
-	 * free_addr_space: ops to free the allocated address space
-	 * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
-	 * start: ops to start the PCI link
-	 * stop: ops to stop the PCI link
-
-   The PCI controller driver can then create a new EPC device by invoking
-   devm_pci_epc_create()/pci_epc_create().
-
-*) devm_pci_epc_destroy()/pci_epc_destroy()
-
-   The PCI controller driver can destroy the EPC device created by either
-   devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
-   pci_epc_destroy().
-
-*) pci_epc_linkup()
-
-   In order to notify all the function devices that the EPC device to which
-   they are linked has established a link with the host, the PCI controller
-   driver should invoke pci_epc_linkup().
-
-*) pci_epc_mem_init()
-
-   Initialize the pci_epc_mem structure used for allocating EPC addr space.
-
-*) pci_epc_mem_exit()
-
-   Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
-
-2.1.2 APIs for the PCI Endpoint Function Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint function driver.
-
-*) pci_epc_write_header()
-
-   The PCI endpoint function driver should use pci_epc_write_header() to
-   write the standard configuration header to the endpoint controller.
-
-*) pci_epc_set_bar()
-
-   The PCI endpoint function driver should use pci_epc_set_bar() to configure
-   the Base Address Register in order for the host to assign PCI addr space.
-   Register space of the function driver is usually configured
-   using this API.
-
-*) pci_epc_clear_bar()
-
-   The PCI endpoint function driver should use pci_epc_clear_bar() to reset
-   the BAR.
-
-*) pci_epc_raise_irq()
-
-   The PCI endpoint function driver should use pci_epc_raise_irq() to raise
-   Legacy Interrupt, MSI or MSI-X Interrupt.
-
-*) pci_epc_mem_alloc_addr()
-
-   The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
-   allocate memory address from EPC addr space which is required to access
-   RC's buffer
-
-*) pci_epc_mem_free_addr()
-
-   The PCI endpoint function driver should use pci_epc_mem_free_addr() to
-   free the memory space allocated using pci_epc_mem_alloc_addr().
-
-2.1.3 Other APIs
-
-There are other APIs provided by the EPC library. These are used for binding
-the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
-using these APIs.
-
-*) pci_epc_get()
-
-   Get a reference to the PCI endpoint controller based on the device name of
-   the controller.
-
-*) pci_epc_put()
-
-   Release the reference to the PCI endpoint controller obtained using
-   pci_epc_get()
-
-*) pci_epc_add_epf()
-
-   Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
-   can have up to 8 functions according to the specification.
-
-*) pci_epc_remove_epf()
-
-   Remove the PCI endpoint function from PCI endpoint controller.
-
-*) pci_epc_start()
-
-   The PCI endpoint function driver should invoke pci_epc_start() once it
-   has configured the endpoint function and wants to start the PCI link.
-
-*) pci_epc_stop()
-
-   The PCI endpoint function driver should invoke pci_epc_stop() to stop
-   the PCI LINK.
-
-2.2 PCI Endpoint Function(EPF) Library
-
-The EPF library provides APIs to be used by the function driver and the EPC
-library to provide endpoint mode functionality.
-
-2.2.1 APIs for the PCI Endpoint Function Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint function driver.
-
-*) pci_epf_register_driver()
-
-   The PCI Endpoint Function driver should implement the following ops:
-	 * bind: ops to perform when a EPC device has been bound to EPF device
-	 * unbind: ops to perform when a binding has been lost between a EPC
-	   device and EPF device
-	 * linkup: ops to perform when the EPC device has established a
-	   connection with a host system
-
-  The PCI Function driver can then register the PCI EPF driver by using
-  pci_epf_register_driver().
-
-*) pci_epf_unregister_driver()
-
-  The PCI Function driver can unregister the PCI EPF driver by using
-  pci_epf_unregister_driver().
-
-*) pci_epf_alloc_space()
-
-  The PCI Function driver can allocate space for a particular BAR using
-  pci_epf_alloc_space().
-
-*) pci_epf_free_space()
-
-  The PCI Function driver can free the allocated space
-  (using pci_epf_alloc_space) by invoking pci_epf_free_space().
-
-2.2.2 APIs for the PCI Endpoint Controller Library
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint controller library.
-
-*) pci_epf_linkup()
-
-   The PCI endpoint controller library invokes pci_epf_linkup() when the
-   EPC device has established the connection to the host.
-
-2.2.2 Other APIs
-There are other APIs provided by the EPF library. These are used to notify
-the function driver when the EPF device is bound to the EPC device.
-pci-ep-cfs.c can be used as reference for using these APIs.
-
-*) pci_epf_create()
-
-   Create a new PCI EPF device by passing the name of the PCI EPF device.
-   This name will be used to bind the the EPF device to a EPF driver.
-
-*) pci_epf_destroy()
-
-   Destroy the created PCI EPF device.
-
-*) pci_epf_bind()
-
-   pci_epf_bind() should be invoked when the EPF device has been bound to
-   a EPC device.
-
-*) pci_epf_unbind()
-
-   pci_epf_unbind() should be invoked when the binding between EPC device
-   and EPF device is lost.
diff --git a/Documentation/PCI/endpoint/pci-test-function.rst b/Documentation/PCI/endpoint/pci-test-function.rst
new file mode 100644
index 000000000000..3c8521d7aa31
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-test-function.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+PCI Test Function
+=================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+Traditionally PCI RC has always been validated by using standard
+PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
+However with the addition of EP-core in linux kernel, it is possible
+to configure a PCI controller that can operate in EP mode to work as
+a test device.
+
+The PCI endpoint test device is a virtual device (defined in software)
+used to test the endpoint functionality and serve as a sample driver
+for other PCI endpoint devices (to use the EP framework).
+
+The PCI endpoint test device has the following registers:
+
+	1) PCI_ENDPOINT_TEST_MAGIC
+	2) PCI_ENDPOINT_TEST_COMMAND
+	3) PCI_ENDPOINT_TEST_STATUS
+	4) PCI_ENDPOINT_TEST_SRC_ADDR
+	5) PCI_ENDPOINT_TEST_DST_ADDR
+	6) PCI_ENDPOINT_TEST_SIZE
+	7) PCI_ENDPOINT_TEST_CHECKSUM
+	8) PCI_ENDPOINT_TEST_IRQ_TYPE
+	9) PCI_ENDPOINT_TEST_IRQ_NUMBER
+
+* PCI_ENDPOINT_TEST_MAGIC
+
+This register will be used to test BAR0. A known pattern will be written
+and read back from MAGIC register to verify BAR0.
+
+* PCI_ENDPOINT_TEST_COMMAND
+
+This register will be used by the host driver to indicate the function
+that the endpoint device must perform.
+
+========	================================================================
+Bitfield	Description
+========	================================================================
+Bit 0		raise legacy IRQ
+Bit 1		raise MSI IRQ
+Bit 2		raise MSI-X IRQ
+Bit 3		read command (read data from RC buffer)
+Bit 4		write command (write data to RC buffer)
+Bit 5		copy command (copy data from one RC buffer to another RC buffer)
+========	================================================================
+
+* PCI_ENDPOINT_TEST_STATUS
+
+This register reflects the status of the PCI endpoint device.
+
+========	==============================
+Bitfield	Description
+========	==============================
+Bit 0		read success
+Bit 1		read fail
+Bit 2		write success
+Bit 3		write fail
+Bit 4		copy success
+Bit 5		copy fail
+Bit 6		IRQ raised
+Bit 7		source address is invalid
+Bit 8		destination address is invalid
+========	==============================
+
+* PCI_ENDPOINT_TEST_SRC_ADDR
+
+This register contains the source address (RC buffer address) for the
+COPY/READ command.
+
+* PCI_ENDPOINT_TEST_DST_ADDR
+
+This register contains the destination address (RC buffer address) for
+the COPY/WRITE command.
+
+* PCI_ENDPOINT_TEST_IRQ_TYPE
+
+This register contains the interrupt type (Legacy/MSI) triggered
+for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
+
+Possible types:
+
+======	==
+Legacy	0
+MSI	1
+MSI-X	2
+======	==
+
+* PCI_ENDPOINT_TEST_IRQ_NUMBER
+
+This register contains the triggered ID interrupt.
+
+Admissible values:
+
+======	===========
+Legacy	0
+MSI	[1 .. 32]
+MSI-X	[1 .. 2048]
+======	===========
diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt
deleted file mode 100644
index 5916f1f592bb..000000000000
--- a/Documentation/PCI/endpoint/pci-test-function.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-				PCI TEST
-		    Kishon Vijay Abraham I <kishon@ti.com>
-
-Traditionally PCI RC has always been validated by using standard
-PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
-However with the addition of EP-core in linux kernel, it is possible
-to configure a PCI controller that can operate in EP mode to work as
-a test device.
-
-The PCI endpoint test device is a virtual device (defined in software)
-used to test the endpoint functionality and serve as a sample driver
-for other PCI endpoint devices (to use the EP framework).
-
-The PCI endpoint test device has the following registers:
-
-	1) PCI_ENDPOINT_TEST_MAGIC
-	2) PCI_ENDPOINT_TEST_COMMAND
-	3) PCI_ENDPOINT_TEST_STATUS
-	4) PCI_ENDPOINT_TEST_SRC_ADDR
-	5) PCI_ENDPOINT_TEST_DST_ADDR
-	6) PCI_ENDPOINT_TEST_SIZE
-	7) PCI_ENDPOINT_TEST_CHECKSUM
-	8) PCI_ENDPOINT_TEST_IRQ_TYPE
-	9) PCI_ENDPOINT_TEST_IRQ_NUMBER
-
-*) PCI_ENDPOINT_TEST_MAGIC
-
-This register will be used to test BAR0. A known pattern will be written
-and read back from MAGIC register to verify BAR0.
-
-*) PCI_ENDPOINT_TEST_COMMAND:
-
-This register will be used by the host driver to indicate the function
-that the endpoint device must perform.
-
-Bitfield Description:
-  Bit 0		: raise legacy IRQ
-  Bit 1		: raise MSI IRQ
-  Bit 2		: raise MSI-X IRQ
-  Bit 3		: read command (read data from RC buffer)
-  Bit 4		: write command (write data to RC buffer)
-  Bit 5		: copy command (copy data from one RC buffer to another
-		  RC buffer)
-
-*) PCI_ENDPOINT_TEST_STATUS
-
-This register reflects the status of the PCI endpoint device.
-
-Bitfield Description:
-  Bit 0		: read success
-  Bit 1		: read fail
-  Bit 2		: write success
-  Bit 3		: write fail
-  Bit 4		: copy success
-  Bit 5		: copy fail
-  Bit 6		: IRQ raised
-  Bit 7		: source address is invalid
-  Bit 8		: destination address is invalid
-
-*) PCI_ENDPOINT_TEST_SRC_ADDR
-
-This register contains the source address (RC buffer address) for the
-COPY/READ command.
-
-*) PCI_ENDPOINT_TEST_DST_ADDR
-
-This register contains the destination address (RC buffer address) for
-the COPY/WRITE command.
-
-*) PCI_ENDPOINT_TEST_IRQ_TYPE
-
-This register contains the interrupt type (Legacy/MSI) triggered
-for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
-
-Possible types:
- - Legacy	: 0
- - MSI		: 1
- - MSI-X	: 2
-
-*) PCI_ENDPOINT_TEST_IRQ_NUMBER
-
-This register contains the triggered ID interrupt.
-
-Admissible values:
- - Legacy	: 0
- - MSI		: [1 .. 32]
- - MSI-X	: [1 .. 2048]
diff --git a/Documentation/PCI/endpoint/pci-test-howto.rst b/Documentation/PCI/endpoint/pci-test-howto.rst
new file mode 100644
index 000000000000..909f770a07d6
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-test-howto.rst
@@ -0,0 +1,235 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+PCI Test User Guide
+===================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+This document is a guide to help users use pci-epf-test function driver
+and pci_endpoint_test host driver for testing PCI. The list of steps to
+be followed in the host side and EP side is given below.
+
+Endpoint Device
+===============
+
+Endpoint Controller Devices
+---------------------------
+
+To find the list of endpoint controller devices in the system::
+
+	# ls /sys/class/pci_epc/
+	  51000000.pcie_ep
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+	# ls /sys/kernel/config/pci_ep/controllers
+	  51000000.pcie_ep
+
+
+Endpoint Function Drivers
+-------------------------
+
+To find the list of endpoint function drivers in the system::
+
+	# ls /sys/bus/pci-epf/drivers
+	  pci_epf_test
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+	# ls /sys/kernel/config/pci_ep/functions
+	  pci_epf_test
+
+
+Creating pci-epf-test Device
+----------------------------
+
+PCI endpoint function device can be created using the configfs. To create
+pci-epf-test device, the following commands can be used::
+
+	# mount -t configfs none /sys/kernel/config
+	# cd /sys/kernel/config/pci_ep/
+	# mkdir functions/pci_epf_test/func1
+
+The "mkdir func1" above creates the pci-epf-test function device that will
+be probed by pci_epf_test driver.
+
+The PCI endpoint framework populates the directory with the following
+configurable fields::
+
+	# ls functions/pci_epf_test/func1
+	  baseclass_code	interrupt_pin	progif_code	subsys_id
+	  cache_line_size	msi_interrupts	revid		subsys_vendorid
+	  deviceid          	msix_interrupts	subclass_code	vendorid
+
+The PCI endpoint function driver populates these entries with default values
+when the device is bound to the driver. The pci-epf-test driver populates
+vendorid with 0xffff and interrupt_pin with 0x0001::
+
+	# cat functions/pci_epf_test/func1/vendorid
+	  0xffff
+	# cat functions/pci_epf_test/func1/interrupt_pin
+	  0x0001
+
+
+Configuring pci-epf-test Device
+-------------------------------
+
+The user can configure the pci-epf-test device using configfs entry. In order
+to change the vendorid and the number of MSI interrupts used by the function
+device, the following commands can be used::
+
+	# echo 0x104c > functions/pci_epf_test/func1/vendorid
+	# echo 0xb500 > functions/pci_epf_test/func1/deviceid
+	# echo 16 > functions/pci_epf_test/func1/msi_interrupts
+	# echo 8 > functions/pci_epf_test/func1/msix_interrupts
+
+
+Binding pci-epf-test Device to EP Controller
+--------------------------------------------
+
+In order for the endpoint function device to be useful, it has to be bound to
+a PCI endpoint controller driver. Use the configfs to bind the function
+device to one of the controller driver present in the system::
+
+	# ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
+
+Once the above step is completed, the PCI endpoint is ready to establish a link
+with the host.
+
+
+Start the Link
+--------------
+
+In order for the endpoint device to establish a link with the host, the _start_
+field should be populated with '1'::
+
+	# echo 1 > controllers/51000000.pcie_ep/start
+
+
+RootComplex Device
+==================
+
+lspci Output
+------------
+
+Note that the devices listed here correspond to the value populated in 1.4
+above::
+
+	00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
+	01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
+
+
+Using Endpoint Test function Device
+-----------------------------------
+
+pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
+tests. To compile this tool the following commands should be used::
+
+	# cd <kernel-dir>
+	# make -C tools/pci
+
+or if you desire to compile and install in your system::
+
+	# cd <kernel-dir>
+	# make -C tools/pci install
+
+The tool and script will be located in <rootfs>/usr/bin/
+
+
+pcitest.sh Output
+~~~~~~~~~~~~~~~~~
+::
+
+	# pcitest.sh
+	BAR tests
+
+	BAR0:           OKAY
+	BAR1:           OKAY
+	BAR2:           OKAY
+	BAR3:           OKAY
+	BAR4:           NOT OKAY
+	BAR5:           NOT OKAY
+
+	Interrupt tests
+
+	SET IRQ TYPE TO LEGACY:         OKAY
+	LEGACY IRQ:     NOT OKAY
+	SET IRQ TYPE TO MSI:            OKAY
+	MSI1:           OKAY
+	MSI2:           OKAY
+	MSI3:           OKAY
+	MSI4:           OKAY
+	MSI5:           OKAY
+	MSI6:           OKAY
+	MSI7:           OKAY
+	MSI8:           OKAY
+	MSI9:           OKAY
+	MSI10:          OKAY
+	MSI11:          OKAY
+	MSI12:          OKAY
+	MSI13:          OKAY
+	MSI14:          OKAY
+	MSI15:          OKAY
+	MSI16:          OKAY
+	MSI17:          NOT OKAY
+	MSI18:          NOT OKAY
+	MSI19:          NOT OKAY
+	MSI20:          NOT OKAY
+	MSI21:          NOT OKAY
+	MSI22:          NOT OKAY
+	MSI23:          NOT OKAY
+	MSI24:          NOT OKAY
+	MSI25:          NOT OKAY
+	MSI26:          NOT OKAY
+	MSI27:          NOT OKAY
+	MSI28:          NOT OKAY
+	MSI29:          NOT OKAY
+	MSI30:          NOT OKAY
+	MSI31:          NOT OKAY
+	MSI32:          NOT OKAY
+	SET IRQ TYPE TO MSI-X:          OKAY
+	MSI-X1:         OKAY
+	MSI-X2:         OKAY
+	MSI-X3:         OKAY
+	MSI-X4:         OKAY
+	MSI-X5:         OKAY
+	MSI-X6:         OKAY
+	MSI-X7:         OKAY
+	MSI-X8:         OKAY
+	MSI-X9:         NOT OKAY
+	MSI-X10:        NOT OKAY
+	MSI-X11:        NOT OKAY
+	MSI-X12:        NOT OKAY
+	MSI-X13:        NOT OKAY
+	MSI-X14:        NOT OKAY
+	MSI-X15:        NOT OKAY
+	MSI-X16:        NOT OKAY
+	[...]
+	MSI-X2047:      NOT OKAY
+	MSI-X2048:      NOT OKAY
+
+	Read Tests
+
+	SET IRQ TYPE TO MSI:            OKAY
+	READ (      1 bytes):           OKAY
+	READ (   1024 bytes):           OKAY
+	READ (   1025 bytes):           OKAY
+	READ (1024000 bytes):           OKAY
+	READ (1024001 bytes):           OKAY
+
+	Write Tests
+
+	WRITE (      1 bytes):          OKAY
+	WRITE (   1024 bytes):          OKAY
+	WRITE (   1025 bytes):          OKAY
+	WRITE (1024000 bytes):          OKAY
+	WRITE (1024001 bytes):          OKAY
+
+	Copy Tests
+
+	COPY (      1 bytes):           OKAY
+	COPY (   1024 bytes):           OKAY
+	COPY (   1025 bytes):           OKAY
+	COPY (1024000 bytes):           OKAY
+	COPY (1024001 bytes):           OKAY
diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt
deleted file mode 100644
index 040479f437a5..000000000000
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ /dev/null
@@ -1,206 +0,0 @@
-			    PCI TEST USERGUIDE
-		    Kishon Vijay Abraham I <kishon@ti.com>
-
-This document is a guide to help users use pci-epf-test function driver
-and pci_endpoint_test host driver for testing PCI. The list of steps to
-be followed in the host side and EP side is given below.
-
-1. Endpoint Device
-
-1.1 Endpoint Controller Devices
-
-To find the list of endpoint controller devices in the system:
-
-	# ls /sys/class/pci_epc/
-	  51000000.pcie_ep
-
-If PCI_ENDPOINT_CONFIGFS is enabled
-	# ls /sys/kernel/config/pci_ep/controllers
-	  51000000.pcie_ep
-
-1.2 Endpoint Function Drivers
-
-To find the list of endpoint function drivers in the system:
-
-	# ls /sys/bus/pci-epf/drivers
-	  pci_epf_test
-
-If PCI_ENDPOINT_CONFIGFS is enabled
-	# ls /sys/kernel/config/pci_ep/functions
-	  pci_epf_test
-
-1.3 Creating pci-epf-test Device
-
-PCI endpoint function device can be created using the configfs. To create
-pci-epf-test device, the following commands can be used
-
-	# mount -t configfs none /sys/kernel/config
-	# cd /sys/kernel/config/pci_ep/
-	# mkdir functions/pci_epf_test/func1
-
-The "mkdir func1" above creates the pci-epf-test function device that will
-be probed by pci_epf_test driver.
-
-The PCI endpoint framework populates the directory with the following
-configurable fields.
-
-	# ls functions/pci_epf_test/func1
-	  baseclass_code	interrupt_pin	progif_code	subsys_id
-	  cache_line_size	msi_interrupts	revid		subsys_vendorid
-	  deviceid          	msix_interrupts	subclass_code	vendorid
-
-The PCI endpoint function driver populates these entries with default values
-when the device is bound to the driver. The pci-epf-test driver populates
-vendorid with 0xffff and interrupt_pin with 0x0001
-
-	# cat functions/pci_epf_test/func1/vendorid
-	  0xffff
-	# cat functions/pci_epf_test/func1/interrupt_pin
-	  0x0001
-
-1.4 Configuring pci-epf-test Device
-
-The user can configure the pci-epf-test device using configfs entry. In order
-to change the vendorid and the number of MSI interrupts used by the function
-device, the following commands can be used.
-
-	# echo 0x104c > functions/pci_epf_test/func1/vendorid
-	# echo 0xb500 > functions/pci_epf_test/func1/deviceid
-	# echo 16 > functions/pci_epf_test/func1/msi_interrupts
-	# echo 8 > functions/pci_epf_test/func1/msix_interrupts
-
-1.5 Binding pci-epf-test Device to EP Controller
-
-In order for the endpoint function device to be useful, it has to be bound to
-a PCI endpoint controller driver. Use the configfs to bind the function
-device to one of the controller driver present in the system.
-
-	# ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
-
-Once the above step is completed, the PCI endpoint is ready to establish a link
-with the host.
-
-1.6 Start the Link
-
-In order for the endpoint device to establish a link with the host, the _start_
-field should be populated with '1'.
-
-	# echo 1 > controllers/51000000.pcie_ep/start
-
-2. RootComplex Device
-
-2.1 lspci Output
-
-Note that the devices listed here correspond to the value populated in 1.4 above
-
-	00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
-	01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
-
-2.2 Using Endpoint Test function Device
-
-pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
-tests. To compile this tool the following commands should be used:
-
-	# cd <kernel-dir>
-	# make -C tools/pci
-
-or if you desire to compile and install in your system:
-
-	# cd <kernel-dir>
-	# make -C tools/pci install
-
-The tool and script will be located in <rootfs>/usr/bin/
-
-2.2.1 pcitest.sh Output
-	# pcitest.sh
-	BAR tests
-
-	BAR0:           OKAY
-	BAR1:           OKAY
-	BAR2:           OKAY
-	BAR3:           OKAY
-	BAR4:           NOT OKAY
-	BAR5:           NOT OKAY
-
-	Interrupt tests
-
-	SET IRQ TYPE TO LEGACY:         OKAY
-	LEGACY IRQ:     NOT OKAY
-	SET IRQ TYPE TO MSI:            OKAY
-	MSI1:           OKAY
-	MSI2:           OKAY
-	MSI3:           OKAY
-	MSI4:           OKAY
-	MSI5:           OKAY
-	MSI6:           OKAY
-	MSI7:           OKAY
-	MSI8:           OKAY
-	MSI9:           OKAY
-	MSI10:          OKAY
-	MSI11:          OKAY
-	MSI12:          OKAY
-	MSI13:          OKAY
-	MSI14:          OKAY
-	MSI15:          OKAY
-	MSI16:          OKAY
-	MSI17:          NOT OKAY
-	MSI18:          NOT OKAY
-	MSI19:          NOT OKAY
-	MSI20:          NOT OKAY
-	MSI21:          NOT OKAY
-	MSI22:          NOT OKAY
-	MSI23:          NOT OKAY
-	MSI24:          NOT OKAY
-	MSI25:          NOT OKAY
-	MSI26:          NOT OKAY
-	MSI27:          NOT OKAY
-	MSI28:          NOT OKAY
-	MSI29:          NOT OKAY
-	MSI30:          NOT OKAY
-	MSI31:          NOT OKAY
-	MSI32:          NOT OKAY
-	SET IRQ TYPE TO MSI-X:          OKAY
-	MSI-X1:         OKAY
-	MSI-X2:         OKAY
-	MSI-X3:         OKAY
-	MSI-X4:         OKAY
-	MSI-X5:         OKAY
-	MSI-X6:         OKAY
-	MSI-X7:         OKAY
-	MSI-X8:         OKAY
-	MSI-X9:         NOT OKAY
-	MSI-X10:        NOT OKAY
-	MSI-X11:        NOT OKAY
-	MSI-X12:        NOT OKAY
-	MSI-X13:        NOT OKAY
-	MSI-X14:        NOT OKAY
-	MSI-X15:        NOT OKAY
-	MSI-X16:        NOT OKAY
-	[...]
-	MSI-X2047:      NOT OKAY
-	MSI-X2048:      NOT OKAY
-
-	Read Tests
-
-	SET IRQ TYPE TO MSI:            OKAY
-	READ (      1 bytes):           OKAY
-	READ (   1024 bytes):           OKAY
-	READ (   1025 bytes):           OKAY
-	READ (1024000 bytes):           OKAY
-	READ (1024001 bytes):           OKAY
-
-	Write Tests
-
-	WRITE (      1 bytes):          OKAY
-	WRITE (   1024 bytes):          OKAY
-	WRITE (   1025 bytes):          OKAY
-	WRITE (1024000 bytes):          OKAY
-	WRITE (1024001 bytes):          OKAY
-
-	Copy Tests
-
-	COPY (      1 bytes):           OKAY
-	COPY (   1024 bytes):           OKAY
-	COPY (   1025 bytes):           OKAY
-	COPY (1024000 bytes):           OKAY
-	COPY (1024001 bytes):           OKAY
diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
new file mode 100644
index 000000000000..f4c6121868c3
--- /dev/null
+++ b/Documentation/PCI/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+Linux PCI Bus Subsystem
+=======================
+
+.. toctree::
+   :maxdepth: 2
+   :numbered:
+
+   pci
+   picebus-howto
+   pci-iov-howto
+   msi-howto
+   acpi-info
+   pci-error-recovery
+   pcieaer-howto
+   endpoint/index
diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
new file mode 100644
index 000000000000..994cbb660ade
--- /dev/null
+++ b/Documentation/PCI/msi-howto.rst
@@ -0,0 +1,287 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==========================
+The MSI Driver Guide HOWTO
+==========================
+
+:Authors: Tom L Nguyen; Martine Silbermann; Matthew Wilcox
+
+:Copyright: 2003, 2008 Intel Corporation
+
+About this guide
+================
+
+This guide describes the basics of Message Signaled Interrupts (MSIs),
+the advantages of using MSI over traditional interrupt mechanisms, how
+to change your driver to use MSI or MSI-X and some basic diagnostics to
+try if a device doesn't support MSIs.
+
+
+What are MSIs?
+==============
+
+A Message Signaled Interrupt is a write from the device to a special
+address which causes an interrupt to be received by the CPU.
+
+The MSI capability was first specified in PCI 2.2 and was later enhanced
+in PCI 3.0 to allow each interrupt to be masked individually.  The MSI-X
+capability was also introduced with PCI 3.0.  It supports more interrupts
+per device than MSI and allows interrupts to be independently configured.
+
+Devices may support both MSI and MSI-X, but only one can be enabled at
+a time.
+
+
+Why use MSIs?
+=============
+
+There are three reasons why using MSIs can give an advantage over
+traditional pin-based interrupts.
+
+Pin-based PCI interrupts are often shared amongst several devices.
+To support this, the kernel must call each interrupt handler associated
+with an interrupt, which leads to reduced performance for the system as
+a whole.  MSIs are never shared, so this problem cannot arise.
+
+When a device writes data to memory, then raises a pin-based interrupt,
+it is possible that the interrupt may arrive before all the data has
+arrived in memory (this becomes more likely with devices behind PCI-PCI
+bridges).  In order to ensure that all the data has arrived in memory,
+the interrupt handler must read a register on the device which raised
+the interrupt.  PCI transaction ordering rules require that all the data
+arrive in memory before the value may be returned from the register.
+Using MSIs avoids this problem as the interrupt-generating write cannot
+pass the data writes, so by the time the interrupt is raised, the driver
+knows that all the data has arrived in memory.
+
+PCI devices can only support a single pin-based interrupt per function.
+Often drivers have to query the device to find out what event has
+occurred, slowing down interrupt handling for the common case.  With
+MSIs, a device can support more interrupts, allowing each interrupt
+to be specialised to a different purpose.  One possible design gives
+infrequent conditions (such as errors) their own interrupt which allows
+the driver to handle the normal interrupt handling path more efficiently.
+Other possible designs include giving one interrupt to each packet queue
+in a network card or each port in a storage controller.
+
+
+How to use MSIs
+===============
+
+PCI devices are initialised to use pin-based interrupts.  The device
+driver has to set up the device to use MSI or MSI-X.  Not all machines
+support MSIs correctly, and for those machines, the APIs described below
+will simply fail and the device will continue to use pin-based interrupts.
+
+Include kernel support for MSIs
+-------------------------------
+
+To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
+option enabled.  This option is only available on some architectures,
+and it may depend on some other options also being set.  For example,
+on x86, you must also enable X86_UP_APIC or SMP in order to see the
+CONFIG_PCI_MSI option.
+
+Using MSI
+---------
+
+Most of the hard work is done for the driver in the PCI layer.  The driver
+simply has to request that the PCI layer set up the MSI capability for this
+device.
+
+To automatically use MSI or MSI-X interrupt vectors, use the following
+function::
+
+  int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+		unsigned int max_vecs, unsigned int flags);
+
+which allocates up to max_vecs interrupt vectors for a PCI device.  It
+returns the number of vectors allocated or a negative error.  If the device
+has a requirements for a minimum number of vectors the driver can pass a
+min_vecs argument set to this limit, and the PCI core will return -ENOSPC
+if it can't meet the minimum number of vectors.
+
+The flags argument is used to specify which type of interrupt can be used
+by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX).
+A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for
+any possible kind of interrupt.  If the PCI_IRQ_AFFINITY flag is set,
+pci_alloc_irq_vectors() will spread the interrupts around the available CPUs.
+
+To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
+vectors, use the following function::
+
+  int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+
+Any allocated resources should be freed before removing the device using
+the following function::
+
+  void pci_free_irq_vectors(struct pci_dev *dev);
+
+If a device supports both MSI-X and MSI capabilities, this API will use the
+MSI-X facilities in preference to the MSI facilities.  MSI-X supports any
+number of interrupts between 1 and 2048.  In contrast, MSI is restricted to
+a maximum of 32 interrupts (and must be a power of two).  In addition, the
+MSI interrupt vectors must be allocated consecutively, so the system might
+not be able to allocate as many vectors for MSI as it could for MSI-X.  On
+some platforms, MSI interrupts must all be targeted at the same set of CPUs
+whereas MSI-X interrupts can all be targeted at different CPUs.
+
+If a device supports neither MSI-X or MSI it will fall back to a single
+legacy IRQ vector.
+
+The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
+as possible, likely up to the limit supported by the device.  If nvec is
+larger than the number supported by the device it will automatically be
+capped to the supported limit, so there is no need to query the number of
+vectors supported beforehand::
+
+	nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES)
+	if (nvec < 0)
+		goto out_err;
+
+If a driver is unable or unwilling to deal with a variable number of MSI
+interrupts it can request a particular number of interrupts by passing that
+number to pci_alloc_irq_vectors() function as both 'min_vecs' and
+'max_vecs' parameters::
+
+	ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES);
+	if (ret < 0)
+		goto out_err;
+
+The most notorious example of the request type described above is enabling
+the single MSI mode for a device.  It could be done by passing two 1s as
+'min_vecs' and 'max_vecs'::
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+	if (ret < 0)
+		goto out_err;
+
+Some devices might not support using legacy line interrupts, in which case
+the driver can specify that only MSI or MSI-X is acceptable::
+
+	nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX);
+	if (nvec < 0)
+		goto out_err;
+
+Legacy APIs
+-----------
+
+The following old APIs to enable and disable MSI or MSI-X interrupts should
+not be used in new code::
+
+  pci_enable_msi()		/* deprecated */
+  pci_disable_msi()		/* deprecated */
+  pci_enable_msix_range()	/* deprecated */
+  pci_enable_msix_exact()	/* deprecated */
+  pci_disable_msix()		/* deprecated */
+
+Additionally there are APIs to provide the number of supported MSI or MSI-X
+vectors: pci_msi_vec_count() and pci_msix_vec_count().  In general these
+should be avoided in favor of letting pci_alloc_irq_vectors() cap the
+number of vectors.  If you have a legitimate special use case for the count
+of vectors we might have to revisit that decision and add a
+pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
+
+Considerations when using MSIs
+------------------------------
+
+Spinlocks
+~~~~~~~~~
+
+Most device drivers have a per-device spinlock which is taken in the
+interrupt handler.  With pin-based interrupts or a single MSI, it is not
+necessary to disable interrupts (Linux guarantees the same interrupt will
+not be re-entered).  If a device uses multiple interrupts, the driver
+must disable interrupts while the lock is held.  If the device sends
+a different interrupt, the driver will deadlock trying to recursively
+acquire the spinlock.  Such deadlocks can be avoided by using
+spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
+and acquire the lock (see Documentation/kernel-hacking/locking.rst).
+
+How to tell whether MSI/MSI-X is enabled on a device
+----------------------------------------------------
+
+Using 'lspci -v' (as root) may show some devices with "MSI", "Message
+Signalled Interrupts" or "MSI-X" capabilities.  Each of these capabilities
+has an 'Enable' flag which is followed with either "+" (enabled)
+or "-" (disabled).
+
+
+MSI quirks
+==========
+
+Several PCI chipsets or devices are known not to support MSIs.
+The PCI stack provides three ways to disable MSIs:
+
+1. globally
+2. on all devices behind a specific bridge
+3. on a single device
+
+Disabling MSIs globally
+-----------------------
+
+Some host chipsets simply don't support MSIs properly.  If we're
+lucky, the manufacturer knows this and has indicated it in the ACPI
+FADT table.  In this case, Linux automatically disables MSIs.
+Some boards don't include this information in the table and so we have
+to detect them ourselves.  The complete list of these is found near the
+quirk_disable_all_msi() function in drivers/pci/quirks.c.
+
+If you have a board which has problems with MSIs, you can pass pci=nomsi
+on the kernel command line to disable MSIs on all devices.  It would be
+in your best interests to report the problem to linux-pci@vger.kernel.org
+including a full 'lspci -v' so we can add the quirks to the kernel.
+
+Disabling MSIs below a bridge
+-----------------------------
+
+Some PCI bridges are not able to route MSIs between busses properly.
+In this case, MSIs must be disabled on all devices behind the bridge.
+
+Some bridges allow you to enable MSIs by changing some bits in their
+PCI configuration space (especially the Hypertransport chipsets such
+as the nVidia nForce and Serverworks HT2000).  As with host chipsets,
+Linux mostly knows about them and automatically enables MSIs if it can.
+If you have a bridge unknown to Linux, you can enable
+MSIs in configuration space using whatever method you know works, then
+enable MSIs on that bridge by doing::
+
+       echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
+
+where $bridge is the PCI address of the bridge you've enabled (eg
+0000:00:0e.0).
+
+To disable MSIs, echo 0 instead of 1.  Changing this value should be
+done with caution as it could break interrupt handling for all devices
+below this bridge.
+
+Again, please notify linux-pci@vger.kernel.org of any bridges that need
+special handling.
+
+Disabling MSIs on a single device
+---------------------------------
+
+Some devices are known to have faulty MSI implementations.  Usually this
+is handled in the individual device driver, but occasionally it's necessary
+to handle this with a quirk.  Some drivers have an option to disable use
+of MSI.  While this is a convenient workaround for the driver author,
+it is not good practice, and should not be emulated.
+
+Finding why MSIs are disabled on a device
+-----------------------------------------
+
+From the above three sections, you can see that there are many reasons
+why MSIs may not be enabled for a given device.  Your first step should
+be to examine your dmesg carefully to determine whether MSIs are enabled
+for your machine.  You should also check your .config to be sure you
+have enabled CONFIG_PCI_MSI.
+
+Then, 'lspci -t' gives the list of bridges above a device. Reading
+`/sys/bus/pci/devices/*/msi_bus` will tell you whether MSIs are enabled (1)
+or disabled (0).  If 0 is found in any of the msi_bus files belonging
+to bridges between the PCI root and the device, MSIs are disabled.
+
+It is also worth checking the device driver to see whether it supports MSIs.
+For example, it may contain calls to pci_irq_alloc_vectors() with the
+PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst
new file mode 100644
index 000000000000..e5d450df06b4
--- /dev/null
+++ b/Documentation/PCI/pci-error-recovery.rst
@@ -0,0 +1,427 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+PCI Error Recovery
+==================
+
+
+:Authors: - Linas Vepstas <linasvepstas@gmail.com>
+          - Richard Lary <rlary@us.ibm.com>
+          - Mike Mason <mmlnx@us.ibm.com>
+
+
+Many PCI bus controllers are able to detect a variety of hardware
+PCI errors on the bus, such as parity errors on the data and address
+buses, as well as SERR and PERR errors.  Some of the more advanced
+chipsets are able to deal with these errors; these include PCI-E chipsets,
+and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
+pSeries boxes. A typical action taken is to disconnect the affected device,
+halting all I/O to it.  The goal of a disconnection is to avoid system
+corruption; for example, to halt system memory corruption due to DMA's
+to "wild" addresses. Typically, a reconnection mechanism is also
+offered, so that the affected PCI device(s) are reset and put back
+into working condition. The reset phase requires coordination
+between the affected device drivers and the PCI controller chip.
+This document describes a generic API for notifying device drivers
+of a bus disconnection, and then performing error recovery.
+This API is currently implemented in the 2.6.16 and later kernels.
+
+Reporting and recovery is performed in several steps. First, when
+a PCI hardware error has resulted in a bus disconnect, that event
+is reported as soon as possible to all affected device drivers,
+including multiple instances of a device driver on multi-function
+cards. This allows device drivers to avoid deadlocking in spinloops,
+waiting for some i/o-space register to change, when it never will.
+It also gives the drivers a chance to defer incoming I/O as
+needed.
+
+Next, recovery is performed in several stages. Most of the complexity
+is forced by the need to handle multi-function devices, that is,
+devices that have multiple device drivers associated with them.
+In the first stage, each driver is allowed to indicate what type
+of reset it desires, the choices being a simple re-enabling of I/O
+or requesting a slot reset.
+
+If any driver requests a slot reset, that is what will be done.
+
+After a reset and/or a re-enabling of I/O, all drivers are
+again notified, so that they may then perform any device setup/config
+that may be required.  After these have all completed, a final
+"resume normal operations" event is sent out.
+
+The biggest reason for choosing a kernel-based implementation rather
+than a user-space implementation was the need to deal with bus
+disconnects of PCI devices attached to storage media, and, in particular,
+disconnects from devices holding the root file system.  If the root
+file system is disconnected, a user-space mechanism would have to go
+through a large number of contortions to complete recovery. Almost all
+of the current Linux file systems are not tolerant of disconnection
+from/reconnection to their underlying block device. By contrast,
+bus errors are easy to manage in the device driver. Indeed, most
+device drivers already handle very similar recovery procedures;
+for example, the SCSI-generic layer already provides significant
+mechanisms for dealing with SCSI bus errors and SCSI bus resets.
+
+
+Detailed Design
+===============
+
+Design and implementation details below, based on a chain of
+public email discussions with Ben Herrenschmidt, circa 5 April 2005.
+
+The error recovery API support is exposed to the driver in the form of
+a structure of function pointers pointed to by a new field in struct
+pci_driver. A driver that fails to provide the structure is "non-aware",
+and the actual recovery steps taken are platform dependent.  The
+arch/powerpc implementation will simulate a PCI hotplug remove/add.
+
+This structure has the form::
+
+	struct pci_error_handlers
+	{
+		int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
+		int (*mmio_enabled)(struct pci_dev *dev);
+		int (*slot_reset)(struct pci_dev *dev);
+		void (*resume)(struct pci_dev *dev);
+	};
+
+The possible channel states are::
+
+	enum pci_channel_state {
+		pci_channel_io_normal,  /* I/O channel is in normal state */
+		pci_channel_io_frozen,  /* I/O to channel is blocked */
+		pci_channel_io_perm_failure, /* PCI card is dead */
+	};
+
+Possible return values are::
+
+	enum pci_ers_result {
+		PCI_ERS_RESULT_NONE,        /* no result/none/not supported in device driver */
+		PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */
+		PCI_ERS_RESULT_NEED_RESET,  /* Device driver wants slot to be reset. */
+		PCI_ERS_RESULT_DISCONNECT,  /* Device has completely failed, is unrecoverable */
+		PCI_ERS_RESULT_RECOVERED,   /* Device driver is fully recovered and operational */
+	};
+
+A driver does not have to implement all of these callbacks; however,
+if it implements any, it must implement error_detected(). If a callback
+is not implemented, the corresponding feature is considered unsupported.
+For example, if mmio_enabled() and resume() aren't there, then it
+is assumed that the driver is not doing any direct recovery and requires
+a slot reset.  Typically a driver will want to know about
+a slot_reset().
+
+The actual steps taken by a platform to recover from a PCI error
+event will be platform-dependent, but will follow the general
+sequence described below.
+
+STEP 0: Error Event
+-------------------
+A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
+is isolated, in that all I/O is blocked: all reads return 0xffffffff,
+all writes are ignored.
+
+
+STEP 1: Notification
+--------------------
+Platform calls the error_detected() callback on every instance of
+every driver affected by the error.
+
+At this point, the device might not be accessible anymore, depending on
+the platform (the slot will be isolated on powerpc). The driver may
+already have "noticed" the error because of a failing I/O, but this
+is the proper "synchronization point", that is, it gives the driver
+a chance to cleanup, waiting for pending stuff (timers, whatever, etc...)
+to complete; it can take semaphores, schedule, etc... everything but
+touch the device. Within this function and after it returns, the driver
+shouldn't do any new IOs. Called in task context. This is sort of a
+"quiesce" point. See note about interrupts at the end of this doc.
+
+All drivers participating in this system must implement this call.
+The driver must return one of the following result codes:
+
+  - PCI_ERS_RESULT_CAN_RECOVER
+      Driver returns this if it thinks it might be able to recover
+      the HW by just banging IOs or if it wants to be given
+      a chance to extract some diagnostic information (see
+      mmio_enable, below).
+  - PCI_ERS_RESULT_NEED_RESET
+      Driver returns this if it can't recover without a
+      slot reset.
+  - PCI_ERS_RESULT_DISCONNECT
+      Driver returns this if it doesn't want to recover at all.
+
+The next step taken will depend on the result codes returned by the
+drivers.
+
+If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER,
+then the platform should re-enable IOs on the slot (or do nothing in
+particular, if the platform doesn't isolate slots), and recovery
+proceeds to STEP 2 (MMIO Enable).
+
+If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
+then recovery proceeds to STEP 4 (Slot Reset).
+
+If the platform is unable to recover the slot, the next step
+is STEP 6 (Permanent Failure).
+
+.. note::
+
+   The current powerpc implementation assumes that a device driver will
+   *not* schedule or semaphore in this routine; the current powerpc
+   implementation uses one kernel thread to notify all devices;
+   thus, if one device sleeps/schedules, all devices are affected.
+   Doing better requires complex multi-threaded logic in the error
+   recovery implementation (e.g. waiting for all notification threads
+   to "join" before proceeding with recovery.)  This seems excessively
+   complex and not worth implementing.
+
+   The current powerpc implementation doesn't much care if the device
+   attempts I/O at this point, or not.  I/O's will fail, returning
+   a value of 0xff on read, and writes will be dropped. If more than
+   EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
+   assumes that the device driver has gone into an infinite loop
+   and prints an error to syslog.  A reboot is then required to
+   get the device working again.
+
+STEP 2: MMIO Enabled
+--------------------
+The platform re-enables MMIO to the device (but typically not the
+DMA), and then calls the mmio_enabled() callback on all affected
+device drivers.
+
+This is the "early recovery" call. IOs are allowed again, but DMA is
+not, with some restrictions. This is NOT a callback for the driver to
+start operations again, only to peek/poke at the device, extract diagnostic
+information, if any, and eventually do things like trigger a device local
+reset or some such, but not restart operations. This callback is made if
+all drivers on a segment agree that they can try to recover and if no automatic
+link reset was performed by the HW. If the platform can't just re-enable IOs
+without a slot reset or a link reset, it will not call this callback, and
+instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
+
+.. note::
+
+   The following is proposed; no platform implements this yet:
+   Proposal: All I/O's should be done _synchronously_ from within
+   this callback, errors triggered by them will be returned via
+   the normal pci_check_whatever() API, no new error_detected()
+   callback will be issued due to an error happening here. However,
+   such an error might cause IOs to be re-blocked for the whole
+   segment, and thus invalidate the recovery that other devices
+   on the same segment might have done, forcing the whole segment
+   into one of the next states, that is, link reset or slot reset.
+
+The driver should return one of the following result codes:
+  - PCI_ERS_RESULT_RECOVERED
+      Driver returns this if it thinks the device is fully
+      functional and thinks it is ready to start
+      normal driver operations again. There is no
+      guarantee that the driver will actually be
+      allowed to proceed, as another driver on the
+      same segment might have failed and thus triggered a
+      slot reset on platforms that support it.
+
+  - PCI_ERS_RESULT_NEED_RESET
+      Driver returns this if it thinks the device is not
+      recoverable in its current state and it needs a slot
+      reset to proceed.
+
+  - PCI_ERS_RESULT_DISCONNECT
+      Same as above. Total failure, no recovery even after
+      reset driver dead. (To be defined more precisely)
+
+The next step taken depends on the results returned by the drivers.
+If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
+proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
+
+If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
+proceeds to STEP 4 (Slot Reset)
+
+STEP 3: Link Reset
+------------------
+The platform resets the link.  This is a PCI-Express specific step
+and is done whenever a fatal error has been detected that can be
+"solved" by resetting the link.
+
+STEP 4: Slot Reset
+------------------
+
+In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
+the platform will perform a slot reset on the requesting PCI device(s).
+The actual steps taken by a platform to perform a slot reset
+will be platform-dependent. Upon completion of slot reset, the
+platform will call the device slot_reset() callback.
+
+Powerpc platforms implement two levels of slot reset:
+soft reset(default) and fundamental(optional) reset.
+
+Powerpc soft reset consists of asserting the adapter #RST line and then
+restoring the PCI BAR's and PCI configuration header to a state
+that is equivalent to what it would be after a fresh system
+power-on followed by power-on BIOS/system firmware initialization.
+Soft reset is also known as hot-reset.
+
+Powerpc fundamental reset is supported by PCI Express cards only
+and results in device's state machines, hardware logic, port states and
+configuration registers to initialize to their default conditions.
+
+For most PCI devices, a soft reset will be sufficient for recovery.
+Optional fundamental reset is provided to support a limited number
+of PCI Express devices for which a soft reset is not sufficient
+for recovery.
+
+If the platform supports PCI hotplug, then the reset might be
+performed by toggling the slot electrical power off/on.
+
+It is important for the platform to restore the PCI config space
+to the "fresh poweron" state, rather than the "last state". After
+a slot reset, the device driver will almost always use its standard
+device initialization routines, and an unusual config space setup
+may result in hung devices, kernel panics, or silent data corruption.
+
+This call gives drivers the chance to re-initialize the hardware
+(re-download firmware, etc.).  At this point, the driver may assume
+that the card is in a fresh state and is fully functional. The slot
+is unfrozen and the driver has full access to PCI config space,
+memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X)
+will also be available.
+
+Drivers should not restart normal I/O processing operations
+at this point.  If all device drivers report success on this
+callback, the platform will call resume() to complete the sequence,
+and let the driver restart normal I/O processing.
+
+A driver can still return a critical failure for this function if
+it can't get the device operational after reset.  If the platform
+previously tried a soft reset, it might now try a hard reset (power
+cycle) and then call slot_reset() again.  It the device still can't
+be recovered, there is nothing more that can be done;  the platform
+will typically report a "permanent failure" in such a case.  The
+device will be considered "dead" in this case.
+
+Drivers for multi-function cards will need to coordinate among
+themselves as to which driver instance will perform any "one-shot"
+or global device initialization. For example, the Symbios sym53cxx2
+driver performs device init only from PCI function 0::
+
+	+       if (PCI_FUNC(pdev->devfn) == 0)
+	+               sym_reset_scsi_bus(np, 0);
+
+Result codes:
+	- PCI_ERS_RESULT_DISCONNECT
+	  Same as above.
+
+Drivers for PCI Express cards that require a fundamental reset must
+set the needs_freset bit in the pci_dev structure in their probe function.
+For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
+PCI card types::
+
+	+	/* Set EEH reset type to fundamental if required by hba  */
+	+	if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha))
+	+		pdev->needs_freset = 1;
+	+
+
+Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
+Failure).
+
+.. note::
+
+   The current powerpc implementation does not try a power-cycle
+   reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
+   However, it probably should.
+
+
+STEP 5: Resume Operations
+-------------------------
+The platform will call the resume() callback on all affected device
+drivers if all drivers on the segment have returned
+PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks.
+The goal of this callback is to tell the driver to restart activity,
+that everything is back and running. This callback does not return
+a result code.
+
+At this point, if a new error happens, the platform will restart
+a new error recovery sequence.
+
+STEP 6: Permanent Failure
+-------------------------
+A "permanent failure" has occurred, and the platform cannot recover
+the device.  The platform will call error_detected() with a
+pci_channel_state value of pci_channel_io_perm_failure.
+
+The device driver should, at this point, assume the worst. It should
+cancel all pending I/O, refuse all new I/O, returning -EIO to
+higher layers. The device driver should then clean up all of its
+memory and remove itself from kernel operations, much as it would
+during system shutdown.
+
+The platform will typically notify the system operator of the
+permanent failure in some way.  If the device is hotplug-capable,
+the operator will probably want to remove and replace the device.
+Note, however, not all failures are truly "permanent". Some are
+caused by over-heating, some by a poorly seated card. Many
+PCI error events are caused by software bugs, e.g. DMA's to
+wild addresses or bogus split transactions due to programming
+errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
+for additional detail on real-life experience of the causes of
+software errors.
+
+
+Conclusion; General Remarks
+---------------------------
+The way the callbacks are called is platform policy. A platform with
+no slot reset capability may want to just "ignore" drivers that can't
+recover (disconnect them) and try to let other cards on the same segment
+recover. Keep in mind that in most real life cases, though, there will
+be only one driver per segment.
+
+Now, a note about interrupts. If you get an interrupt and your
+device is dead or has been isolated, there is a problem :)
+The current policy is to turn this into a platform policy.
+That is, the recovery API only requires that:
+
+ - There is no guarantee that interrupt delivery can proceed from any
+   device on the segment starting from the error detection and until the
+   slot_reset callback is called, at which point interrupts are expected
+   to be fully operational.
+
+ - There is no guarantee that interrupt delivery is stopped, that is,
+   a driver that gets an interrupt after detecting an error, or that detects
+   an error within the interrupt handler such that it prevents proper
+   ack'ing of the interrupt (and thus removal of the source) should just
+   return IRQ_NOTHANDLED. It's up to the platform to deal with that
+   condition, typically by masking the IRQ source during the duration of
+   the error handling. It is expected that the platform "knows" which
+   interrupts are routed to error-management capable slots and can deal
+   with temporarily disabling that IRQ number during error processing (this
+   isn't terribly complex). That means some IRQ latency for other devices
+   sharing the interrupt, but there is simply no other way. High end
+   platforms aren't supposed to share interrupts between many devices
+   anyway :)
+
+.. note::
+
+   Implementation details for the powerpc platform are discussed in
+   the file Documentation/powerpc/eeh-pci-error-recovery.rst
+
+   As of this writing, there is a growing list of device drivers with
+   patches implementing error recovery. Not all of these patches are in
+   mainline yet. These may be used as "examples":
+
+   - drivers/scsi/ipr
+   - drivers/scsi/sym53c8xx_2
+   - drivers/scsi/qla2xxx
+   - drivers/scsi/lpfc
+   - drivers/next/bnx2.c
+   - drivers/next/e100.c
+   - drivers/net/e1000
+   - drivers/net/e1000e
+   - drivers/net/ixgb
+   - drivers/net/ixgbe
+   - drivers/net/cxgb3
+   - drivers/net/s2io.c
+   - drivers/net/qlge
+
+The End
+-------
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
deleted file mode 100644
index 0b6bb3ef449e..000000000000
--- a/Documentation/PCI/pci-error-recovery.txt
+++ /dev/null
@@ -1,413 +0,0 @@
-
-                       PCI Error Recovery
-                       ------------------
-                        February 2, 2006
-
-                 Current document maintainer:
-             Linas Vepstas <linasvepstas@gmail.com>
-          updated by Richard Lary <rlary@us.ibm.com>
-       and Mike Mason <mmlnx@us.ibm.com> on 27-Jul-2009
-
-
-Many PCI bus controllers are able to detect a variety of hardware
-PCI errors on the bus, such as parity errors on the data and address
-buses, as well as SERR and PERR errors.  Some of the more advanced
-chipsets are able to deal with these errors; these include PCI-E chipsets,
-and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
-pSeries boxes. A typical action taken is to disconnect the affected device,
-halting all I/O to it.  The goal of a disconnection is to avoid system
-corruption; for example, to halt system memory corruption due to DMA's
-to "wild" addresses. Typically, a reconnection mechanism is also
-offered, so that the affected PCI device(s) are reset and put back
-into working condition. The reset phase requires coordination
-between the affected device drivers and the PCI controller chip.
-This document describes a generic API for notifying device drivers
-of a bus disconnection, and then performing error recovery.
-This API is currently implemented in the 2.6.16 and later kernels.
-
-Reporting and recovery is performed in several steps. First, when
-a PCI hardware error has resulted in a bus disconnect, that event
-is reported as soon as possible to all affected device drivers,
-including multiple instances of a device driver on multi-function
-cards. This allows device drivers to avoid deadlocking in spinloops,
-waiting for some i/o-space register to change, when it never will.
-It also gives the drivers a chance to defer incoming I/O as
-needed.
-
-Next, recovery is performed in several stages. Most of the complexity
-is forced by the need to handle multi-function devices, that is,
-devices that have multiple device drivers associated with them.
-In the first stage, each driver is allowed to indicate what type
-of reset it desires, the choices being a simple re-enabling of I/O
-or requesting a slot reset.
-
-If any driver requests a slot reset, that is what will be done.
-
-After a reset and/or a re-enabling of I/O, all drivers are
-again notified, so that they may then perform any device setup/config
-that may be required.  After these have all completed, a final
-"resume normal operations" event is sent out.
-
-The biggest reason for choosing a kernel-based implementation rather
-than a user-space implementation was the need to deal with bus
-disconnects of PCI devices attached to storage media, and, in particular,
-disconnects from devices holding the root file system.  If the root
-file system is disconnected, a user-space mechanism would have to go
-through a large number of contortions to complete recovery. Almost all
-of the current Linux file systems are not tolerant of disconnection
-from/reconnection to their underlying block device. By contrast,
-bus errors are easy to manage in the device driver. Indeed, most
-device drivers already handle very similar recovery procedures;
-for example, the SCSI-generic layer already provides significant
-mechanisms for dealing with SCSI bus errors and SCSI bus resets.
-
-
-Detailed Design
----------------
-Design and implementation details below, based on a chain of
-public email discussions with Ben Herrenschmidt, circa 5 April 2005.
-
-The error recovery API support is exposed to the driver in the form of
-a structure of function pointers pointed to by a new field in struct
-pci_driver. A driver that fails to provide the structure is "non-aware",
-and the actual recovery steps taken are platform dependent.  The
-arch/powerpc implementation will simulate a PCI hotplug remove/add.
-
-This structure has the form:
-struct pci_error_handlers
-{
-	int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
-	int (*mmio_enabled)(struct pci_dev *dev);
-	int (*slot_reset)(struct pci_dev *dev);
-	void (*resume)(struct pci_dev *dev);
-};
-
-The possible channel states are:
-enum pci_channel_state {
-	pci_channel_io_normal,  /* I/O channel is in normal state */
-	pci_channel_io_frozen,  /* I/O to channel is blocked */
-	pci_channel_io_perm_failure, /* PCI card is dead */
-};
-
-Possible return values are:
-enum pci_ers_result {
-	PCI_ERS_RESULT_NONE,        /* no result/none/not supported in device driver */
-	PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */
-	PCI_ERS_RESULT_NEED_RESET,  /* Device driver wants slot to be reset. */
-	PCI_ERS_RESULT_DISCONNECT,  /* Device has completely failed, is unrecoverable */
-	PCI_ERS_RESULT_RECOVERED,   /* Device driver is fully recovered and operational */
-};
-
-A driver does not have to implement all of these callbacks; however,
-if it implements any, it must implement error_detected(). If a callback
-is not implemented, the corresponding feature is considered unsupported.
-For example, if mmio_enabled() and resume() aren't there, then it
-is assumed that the driver is not doing any direct recovery and requires
-a slot reset.  Typically a driver will want to know about
-a slot_reset().
-
-The actual steps taken by a platform to recover from a PCI error
-event will be platform-dependent, but will follow the general
-sequence described below.
-
-STEP 0: Error Event
--------------------
-A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
-is isolated, in that all I/O is blocked: all reads return 0xffffffff,
-all writes are ignored.
-
-
-STEP 1: Notification
---------------------
-Platform calls the error_detected() callback on every instance of
-every driver affected by the error.
-
-At this point, the device might not be accessible anymore, depending on
-the platform (the slot will be isolated on powerpc). The driver may
-already have "noticed" the error because of a failing I/O, but this
-is the proper "synchronization point", that is, it gives the driver
-a chance to cleanup, waiting for pending stuff (timers, whatever, etc...)
-to complete; it can take semaphores, schedule, etc... everything but
-touch the device. Within this function and after it returns, the driver
-shouldn't do any new IOs. Called in task context. This is sort of a
-"quiesce" point. See note about interrupts at the end of this doc.
-
-All drivers participating in this system must implement this call.
-The driver must return one of the following result codes:
-		- PCI_ERS_RESULT_CAN_RECOVER:
-		  Driver returns this if it thinks it might be able to recover
-		  the HW by just banging IOs or if it wants to be given
-		  a chance to extract some diagnostic information (see
-		  mmio_enable, below).
-		- PCI_ERS_RESULT_NEED_RESET:
-		  Driver returns this if it can't recover without a
-		  slot reset.
-		- PCI_ERS_RESULT_DISCONNECT:
-		  Driver returns this if it doesn't want to recover at all.
-
-The next step taken will depend on the result codes returned by the
-drivers.
-
-If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER,
-then the platform should re-enable IOs on the slot (or do nothing in
-particular, if the platform doesn't isolate slots), and recovery
-proceeds to STEP 2 (MMIO Enable).
-
-If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
-then recovery proceeds to STEP 4 (Slot Reset).
-
-If the platform is unable to recover the slot, the next step
-is STEP 6 (Permanent Failure).
-
->>> The current powerpc implementation assumes that a device driver will
->>> *not* schedule or semaphore in this routine; the current powerpc
->>> implementation uses one kernel thread to notify all devices;
->>> thus, if one device sleeps/schedules, all devices are affected.
->>> Doing better requires complex multi-threaded logic in the error
->>> recovery implementation (e.g. waiting for all notification threads
->>> to "join" before proceeding with recovery.)  This seems excessively
->>> complex and not worth implementing.
-
->>> The current powerpc implementation doesn't much care if the device
->>> attempts I/O at this point, or not.  I/O's will fail, returning
->>> a value of 0xff on read, and writes will be dropped. If more than
->>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
->>> assumes that the device driver has gone into an infinite loop
->>> and prints an error to syslog.  A reboot is then required to
->>> get the device working again.
-
-STEP 2: MMIO Enabled
--------------------
-The platform re-enables MMIO to the device (but typically not the
-DMA), and then calls the mmio_enabled() callback on all affected
-device drivers.
-
-This is the "early recovery" call. IOs are allowed again, but DMA is
-not, with some restrictions. This is NOT a callback for the driver to
-start operations again, only to peek/poke at the device, extract diagnostic
-information, if any, and eventually do things like trigger a device local
-reset or some such, but not restart operations. This callback is made if
-all drivers on a segment agree that they can try to recover and if no automatic
-link reset was performed by the HW. If the platform can't just re-enable IOs
-without a slot reset or a link reset, it will not call this callback, and
-instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
-
->>> The following is proposed; no platform implements this yet:
->>> Proposal: All I/O's should be done _synchronously_ from within
->>> this callback, errors triggered by them will be returned via
->>> the normal pci_check_whatever() API, no new error_detected()
->>> callback will be issued due to an error happening here. However,
->>> such an error might cause IOs to be re-blocked for the whole
->>> segment, and thus invalidate the recovery that other devices
->>> on the same segment might have done, forcing the whole segment
->>> into one of the next states, that is, link reset or slot reset.
-
-The driver should return one of the following result codes:
-		- PCI_ERS_RESULT_RECOVERED
-		  Driver returns this if it thinks the device is fully
-		  functional and thinks it is ready to start
-		  normal driver operations again. There is no
-		  guarantee that the driver will actually be
-		  allowed to proceed, as another driver on the
-		  same segment might have failed and thus triggered a
-		  slot reset on platforms that support it.
-
-		- PCI_ERS_RESULT_NEED_RESET
-		  Driver returns this if it thinks the device is not
-		  recoverable in its current state and it needs a slot
-		  reset to proceed.
-
-		- PCI_ERS_RESULT_DISCONNECT
-		  Same as above. Total failure, no recovery even after
-		  reset driver dead. (To be defined more precisely)
-
-The next step taken depends on the results returned by the drivers.
-If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
-proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
-
-If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
-proceeds to STEP 4 (Slot Reset)
-
-STEP 3: Link Reset
-------------------
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
-------------------
-
-In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
-the platform will perform a slot reset on the requesting PCI device(s).
-The actual steps taken by a platform to perform a slot reset
-will be platform-dependent. Upon completion of slot reset, the
-platform will call the device slot_reset() callback.
-
-Powerpc platforms implement two levels of slot reset:
-soft reset(default) and fundamental(optional) reset.
-
-Powerpc soft reset consists of asserting the adapter #RST line and then
-restoring the PCI BAR's and PCI configuration header to a state
-that is equivalent to what it would be after a fresh system
-power-on followed by power-on BIOS/system firmware initialization.
-Soft reset is also known as hot-reset.
-
-Powerpc fundamental reset is supported by PCI Express cards only
-and results in device's state machines, hardware logic, port states and
-configuration registers to initialize to their default conditions.
-
-For most PCI devices, a soft reset will be sufficient for recovery.
-Optional fundamental reset is provided to support a limited number
-of PCI Express devices for which a soft reset is not sufficient
-for recovery.
-
-If the platform supports PCI hotplug, then the reset might be
-performed by toggling the slot electrical power off/on.
-
-It is important for the platform to restore the PCI config space
-to the "fresh poweron" state, rather than the "last state". After
-a slot reset, the device driver will almost always use its standard
-device initialization routines, and an unusual config space setup
-may result in hung devices, kernel panics, or silent data corruption.
-
-This call gives drivers the chance to re-initialize the hardware
-(re-download firmware, etc.).  At this point, the driver may assume
-that the card is in a fresh state and is fully functional. The slot
-is unfrozen and the driver has full access to PCI config space,
-memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X)
-will also be available.
-
-Drivers should not restart normal I/O processing operations
-at this point.  If all device drivers report success on this
-callback, the platform will call resume() to complete the sequence,
-and let the driver restart normal I/O processing.
-
-A driver can still return a critical failure for this function if
-it can't get the device operational after reset.  If the platform
-previously tried a soft reset, it might now try a hard reset (power
-cycle) and then call slot_reset() again.  It the device still can't
-be recovered, there is nothing more that can be done;  the platform
-will typically report a "permanent failure" in such a case.  The
-device will be considered "dead" in this case.
-
-Drivers for multi-function cards will need to coordinate among
-themselves as to which driver instance will perform any "one-shot"
-or global device initialization. For example, the Symbios sym53cxx2
-driver performs device init only from PCI function 0:
-
-+       if (PCI_FUNC(pdev->devfn) == 0)
-+               sym_reset_scsi_bus(np, 0);
-
-	Result codes:
-		- PCI_ERS_RESULT_DISCONNECT
-		Same as above.
-
-Drivers for PCI Express cards that require a fundamental reset must
-set the needs_freset bit in the pci_dev structure in their probe function.
-For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
-PCI card types:
-
-+	/* Set EEH reset type to fundamental if required by hba  */
-+	if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha))
-+		pdev->needs_freset = 1;
-+
-
-Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
-Failure).
-
->>> The current powerpc implementation does not try a power-cycle
->>> reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
->>> However, it probably should.
-
-
-STEP 5: Resume Operations
--------------------------
-The platform will call the resume() callback on all affected device
-drivers if all drivers on the segment have returned
-PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks.
-The goal of this callback is to tell the driver to restart activity,
-that everything is back and running. This callback does not return
-a result code.
-
-At this point, if a new error happens, the platform will restart
-a new error recovery sequence.
-
-STEP 6: Permanent Failure
--------------------------
-A "permanent failure" has occurred, and the platform cannot recover
-the device.  The platform will call error_detected() with a
-pci_channel_state value of pci_channel_io_perm_failure.
-
-The device driver should, at this point, assume the worst. It should
-cancel all pending I/O, refuse all new I/O, returning -EIO to
-higher layers. The device driver should then clean up all of its
-memory and remove itself from kernel operations, much as it would
-during system shutdown.
-
-The platform will typically notify the system operator of the
-permanent failure in some way.  If the device is hotplug-capable,
-the operator will probably want to remove and replace the device.
-Note, however, not all failures are truly "permanent". Some are
-caused by over-heating, some by a poorly seated card. Many
-PCI error events are caused by software bugs, e.g. DMA's to
-wild addresses or bogus split transactions due to programming
-errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
-for additional detail on real-life experience of the causes of
-software errors.
-
-
-Conclusion; General Remarks
----------------------------
-The way the callbacks are called is platform policy. A platform with
-no slot reset capability may want to just "ignore" drivers that can't
-recover (disconnect them) and try to let other cards on the same segment
-recover. Keep in mind that in most real life cases, though, there will
-be only one driver per segment.
-
-Now, a note about interrupts. If you get an interrupt and your
-device is dead or has been isolated, there is a problem :)
-The current policy is to turn this into a platform policy.
-That is, the recovery API only requires that:
-
- - There is no guarantee that interrupt delivery can proceed from any
-device on the segment starting from the error detection and until the
-slot_reset callback is called, at which point interrupts are expected
-to be fully operational.
-
- - There is no guarantee that interrupt delivery is stopped, that is,
-a driver that gets an interrupt after detecting an error, or that detects
-an error within the interrupt handler such that it prevents proper
-ack'ing of the interrupt (and thus removal of the source) should just
-return IRQ_NOTHANDLED. It's up to the platform to deal with that
-condition, typically by masking the IRQ source during the duration of
-the error handling. It is expected that the platform "knows" which
-interrupts are routed to error-management capable slots and can deal
-with temporarily disabling that IRQ number during error processing (this
-isn't terribly complex). That means some IRQ latency for other devices
-sharing the interrupt, but there is simply no other way. High end
-platforms aren't supposed to share interrupts between many devices
-anyway :)
-
->>> Implementation details for the powerpc platform are discussed in
->>> the file Documentation/powerpc/eeh-pci-error-recovery.txt
-
->>> As of this writing, there is a growing list of device drivers with
->>> patches implementing error recovery. Not all of these patches are in
->>> mainline yet. These may be used as "examples":
->>>
->>> drivers/scsi/ipr
->>> drivers/scsi/sym53c8xx_2
->>> drivers/scsi/qla2xxx
->>> drivers/scsi/lpfc
->>> drivers/next/bnx2.c
->>> drivers/next/e100.c
->>> drivers/net/e1000
->>> drivers/net/e1000e
->>> drivers/net/ixgb
->>> drivers/net/ixgbe
->>> drivers/net/cxgb3
->>> drivers/net/s2io.c
->>> drivers/net/qlge
-
-The End
--------
diff --git a/Documentation/PCI/pci-iov-howto.rst b/Documentation/PCI/pci-iov-howto.rst
new file mode 100644
index 000000000000..b9fd003206f1
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.rst
@@ -0,0 +1,172 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+====================================
+PCI Express I/O Virtualization Howto
+====================================
+
+:Copyright: |copy| 2009 Intel Corporation
+:Authors: - Yu Zhao <yu.zhao@intel.com>
+          - Donald Dutile <ddutile@redhat.com>
+
+Overview
+========
+
+What is SR-IOV
+--------------
+
+Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
+capability which makes one physical device appear as multiple virtual
+devices. The physical device is referred to as Physical Function (PF)
+while the virtual devices are referred to as Virtual Functions (VF).
+Allocation of the VF can be dynamically controlled by the PF via
+registers encapsulated in the capability. By default, this feature is
+not enabled and the PF behaves as traditional PCIe device. Once it's
+turned on, each VF's PCI configuration space can be accessed by its own
+Bus, Device and Function Number (Routing ID). And each VF also has PCI
+Memory Space, which is used to map its register set. VF device driver
+operates on the register set so it can be functional and appear as a
+real existing PCI device.
+
+User Guide
+==========
+
+How can I enable SR-IOV capability
+----------------------------------
+
+Multiple methods are available for SR-IOV enablement.
+In the first method, the device driver (PF driver) will control the
+enabling and disabling of the capability via API provided by SR-IOV core.
+If the hardware has SR-IOV capability, loading its PF driver would
+enable it and all VFs associated with the PF.  Some PF drivers require
+a module parameter to be set to determine the number of VFs to enable.
+In the second method, a write to the sysfs file sriov_numvfs will
+enable and disable the VFs associated with a PCIe PF.  This method
+enables per-PF, VF enable/disable values versus the first method,
+which applies to all PFs of the same device.  Additionally, the
+PCI SRIOV core support ensures that enable/disable operations are
+valid to reduce duplication in multiple drivers for the same
+checks, e.g., check numvfs == 0 if enabling VFs, ensure
+numvfs <= totalvfs.
+The second method is the recommended method for new/future VF devices.
+
+How can I use the Virtual Functions
+-----------------------------------
+
+The VF is treated as hot-plugged PCI devices in the kernel, so they
+should be able to work in the same way as real PCI devices. The VF
+requires device driver that is same as a normal PCI device's.
+
+Developer Guide
+===============
+
+SR-IOV API
+----------
+
+To enable SR-IOV capability:
+
+(a) For the first method, in the driver::
+
+	int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+
+'nr_virtfn' is number of VFs to be enabled.
+
+(b) For the second method, from sysfs::
+
+	echo 'nr_virtfn' > \
+        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
+
+To disable SR-IOV capability:
+
+(a) For the first method, in the driver::
+
+	void pci_disable_sriov(struct pci_dev *dev);
+
+(b) For the second method, from sysfs::
+
+	echo  0 > \
+        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
+
+To enable auto probing VFs by a compatible driver on the host, run
+command below before enabling SR-IOV capabilities. This is the
+default behavior.
+::
+
+	echo 1 > \
+        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
+
+To disable auto probing VFs by a compatible driver on the host, run
+command below before enabling SR-IOV capabilities. Updating this
+entry will not affect VFs which are already probed.
+::
+
+	echo  0 > \
+        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
+
+Usage example
+-------------
+
+Following piece of code illustrates the usage of the SR-IOV API.
+::
+
+	static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
+	{
+		pci_enable_sriov(dev, NR_VIRTFN);
+
+		...
+
+		return 0;
+	}
+
+	static void dev_remove(struct pci_dev *dev)
+	{
+		pci_disable_sriov(dev);
+
+		...
+	}
+
+	static int dev_suspend(struct pci_dev *dev, pm_message_t state)
+	{
+		...
+
+		return 0;
+	}
+
+	static int dev_resume(struct pci_dev *dev)
+	{
+		...
+
+		return 0;
+	}
+
+	static void dev_shutdown(struct pci_dev *dev)
+	{
+		...
+	}
+
+	static int dev_sriov_configure(struct pci_dev *dev, int numvfs)
+	{
+		if (numvfs > 0) {
+			...
+			pci_enable_sriov(dev, numvfs);
+			...
+			return numvfs;
+		}
+		if (numvfs == 0) {
+			....
+			pci_disable_sriov(dev);
+			...
+			return 0;
+		}
+	}
+
+	static struct pci_driver dev_driver = {
+		.name =		"SR-IOV Physical Function driver",
+		.id_table =	dev_id_table,
+		.probe =	dev_probe,
+		.remove =	dev_remove,
+		.suspend =	dev_suspend,
+		.resume =	dev_resume,
+		.shutdown =	dev_shutdown,
+		.sriov_configure = dev_sriov_configure,
+	};
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
deleted file mode 100644
index d2a84151e99c..000000000000
--- a/Documentation/PCI/pci-iov-howto.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-		PCI Express I/O Virtualization Howto
-		Copyright (C) 2009 Intel Corporation
-		    Yu Zhao <yu.zhao@intel.com>
-
-		Update: November 2012
-			-- sysfs-based SRIOV enable-/disable-ment
-		Donald Dutile <ddutile@redhat.com>
-
-1. Overview
-
-1.1 What is SR-IOV
-
-Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
-capability which makes one physical device appear as multiple virtual
-devices. The physical device is referred to as Physical Function (PF)
-while the virtual devices are referred to as Virtual Functions (VF).
-Allocation of the VF can be dynamically controlled by the PF via
-registers encapsulated in the capability. By default, this feature is
-not enabled and the PF behaves as traditional PCIe device. Once it's
-turned on, each VF's PCI configuration space can be accessed by its own
-Bus, Device and Function Number (Routing ID). And each VF also has PCI
-Memory Space, which is used to map its register set. VF device driver
-operates on the register set so it can be functional and appear as a
-real existing PCI device.
-
-2. User Guide
-
-2.1 How can I enable SR-IOV capability
-
-Multiple methods are available for SR-IOV enablement.
-In the first method, the device driver (PF driver) will control the
-enabling and disabling of the capability via API provided by SR-IOV core.
-If the hardware has SR-IOV capability, loading its PF driver would
-enable it and all VFs associated with the PF.  Some PF drivers require
-a module parameter to be set to determine the number of VFs to enable.
-In the second method, a write to the sysfs file sriov_numvfs will
-enable and disable the VFs associated with a PCIe PF.  This method
-enables per-PF, VF enable/disable values versus the first method,
-which applies to all PFs of the same device.  Additionally, the
-PCI SRIOV core support ensures that enable/disable operations are
-valid to reduce duplication in multiple drivers for the same
-checks, e.g., check numvfs == 0 if enabling VFs, ensure
-numvfs <= totalvfs.
-The second method is the recommended method for new/future VF devices.
-
-2.2 How can I use the Virtual Functions
-
-The VF is treated as hot-plugged PCI devices in the kernel, so they
-should be able to work in the same way as real PCI devices. The VF
-requires device driver that is same as a normal PCI device's.
-
-3. Developer Guide
-
-3.1 SR-IOV API
-
-To enable SR-IOV capability:
-(a) For the first method, in the driver:
-	int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
-	'nr_virtfn' is number of VFs to be enabled.
-(b) For the second method, from sysfs:
-	echo 'nr_virtfn' > \
-        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
-
-To disable SR-IOV capability:
-(a) For the first method, in the driver:
-	void pci_disable_sriov(struct pci_dev *dev);
-(b) For the second method, from sysfs:
-	echo  0 > \
-        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
-
-To enable auto probing VFs by a compatible driver on the host, run
-command below before enabling SR-IOV capabilities. This is the
-default behavior.
-	echo 1 > \
-        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
-
-To disable auto probing VFs by a compatible driver on the host, run
-command below before enabling SR-IOV capabilities. Updating this
-entry will not affect VFs which are already probed.
-	echo  0 > \
-        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
-
-3.2 Usage example
-
-Following piece of code illustrates the usage of the SR-IOV API.
-
-static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	pci_enable_sriov(dev, NR_VIRTFN);
-
-	...
-
-	return 0;
-}
-
-static void dev_remove(struct pci_dev *dev)
-{
-	pci_disable_sriov(dev);
-
-	...
-}
-
-static int dev_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	...
-
-	return 0;
-}
-
-static int dev_resume(struct pci_dev *dev)
-{
-	...
-
-	return 0;
-}
-
-static void dev_shutdown(struct pci_dev *dev)
-{
-	...
-}
-
-static int dev_sriov_configure(struct pci_dev *dev, int numvfs)
-{
-	if (numvfs > 0) {
-		...
-		pci_enable_sriov(dev, numvfs);
-		...
-		return numvfs;
-	}
-	if (numvfs == 0) {
-		....
-		pci_disable_sriov(dev);
-		...
-		return 0;
-	}
-}
-
-static struct pci_driver dev_driver = {
-	.name =		"SR-IOV Physical Function driver",
-	.id_table =	dev_id_table,
-	.probe =	dev_probe,
-	.remove =	dev_remove,
-	.suspend =	dev_suspend,
-	.resume =	dev_resume,
-	.shutdown =	dev_shutdown,
-	.sriov_configure = dev_sriov_configure,
-};
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
new file mode 100644
index 000000000000..6864f9a70f5f
--- /dev/null
+++ b/Documentation/PCI/pci.rst
@@ -0,0 +1,578 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+How To Write Linux PCI Drivers
+==============================
+
+:Authors: - Martin Mares <mj@ucw.cz>
+          - Grant Grundler <grundler@parisc-linux.org>
+
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+http://lwn.net/Kernel/LDD3/.
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+Structure of PCI drivers
+========================
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and coherent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Release MMIO/IOP resources
+  - Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+pci_register_driver() call
+==========================
+
+PCI device drivers call ``pci_register_driver()`` during their
+initialization with a pointer to a structure describing the driver
+(``struct pci_driver``):
+
+.. kernel-doc:: include/linux/pci.h
+   :functions: pci_driver
+
+The ID table is an array of ``struct pci_device_id`` entries ending with an
+all-zero entry.  Definitions with static const are generally preferred.
+
+.. kernel-doc:: include/linux/mod_devicetable.h
+   :functions: pci_device_id
+
+Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below::
+
+  echo "vendor device subvendor subdevice class class_mask driver_data" > \
+  /sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+
+  - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+  - class and classmask fields default to 0
+  - driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+"Attributes" for driver functions/data
+--------------------------------------
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+	======		=================================================
+	__init		Initialization code. Thrown away after the driver
+			initializes.
+	__exit		Exit code. Ignored for non-modular drivers.
+	======		=================================================
+
+Tips on when/where to use the above attributes:
+	- The module_init()/module_exit() functions (and all
+	  initialization functions called _only_ from these)
+	  should be marked __init/__exit.
+
+	- Do not mark the struct pci_driver.
+
+	- Do NOT mark a function if you are not sure which mark to use.
+	  Better to not mark the function than mark the function wrong.
+
+
+How to find PCI devices manually
+================================
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID::
+
+	struct pci_dev *dev = NULL;
+	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+		configure_device(dev);
+
+Searching by class ID (iterate in a similar way)::
+
+	pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID::
+
+	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+Device Initialization Steps
+===========================
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+Enable the PCI device
+---------------------
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+
+  - wake up the device if it was in suspended state,
+  - allocate I/O and memory regions of the device (if BIOS did not),
+  - allocate an IRQ (if BIOS did not).
+
+.. note::
+   pci_enable_device() can fail! Check the return value.
+
+.. warning::
+   OS BUG: we don't check resource allocations before enabling those
+   resources. The sequence would make more sense if we called
+   pci_request_resources() before calling pci_enable_device().
+   Currently, the device drivers can't detect the bug when when two
+   devices have been allocated the same range. This is not a common
+   problem and unlikely to get fixed soon.
+
+   This has been discussed before but not changed as of 2.6.19:
+   http://lkml.org/lkml/2006/3/2/194
+
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS.  pci_clear_master() will
+disable DMA by clearing the bus master bit.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate.  Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+Request MMIO/IOP resources
+--------------------------
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/io-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+.. tip::
+   See OS BUG comment above. Currently (2.6.19), The driver can only
+   determine MMIO and IO Port resource availability _after_ calling
+   pci_enable_device().
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+Set the DMA mask size
+---------------------
+.. note::
+   If anything below doesn't make sense, please refer to
+   Documentation/DMA-API.txt. This section is just a reminder that
+   drivers need to indicate DMA capabilities of the device and is not
+   an authoritative source for DMA interfaces.
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters.  In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+Setup shared control data
+-------------------------
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory.  See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+Initialize device registers
+---------------------------
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+Register IRQ handler
+--------------------
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
+PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
+causes the PCI support to program CPU vector data into the PCI device
+capability registers. Many architectures, chip-sets, or BIOSes do NOT
+support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
+the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
+specify PCI_IRQ_LEGACY as well.
+
+Drivers that have different interrupt handlers for MSI/MSI-X and
+legacy INTx should chose the right one based on the msi_enabled
+and msix_enabled flags in the pci_dev structure after calling
+pci_alloc_irq_vectors.
+
+There are (at least) two really good reasons for using MSI:
+
+1) MSI is an exclusive interrupt vector by definition.
+   This means the interrupt handler doesn't have to verify
+   its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+   to be visible to the host CPU(s) when the MSI is delivered. This
+   is important for both data coherency and avoiding stale control data.
+   This guarantee allows the driver to omit MMIO reads to flush
+   the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+PCI device shutdown
+===================
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and consistent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Disable device from responding to MMIO/IO Port addresses
+  - Release MMIO/IO Port resource(s)
+
+
+Stop IRQs on the device
+-----------------------
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+Release the IRQ
+---------------
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+Stop all DMA activity
+---------------------
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+Release DMA buffers
+-------------------
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+Unregister from other subsystems
+--------------------------------
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+Disable Device from responding to MMIO/IO Port addresses
+--------------------------------------------------------
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+Release MMIO/IO Port Resource(s)
+--------------------------------
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+How to access PCI config space
+==============================
+
+You can use `pci_(read|write)_config_(byte|word|dword)` to access the config
+space of a device represented by `struct pci_dev *`. All these functions return
+0 when successful or an error code (`PCIBIOS_...`) which can be translated to a
+text string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+Other interesting functions
+===========================
+
+=============================	================================================
+pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
+				bus and slot and number. If the device is
+				found, its reference count is increased.
+pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability()		Find specified capability in device's capability
+				list.
+pci_resource_start()		Returns bus start address for a given PCI region
+pci_resource_end()		Returns bus end address for a given PCI region
+pci_resource_len()		Returns the byte length of a PCI region
+pci_set_drvdata()		Set private driver data pointer for a pci_dev
+pci_get_drvdata()		Return private driver data pointer for a pci_dev
+pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
+=============================	================================================
+
+
+Miscellaneous hints
+===================
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver.  All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+Vendor and device identifications
+=================================
+
+Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
+are shared across multiple drivers.  You can add private definitions in
+your driver if they're helpful, or just use plain hex constants.
+
+The device IDs are arbitrary hex numbers (vendor controlled) and normally used
+only in a single location, the pci_device_id table.
+
+Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
+There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
+and https://github.com/pciutils/pciids.
+
+
+Obsolete functions
+==================
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface.  They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+=================	===========================================
+pci_find_device()	Superseded by pci_get_device()
+pci_find_subsys()	Superseded by pci_get_subsys()
+pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
+pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
+=================	===========================================
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+MMIO Space and "Write Posting"
+==============================
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work.  The classic "bit banging"
+sequence works fine for I/O Port space::
+
+       for (i = 8; --i; val >>= 1) {
+               outb(val & 1, ioport_reg);      /* write bit */
+               udelay(10);
+       }
+
+The same sequence for MMIO space should be::
+
+       for (i = 8; --i; val >>= 1) {
+               writeb(val & 1, mmio_reg);      /* write bit */
+               readb(safe_mmio_reg);           /* flush posted write */
+               udelay(10);
+       }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl().  Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
deleted file mode 100644
index badb26ac33dc..000000000000
--- a/Documentation/PCI/pci.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-
-			How To Write Linux PCI Drivers
-
-		by Martin Mares <mj@ucw.cz> on 07-Feb-2000
-	updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The world of PCI is vast and full of (mostly unpleasant) surprises.
-Since each CPU architecture implements different chip-sets and PCI devices
-have different requirements (erm, "features"), the result is the PCI support
-in the Linux kernel is not as trivial as one would wish. This short paper
-tries to introduce all potential driver authors to Linux APIs for
-PCI device drivers.
-
-A more complete resource is the third edition of "Linux Device Drivers"
-by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
-LDD3 is available for free (under Creative Commons License) from:
-
-	http://lwn.net/Kernel/LDD3/
-
-However, keep in mind that all documents are subject to "bit rot".
-Refer to the source code if things are not working as described here.
-
-Please send questions/comments/patches about Linux PCI API to the
-"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
-
-
-
-0. Structure of PCI drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PCI drivers "discover" PCI devices in a system via pci_register_driver().
-Actually, it's the other way around. When the PCI generic code discovers
-a new device, the driver with a matching "description" will be notified.
-Details on this below.
-
-pci_register_driver() leaves most of the probing for devices to
-the PCI layer and supports online insertion/removal of devices [thus
-supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
-pci_register_driver() call requires passing in a table of function
-pointers and thus dictates the high level structure of a driver.
-
-Once the driver knows about a PCI device and takes ownership, the
-driver generally needs to perform the following initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines
-
-When done using the device, and perhaps the module needs to be unloaded,
-the driver needs to take the follow steps:
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and coherent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Release MMIO/IOP resources
-	Disable the device
-
-Most of these topics are covered in the following sections.
-For the rest look at LDD3 or <linux/pci.h> .
-
-If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
-the PCI functions described below are defined as inline functions either
-completely empty or just returning an appropriate error codes to avoid
-lots of ifdefs in the drivers.
-
-
-
-1. pci_register_driver() call
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI device drivers call pci_register_driver() during their
-initialization with a pointer to a structure describing the driver
-(struct pci_driver):
-
-	field name	Description
-	----------	------------------------------------------------------
-	id_table	Pointer to table of device ID's the driver is
-			interested in.  Most drivers should export this
-			table using MODULE_DEVICE_TABLE(pci,...).
-
-	probe		This probing function gets called (during execution
-			of pci_register_driver() for already existing
-			devices or later if a new device gets inserted) for
-			all PCI devices which match the ID table and are not
-			"owned" by the other drivers yet. This function gets
-			passed a "struct pci_dev *" for each device whose
-			entry in the ID table matches the device. The probe
-			function returns zero when the driver chooses to
-			take "ownership" of the device or an error code
-			(negative number) otherwise.
-			The probe function always gets called from process
-			context, so it can sleep.
-
-	remove		The remove() function gets called whenever a device
-			being handled by this driver is removed (either during
-			deregistration of the driver or when it's manually
-			pulled out of a hot-pluggable slot).
-			The remove function always gets called from process
-			context, so it can sleep.
-
-	suspend		Put device into low power state.
-	suspend_late	Put device into low power state.
-
-	resume_early	Wake device from low power state.
-	resume		Wake device from low power state.
-
-		(Please see Documentation/power/pci.txt for descriptions
-		of PCI Power Management and the related functions.)
-
-	shutdown	Hook into reboot_notifier_list (kernel/sys.c).
-			Intended to stop any idling DMA operations.
-			Useful for enabling wake-on-lan (NIC) or changing
-			the power state of a device before reboot.
-			e.g. drivers/net/e100.c.
-
-	err_handler	See Documentation/PCI/pci-error-recovery.txt
-
-
-The ID table is an array of struct pci_device_id entries ending with an
-all-zero entry.  Definitions with static const are generally preferred.
-
-Each entry consists of:
-
-	vendor,device	Vendor and device ID to match (or PCI_ANY_ID)
-
-	subvendor,	Subsystem vendor and device ID to match (or PCI_ANY_ID)
-	subdevice,
-
-	class		Device class, subclass, and "interface" to match.
-			See Appendix D of the PCI Local Bus Spec or
-			include/linux/pci_ids.h for a full list of classes.
-			Most drivers do not need to specify class/class_mask
-			as vendor/device is normally sufficient.
-
-	class_mask	limit which sub-fields of the class field are compared.
-			See drivers/scsi/sym53c8xx_2/ for example of usage.
-
-	driver_data	Data private to the driver.
-			Most drivers don't need to use driver_data field.
-			Best practice is to use driver_data as an index
-			into a static list of equivalent device types,
-			instead of using it as a pointer.
-
-
-Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
-a pci_device_id table.
-
-New PCI IDs may be added to a device driver pci_ids table at runtime
-as shown below:
-
-echo "vendor device subvendor subdevice class class_mask driver_data" > \
-/sys/bus/pci/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The vendor and device fields are mandatory, the others are optional. Users
-need pass only as many optional fields as necessary:
-	o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
-	o class and classmask fields default to 0
-	o driver_data defaults to 0UL.
-
-Note that driver_data must match the value used by any of the pci_device_id
-entries defined in the driver. This makes the driver_data field mandatory
-if all the pci_device_id entries have a non-zero driver_data value.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCI devices listed in its (newly updated) pci_ids list.
-
-When the driver exits, it just calls pci_unregister_driver() and the PCI layer
-automatically calls the remove hook for all devices handled by the driver.
-
-
-1.1 "Attributes" for driver functions/data
-
-Please mark the initialization and cleanup functions where appropriate
-(the corresponding macros are defined in <linux/init.h>):
-
-	__init		Initialization code. Thrown away after the driver
-			initializes.
-	__exit		Exit code. Ignored for non-modular drivers.
-
-Tips on when/where to use the above attributes:
-	o The module_init()/module_exit() functions (and all
-	  initialization functions called _only_ from these)
-	  should be marked __init/__exit.
-
-	o Do not mark the struct pci_driver.
-
-	o Do NOT mark a function if you are not sure which mark to use.
-	  Better to not mark the function than mark the function wrong.
-
-
-
-2. How to find PCI devices manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI drivers should have a really good reason for not using the
-pci_register_driver() interface to search for PCI devices.
-The main reason PCI devices are controlled by multiple drivers
-is because one PCI device implements several different HW services.
-E.g. combined serial/parallel port/floppy controller.
-
-A manual search may be performed using the following constructs:
-
-Searching by vendor and device ID:
-
-	struct pci_dev *dev = NULL;
-	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
-		configure_device(dev);
-
-Searching by class ID (iterate in a similar way):
-
-	pci_get_class(CLASS_ID, dev)
-
-Searching by both vendor/device and subsystem vendor/device ID:
-
-	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
-
-You can use the constant PCI_ANY_ID as a wildcard replacement for
-VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
-specific vendor, for example.
-
-These functions are hotplug-safe. They increment the reference count on
-the pci_dev that they return. You must eventually (possibly at module unload)
-decrement the reference count on these devices by calling pci_dev_put().
-
-
-
-3. Device Initialization Steps
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As noted in the introduction, most PCI drivers need the following steps
-for device initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines.
-
-The driver can access PCI config space registers at any time.
-(Well, almost. When running BIST, config space can go away...but
-that will just result in a PCI Bus Master Abort and config reads
-will return garbage).
-
-
-3.1 Enable the PCI device
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Before touching any device registers, the driver needs to enable
-the PCI device by calling pci_enable_device(). This will:
-	o wake up the device if it was in suspended state,
-	o allocate I/O and memory regions of the device (if BIOS did not),
-	o allocate an IRQ (if BIOS did not).
-
-NOTE: pci_enable_device() can fail! Check the return value.
-
-[ OS BUG: we don't check resource allocations before enabling those
-  resources. The sequence would make more sense if we called
-  pci_request_resources() before calling pci_enable_device().
-  Currently, the device drivers can't detect the bug when when two
-  devices have been allocated the same range. This is not a common
-  problem and unlikely to get fixed soon.
-
-  This has been discussed before but not changed as of 2.6.19:
-	http://lkml.org/lkml/2006/3/2/194
-]
-
-pci_set_master() will enable DMA by setting the bus master bit
-in the PCI_COMMAND register. It also fixes the latency timer value if
-it's set to something bogus by the BIOS.  pci_clear_master() will
-disable DMA by clearing the bus master bit.
-
-If the PCI device can use the PCI Memory-Write-Invalidate transaction,
-call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
-and also ensures that the cache line size register is set correctly.
-Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate.  Alternatively,
-if Mem-Wr-Inval would be nice to have but is not required, call
-pci_try_set_mwi() to have the system do its best effort at enabling
-Mem-Wr-Inval.
-
-
-3.2 Request MMIO/IOP resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Memory (MMIO), and I/O port addresses should NOT be read directly
-from the PCI device config space. Use the values in the pci_dev structure
-as the PCI "bus address" might have been remapped to a "host physical"
-address by the arch/chip-set specific kernel support.
-
-See Documentation/io-mapping.txt for how to access device registers
-or device memory.
-
-The device driver needs to call pci_request_region() to verify
-no other device is already using the same address resource.
-Conversely, drivers should call pci_release_region() AFTER
-calling pci_disable_device().
-The idea is to prevent two devices colliding on the same address range.
-
-[ See OS BUG comment above. Currently (2.6.19), The driver can only
-  determine MMIO and IO Port resource availability _after_ calling
-  pci_enable_device(). ]
-
-Generic flavors of pci_request_region() are request_mem_region()
-(for MMIO ranges) and request_region() (for IO Port ranges).
-Use these for address resources that are not described by "normal" PCI
-BARs.
-
-Also see pci_request_selected_regions() below.
-
-
-3.3 Set the DMA mask size
-~~~~~~~~~~~~~~~~~~~~~~~~~
-[ If anything below doesn't make sense, please refer to
-  Documentation/DMA-API.txt. This section is just a reminder that
-  drivers need to indicate DMA capabilities of the device and is not
-  an authoritative source for DMA interfaces. ]
-
-While all drivers should explicitly indicate the DMA capability
-(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
-32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
-appropriate parameters.  In general this allows more efficient DMA
-on systems where System RAM exists above 4G _physical_ address.
-
-Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
-
-Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
-Again, this includes drivers for all PCI-X and PCIe compliant devices.
-Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
-64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
-
-
-3.4 Setup shared control data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory.  See Documentation/DMA-API.txt for a full description of
-the DMA APIs. This section is just a reminder that it needs to be done
-before enabling DMA on the device.
-
-
-3.5 Initialize device registers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers will need specific "capability" fields programmed
-or other "vendor specific" register initialized or reset.
-E.g. clearing pending interrupts.
-
-
-3.6 Register IRQ handler
-~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the last step described here,
-this is often just another intermediate step to initialize a device.
-This step can often be deferred until the device is opened for use.
-
-All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
-and use the devid to map IRQs to devices (remember that all PCI IRQ lines
-can be shared).
-
-request_irq() will associate an interrupt handler and device handle
-with an interrupt number. Historically interrupt numbers represent
-IRQ lines which run from the PCI device to the Interrupt controller.
-With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
-
-request_irq() also enables the interrupt. Make sure the device is
-quiesced and does not have any interrupts pending before registering
-the interrupt handler.
-
-MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
-which deliver interrupts to the CPU via a DMA write to a Local APIC.
-The fundamental difference between MSI and MSI-X is how multiple
-"vectors" get allocated. MSI requires contiguous blocks of vectors
-while MSI-X can allocate several individual ones.
-
-MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
-PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
-causes the PCI support to program CPU vector data into the PCI device
-capability registers. Many architectures, chip-sets, or BIOSes do NOT
-support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
-the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
-specify PCI_IRQ_LEGACY as well.
-
-Drivers that have different interrupt handlers for MSI/MSI-X and
-legacy INTx should chose the right one based on the msi_enabled
-and msix_enabled flags in the pci_dev structure after calling
-pci_alloc_irq_vectors.
-
-There are (at least) two really good reasons for using MSI:
-1) MSI is an exclusive interrupt vector by definition.
-   This means the interrupt handler doesn't have to verify
-   its device caused the interrupt.
-
-2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
-   to be visible to the host CPU(s) when the MSI is delivered. This
-   is important for both data coherency and avoiding stale control data.
-   This guarantee allows the driver to omit MMIO reads to flush
-   the DMA stream.
-
-See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
-of MSI/MSI-X usage.
-
-
-
-4. PCI device shutdown
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When a PCI device driver is being unloaded, most of the following
-steps need to be performed:
-
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and consistent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Disable device from responding to MMIO/IO Port addresses
-	Release MMIO/IO Port resource(s)
-
-
-4.1 Stop IRQs on the device
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-How to do this is chip/device specific. If it's not done, it opens
-the possibility of a "screaming interrupt" if (and only if)
-the IRQ is shared with another device.
-
-When the shared IRQ handler is "unhooked", the remaining devices
-using the same IRQ line will still need the IRQ enabled. Thus if the
-"unhooked" device asserts IRQ line, the system will respond assuming
-it was one of the remaining devices asserted the IRQ line. Since none
-of the other devices will handle the IRQ, the system will "hang" until
-it decides the IRQ isn't going to get handled and masks the IRQ (100,000
-iterations later). Once the shared IRQ is masked, the remaining devices
-will stop functioning properly. Not a nice situation.
-
-This is another reason to use MSI or MSI-X if it's available.
-MSI and MSI-X are defined to be exclusive interrupts and thus
-are not susceptible to the "screaming interrupt" problem.
-
-
-4.2 Release the IRQ
-~~~~~~~~~~~~~~~~~~~
-Once the device is quiesced (no more IRQs), one can call free_irq().
-This function will return control once any pending IRQs are handled,
-"unhook" the drivers IRQ handler from that IRQ, and finally release
-the IRQ if no one else is using it.
-
-
-4.3 Stop all DMA activity
-~~~~~~~~~~~~~~~~~~~~~~~~~
-It's extremely important to stop all DMA operations BEFORE attempting
-to deallocate DMA control data. Failure to do so can result in memory
-corruption, hangs, and on some chip-sets a hard crash.
-
-Stopping DMA after stopping the IRQs can avoid races where the
-IRQ handler might restart DMA engines.
-
-While this step sounds obvious and trivial, several "mature" drivers
-didn't get this step right in the past.
-
-
-4.4 Release DMA buffers
-~~~~~~~~~~~~~~~~~~~~~~~
-Once DMA is stopped, clean up streaming DMA first.
-I.e. unmap data buffers and return buffers to "upstream"
-owners if there is one.
-
-Then clean up "consistent" buffers which contain the control data.
-
-See Documentation/DMA-API.txt for details on unmapping interfaces.
-
-
-4.5 Unregister from other subsystems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Most low level PCI device drivers support some other subsystem
-like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
-driver isn't losing resources from that other subsystem.
-If this happens, typically the symptom is an Oops (panic) when
-the subsystem attempts to call into a driver that has been unloaded.
-
-
-4.6 Disable Device from responding to MMIO/IO Port addresses
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-io_unmap() MMIO or IO Port resources and then call pci_disable_device().
-This is the symmetric opposite of pci_enable_device().
-Do not access device registers after calling pci_disable_device().
-
-
-4.7 Release MMIO/IO Port Resource(s)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Call pci_release_region() to mark the MMIO or IO Port range as available.
-Failure to do so usually results in the inability to reload the driver.
-
-
-
-5. How to access PCI config space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can use pci_(read|write)_config_(byte|word|dword) to access the config
-space of a device represented by struct pci_dev *. All these functions return 0
-when successful or an error code (PCIBIOS_...) which can be translated to a text
-string by pcibios_strerror. Most drivers expect that accesses to valid PCI
-devices don't fail.
-
-If you don't have a struct pci_dev available, you can call
-pci_bus_(read|write)_config_(byte|word|dword) to access a given device
-and function on that bus.
-
-If you access fields in the standard portion of the config header, please
-use symbolic names of locations and bits declared in <linux/pci.h>.
-
-If you need to access Extended PCI Capability registers, just call
-pci_find_capability() for the particular capability and it will find the
-corresponding register block for you.
-
-
-
-6. Other interesting functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
-				bus and slot and number. If the device is
-				found, its reference count is increased.
-pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
-pci_find_capability()		Find specified capability in device's capability
-				list.
-pci_resource_start()		Returns bus start address for a given PCI region
-pci_resource_end()		Returns bus end address for a given PCI region
-pci_resource_len()		Returns the byte length of a PCI region
-pci_set_drvdata()		Set private driver data pointer for a pci_dev
-pci_get_drvdata()		Return private driver data pointer for a pci_dev
-pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
-pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
-
-
-
-7. Miscellaneous hints
-~~~~~~~~~~~~~~~~~~~~~~
-
-When displaying PCI device names to the user (for example when a driver wants
-to tell the user what card has it found), please use pci_name(pci_dev).
-
-Always refer to the PCI devices by a pointer to the pci_dev structure.
-All PCI layer functions use this identification and it's the only
-reasonable one. Don't use bus/slot/function numbers except for very
-special purposes -- on systems with multiple primary buses their semantics
-can be pretty complex.
-
-Don't try to turn on Fast Back to Back writes in your driver.  All devices
-on the bus need to be capable of doing it, so this is something which needs
-to be handled by platform and generic code, not individual drivers.
-
-
-
-8. Vendor and device identifications
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
-are shared across multiple drivers.  You can add private definitions in
-your driver if they're helpful, or just use plain hex constants.
-
-The device IDs are arbitrary hex numbers (vendor controlled) and normally used
-only in a single location, the pci_device_id table.
-
-Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
-There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
-and https://github.com/pciutils/pciids.
-
-
-
-9. Obsolete functions
-~~~~~~~~~~~~~~~~~~~~~
-
-There are several functions which you might come across when trying to
-port an old driver to the new PCI interface.  They are no longer present
-in the kernel as they aren't compatible with hotplug or PCI domains or
-having sane locking.
-
-pci_find_device()	Superseded by pci_get_device()
-pci_find_subsys()	Superseded by pci_get_subsys()
-pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
-pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
-
-
-The alternative is the traditional PCI device driver that walks PCI
-device lists. This is still possible but discouraged.
-
-
-
-10. MMIO Space and "Write Posting"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Converting a driver from using I/O Port space to using MMIO space
-often requires some additional changes. Specifically, "write posting"
-needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
-already do this. I/O Port space guarantees write transactions reach the PCI
-device before the CPU can continue. Writes to MMIO space allow the CPU
-to continue before the transaction reaches the PCI device. HW weenies
-call this "Write Posting" because the write completion is "posted" to
-the CPU before the transaction has reached its destination.
-
-Thus, timing sensitive code should add readl() where the CPU is
-expected to wait before doing other work.  The classic "bit banging"
-sequence works fine for I/O Port space:
-
-       for (i = 8; --i; val >>= 1) {
-               outb(val & 1, ioport_reg);      /* write bit */
-               udelay(10);
-       }
-
-The same sequence for MMIO space should be:
-
-       for (i = 8; --i; val >>= 1) {
-               writeb(val & 1, mmio_reg);      /* write bit */
-               readb(safe_mmio_reg);           /* flush posted write */
-               udelay(10);
-       }
-
-It is important that "safe_mmio_reg" not have any side effects that
-interferes with the correct operation of the device.
-
-Another case to watch out for is when resetting a PCI device. Use PCI
-Configuration space reads to flush the writel(). This will gracefully
-handle the PCI master abort on all platforms if the PCI device is
-expected to not respond to a readl().  Most x86 platforms will allow
-MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
-(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
-
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
new file mode 100644
index 000000000000..18bdefaafd1a
--- /dev/null
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -0,0 +1,311 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===========================================================
+The PCI Express Advanced Error Reporting Driver Guide HOWTO
+===========================================================
+
+:Authors: - T. Long Nguyen <tom.l.nguyen@intel.com>
+          - Yanmin Zhang <yanmin.zhang@intel.com>
+
+:Copyright: |copy| 2006 Intel Corporation
+
+Overview
+===========
+
+About this guide
+----------------
+
+This guide describes the basics of the PCI Express Advanced Error
+Reporting (AER) driver and provides information on how to use it, as
+well as how to enable the drivers of endpoint devices to conform with
+PCI Express AER driver.
+
+
+What is the PCI Express AER Driver?
+-----------------------------------
+
+PCI Express error signaling can occur on the PCI Express link itself
+or on behalf of transactions initiated on the link. PCI Express
+defines two error reporting paradigms: the baseline capability and
+the Advanced Error Reporting capability. The baseline capability is
+required of all PCI Express components providing a minimum defined
+set of error reporting requirements. Advanced Error Reporting
+capability is implemented with a PCI Express advanced error reporting
+extended capability structure providing more robust error reporting.
+
+The PCI Express AER driver provides the infrastructure to support PCI
+Express Advanced Error Reporting capability. The PCI Express AER
+driver provides three basic functions:
+
+  - Gathers the comprehensive error information if errors occurred.
+  - Reports error to the users.
+  - Performs error recovery actions.
+
+AER driver only attaches root ports which support PCI-Express AER
+capability.
+
+
+User Guide
+==========
+
+Include the PCI Express AER Root Driver into the Linux Kernel
+-------------------------------------------------------------
+
+The PCI Express AER Root driver is a Root Port service driver attached
+to the PCI Express Port Bus driver. If a user wants to use it, the driver
+has to be compiled. Option CONFIG_PCIEAER supports this capability. It
+depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and
+CONFIG_PCIEAER = y.
+
+Load PCI Express AER Root Driver
+--------------------------------
+
+Some systems have AER support in firmware. Enabling Linux AER support at
+the same time the firmware handles AER may result in unpredictable
+behavior. Therefore, Linux does not handle AER events unless the firmware
+grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0
+Specification for details regarding _OSC usage.
+
+AER error output
+----------------
+
+When a PCIe AER error is captured, an error message will be output to
+console. If it's a correctable error, it is output as a warning.
+Otherwise, it is printed as an error. So users could choose different
+log level to filter out correctable error messages.
+
+Below shows an example::
+
+  0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
+  0000:50:00.0:   device [8086:0329] error status/mask=00100000/00000000
+  0000:50:00.0:    [20] Unsupported Request    (First)
+  0000:50:00.0:   TLP Header: 04000001 00200a03 05010000 00050100
+
+In the example, 'Requester ID' means the ID of the device who sends
+the error message to root port. Pls. refer to pci express specs for
+other fields.
+
+AER Statistics / Counters
+-------------------------
+
+When PCIe AER errors are captured, the counters / statistics are also exposed
+in the form of sysfs attributes which are documented at
+Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+
+Developer Guide
+===============
+
+To enable AER aware support requires a software driver to configure
+the AER capability structure within its device and to provide callbacks.
+
+To support AER better, developers need understand how AER does work
+firstly.
+
+PCI Express errors are classified into two types: correctable errors
+and uncorrectable errors. This classification is based on the impacts
+of those errors, which may result in degraded performance or function
+failure.
+
+Correctable errors pose no impacts on the functionality of the
+interface. The PCI Express protocol can recover without any software
+intervention or any loss of data. These errors are detected and
+corrected by hardware. Unlike correctable errors, uncorrectable
+errors impact functionality of the interface. Uncorrectable errors
+can cause a particular transaction or a particular PCI Express link
+to be unreliable. Depending on those error conditions, uncorrectable
+errors are further classified into non-fatal errors and fatal errors.
+Non-fatal errors cause the particular transaction to be unreliable,
+but the PCI Express link itself is fully functional. Fatal errors, on
+the other hand, cause the link to be unreliable.
+
+When AER is enabled, a PCI Express device will automatically send an
+error message to the PCIe root port above it when the device captures
+an error. The Root Port, upon receiving an error reporting message,
+internally processes and logs the error message in its PCI Express
+capability structure. Error information being logged includes storing
+the error reporting agent's requestor ID into the Error Source
+Identification Registers and setting the error bits of the Root Error
+Status Register accordingly. If AER error reporting is enabled in Root
+Error Command Register, the Root Port generates an interrupt if an
+error is detected.
+
+Note that the errors as described above are related to the PCI Express
+hierarchy and links. These errors do not include any device specific
+errors because device specific errors will still get sent directly to
+the device driver.
+
+Configure the AER capability structure
+--------------------------------------
+
+AER aware drivers of PCI Express component need change the device
+control registers to enable AER. They also could change AER registers,
+including mask and severity registers. Helper function
+pci_enable_pcie_error_reporting could be used to enable AER. See
+section 3.3.
+
+Provide callbacks
+-----------------
+
+callback reset_link to reset pci express link
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This callback is used to reset the pci express physical link when a
+fatal error happens. The root port aer service driver provides a
+default reset_link function, but different upstream ports might
+have different specifications to reset pci express link, so all
+upstream ports should provide their own reset_link functions.
+
+In struct pcie_port_service_driver, a new pointer, reset_link, is
+added.
+::
+
+	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
+
+Section 3.2.2.2 provides more detailed info on when to call
+reset_link.
+
+PCI error-recovery callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PCI Express AER Root driver uses error callbacks to coordinate
+with downstream device drivers associated with a hierarchy in question
+when performing error recovery actions.
+
+Data struct pci_driver has a pointer, err_handler, to point to
+pci_error_handlers who consists of a couple of callback function
+pointers. AER driver follows the rules defined in
+pci-error-recovery.txt except pci express specific parts (e.g.
+reset_link). Pls. refer to pci-error-recovery.txt for detailed
+definitions of the callbacks.
+
+Below sections specify when to call the error callback functions.
+
+Correctable errors
+~~~~~~~~~~~~~~~~~~
+
+Correctable errors pose no impacts on the functionality of
+the interface. The PCI Express protocol can recover without any
+software intervention or any loss of data. These errors do not
+require any recovery actions. The AER driver clears the device's
+correctable error status register accordingly and logs these errors.
+
+Non-correctable (non-fatal and fatal) errors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If an error message indicates a non-fatal error, performing link reset
+at upstream is not required. The AER driver calls error_detected(dev,
+pci_channel_io_normal) to all drivers associated within a hierarchy in
+question. for example::
+
+  EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort
+
+If Upstream port A captures an AER error, the hierarchy consists of
+Downstream port B and EndPoint.
+
+A driver may return PCI_ERS_RESULT_CAN_RECOVER,
+PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on
+whether it can recover or the AER driver calls mmio_enabled as next.
+
+If an error message indicates a fatal error, kernel will broadcast
+error_detected(dev, pci_channel_io_frozen) to all drivers within
+a hierarchy in question. Then, performing link reset at upstream is
+necessary. As different kinds of devices might use different approaches
+to reset link, AER port service driver is required to provide the
+function to reset link. Firstly, kernel looks for if the upstream
+component has an aer driver. If it has, kernel uses the reset_link
+callback of the aer driver. If the upstream component has no aer driver
+and the port is downstream port, we will perform a hot reset as the
+default by setting the Secondary Bus Reset bit of the Bridge Control
+register associated with the downstream port. As for upstream ports,
+they should provide their own aer service drivers with reset_link
+function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
+reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
+to mmio_enabled.
+
+helper functions
+----------------
+::
+
+  int pci_enable_pcie_error_reporting(struct pci_dev *dev);
+
+pci_enable_pcie_error_reporting enables the device to send error
+messages to root port when an error is detected. Note that devices
+don't enable the error reporting by default, so device drivers need
+call this function to enable it.
+
+::
+
+  int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+
+pci_disable_pcie_error_reporting disables the device to send error
+messages to root port when an error is detected.
+
+::
+
+  int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
+
+pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
+error status register.
+
+Frequent Asked Questions
+------------------------
+
+Q:
+  What happens if a PCI Express device driver does not provide an
+  error recovery handler (pci_driver->err_handler is equal to NULL)?
+
+A:
+  The devices attached with the driver won't be recovered. If the
+  error is fatal, kernel will print out warning messages. Please refer
+  to section 3 for more information.
+
+Q:
+  What happens if an upstream port service driver does not provide
+  callback reset_link?
+
+A:
+  Fatal error recovery will fail if the errors are reported by the
+  upstream ports who are attached by the service driver.
+
+Q:
+  How does this infrastructure deal with driver that is not PCI
+  Express aware?
+
+A:
+  This infrastructure calls the error callback functions of the
+  driver when an error happens. But if the driver is not aware of
+  PCI Express, the device might not report its own errors to root
+  port.
+
+Q:
+  What modifications will that driver need to make it compatible
+  with the PCI Express AER Root driver?
+
+A:
+  It could call the helper functions to enable AER in devices and
+  cleanup uncorrectable status register. Pls. refer to section 3.3.
+
+
+Software error injection
+========================
+
+Debugging PCIe AER error recovery code is quite difficult because it
+is hard to trigger real hardware errors. Software based error
+injection can be used to fake various kinds of PCIe errors.
+
+First you should enable PCIe AER software error injection in kernel
+configuration, that is, following item should be in your .config.
+
+CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
+
+After reboot with new kernel or insert the module, a device file named
+/dev/aer_inject should be created.
+
+Then, you need a user space tool named aer-inject, which can be gotten
+from:
+
+    https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
+
+More information about aer-inject can be found in the document comes
+with its source code.
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
deleted file mode 100644
index 48ce7903e3c6..000000000000
--- a/Documentation/PCI/pcieaer-howto.txt
+++ /dev/null
@@ -1,267 +0,0 @@
-   The PCI Express Advanced Error Reporting Driver Guide HOWTO
-		T. Long Nguyen	<tom.l.nguyen@intel.com>
-		Yanmin Zhang	<yanmin.zhang@intel.com>
-				07/29/2006
-
-
-1. Overview
-
-1.1 About this guide
-
-This guide describes the basics of the PCI Express Advanced Error
-Reporting (AER) driver and provides information on how to use it, as
-well as how to enable the drivers of endpoint devices to conform with
-PCI Express AER driver.
-
-1.2 Copyright (C) Intel Corporation 2006.
-
-1.3 What is the PCI Express AER Driver?
-
-PCI Express error signaling can occur on the PCI Express link itself
-or on behalf of transactions initiated on the link. PCI Express
-defines two error reporting paradigms: the baseline capability and
-the Advanced Error Reporting capability. The baseline capability is
-required of all PCI Express components providing a minimum defined
-set of error reporting requirements. Advanced Error Reporting
-capability is implemented with a PCI Express advanced error reporting
-extended capability structure providing more robust error reporting.
-
-The PCI Express AER driver provides the infrastructure to support PCI
-Express Advanced Error Reporting capability. The PCI Express AER
-driver provides three basic functions:
-
--	Gathers the comprehensive error information if errors occurred.
--	Reports error to the users.
--	Performs error recovery actions.
-
-AER driver only attaches root ports which support PCI-Express AER
-capability.
-
-
-2. User Guide
-
-2.1 Include the PCI Express AER Root Driver into the Linux Kernel
-
-The PCI Express AER Root driver is a Root Port service driver attached
-to the PCI Express Port Bus driver. If a user wants to use it, the driver
-has to be compiled. Option CONFIG_PCIEAER supports this capability. It
-depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and
-CONFIG_PCIEAER = y.
-
-2.2 Load PCI Express AER Root Driver
-
-Some systems have AER support in firmware. Enabling Linux AER support at
-the same time the firmware handles AER may result in unpredictable
-behavior. Therefore, Linux does not handle AER events unless the firmware
-grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0
-Specification for details regarding _OSC usage.
-
-2.3 AER error output
-
-When a PCIe AER error is captured, an error message will be output to
-console. If it's a correctable error, it is output as a warning.
-Otherwise, it is printed as an error. So users could choose different
-log level to filter out correctable error messages.
-
-Below shows an example:
-0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
-0000:50:00.0:   device [8086:0329] error status/mask=00100000/00000000
-0000:50:00.0:    [20] Unsupported Request    (First)
-0000:50:00.0:   TLP Header: 04000001 00200a03 05010000 00050100
-
-In the example, 'Requester ID' means the ID of the device who sends
-the error message to root port. Pls. refer to pci express specs for
-other fields.
-
-2.4 AER Statistics / Counters
-
-When PCIe AER errors are captured, the counters / statistics are also exposed
-in the form of sysfs attributes which are documented at
-Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
-
-3. Developer Guide
-
-To enable AER aware support requires a software driver to configure
-the AER capability structure within its device and to provide callbacks.
-
-To support AER better, developers need understand how AER does work
-firstly.
-
-PCI Express errors are classified into two types: correctable errors
-and uncorrectable errors. This classification is based on the impacts
-of those errors, which may result in degraded performance or function
-failure.
-
-Correctable errors pose no impacts on the functionality of the
-interface. The PCI Express protocol can recover without any software
-intervention or any loss of data. These errors are detected and
-corrected by hardware. Unlike correctable errors, uncorrectable
-errors impact functionality of the interface. Uncorrectable errors
-can cause a particular transaction or a particular PCI Express link
-to be unreliable. Depending on those error conditions, uncorrectable
-errors are further classified into non-fatal errors and fatal errors.
-Non-fatal errors cause the particular transaction to be unreliable,
-but the PCI Express link itself is fully functional. Fatal errors, on
-the other hand, cause the link to be unreliable.
-
-When AER is enabled, a PCI Express device will automatically send an
-error message to the PCIe root port above it when the device captures
-an error. The Root Port, upon receiving an error reporting message,
-internally processes and logs the error message in its PCI Express
-capability structure. Error information being logged includes storing
-the error reporting agent's requestor ID into the Error Source
-Identification Registers and setting the error bits of the Root Error
-Status Register accordingly. If AER error reporting is enabled in Root
-Error Command Register, the Root Port generates an interrupt if an
-error is detected.
-
-Note that the errors as described above are related to the PCI Express
-hierarchy and links. These errors do not include any device specific
-errors because device specific errors will still get sent directly to
-the device driver.
-
-3.1 Configure the AER capability structure
-
-AER aware drivers of PCI Express component need change the device
-control registers to enable AER. They also could change AER registers,
-including mask and severity registers. Helper function
-pci_enable_pcie_error_reporting could be used to enable AER. See
-section 3.3.
-
-3.2. Provide callbacks
-
-3.2.1 callback reset_link to reset pci express link
-
-This callback is used to reset the pci express physical link when a
-fatal error happens. The root port aer service driver provides a
-default reset_link function, but different upstream ports might
-have different specifications to reset pci express link, so all
-upstream ports should provide their own reset_link functions.
-
-In struct pcie_port_service_driver, a new pointer, reset_link, is
-added.
-
-pci_ers_result_t (*reset_link) (struct pci_dev *dev);
-
-Section 3.2.2.2 provides more detailed info on when to call
-reset_link.
-
-3.2.2 PCI error-recovery callbacks
-
-The PCI Express AER Root driver uses error callbacks to coordinate
-with downstream device drivers associated with a hierarchy in question
-when performing error recovery actions.
-
-Data struct pci_driver has a pointer, err_handler, to point to
-pci_error_handlers who consists of a couple of callback function
-pointers. AER driver follows the rules defined in
-pci-error-recovery.txt except pci express specific parts (e.g.
-reset_link). Pls. refer to pci-error-recovery.txt for detailed
-definitions of the callbacks.
-
-Below sections specify when to call the error callback functions.
-
-3.2.2.1 Correctable errors
-
-Correctable errors pose no impacts on the functionality of
-the interface. The PCI Express protocol can recover without any
-software intervention or any loss of data. These errors do not
-require any recovery actions. The AER driver clears the device's
-correctable error status register accordingly and logs these errors.
-
-3.2.2.2 Non-correctable (non-fatal and fatal) errors
-
-If an error message indicates a non-fatal error, performing link reset
-at upstream is not required. The AER driver calls error_detected(dev,
-pci_channel_io_normal) to all drivers associated within a hierarchy in
-question. for example,
-EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort.
-If Upstream port A captures an AER error, the hierarchy consists of
-Downstream port B and EndPoint.
-
-A driver may return PCI_ERS_RESULT_CAN_RECOVER,
-PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on
-whether it can recover or the AER driver calls mmio_enabled as next.
-
-If an error message indicates a fatal error, kernel will broadcast
-error_detected(dev, pci_channel_io_frozen) to all drivers within
-a hierarchy in question. Then, performing link reset at upstream is
-necessary. As different kinds of devices might use different approaches
-to reset link, AER port service driver is required to provide the
-function to reset link. Firstly, kernel looks for if the upstream
-component has an aer driver. If it has, kernel uses the reset_link
-callback of the aer driver. If the upstream component has no aer driver
-and the port is downstream port, we will perform a hot reset as the
-default by setting the Secondary Bus Reset bit of the Bridge Control
-register associated with the downstream port. As for upstream ports,
-they should provide their own aer service drivers with reset_link
-function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
-reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
-to mmio_enabled.
-
-3.3 helper functions
-
-3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
-pci_enable_pcie_error_reporting enables the device to send error
-messages to root port when an error is detected. Note that devices
-don't enable the error reporting by default, so device drivers need
-call this function to enable it.
-
-3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-pci_disable_pcie_error_reporting disables the device to send error
-messages to root port when an error is detected.
-
-3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
-pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
-error status register.
-
-3.4 Frequent Asked Questions
-
-Q: What happens if a PCI Express device driver does not provide an
-error recovery handler (pci_driver->err_handler is equal to NULL)?
-
-A: The devices attached with the driver won't be recovered. If the
-error is fatal, kernel will print out warning messages. Please refer
-to section 3 for more information.
-
-Q: What happens if an upstream port service driver does not provide
-callback reset_link?
-
-A: Fatal error recovery will fail if the errors are reported by the
-upstream ports who are attached by the service driver.
-
-Q: How does this infrastructure deal with driver that is not PCI
-Express aware?
-
-A: This infrastructure calls the error callback functions of the
-driver when an error happens. But if the driver is not aware of
-PCI Express, the device might not report its own errors to root
-port.
-
-Q: What modifications will that driver need to make it compatible
-with the PCI Express AER Root driver?
-
-A: It could call the helper functions to enable AER in devices and
-cleanup uncorrectable status register. Pls. refer to section 3.3.
-
-
-4. Software error injection
-
-Debugging PCIe AER error recovery code is quite difficult because it
-is hard to trigger real hardware errors. Software based error
-injection can be used to fake various kinds of PCIe errors.
-
-First you should enable PCIe AER software error injection in kernel
-configuration, that is, following item should be in your .config.
-
-CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
-
-After reboot with new kernel or insert the module, a device file named
-/dev/aer_inject should be created.
-
-Then, you need a user space tool named aer-inject, which can be gotten
-from:
-    https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
-
-More information about aer-inject can be found in the document comes
-with its source code.
diff --git a/Documentation/PCI/picebus-howto.rst b/Documentation/PCI/picebus-howto.rst
new file mode 100644
index 000000000000..f882ff62c51f
--- /dev/null
+++ b/Documentation/PCI/picebus-howto.rst
@@ -0,0 +1,220 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===========================================
+The PCI Express Port Bus Driver Guide HOWTO
+===========================================
+
+:Author: Tom L Nguyen tom.l.nguyen@intel.com 11/03/2004
+:Copyright: |copy| 2004 Intel Corporation
+
+About this guide
+================
+
+This guide describes the basics of the PCI Express Port Bus driver
+and provides information on how to enable the service drivers to
+register/unregister with the PCI Express Port Bus Driver.
+
+
+What is the PCI Express Port Bus Driver
+=======================================
+
+A PCI Express Port is a logical PCI-PCI Bridge structure. There
+are two types of PCI Express Port: the Root Port and the Switch
+Port. The Root Port originates a PCI Express link from a PCI Express
+Root Complex and the Switch Port connects PCI Express links to
+internal logical PCI buses. The Switch Port, which has its secondary
+bus representing the switch's internal routing logic, is called the
+switch's Upstream Port. The switch's Downstream Port is bridging from
+switch's internal routing bus to a bus representing the downstream
+PCI Express link from the PCI Express Switch.
+
+A PCI Express Port can provide up to four distinct functions,
+referred to in this document as services, depending on its port type.
+PCI Express Port's services include native hotplug support (HP),
+power management event support (PME), advanced error reporting
+support (AER), and virtual channel support (VC). These services may
+be handled by a single complex driver or be individually distributed
+and handled by corresponding service drivers.
+
+Why use the PCI Express Port Bus Driver?
+========================================
+
+In existing Linux kernels, the Linux Device Driver Model allows a
+physical device to be handled by only a single driver. The PCI
+Express Port is a PCI-PCI Bridge device with multiple distinct
+services. To maintain a clean and simple solution each service
+may have its own software service driver. In this case several
+service drivers will compete for a single PCI-PCI Bridge device.
+For example, if the PCI Express Root Port native hotplug service
+driver is loaded first, it claims a PCI-PCI Bridge Root Port. The
+kernel therefore does not load other service drivers for that Root
+Port. In other words, it is impossible to have multiple service
+drivers load and run on a PCI-PCI Bridge device simultaneously
+using the current driver model.
+
+To enable multiple service drivers running simultaneously requires
+having a PCI Express Port Bus driver, which manages all populated
+PCI Express Ports and distributes all provided service requests
+to the corresponding service drivers as required. Some key
+advantages of using the PCI Express Port Bus driver are listed below:
+
+  - Allow multiple service drivers to run simultaneously on
+    a PCI-PCI Bridge Port device.
+
+  - Allow service drivers implemented in an independent
+    staged approach.
+
+  - Allow one service driver to run on multiple PCI-PCI Bridge
+    Port devices.
+
+  - Manage and distribute resources of a PCI-PCI Bridge Port
+    device to requested service drivers.
+
+Configuring the PCI Express Port Bus Driver vs. Service Drivers
+===============================================================
+
+Including the PCI Express Port Bus Driver Support into the Kernel
+-----------------------------------------------------------------
+
+Including the PCI Express Port Bus driver depends on whether the PCI
+Express support is included in the kernel config. The kernel will
+automatically include the PCI Express Port Bus driver as a kernel
+driver when the PCI Express support is enabled in the kernel.
+
+Enabling Service Driver Support
+-------------------------------
+
+PCI device drivers are implemented based on Linux Device Driver Model.
+All service drivers are PCI device drivers. As discussed above, it is
+impossible to load any service driver once the kernel has loaded the
+PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver
+Model requires some minimal changes on existing service drivers that
+imposes no impact on the functionality of existing service drivers.
+
+A service driver is required to use the two APIs shown below to
+register its service with the PCI Express Port Bus driver (see
+section 5.2.1 & 5.2.2). It is important that a service driver
+initializes the pcie_port_service_driver data structure, included in
+header file /include/linux/pcieport_if.h, before calling these APIs.
+Failure to do so will result an identity mismatch, which prevents
+the PCI Express Port Bus driver from loading a service driver.
+
+pcie_port_service_register
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+
+  int pcie_port_service_register(struct pcie_port_service_driver *new)
+
+This API replaces the Linux Driver Model's pci_register_driver API. A
+service driver should always calls pcie_port_service_register at
+module init. Note that after service driver being loaded, calls
+such as pci_enable_device(dev) and pci_set_master(dev) are no longer
+necessary since these calls are executed by the PCI Port Bus driver.
+
+pcie_port_service_unregister
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+
+  void pcie_port_service_unregister(struct pcie_port_service_driver *new)
+
+pcie_port_service_unregister replaces the Linux Driver Model's
+pci_unregister_driver. It's always called by service driver when a
+module exits.
+
+Sample Code
+~~~~~~~~~~~
+
+Below is sample service driver code to initialize the port service
+driver data structure.
+::
+
+  static struct pcie_port_service_id service_id[] = { {
+    .vendor = PCI_ANY_ID,
+    .device = PCI_ANY_ID,
+    .port_type = PCIE_RC_PORT,
+    .service_type = PCIE_PORT_SERVICE_AER,
+    }, { /* end: all zeroes */ }
+  };
+
+  static struct pcie_port_service_driver root_aerdrv = {
+    .name		= (char *)device_name,
+    .id_table	= &service_id[0],
+
+    .probe		= aerdrv_load,
+    .remove		= aerdrv_unload,
+
+    .suspend	= aerdrv_suspend,
+    .resume		= aerdrv_resume,
+  };
+
+Below is a sample code for registering/unregistering a service
+driver.
+::
+
+  static int __init aerdrv_service_init(void)
+  {
+    int retval = 0;
+
+    retval = pcie_port_service_register(&root_aerdrv);
+    if (!retval) {
+      /*
+      * FIX ME
+      */
+    }
+    return retval;
+  }
+
+  static void __exit aerdrv_service_exit(void)
+  {
+    pcie_port_service_unregister(&root_aerdrv);
+  }
+
+  module_init(aerdrv_service_init);
+  module_exit(aerdrv_service_exit);
+
+Possible Resource Conflicts
+===========================
+
+Since all service drivers of a PCI-PCI Bridge Port device are
+allowed to run simultaneously, below lists a few of possible resource
+conflicts with proposed solutions.
+
+MSI and MSI-X Vector Resource
+-----------------------------
+
+Once MSI or MSI-X interrupts are enabled on a device, it stays in this
+mode until they are disabled again.  Since service drivers of the same
+PCI-PCI Bridge port share the same physical device, if an individual
+service driver enables or disables MSI/MSI-X mode it may result
+unpredictable behavior.
+
+To avoid this situation all service drivers are not permitted to
+switch interrupt mode on its device. The PCI Express Port Bus driver
+is responsible for determining the interrupt mode and this should be
+transparent to service drivers. Service drivers need to know only
+the vector IRQ assigned to the field irq of struct pcie_device, which
+is passed in when the PCI Express Port Bus driver probes each service
+driver. Service drivers should use (struct pcie_device*)dev->irq to
+call request_irq/free_irq. In addition, the interrupt mode is stored
+in the field interrupt_mode of struct pcie_device.
+
+PCI Memory/IO Mapped Regions
+----------------------------
+
+Service drivers for PCI Express Power Management (PME), Advanced
+Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access
+PCI configuration space on the PCI Express port. In all cases the
+registers accessed are independent of each other. This patch assumes
+that all service drivers will be well behaved and not overwrite
+other service driver's configuration settings.
+
+PCI Config Registers
+--------------------
+
+Each service driver runs its PCI config operations on its own
+capability structure except the PCI Express capability structure, in
+which Root Control register and Device Control register are shared
+between PME and AER. This patch assumes that all service drivers
+will be well behaved and not overwrite other service driver's
+configuration settings.
diff --git a/Documentation/RCU/UP.rst b/Documentation/RCU/UP.rst
new file mode 100644
index 000000000000..e26dda27430c
--- /dev/null
+++ b/Documentation/RCU/UP.rst
@@ -0,0 +1,143 @@
+.. _up_doc:
+
+RCU on Uniprocessor Systems
+===========================
+
+A common misconception is that, on UP systems, the call_rcu() primitive
+may immediately invoke its function.  The basis of this misconception
+is that since there is only one CPU, it should not be necessary to
+wait for anything else to get done, since there are no other CPUs for
+anything else to be happening on.  Although this approach will *sort of*
+work a surprising amount of the time, it is a very bad idea in general.
+This document presents three examples that demonstrate exactly how bad
+an idea this is.
+
+Example 1: softirq Suicide
+--------------------------
+
+Suppose that an RCU-based algorithm scans a linked list containing
+elements A, B, and C in process context, and can delete elements from
+this same list in softirq context.  Suppose that the process-context scan
+is referencing element B when it is interrupted by softirq processing,
+which deletes element B, and then invokes call_rcu() to free element B
+after a grace period.
+
+Now, if call_rcu() were to directly invoke its arguments, then upon return
+from softirq, the list scan would find itself referencing a newly freed
+element B.  This situation can greatly decrease the life expectancy of
+your kernel.
+
+This same problem can occur if call_rcu() is invoked from a hardware
+interrupt handler.
+
+Example 2: Function-Call Fatality
+---------------------------------
+
+Of course, one could avert the suicide described in the preceding example
+by having call_rcu() directly invoke its arguments only if it was called
+from process context.  However, this can fail in a similar manner.
+
+Suppose that an RCU-based algorithm again scans a linked list containing
+elements A, B, and C in process contexts, but that it invokes a function
+on each element as it is scanned.  Suppose further that this function
+deletes element B from the list, then passes it to call_rcu() for deferred
+freeing.  This may be a bit unconventional, but it is perfectly legal
+RCU usage, since call_rcu() must wait for a grace period to elapse.
+Therefore, in this case, allowing call_rcu() to immediately invoke
+its arguments would cause it to fail to make the fundamental guarantee
+underlying RCU, namely that call_rcu() defers invoking its arguments until
+all RCU read-side critical sections currently executing have completed.
+
+Quick Quiz #1:
+	Why is it *not* legal to invoke synchronize_rcu() in this case?
+
+:ref:`Answers to Quick Quiz <answer_quick_quiz_up>`
+
+Example 3: Death by Deadlock
+----------------------------
+
+Suppose that call_rcu() is invoked while holding a lock, and that the
+callback function must acquire this same lock.  In this case, if
+call_rcu() were to directly invoke the callback, the result would
+be self-deadlock.
+
+In some cases, it would possible to restructure to code so that
+the call_rcu() is delayed until after the lock is released.  However,
+there are cases where this can be quite ugly:
+
+1.	If a number of items need to be passed to call_rcu() within
+	the same critical section, then the code would need to create
+	a list of them, then traverse the list once the lock was
+	released.
+
+2.	In some cases, the lock will be held across some kernel API,
+	so that delaying the call_rcu() until the lock is released
+	requires that the data item be passed up via a common API.
+	It is far better to guarantee that callbacks are invoked
+	with no locks held than to have to modify such APIs to allow
+	arbitrary data items to be passed back up through them.
+
+If call_rcu() directly invokes the callback, painful locking restrictions
+or API changes would be required.
+
+Quick Quiz #2:
+	What locking restriction must RCU callbacks respect?
+
+:ref:`Answers to Quick Quiz <answer_quick_quiz_up>`
+
+Summary
+-------
+
+Permitting call_rcu() to immediately invoke its arguments breaks RCU,
+even on a UP system.  So do not do it!  Even on a UP system, the RCU
+infrastructure *must* respect grace periods, and *must* invoke callbacks
+from a known environment in which no locks are held.
+
+Note that it *is* safe for synchronize_rcu() to return immediately on
+UP systems, including PREEMPT SMP builds running on UP systems.
+
+Quick Quiz #3:
+	Why can't synchronize_rcu() return immediately on UP systems running
+	preemptable RCU?
+
+.. _answer_quick_quiz_up:
+
+Answer to Quick Quiz #1:
+	Why is it *not* legal to invoke synchronize_rcu() in this case?
+
+	Because the calling function is scanning an RCU-protected linked
+	list, and is therefore within an RCU read-side critical section.
+	Therefore, the called function has been invoked within an RCU
+	read-side critical section, and is not permitted to block.
+
+Answer to Quick Quiz #2:
+	What locking restriction must RCU callbacks respect?
+
+	Any lock that is acquired within an RCU callback must be acquired
+	elsewhere using an _bh variant of the spinlock primitive.
+	For example, if "mylock" is acquired by an RCU callback, then
+	a process-context acquisition of this lock must use something
+	like spin_lock_bh() to acquire the lock.  Please note that
+	it is also OK to use _irq variants of spinlocks, for example,
+	spin_lock_irqsave().
+
+	If the process-context code were to simply use spin_lock(),
+	then, since RCU callbacks can be invoked from softirq context,
+	the callback might be called from a softirq that interrupted
+	the process-context critical section.  This would result in
+	self-deadlock.
+
+	This restriction might seem gratuitous, since very few RCU
+	callbacks acquire locks directly.  However, a great many RCU
+	callbacks do acquire locks *indirectly*, for example, via
+	the kfree() primitive.
+
+Answer to Quick Quiz #3:
+	Why can't synchronize_rcu() return immediately on UP systems
+	running preemptable RCU?
+
+	Because some other task might have been preempted in the middle
+	of an RCU read-side critical section.  If synchronize_rcu()
+	simply immediately returned, it would prematurely signal the
+	end of the grace period, which would come as a nasty shock to
+	that other thread when it started running again.
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
deleted file mode 100644
index 53bde717017b..000000000000
--- a/Documentation/RCU/UP.txt
+++ /dev/null
@@ -1,133 +0,0 @@
-RCU on Uniprocessor Systems
-
-
-A common misconception is that, on UP systems, the call_rcu() primitive
-may immediately invoke its function.  The basis of this misconception
-is that since there is only one CPU, it should not be necessary to
-wait for anything else to get done, since there are no other CPUs for
-anything else to be happening on.  Although this approach will -sort- -of-
-work a surprising amount of the time, it is a very bad idea in general.
-This document presents three examples that demonstrate exactly how bad
-an idea this is.
-
-
-Example 1: softirq Suicide
-
-Suppose that an RCU-based algorithm scans a linked list containing
-elements A, B, and C in process context, and can delete elements from
-this same list in softirq context.  Suppose that the process-context scan
-is referencing element B when it is interrupted by softirq processing,
-which deletes element B, and then invokes call_rcu() to free element B
-after a grace period.
-
-Now, if call_rcu() were to directly invoke its arguments, then upon return
-from softirq, the list scan would find itself referencing a newly freed
-element B.  This situation can greatly decrease the life expectancy of
-your kernel.
-
-This same problem can occur if call_rcu() is invoked from a hardware
-interrupt handler.
-
-
-Example 2: Function-Call Fatality
-
-Of course, one could avert the suicide described in the preceding example
-by having call_rcu() directly invoke its arguments only if it was called
-from process context.  However, this can fail in a similar manner.
-
-Suppose that an RCU-based algorithm again scans a linked list containing
-elements A, B, and C in process contexts, but that it invokes a function
-on each element as it is scanned.  Suppose further that this function
-deletes element B from the list, then passes it to call_rcu() for deferred
-freeing.  This may be a bit unconventional, but it is perfectly legal
-RCU usage, since call_rcu() must wait for a grace period to elapse.
-Therefore, in this case, allowing call_rcu() to immediately invoke
-its arguments would cause it to fail to make the fundamental guarantee
-underlying RCU, namely that call_rcu() defers invoking its arguments until
-all RCU read-side critical sections currently executing have completed.
-
-Quick Quiz #1: why is it -not- legal to invoke synchronize_rcu() in
-	this case?
-
-
-Example 3: Death by Deadlock
-
-Suppose that call_rcu() is invoked while holding a lock, and that the
-callback function must acquire this same lock.  In this case, if
-call_rcu() were to directly invoke the callback, the result would
-be self-deadlock.
-
-In some cases, it would possible to restructure to code so that
-the call_rcu() is delayed until after the lock is released.  However,
-there are cases where this can be quite ugly:
-
-1.	If a number of items need to be passed to call_rcu() within
-	the same critical section, then the code would need to create
-	a list of them, then traverse the list once the lock was
-	released.
-
-2.	In some cases, the lock will be held across some kernel API,
-	so that delaying the call_rcu() until the lock is released
-	requires that the data item be passed up via a common API.
-	It is far better to guarantee that callbacks are invoked
-	with no locks held than to have to modify such APIs to allow
-	arbitrary data items to be passed back up through them.
-
-If call_rcu() directly invokes the callback, painful locking restrictions
-or API changes would be required.
-
-Quick Quiz #2: What locking restriction must RCU callbacks respect?
-
-
-Summary
-
-Permitting call_rcu() to immediately invoke its arguments breaks RCU,
-even on a UP system.  So do not do it!  Even on a UP system, the RCU
-infrastructure -must- respect grace periods, and -must- invoke callbacks
-from a known environment in which no locks are held.
-
-Note that it -is- safe for synchronize_rcu() to return immediately on
-UP systems, including !PREEMPT SMP builds running on UP systems.
-
-Quick Quiz #3: Why can't synchronize_rcu() return immediately on
-	UP systems running preemptable RCU?
-
-
-Answer to Quick Quiz #1:
-	Why is it -not- legal to invoke synchronize_rcu() in this case?
-
-	Because the calling function is scanning an RCU-protected linked
-	list, and is therefore within an RCU read-side critical section.
-	Therefore, the called function has been invoked within an RCU
-	read-side critical section, and is not permitted to block.
-
-Answer to Quick Quiz #2:
-	What locking restriction must RCU callbacks respect?
-
-	Any lock that is acquired within an RCU callback must be
-	acquired elsewhere using an _irq variant of the spinlock
-	primitive.  For example, if "mylock" is acquired by an
-	RCU callback, then a process-context acquisition of this
-	lock must use something like spin_lock_irqsave() to
-	acquire the lock.
-
-	If the process-context code were to simply use spin_lock(),
-	then, since RCU callbacks can be invoked from softirq context,
-	the callback might be called from a softirq that interrupted
-	the process-context critical section.  This would result in
-	self-deadlock.
-
-	This restriction might seem gratuitous, since very few RCU
-	callbacks acquire locks directly.  However, a great many RCU
-	callbacks do acquire locks -indirectly-, for example, via
-	the kfree() primitive.
-
-Answer to Quick Quiz #3:
-	Why can't synchronize_rcu() return immediately on UP systems
-	running preemptable RCU?
-
-	Because some other task might have been preempted in the middle
-	of an RCU read-side critical section.  If synchronize_rcu()
-	simply immediately returned, it would prematurely signal the
-	end of the grace period, which would come as a nasty shock to
-	that other thread when it started running again.
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
new file mode 100644
index 000000000000..340a9725676c
--- /dev/null
+++ b/Documentation/RCU/index.rst
@@ -0,0 +1,19 @@
+.. _rcu_concepts:
+
+============
+RCU concepts
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   rcu
+   listRCU
+   UP
+
+.. only:: subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst
new file mode 100644
index 000000000000..7956ff33042b
--- /dev/null
+++ b/Documentation/RCU/listRCU.rst
@@ -0,0 +1,321 @@
+.. _list_rcu_doc:
+
+Using RCU to Protect Read-Mostly Linked Lists
+=============================================
+
+One of the best applications of RCU is to protect read-mostly linked lists
+("struct list_head" in list.h).  One big advantage of this approach
+is that all of the required memory barriers are included for you in
+the list macros.  This document describes several applications of RCU,
+with the best fits first.
+
+Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
+----------------------------------------------------------------------
+
+The best applications are cases where, if reader-writer locking were
+used, the read-side lock would be dropped before taking any action
+based on the results of the search.  The most celebrated example is
+the routing table.  Because the routing table is tracking the state of
+equipment outside of the computer, it will at times contain stale data.
+Therefore, once the route has been computed, there is no need to hold
+the routing table static during transmission of the packet.  After all,
+you can hold the routing table static all you want, but that won't keep
+the external Internet from changing, and it is the state of the external
+Internet that really matters.  In addition, routing entries are typically
+added or deleted, rather than being modified in place.
+
+A straightforward example of this use of RCU may be found in the
+system-call auditing support.  For example, a reader-writer locked
+implementation of audit_filter_task() might be as follows::
+
+	static enum audit_state audit_filter_task(struct task_struct *tsk)
+	{
+		struct audit_entry *e;
+		enum audit_state   state;
+
+		read_lock(&auditsc_lock);
+		/* Note: audit_netlink_sem held by caller. */
+		list_for_each_entry(e, &audit_tsklist, list) {
+			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+				read_unlock(&auditsc_lock);
+				return state;
+			}
+		}
+		read_unlock(&auditsc_lock);
+		return AUDIT_BUILD_CONTEXT;
+	}
+
+Here the list is searched under the lock, but the lock is dropped before
+the corresponding value is returned.  By the time that this value is acted
+on, the list may well have been modified.  This makes sense, since if
+you are turning auditing off, it is OK to audit a few extra system calls.
+
+This means that RCU can be easily applied to the read side, as follows::
+
+	static enum audit_state audit_filter_task(struct task_struct *tsk)
+	{
+		struct audit_entry *e;
+		enum audit_state   state;
+
+		rcu_read_lock();
+		/* Note: audit_netlink_sem held by caller. */
+		list_for_each_entry_rcu(e, &audit_tsklist, list) {
+			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+				rcu_read_unlock();
+				return state;
+			}
+		}
+		rcu_read_unlock();
+		return AUDIT_BUILD_CONTEXT;
+	}
+
+The read_lock() and read_unlock() calls have become rcu_read_lock()
+and rcu_read_unlock(), respectively, and the list_for_each_entry() has
+become list_for_each_entry_rcu().  The _rcu() list-traversal primitives
+insert the read-side memory barriers that are required on DEC Alpha CPUs.
+
+The changes to the update side are also straightforward.  A reader-writer
+lock might be used as follows for deletion and insertion::
+
+	static inline int audit_del_rule(struct audit_rule *rule,
+					 struct list_head *list)
+	{
+		struct audit_entry  *e;
+
+		write_lock(&auditsc_lock);
+		list_for_each_entry(e, list, list) {
+			if (!audit_compare_rule(rule, &e->rule)) {
+				list_del(&e->list);
+				write_unlock(&auditsc_lock);
+				return 0;
+			}
+		}
+		write_unlock(&auditsc_lock);
+		return -EFAULT;		/* No matching rule */
+	}
+
+	static inline int audit_add_rule(struct audit_entry *entry,
+					 struct list_head *list)
+	{
+		write_lock(&auditsc_lock);
+		if (entry->rule.flags & AUDIT_PREPEND) {
+			entry->rule.flags &= ~AUDIT_PREPEND;
+			list_add(&entry->list, list);
+		} else {
+			list_add_tail(&entry->list, list);
+		}
+		write_unlock(&auditsc_lock);
+		return 0;
+	}
+
+Following are the RCU equivalents for these two functions::
+
+	static inline int audit_del_rule(struct audit_rule *rule,
+					 struct list_head *list)
+	{
+		struct audit_entry  *e;
+
+		/* Do not use the _rcu iterator here, since this is the only
+		 * deletion routine. */
+		list_for_each_entry(e, list, list) {
+			if (!audit_compare_rule(rule, &e->rule)) {
+				list_del_rcu(&e->list);
+				call_rcu(&e->rcu, audit_free_rule);
+				return 0;
+			}
+		}
+		return -EFAULT;		/* No matching rule */
+	}
+
+	static inline int audit_add_rule(struct audit_entry *entry,
+					 struct list_head *list)
+	{
+		if (entry->rule.flags & AUDIT_PREPEND) {
+			entry->rule.flags &= ~AUDIT_PREPEND;
+			list_add_rcu(&entry->list, list);
+		} else {
+			list_add_tail_rcu(&entry->list, list);
+		}
+		return 0;
+	}
+
+Normally, the write_lock() and write_unlock() would be replaced by
+a spin_lock() and a spin_unlock(), but in this case, all callers hold
+audit_netlink_sem, so no additional locking is required.  The auditsc_lock
+can therefore be eliminated, since use of RCU eliminates the need for
+writers to exclude readers.  Normally, the write_lock() calls would
+be converted into spin_lock() calls.
+
+The list_del(), list_add(), and list_add_tail() primitives have been
+replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
+The _rcu() list-manipulation primitives add memory barriers that are
+needed on weakly ordered CPUs (most of them!).  The list_del_rcu()
+primitive omits the pointer poisoning debug-assist code that would
+otherwise cause concurrent readers to fail spectacularly.
+
+So, when readers can tolerate stale data and when entries are either added
+or deleted, without in-place modification, it is very easy to use RCU!
+
+Example 2: Handling In-Place Updates
+------------------------------------
+
+The system-call auditing code does not update auditing rules in place.
+However, if it did, reader-writer-locked code to do so might look as
+follows (presumably, the field_count is only permitted to decrease,
+otherwise, the added fields would need to be filled in)::
+
+	static inline int audit_upd_rule(struct audit_rule *rule,
+					 struct list_head *list,
+					 __u32 newaction,
+					 __u32 newfield_count)
+	{
+		struct audit_entry  *e;
+		struct audit_newentry *ne;
+
+		write_lock(&auditsc_lock);
+		/* Note: audit_netlink_sem held by caller. */
+		list_for_each_entry(e, list, list) {
+			if (!audit_compare_rule(rule, &e->rule)) {
+				e->rule.action = newaction;
+				e->rule.file_count = newfield_count;
+				write_unlock(&auditsc_lock);
+				return 0;
+			}
+		}
+		write_unlock(&auditsc_lock);
+		return -EFAULT;		/* No matching rule */
+	}
+
+The RCU version creates a copy, updates the copy, then replaces the old
+entry with the newly updated entry.  This sequence of actions, allowing
+concurrent reads while doing a copy to perform an update, is what gives
+RCU ("read-copy update") its name.  The RCU code is as follows::
+
+	static inline int audit_upd_rule(struct audit_rule *rule,
+					 struct list_head *list,
+					 __u32 newaction,
+					 __u32 newfield_count)
+	{
+		struct audit_entry  *e;
+		struct audit_newentry *ne;
+
+		list_for_each_entry(e, list, list) {
+			if (!audit_compare_rule(rule, &e->rule)) {
+				ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
+				if (ne == NULL)
+					return -ENOMEM;
+				audit_copy_rule(&ne->rule, &e->rule);
+				ne->rule.action = newaction;
+				ne->rule.file_count = newfield_count;
+				list_replace_rcu(&e->list, &ne->list);
+				call_rcu(&e->rcu, audit_free_rule);
+				return 0;
+			}
+		}
+		return -EFAULT;		/* No matching rule */
+	}
+
+Again, this assumes that the caller holds audit_netlink_sem.  Normally,
+the reader-writer lock would become a spinlock in this sort of code.
+
+Example 3: Eliminating Stale Data
+---------------------------------
+
+The auditing examples above tolerate stale data, as do most algorithms
+that are tracking external state.  Because there is a delay from the
+time the external state changes before Linux becomes aware of the change,
+additional RCU-induced staleness is normally not a problem.
+
+However, there are many examples where stale data cannot be tolerated.
+One example in the Linux kernel is the System V IPC (see the ipc_lock()
+function in ipc/util.c).  This code checks a "deleted" flag under a
+per-entry spinlock, and, if the "deleted" flag is set, pretends that the
+entry does not exist.  For this to be helpful, the search function must
+return holding the per-entry spinlock, as ipc_lock() does in fact do.
+
+Quick Quiz:
+	Why does the search function need to return holding the per-entry lock for
+	this deleted-flag technique to be helpful?
+
+:ref:`Answer to Quick Quiz <answer_quick_quiz_list>`
+
+If the system-call audit module were to ever need to reject stale data,
+one way to accomplish this would be to add a "deleted" flag and a "lock"
+spinlock to the audit_entry structure, and modify audit_filter_task()
+as follows::
+
+	static enum audit_state audit_filter_task(struct task_struct *tsk)
+	{
+		struct audit_entry *e;
+		enum audit_state   state;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(e, &audit_tsklist, list) {
+			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+				spin_lock(&e->lock);
+				if (e->deleted) {
+					spin_unlock(&e->lock);
+					rcu_read_unlock();
+					return AUDIT_BUILD_CONTEXT;
+				}
+				rcu_read_unlock();
+				return state;
+			}
+		}
+		rcu_read_unlock();
+		return AUDIT_BUILD_CONTEXT;
+	}
+
+Note that this example assumes that entries are only added and deleted.
+Additional mechanism is required to deal correctly with the
+update-in-place performed by audit_upd_rule().  For one thing,
+audit_upd_rule() would need additional memory barriers to ensure
+that the list_add_rcu() was really executed before the list_del_rcu().
+
+The audit_del_rule() function would need to set the "deleted"
+flag under the spinlock as follows::
+
+	static inline int audit_del_rule(struct audit_rule *rule,
+					 struct list_head *list)
+	{
+		struct audit_entry  *e;
+
+		/* Do not need to use the _rcu iterator here, since this
+		 * is the only deletion routine. */
+		list_for_each_entry(e, list, list) {
+			if (!audit_compare_rule(rule, &e->rule)) {
+				spin_lock(&e->lock);
+				list_del_rcu(&e->list);
+				e->deleted = 1;
+				spin_unlock(&e->lock);
+				call_rcu(&e->rcu, audit_free_rule);
+				return 0;
+			}
+		}
+		return -EFAULT;		/* No matching rule */
+	}
+
+Summary
+-------
+
+Read-mostly list-based data structures that can tolerate stale data are
+the most amenable to use of RCU.  The simplest case is where entries are
+either added or deleted from the data structure (or atomically modified
+in place), but non-atomic in-place modifications can be handled by making
+a copy, updating the copy, then replacing the original with the copy.
+If stale data cannot be tolerated, then a "deleted" flag may be used
+in conjunction with a per-entry spinlock in order to allow the search
+function to reject newly deleted data.
+
+.. _answer_quick_quiz_list:
+
+Answer to Quick Quiz:
+	Why does the search function need to return holding the per-entry
+	lock for this deleted-flag technique to be helpful?
+
+	If the search function drops the per-entry lock before returning,
+	then the caller will be processing stale data in any case.  If it
+	is really OK to be processing stale data, then you don't need a
+	"deleted" flag.  If processing stale data really is a problem,
+	then you need to hold the per-entry lock across all of the code
+	that uses the value that was returned.
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
deleted file mode 100644
index adb5a3782846..000000000000
--- a/Documentation/RCU/listRCU.txt
+++ /dev/null
@@ -1,315 +0,0 @@
-Using RCU to Protect Read-Mostly Linked Lists
-
-
-One of the best applications of RCU is to protect read-mostly linked lists
-("struct list_head" in list.h).  One big advantage of this approach
-is that all of the required memory barriers are included for you in
-the list macros.  This document describes several applications of RCU,
-with the best fits first.
-
-
-Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
-
-The best applications are cases where, if reader-writer locking were
-used, the read-side lock would be dropped before taking any action
-based on the results of the search.  The most celebrated example is
-the routing table.  Because the routing table is tracking the state of
-equipment outside of the computer, it will at times contain stale data.
-Therefore, once the route has been computed, there is no need to hold
-the routing table static during transmission of the packet.  After all,
-you can hold the routing table static all you want, but that won't keep
-the external Internet from changing, and it is the state of the external
-Internet that really matters.  In addition, routing entries are typically
-added or deleted, rather than being modified in place.
-
-A straightforward example of this use of RCU may be found in the
-system-call auditing support.  For example, a reader-writer locked
-implementation of audit_filter_task() might be as follows:
-
-	static enum audit_state audit_filter_task(struct task_struct *tsk)
-	{
-		struct audit_entry *e;
-		enum audit_state   state;
-
-		read_lock(&auditsc_lock);
-		/* Note: audit_netlink_sem held by caller. */
-		list_for_each_entry(e, &audit_tsklist, list) {
-			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
-				read_unlock(&auditsc_lock);
-				return state;
-			}
-		}
-		read_unlock(&auditsc_lock);
-		return AUDIT_BUILD_CONTEXT;
-	}
-
-Here the list is searched under the lock, but the lock is dropped before
-the corresponding value is returned.  By the time that this value is acted
-on, the list may well have been modified.  This makes sense, since if
-you are turning auditing off, it is OK to audit a few extra system calls.
-
-This means that RCU can be easily applied to the read side, as follows:
-
-	static enum audit_state audit_filter_task(struct task_struct *tsk)
-	{
-		struct audit_entry *e;
-		enum audit_state   state;
-
-		rcu_read_lock();
-		/* Note: audit_netlink_sem held by caller. */
-		list_for_each_entry_rcu(e, &audit_tsklist, list) {
-			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
-				rcu_read_unlock();
-				return state;
-			}
-		}
-		rcu_read_unlock();
-		return AUDIT_BUILD_CONTEXT;
-	}
-
-The read_lock() and read_unlock() calls have become rcu_read_lock()
-and rcu_read_unlock(), respectively, and the list_for_each_entry() has
-become list_for_each_entry_rcu().  The _rcu() list-traversal primitives
-insert the read-side memory barriers that are required on DEC Alpha CPUs.
-
-The changes to the update side are also straightforward.  A reader-writer
-lock might be used as follows for deletion and insertion:
-
-	static inline int audit_del_rule(struct audit_rule *rule,
-					 struct list_head *list)
-	{
-		struct audit_entry  *e;
-
-		write_lock(&auditsc_lock);
-		list_for_each_entry(e, list, list) {
-			if (!audit_compare_rule(rule, &e->rule)) {
-				list_del(&e->list);
-				write_unlock(&auditsc_lock);
-				return 0;
-			}
-		}
-		write_unlock(&auditsc_lock);
-		return -EFAULT;		/* No matching rule */
-	}
-
-	static inline int audit_add_rule(struct audit_entry *entry,
-					 struct list_head *list)
-	{
-		write_lock(&auditsc_lock);
-		if (entry->rule.flags & AUDIT_PREPEND) {
-			entry->rule.flags &= ~AUDIT_PREPEND;
-			list_add(&entry->list, list);
-		} else {
-			list_add_tail(&entry->list, list);
-		}
-		write_unlock(&auditsc_lock);
-		return 0;
-	}
-
-Following are the RCU equivalents for these two functions:
-
-	static inline int audit_del_rule(struct audit_rule *rule,
-					 struct list_head *list)
-	{
-		struct audit_entry  *e;
-
-		/* Do not use the _rcu iterator here, since this is the only
-		 * deletion routine. */
-		list_for_each_entry(e, list, list) {
-			if (!audit_compare_rule(rule, &e->rule)) {
-				list_del_rcu(&e->list);
-				call_rcu(&e->rcu, audit_free_rule);
-				return 0;
-			}
-		}
-		return -EFAULT;		/* No matching rule */
-	}
-
-	static inline int audit_add_rule(struct audit_entry *entry,
-					 struct list_head *list)
-	{
-		if (entry->rule.flags & AUDIT_PREPEND) {
-			entry->rule.flags &= ~AUDIT_PREPEND;
-			list_add_rcu(&entry->list, list);
-		} else {
-			list_add_tail_rcu(&entry->list, list);
-		}
-		return 0;
-	}
-
-Normally, the write_lock() and write_unlock() would be replaced by
-a spin_lock() and a spin_unlock(), but in this case, all callers hold
-audit_netlink_sem, so no additional locking is required.  The auditsc_lock
-can therefore be eliminated, since use of RCU eliminates the need for
-writers to exclude readers.  Normally, the write_lock() calls would
-be converted into spin_lock() calls.
-
-The list_del(), list_add(), and list_add_tail() primitives have been
-replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
-The _rcu() list-manipulation primitives add memory barriers that are
-needed on weakly ordered CPUs (most of them!).  The list_del_rcu()
-primitive omits the pointer poisoning debug-assist code that would
-otherwise cause concurrent readers to fail spectacularly.
-
-So, when readers can tolerate stale data and when entries are either added
-or deleted, without in-place modification, it is very easy to use RCU!
-
-
-Example 2: Handling In-Place Updates
-
-The system-call auditing code does not update auditing rules in place.
-However, if it did, reader-writer-locked code to do so might look as
-follows (presumably, the field_count is only permitted to decrease,
-otherwise, the added fields would need to be filled in):
-
-	static inline int audit_upd_rule(struct audit_rule *rule,
-					 struct list_head *list,
-					 __u32 newaction,
-					 __u32 newfield_count)
-	{
-		struct audit_entry  *e;
-		struct audit_newentry *ne;
-
-		write_lock(&auditsc_lock);
-		/* Note: audit_netlink_sem held by caller. */
-		list_for_each_entry(e, list, list) {
-			if (!audit_compare_rule(rule, &e->rule)) {
-				e->rule.action = newaction;
-				e->rule.file_count = newfield_count;
-				write_unlock(&auditsc_lock);
-				return 0;
-			}
-		}
-		write_unlock(&auditsc_lock);
-		return -EFAULT;		/* No matching rule */
-	}
-
-The RCU version creates a copy, updates the copy, then replaces the old
-entry with the newly updated entry.  This sequence of actions, allowing
-concurrent reads while doing a copy to perform an update, is what gives
-RCU ("read-copy update") its name.  The RCU code is as follows:
-
-	static inline int audit_upd_rule(struct audit_rule *rule,
-					 struct list_head *list,
-					 __u32 newaction,
-					 __u32 newfield_count)
-	{
-		struct audit_entry  *e;
-		struct audit_newentry *ne;
-
-		list_for_each_entry(e, list, list) {
-			if (!audit_compare_rule(rule, &e->rule)) {
-				ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
-				if (ne == NULL)
-					return -ENOMEM;
-				audit_copy_rule(&ne->rule, &e->rule);
-				ne->rule.action = newaction;
-				ne->rule.file_count = newfield_count;
-				list_replace_rcu(&e->list, &ne->list);
-				call_rcu(&e->rcu, audit_free_rule);
-				return 0;
-			}
-		}
-		return -EFAULT;		/* No matching rule */
-	}
-
-Again, this assumes that the caller holds audit_netlink_sem.  Normally,
-the reader-writer lock would become a spinlock in this sort of code.
-
-
-Example 3: Eliminating Stale Data
-
-The auditing examples above tolerate stale data, as do most algorithms
-that are tracking external state.  Because there is a delay from the
-time the external state changes before Linux becomes aware of the change,
-additional RCU-induced staleness is normally not a problem.
-
-However, there are many examples where stale data cannot be tolerated.
-One example in the Linux kernel is the System V IPC (see the ipc_lock()
-function in ipc/util.c).  This code checks a "deleted" flag under a
-per-entry spinlock, and, if the "deleted" flag is set, pretends that the
-entry does not exist.  For this to be helpful, the search function must
-return holding the per-entry spinlock, as ipc_lock() does in fact do.
-
-Quick Quiz:  Why does the search function need to return holding the
-	per-entry lock for this deleted-flag technique to be helpful?
-
-If the system-call audit module were to ever need to reject stale data,
-one way to accomplish this would be to add a "deleted" flag and a "lock"
-spinlock to the audit_entry structure, and modify audit_filter_task()
-as follows:
-
-	static enum audit_state audit_filter_task(struct task_struct *tsk)
-	{
-		struct audit_entry *e;
-		enum audit_state   state;
-
-		rcu_read_lock();
-		list_for_each_entry_rcu(e, &audit_tsklist, list) {
-			if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
-				spin_lock(&e->lock);
-				if (e->deleted) {
-					spin_unlock(&e->lock);
-					rcu_read_unlock();
-					return AUDIT_BUILD_CONTEXT;
-				}
-				rcu_read_unlock();
-				return state;
-			}
-		}
-		rcu_read_unlock();
-		return AUDIT_BUILD_CONTEXT;
-	}
-
-Note that this example assumes that entries are only added and deleted.
-Additional mechanism is required to deal correctly with the
-update-in-place performed by audit_upd_rule().  For one thing,
-audit_upd_rule() would need additional memory barriers to ensure
-that the list_add_rcu() was really executed before the list_del_rcu().
-
-The audit_del_rule() function would need to set the "deleted"
-flag under the spinlock as follows:
-
-	static inline int audit_del_rule(struct audit_rule *rule,
-					 struct list_head *list)
-	{
-		struct audit_entry  *e;
-
-		/* Do not need to use the _rcu iterator here, since this
-		 * is the only deletion routine. */
-		list_for_each_entry(e, list, list) {
-			if (!audit_compare_rule(rule, &e->rule)) {
-				spin_lock(&e->lock);
-				list_del_rcu(&e->list);
-				e->deleted = 1;
-				spin_unlock(&e->lock);
-				call_rcu(&e->rcu, audit_free_rule);
-				return 0;
-			}
-		}
-		return -EFAULT;		/* No matching rule */
-	}
-
-
-Summary
-
-Read-mostly list-based data structures that can tolerate stale data are
-the most amenable to use of RCU.  The simplest case is where entries are
-either added or deleted from the data structure (or atomically modified
-in place), but non-atomic in-place modifications can be handled by making
-a copy, updating the copy, then replacing the original with the copy.
-If stale data cannot be tolerated, then a "deleted" flag may be used
-in conjunction with a per-entry spinlock in order to allow the search
-function to reject newly deleted data.
-
-
-Answer to Quick Quiz
-	Why does the search function need to return holding the per-entry
-	lock for this deleted-flag technique to be helpful?
-
-	If the search function drops the per-entry lock before returning,
-	then the caller will be processing stale data in any case.  If it
-	is really OK to be processing stale data, then you don't need a
-	"deleted" flag.  If processing stale data really is a problem,
-	then you need to hold the per-entry lock across all of the code
-	that uses the value that was returned.
diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst
new file mode 100644
index 000000000000..8dfb437dacc3
--- /dev/null
+++ b/Documentation/RCU/rcu.rst
@@ -0,0 +1,92 @@
+.. _rcu_doc:
+
+RCU Concepts
+============
+
+The basic idea behind RCU (read-copy update) is to split destructive
+operations into two parts, one that prevents anyone from seeing the data
+item being destroyed, and one that actually carries out the destruction.
+A "grace period" must elapse between the two parts, and this grace period
+must be long enough that any readers accessing the item being deleted have
+since dropped their references.  For example, an RCU-protected deletion
+from a linked list would first remove the item from the list, wait for
+a grace period to elapse, then free the element.  See the
+Documentation/RCU/listRCU.rst file for more information on using RCU with
+linked lists.
+
+Frequently Asked Questions
+--------------------------
+
+- Why would anyone want to use RCU?
+
+  The advantage of RCU's two-part approach is that RCU readers need
+  not acquire any locks, perform any atomic instructions, write to
+  shared memory, or (on CPUs other than Alpha) execute any memory
+  barriers.  The fact that these operations are quite expensive
+  on modern CPUs is what gives RCU its performance advantages
+  in read-mostly situations.  The fact that RCU readers need not
+  acquire locks can also greatly simplify deadlock-avoidance code.
+
+- How can the updater tell when a grace period has completed
+  if the RCU readers give no indication when they are done?
+
+  Just as with spinlocks, RCU readers are not permitted to
+  block, switch to user-mode execution, or enter the idle loop.
+  Therefore, as soon as a CPU is seen passing through any of these
+  three states, we know that that CPU has exited any previous RCU
+  read-side critical sections.  So, if we remove an item from a
+  linked list, and then wait until all CPUs have switched context,
+  executed in user mode, or executed in the idle loop, we can
+  safely free up that item.
+
+  Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
+  same effect, but require that the readers manipulate CPU-local
+  counters.  These counters allow limited types of blocking within
+  RCU read-side critical sections.  SRCU also uses CPU-local
+  counters, and permits general blocking within RCU read-side
+  critical sections.  These variants of RCU detect grace periods
+  by sampling these counters.
+
+- If I am running on a uniprocessor kernel, which can only do one
+  thing at a time, why should I wait for a grace period?
+
+  See the Documentation/RCU/UP.rst file for more information.
+
+- How can I see where RCU is currently used in the Linux kernel?
+
+  Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
+  "rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock",
+  "srcu_read_unlock", "synchronize_rcu", "synchronize_net",
+  "synchronize_srcu", and the other RCU primitives.  Or grab one
+  of the cscope databases from:
+
+  (http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html).
+
+- What guidelines should I follow when writing code that uses RCU?
+
+  See the checklist.txt file in this directory.
+
+- Why the name "RCU"?
+
+  "RCU" stands for "read-copy update".  The file Documentation/RCU/listRCU.rst
+  has more information on where this name came from, search for
+  "read-copy update" to find it.
+
+- I hear that RCU is patented?  What is with that?
+
+  Yes, it is.  There are several known patents related to RCU,
+  search for the string "Patent" in RTFP.txt to find them.
+  Of these, one was allowed to lapse by the assignee, and the
+  others have been contributed to the Linux kernel under GPL.
+  There are now also LGPL implementations of user-level RCU
+  available (http://liburcu.org/).
+
+- I hear that RCU needs work in order to support realtime kernels?
+
+  Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
+  kernel configuration parameter.
+
+- Where can I find more information on RCU?
+
+  See the RTFP.txt file in this directory.
+  Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
deleted file mode 100644
index c818cf65c5a9..000000000000
--- a/Documentation/RCU/rcu.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-RCU Concepts
-
-
-The basic idea behind RCU (read-copy update) is to split destructive
-operations into two parts, one that prevents anyone from seeing the data
-item being destroyed, and one that actually carries out the destruction.
-A "grace period" must elapse between the two parts, and this grace period
-must be long enough that any readers accessing the item being deleted have
-since dropped their references.  For example, an RCU-protected deletion
-from a linked list would first remove the item from the list, wait for
-a grace period to elapse, then free the element.  See the listRCU.txt
-file for more information on using RCU with linked lists.
-
-
-Frequently Asked Questions
-
-o	Why would anyone want to use RCU?
-
-	The advantage of RCU's two-part approach is that RCU readers need
-	not acquire any locks, perform any atomic instructions, write to
-	shared memory, or (on CPUs other than Alpha) execute any memory
-	barriers.  The fact that these operations are quite expensive
-	on modern CPUs is what gives RCU its performance advantages
-	in read-mostly situations.  The fact that RCU readers need not
-	acquire locks can also greatly simplify deadlock-avoidance code.
-
-o	How can the updater tell when a grace period has completed
-	if the RCU readers give no indication when they are done?
-
-	Just as with spinlocks, RCU readers are not permitted to
-	block, switch to user-mode execution, or enter the idle loop.
-	Therefore, as soon as a CPU is seen passing through any of these
-	three states, we know that that CPU has exited any previous RCU
-	read-side critical sections.  So, if we remove an item from a
-	linked list, and then wait until all CPUs have switched context,
-	executed in user mode, or executed in the idle loop, we can
-	safely free up that item.
-
-	Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
-	same effect, but require that the readers manipulate CPU-local
-	counters.  These counters allow limited types of blocking within
-	RCU read-side critical sections.  SRCU also uses CPU-local
-	counters, and permits general blocking within RCU read-side
-	critical sections.  These variants of RCU detect grace periods
-	by sampling these counters.
-
-o	If I am running on a uniprocessor kernel, which can only do one
-	thing at a time, why should I wait for a grace period?
-
-	See the UP.txt file in this directory.
-
-o	How can I see where RCU is currently used in the Linux kernel?
-
-	Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
-	"rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock",
-	"srcu_read_unlock", "synchronize_rcu", "synchronize_net",
-	"synchronize_srcu", and the other RCU primitives.  Or grab one
-	of the cscope databases from:
-
-	http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
-
-o	What guidelines should I follow when writing code that uses RCU?
-
-	See the checklist.txt file in this directory.
-
-o	Why the name "RCU"?
-
-	"RCU" stands for "read-copy update".  The file listRCU.txt has
-	more information on where this name came from, search for
-	"read-copy update" to find it.
-
-o	I hear that RCU is patented?  What is with that?
-
-	Yes, it is.  There are several known patents related to RCU,
-	search for the string "Patent" in RTFP.txt to find them.
-	Of these, one was allowed to lapse by the assignee, and the
-	others have been contributed to the Linux kernel under GPL.
-	There are now also LGPL implementations of user-level RCU
-	available (http://liburcu.org/).
-
-o	I hear that RCU needs work in order to support realtime kernels?
-
-	Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
-	kernel configuration parameter.
-
-o	Where can I find more information on RCU?
-
-	See the RTFP.txt file in this directory.
-	Or point your browser at http://www.rdrop.com/users/paulmck/RCU/.
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 8151f0195f76..23f115dc87cf 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,7 +1,7 @@
 Using hlist_nulls to protect read-mostly linked lists and
 objects using SLAB_TYPESAFE_BY_RCU allocations.
 
-Please read the basics in Documentation/RCU/listRCU.txt
+Please read the basics in Documentation/RCU/listRCU.rst
 
 Using special makers (called 'nulls') is a convenient way
 to solve following problem :
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 613033ff2b9b..5e6429d66c24 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -12,6 +12,7 @@ please read on.
 Reference counting on elements of lists which are protected by traditional
 reader/writer spinlocks or semaphores are straightforward:
 
+CODE LISTING A:
 1.				2.
 add()				search_and_reference()
 {				{
@@ -28,7 +29,8 @@ add()				search_and_reference()
 release_referenced()			delete()
 {					{
     ...					    write_lock(&list_lock);
-    atomic_dec(&el->rc, relfunc)	    ...
+    if(atomic_dec_and_test(&el->rc))	    ...
+	kfree(el);
     ...					    remove_element
 }					    write_unlock(&list_lock);
  					    ...
@@ -44,6 +46,7 @@ search_and_reference() could potentially hold reference to an element which
 has already been deleted from the list/array.  Use atomic_inc_not_zero()
 in this scenario as follows:
 
+CODE LISTING B:
 1.					2.
 add()					search_and_reference()
 {					{
@@ -79,6 +82,7 @@ search_and_reference() code path.  In such cases, the
 atomic_dec_and_test() may be moved from delete() to el_free()
 as follows:
 
+CODE LISTING C:
 1.					2.
 add()					search_and_reference()
 {					{
@@ -114,6 +118,17 @@ element can therefore safely be freed.  This in turn guarantees that if
 any reader finds the element, that reader may safely acquire a reference
 without checking the value of the reference counter.
 
+A clear advantage of the RCU-based pattern in listing C over the one
+in listing B is that any call to search_and_reference() that locates
+a given object will succeed in obtaining a reference to that object,
+even given a concurrent invocation of delete() for that same object.
+Similarly, a clear advantage of both listings B and C over listing A is
+that a call to delete() is not delayed even if there are an arbitrarily
+large number of calls to search_and_reference() searching for the same
+object that delete() was invoked on.  Instead, all that is delayed is
+the eventual invocation of kfree(), which is usually not a problem on
+modern computer systems, even the small ones.
+
 In cases where delete() can sleep, synchronize_rcu() can be called from
 delete(), so that el_free() can be subsumed into delete as follows:
 
@@ -130,3 +145,7 @@ delete()
     	kfree(el);
     ...
 }
+
+As additional examples in the kernel, the pattern in listing C is used by
+reference counting of struct pid, while the pattern in listing B is used by
+struct posix_acl.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1ab70c37921f..13e88fc00f01 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -153,7 +153,7 @@ rcupdate.rcu_task_stall_timeout
 	This boot/sysfs parameter controls the RCU-tasks stall warning
 	interval.  A value of zero or less suppresses RCU-tasks stall
 	warnings.  A positive value sets the stall-warning interval
-	in jiffies.  An RCU-tasks stall warning starts with the line:
+	in seconds.  An RCU-tasks stall warning starts with the line:
 
 		INFO: rcu_tasks detected stalls on tasks:
 
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 981651a8b65d..7e1a8721637a 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -212,7 +212,7 @@ synchronize_rcu()
 
 rcu_assign_pointer()
 
-	typeof(p) rcu_assign_pointer(p, typeof(p) v);
+	void rcu_assign_pointer(p, typeof(p) v);
 
 	Yes, rcu_assign_pointer() -is- implemented as a macro, though it
 	would be cool to be able to declare a function in this manner.
@@ -220,9 +220,9 @@ rcu_assign_pointer()
 
 	The updater uses this function to assign a new value to an
 	RCU-protected pointer, in order to safely communicate the change
-	in value from the updater to the reader.  This function returns
-	the new value, and also executes any memory-barrier instructions
-	required for a given CPU architecture.
+	in value from the updater to the reader.  This macro does not
+	evaluate to an rvalue, but it does execute any memory-barrier
+	instructions required for a given CPU architecture.
 
 	Perhaps just as important, it serves to document (1) which
 	pointers are protected by RCU and (2) the point at which a
diff --git a/Documentation/accounting/cgroupstats.rst b/Documentation/accounting/cgroupstats.rst
new file mode 100644
index 000000000000..b9afc48f4ea2
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.rst
@@ -0,0 +1,31 @@
+==================
+Control Groupstats
+==================
+
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below::
+
+  ~/balbir/cgroupstats # ./getdelays  -C "/sys/fs/cgroup/a"
+  sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+  ~/balbir/cgroupstats # ./getdelays  -C "/sys/fs/cgroup"
+  sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
deleted file mode 100644
index d16a9849e60e..000000000000
--- a/Documentation/accounting/cgroupstats.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Control Groupstats is inspired by the discussion at
-http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
-suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
-
-Per cgroup statistics infrastructure re-uses code from the taskstats
-interface. A new set of cgroup operations are registered with commands
-and attributes specific to cgroups. It should be very easy to
-extend per cgroup statistics, by adding members to the cgroupstats
-structure.
-
-The current model for cgroupstats is a pull, a push model (to post
-statistics on interesting events), should be very easy to add. Currently
-user space requests for statistics by passing the cgroup path.
-Statistics about the state of all the tasks in the cgroup is returned to
-user space.
-
-NOTE: We currently rely on delay accounting for extracting information
-about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
-information will not be available.
-
-To extract cgroup statistics a utility very similar to getdelays.c
-has been developed, the sample output of the utility is shown below
-
-~/balbir/cgroupstats # ./getdelays  -C "/sys/fs/cgroup/a"
-sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
-~/balbir/cgroupstats # ./getdelays  -C "/sys/fs/cgroup"
-sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
new file mode 100644
index 000000000000..7cc7f5852da0
--- /dev/null
+++ b/Documentation/accounting/delay-accounting.rst
@@ -0,0 +1,126 @@
+================
+Delay accounting
+================
+
+Tasks encounter delays in execution when they wait
+for some kernel resource to become available e.g. a
+runnable task may wait for a free CPU to run on.
+
+The per-task delay accounting functionality measures
+the delays experienced by a task while
+
+a) waiting for a CPU (while being runnable)
+b) completion of synchronous block I/O initiated by the task
+c) swapping in pages
+d) memory reclaim
+
+and makes these statistics available to userspace through
+the taskstats interface.
+
+Such delays provide feedback for setting a task's cpu priority,
+io priority and rss limit values appropriately. Long delays for
+important tasks could be a trigger for raising its corresponding priority.
+
+The functionality, through its use of the taskstats interface, also provides
+delay statistics aggregated for all tasks (or threads) belonging to a
+thread group (corresponding to a traditional Unix process). This is a commonly
+needed aggregation that is more efficiently done by the kernel.
+
+Userspace utilities, particularly resource management applications, can also
+aggregate delay statistics into arbitrary groups. To enable this, delay
+statistics of a task are available both during its lifetime as well as on its
+exit, ensuring continuous and complete monitoring can be done.
+
+
+Interface
+---------
+
+Delay accounting uses the taskstats interface which is described
+in detail in a separate document in this directory. Taskstats returns a
+generic data structure to userspace corresponding to per-pid and per-tgid
+statistics. The delay accounting functionality populates specific fields of
+this structure. See
+
+     include/linux/taskstats.h
+
+for a description of the fields pertaining to delay accounting.
+It will generally be in the form of counters returning the cumulative
+delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
+
+Taking the difference of two successive readings of a given
+counter (say cpu_delay_total) for a task will give the delay
+experienced by the task waiting for the corresponding resource
+in that interval.
+
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
+
+The getdelays.c userspace utility in tools/accounting directory allows simple
+commands to be run and the corresponding delay statistics to be displayed. It
+also serves as an example of using the taskstats interface.
+
+Usage
+-----
+
+Compile the kernel with::
+
+	CONFIG_TASK_DELAY_ACCT=y
+	CONFIG_TASKSTATS=y
+
+Delay accounting is enabled by default at boot up.
+To disable, add::
+
+   nodelayacct
+
+to the kernel boot options. The rest of the instructions
+below assume this has not been done.
+
+After the system has booted up, use a utility
+similar to  getdelays.c to access the delays
+seen by a given task or a task group (tgid).
+The utility also allows a given command to be
+executed and the corresponding delays to be
+seen.
+
+General format of the getdelays command::
+
+	getdelays [-t tgid] [-p pid] [-c cmd...]
+
+
+Get delays, since system boot, for pid 10::
+
+	# ./getdelays -p 10
+	(output similar to next case)
+
+Get sum of delays, since system boot, for all pids with tgid 5::
+
+	# ./getdelays -t 5
+
+
+	CPU	count	real total	virtual total	delay total
+		7876	92005750	100000000	24001500
+	IO	count	delay total
+		0	0
+	SWAP	count	delay total
+		0	0
+	RECLAIM	count	delay total
+		0	0
+
+Get delays seen in executing a given simple command::
+
+  # ./getdelays -c ls /
+
+  bin   data1  data3  data5  dev  home  media  opt   root  srv        sys  usr
+  boot  data2  data4  data6  etc  lib   mnt    proc  sbin  subdomain  tmp  var
+
+
+  CPU	count	real total	virtual total	delay total
+	6	4000250		4000000		0
+  IO	count	delay total
+	0	0
+  SWAP	count	delay total
+	0	0
+  RECLAIM	count	delay total
+	0	0
diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
deleted file mode 100644
index 042ea59b5853..000000000000
--- a/Documentation/accounting/delay-accounting.txt
+++ /dev/null
@@ -1,117 +0,0 @@
-Delay accounting
-----------------
-
-Tasks encounter delays in execution when they wait
-for some kernel resource to become available e.g. a
-runnable task may wait for a free CPU to run on.
-
-The per-task delay accounting functionality measures
-the delays experienced by a task while
-
-a) waiting for a CPU (while being runnable)
-b) completion of synchronous block I/O initiated by the task
-c) swapping in pages
-d) memory reclaim
-
-and makes these statistics available to userspace through
-the taskstats interface.
-
-Such delays provide feedback for setting a task's cpu priority,
-io priority and rss limit values appropriately. Long delays for
-important tasks could be a trigger for raising its corresponding priority.
-
-The functionality, through its use of the taskstats interface, also provides
-delay statistics aggregated for all tasks (or threads) belonging to a
-thread group (corresponding to a traditional Unix process). This is a commonly
-needed aggregation that is more efficiently done by the kernel.
-
-Userspace utilities, particularly resource management applications, can also
-aggregate delay statistics into arbitrary groups. To enable this, delay
-statistics of a task are available both during its lifetime as well as on its
-exit, ensuring continuous and complete monitoring can be done.
-
-
-Interface
----------
-
-Delay accounting uses the taskstats interface which is described
-in detail in a separate document in this directory. Taskstats returns a
-generic data structure to userspace corresponding to per-pid and per-tgid
-statistics. The delay accounting functionality populates specific fields of
-this structure. See
-     include/linux/taskstats.h
-for a description of the fields pertaining to delay accounting.
-It will generally be in the form of counters returning the cumulative
-delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
-
-Taking the difference of two successive readings of a given
-counter (say cpu_delay_total) for a task will give the delay
-experienced by the task waiting for the corresponding resource
-in that interval.
-
-When a task exits, records containing the per-task statistics
-are sent to userspace without requiring a command. If it is the last exiting
-task of a thread group, the per-tgid statistics are also sent. More details
-are given in the taskstats interface description.
-
-The getdelays.c userspace utility in tools/accounting directory allows simple
-commands to be run and the corresponding delay statistics to be displayed. It
-also serves as an example of using the taskstats interface.
-
-Usage
------
-
-Compile the kernel with
-	CONFIG_TASK_DELAY_ACCT=y
-	CONFIG_TASKSTATS=y
-
-Delay accounting is enabled by default at boot up.
-To disable, add
-   nodelayacct
-to the kernel boot options. The rest of the instructions
-below assume this has not been done.
-
-After the system has booted up, use a utility
-similar to  getdelays.c to access the delays
-seen by a given task or a task group (tgid).
-The utility also allows a given command to be
-executed and the corresponding delays to be
-seen.
-
-General format of the getdelays command
-
-getdelays [-t tgid] [-p pid] [-c cmd...]
-
-
-Get delays, since system boot, for pid 10
-# ./getdelays -p 10
-(output similar to next case)
-
-Get sum of delays, since system boot, for all pids with tgid 5
-# ./getdelays -t 5
-
-
-CPU	count	real total	virtual total	delay total
-	7876	92005750	100000000	24001500
-IO	count	delay total
-	0	0
-SWAP	count	delay total
-	0	0
-RECLAIM	count	delay total
-	0	0
-
-Get delays seen in executing a given simple command
-# ./getdelays -c ls /
-
-bin   data1  data3  data5  dev  home  media  opt   root  srv        sys  usr
-boot  data2  data4  data6  etc  lib   mnt    proc  sbin  subdomain  tmp  var
-
-
-CPU	count	real total	virtual total	delay total
-	6	4000250		4000000		0
-IO	count	delay total
-	0	0
-SWAP	count	delay total
-	0	0
-RECLAIM	count	delay total
-	0	0
diff --git a/Documentation/accounting/index.rst b/Documentation/accounting/index.rst
new file mode 100644
index 000000000000..9369d8bf32be
--- /dev/null
+++ b/Documentation/accounting/index.rst
@@ -0,0 +1,14 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Accounting
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   cgroupstats
+   delay-accounting
+   psi
+   taskstats
+   taskstats-struct
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
new file mode 100644
index 000000000000..621111ce5740
--- /dev/null
+++ b/Documentation/accounting/psi.rst
@@ -0,0 +1,182 @@
+================================
+PSI - Pressure Stall Information
+================================
+
+:Date: April, 2018
+:Author: Johannes Weiner <hannes@cmpxchg.org>
+
+When CPU, memory or IO devices are contended, workloads experience
+latency spikes, throughput losses, and run the risk of OOM kills.
+
+Without an accurate measure of such contention, users are forced to
+either play it safe and under-utilize their hardware resources, or
+roll the dice and frequently suffer the disruptions resulting from
+excessive overcommit.
+
+The psi feature identifies and quantifies the disruptions caused by
+such resource crunches and the time impact it has on complex workloads
+or even entire systems.
+
+Having an accurate measure of productivity losses caused by resource
+scarcity aids users in sizing workloads to hardware--or provisioning
+hardware according to workload demand.
+
+As psi aggregates this information in realtime, systems can be managed
+dynamically using techniques such as load shedding, migrating jobs to
+other systems or data centers, or strategically pausing or killing low
+priority or restartable batch jobs.
+
+This allows maximizing hardware utilization without sacrificing
+workload health or risking major disruptions such as OOM kills.
+
+Pressure interface
+==================
+
+Pressure information for each resource is exported through the
+respective file in /proc/pressure/ -- cpu, memory, and io.
+
+The format for CPU is as such::
+
+	some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+and for memory and IO::
+
+	some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+	full avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+The "some" line indicates the share of time in which at least some
+tasks are stalled on a given resource.
+
+The "full" line indicates the share of time in which all non-idle
+tasks are stalled on a given resource simultaneously. In this state
+actual CPU cycles are going to waste, and a workload that spends
+extended time in this state is considered to be thrashing. This has
+severe impact on performance, and it's useful to distinguish this
+situation from a state where some tasks are stalled but the CPU is
+still doing productive work. As such, time spent in this subset of the
+stall state is tracked separately and exported in the "full" averages.
+
+The ratios (in %) are tracked as recent trends over ten, sixty, and
+three hundred second windows, which gives insight into short term events
+as well as medium and long term trends. The total absolute stall time
+(in us) is tracked and exported as well, to allow detection of latency
+spikes which wouldn't necessarily make a dent in the time averages,
+or to average trends over custom time frames.
+
+Monitoring for pressure thresholds
+==================================
+
+Users can register triggers and use poll() to be woken up when resource
+pressure exceeds certain thresholds.
+
+A trigger describes the maximum cumulative stall time over a specific
+time window, e.g. 100ms of total stall time within any 500ms window to
+generate a wakeup event.
+
+To register a trigger user has to open psi interface file under
+/proc/pressure/ representing the resource to be monitored and write the
+desired threshold and time window. The open file descriptor should be
+used to wait for trigger events using select(), poll() or epoll().
+The following format is used::
+
+	<some|full> <stall amount in us> <time window in us>
+
+For example writing "some 150000 1000000" into /proc/pressure/memory
+would add 150ms threshold for partial memory stall measured within
+1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
+would add 50ms threshold for full io stall measured within 1sec time window.
+
+Triggers can be set on more than one psi metric and more than one trigger
+for the same psi metric can be specified. However for each trigger a separate
+file descriptor is required to be able to poll it separately from others,
+therefore for each trigger a separate open() syscall should be made even
+when opening the same psi interface file.
+
+Monitors activate only when system enters stall state for the monitored
+psi metric and deactivates upon exit from the stall state. While system is
+in the stall state psi signal growth is monitored at a rate of 10 times per
+tracking window.
+
+The kernel accepts window sizes ranging from 500ms to 10s, therefore min
+monitoring update interval is 50ms and max is 1s. Min limit is set to
+prevent overly frequent polling. Max limit is chosen as a high enough number
+after which monitors are most likely not needed and psi averages can be used
+instead.
+
+When activated, psi monitor stays active for at least the duration of one
+tracking window to avoid repeated activations/deactivations when system is
+bouncing in and out of the stall state.
+
+Notifications to the userspace are rate-limited to one per tracking window.
+
+The trigger will de-register when the file descriptor used to define the
+trigger  is closed.
+
+Userspace monitor usage example
+===============================
+
+::
+
+  #include <errno.h>
+  #include <fcntl.h>
+  #include <stdio.h>
+  #include <poll.h>
+  #include <string.h>
+  #include <unistd.h>
+
+  /*
+   * Monitor memory partial stall with 1s tracking window size
+   * and 150ms threshold.
+   */
+  int main() {
+	const char trig[] = "some 150000 1000000";
+	struct pollfd fds;
+	int n;
+
+	fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
+	if (fds.fd < 0) {
+		printf("/proc/pressure/memory open error: %s\n",
+			strerror(errno));
+		return 1;
+	}
+	fds.events = POLLPRI;
+
+	if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
+		printf("/proc/pressure/memory write error: %s\n",
+			strerror(errno));
+		return 1;
+	}
+
+	printf("waiting for events...\n");
+	while (1) {
+		n = poll(&fds, 1, -1);
+		if (n < 0) {
+			printf("poll error: %s\n", strerror(errno));
+			return 1;
+		}
+		if (fds.revents & POLLERR) {
+			printf("got POLLERR, event source is gone\n");
+			return 0;
+		}
+		if (fds.revents & POLLPRI) {
+			printf("event triggered!\n");
+		} else {
+			printf("unknown event received: 0x%x\n", fds.revents);
+			return 1;
+		}
+	}
+
+	return 0;
+  }
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
+
+Per-cgroup psi monitors can be specified and used the same way as
+system-wide ones.
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
deleted file mode 100644
index 5cbe5659e3b7..000000000000
--- a/Documentation/accounting/psi.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-================================
-PSI - Pressure Stall Information
-================================
-
-:Date: April, 2018
-:Author: Johannes Weiner <hannes@cmpxchg.org>
-
-When CPU, memory or IO devices are contended, workloads experience
-latency spikes, throughput losses, and run the risk of OOM kills.
-
-Without an accurate measure of such contention, users are forced to
-either play it safe and under-utilize their hardware resources, or
-roll the dice and frequently suffer the disruptions resulting from
-excessive overcommit.
-
-The psi feature identifies and quantifies the disruptions caused by
-such resource crunches and the time impact it has on complex workloads
-or even entire systems.
-
-Having an accurate measure of productivity losses caused by resource
-scarcity aids users in sizing workloads to hardware--or provisioning
-hardware according to workload demand.
-
-As psi aggregates this information in realtime, systems can be managed
-dynamically using techniques such as load shedding, migrating jobs to
-other systems or data centers, or strategically pausing or killing low
-priority or restartable batch jobs.
-
-This allows maximizing hardware utilization without sacrificing
-workload health or risking major disruptions such as OOM kills.
-
-Pressure interface
-==================
-
-Pressure information for each resource is exported through the
-respective file in /proc/pressure/ -- cpu, memory, and io.
-
-The format for CPU is as such:
-
-some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-and for memory and IO:
-
-some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-full avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-The "some" line indicates the share of time in which at least some
-tasks are stalled on a given resource.
-
-The "full" line indicates the share of time in which all non-idle
-tasks are stalled on a given resource simultaneously. In this state
-actual CPU cycles are going to waste, and a workload that spends
-extended time in this state is considered to be thrashing. This has
-severe impact on performance, and it's useful to distinguish this
-situation from a state where some tasks are stalled but the CPU is
-still doing productive work. As such, time spent in this subset of the
-stall state is tracked separately and exported in the "full" averages.
-
-The ratios (in %) are tracked as recent trends over ten, sixty, and
-three hundred second windows, which gives insight into short term events
-as well as medium and long term trends. The total absolute stall time
-(in us) is tracked and exported as well, to allow detection of latency
-spikes which wouldn't necessarily make a dent in the time averages,
-or to average trends over custom time frames.
-
-Monitoring for pressure thresholds
-==================================
-
-Users can register triggers and use poll() to be woken up when resource
-pressure exceeds certain thresholds.
-
-A trigger describes the maximum cumulative stall time over a specific
-time window, e.g. 100ms of total stall time within any 500ms window to
-generate a wakeup event.
-
-To register a trigger user has to open psi interface file under
-/proc/pressure/ representing the resource to be monitored and write the
-desired threshold and time window. The open file descriptor should be
-used to wait for trigger events using select(), poll() or epoll().
-The following format is used:
-
-<some|full> <stall amount in us> <time window in us>
-
-For example writing "some 150000 1000000" into /proc/pressure/memory
-would add 150ms threshold for partial memory stall measured within
-1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
-would add 50ms threshold for full io stall measured within 1sec time window.
-
-Triggers can be set on more than one psi metric and more than one trigger
-for the same psi metric can be specified. However for each trigger a separate
-file descriptor is required to be able to poll it separately from others,
-therefore for each trigger a separate open() syscall should be made even
-when opening the same psi interface file.
-
-Monitors activate only when system enters stall state for the monitored
-psi metric and deactivates upon exit from the stall state. While system is
-in the stall state psi signal growth is monitored at a rate of 10 times per
-tracking window.
-
-The kernel accepts window sizes ranging from 500ms to 10s, therefore min
-monitoring update interval is 50ms and max is 1s. Min limit is set to
-prevent overly frequent polling. Max limit is chosen as a high enough number
-after which monitors are most likely not needed and psi averages can be used
-instead.
-
-When activated, psi monitor stays active for at least the duration of one
-tracking window to avoid repeated activations/deactivations when system is
-bouncing in and out of the stall state.
-
-Notifications to the userspace are rate-limited to one per tracking window.
-
-The trigger will de-register when the file descriptor used to define the
-trigger  is closed.
-
-Userspace monitor usage example
-===============================
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <poll.h>
-#include <string.h>
-#include <unistd.h>
-
-/*
- * Monitor memory partial stall with 1s tracking window size
- * and 150ms threshold.
- */
-int main() {
-	const char trig[] = "some 150000 1000000";
-	struct pollfd fds;
-	int n;
-
-	fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
-	if (fds.fd < 0) {
-		printf("/proc/pressure/memory open error: %s\n",
-			strerror(errno));
-		return 1;
-	}
-	fds.events = POLLPRI;
-
-	if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
-		printf("/proc/pressure/memory write error: %s\n",
-			strerror(errno));
-		return 1;
-	}
-
-	printf("waiting for events...\n");
-	while (1) {
-		n = poll(&fds, 1, -1);
-		if (n < 0) {
-			printf("poll error: %s\n", strerror(errno));
-			return 1;
-		}
-		if (fds.revents & POLLERR) {
-			printf("got POLLERR, event source is gone\n");
-			return 0;
-		}
-		if (fds.revents & POLLPRI) {
-			printf("event triggered!\n");
-		} else {
-			printf("unknown event received: 0x%x\n", fds.revents);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-Cgroup2 interface
-=================
-
-In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
-mounted, pressure stall information is also tracked for tasks grouped
-into cgroups. Each subdirectory in the cgroupfs mountpoint contains
-cpu.pressure, memory.pressure, and io.pressure files; the format is
-the same as the /proc/pressure/ files.
-
-Per-cgroup psi monitors can be specified and used the same way as
-system-wide ones.
diff --git a/Documentation/accounting/taskstats-struct.rst b/Documentation/accounting/taskstats-struct.rst
new file mode 100644
index 000000000000..ca90fd489c9a
--- /dev/null
+++ b/Documentation/accounting/taskstats-struct.rst
@@ -0,0 +1,199 @@
+====================
+The struct taskstats
+====================
+
+This document contains an explanation of the struct taskstats fields.
+
+There are three different groups of fields in the struct taskstats:
+
+1) Common and basic accounting fields
+    If CONFIG_TASKSTATS is set, the taskstats interface is enabled and
+    the common fields and basic accounting fields are collected for
+    delivery at do_exit() of a task.
+2) Delay accounting fields
+    These fields are placed between::
+
+	/* Delay accounting fields start */
+
+    and::
+
+	/* Delay accounting fields end */
+
+    Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
+3) Extended accounting fields
+    These fields are placed between::
+
+	/* Extended accounting fields start */
+
+    and::
+
+	/* Extended accounting fields end */
+
+    Their values are collected if CONFIG_TASK_XACCT is set.
+
+4) Per-task and per-thread context switch count statistics
+
+5) Time accounting for SMT machines
+
+6) Extended delay accounting fields for memory reclaim
+
+Future extension should add fields to the end of the taskstats struct, and
+should not change the relative position of each field within the struct.
+
+::
+
+  struct taskstats {
+
+1) Common and basic accounting fields::
+
+	/* The version number of this struct. This field is always set to
+	 * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+	 * Each time the struct is changed, the value should be incremented.
+	 */
+	__u16	version;
+
+	/* The exit code of a task. */
+	__u32	ac_exitcode;		/* Exit status */
+
+	/* The accounting flags of a task as defined in <linux/acct.h>
+	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+	 */
+	__u8	ac_flag;		/* Record flags */
+
+	/* The value of task_nice() of a task. */
+	__u8	ac_nice;		/* task_nice */
+
+	/* The name of the command that started this task. */
+	char	ac_comm[TS_COMM_LEN];	/* Command name */
+
+	/* The scheduling discipline as set in task->policy field. */
+	__u8	ac_sched;		/* Scheduling discipline */
+
+	__u8	ac_pad[3];
+	__u32	ac_uid;			/* User ID */
+	__u32	ac_gid;			/* Group ID */
+	__u32	ac_pid;			/* Process ID */
+	__u32	ac_ppid;		/* Parent process ID */
+
+	/* The time when a task begins, in [secs] since 1970. */
+	__u32	ac_btime;		/* Begin time [sec since 1970] */
+
+	/* The elapsed time of a task, in [usec]. */
+	__u64	ac_etime;		/* Elapsed time [usec] */
+
+	/* The user CPU time of a task, in [usec]. */
+	__u64	ac_utime;		/* User CPU time [usec] */
+
+	/* The system CPU time of a task, in [usec]. */
+	__u64	ac_stime;		/* System CPU time [usec] */
+
+	/* The minor page fault count of a task, as set in task->min_flt. */
+	__u64	ac_minflt;		/* Minor Page Fault Count */
+
+	/* The major page fault count of a task, as set in task->maj_flt. */
+	__u64	ac_majflt;		/* Major Page Fault Count */
+
+
+2) Delay accounting fields::
+
+	/* Delay accounting fields start
+	 *
+	 * All values, until the comment "Delay accounting fields end" are
+	 * available only if delay accounting is enabled, even though the last
+	 * few fields are not delays
+	 *
+	 * xxx_count is the number of delay values recorded
+	 * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+	 *
+	 * xxx_delay_total wraps around to zero on overflow
+	 * xxx_count incremented regardless of overflow
+	 */
+
+	/* Delay waiting for cpu, while runnable
+	 * count, delay_total NOT updated atomically
+	 */
+	__u64	cpu_count;
+	__u64	cpu_delay_total;
+
+	/* Following four fields atomically updated using task->delays->lock */
+
+	/* Delay waiting for synchronous block I/O to complete
+	 * does not account for delays in I/O submission
+	 */
+	__u64	blkio_count;
+	__u64	blkio_delay_total;
+
+	/* Delay waiting for page fault I/O (swap in only) */
+	__u64	swapin_count;
+	__u64	swapin_delay_total;
+
+	/* cpu "wall-clock" running time
+	 * On some architectures, value will adjust for cpu time stolen
+	 * from the kernel in involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_real_total;
+
+	/* cpu "virtual" running time
+	 * Uses time intervals seen by the kernel i.e. no adjustment
+	 * for kernel's involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_virtual_total;
+	/* Delay accounting fields end */
+	/* version 1 ends here */
+
+
+3) Extended accounting fields::
+
+	/* Extended accounting fields start */
+
+	/* Accumulated RSS usage in duration of a task, in MBytes-usecs.
+	 * The current rss usage is added to this counter every time
+	 * a tick is charged to a task's system time. So, at the end we
+	 * will have memory usage multiplied by system time. Thus an
+	 * average usage per system time unit can be calculated.
+	 */
+	__u64	coremem;		/* accumulated RSS usage in MB-usec */
+
+	/* Accumulated virtual memory usage in duration of a task.
+	 * Same as acct_rss_mem1 above except that we keep track of VM usage.
+	 */
+	__u64	virtmem;		/* accumulated VM usage in MB-usec */
+
+	/* High watermark of RSS usage in duration of a task, in KBytes. */
+	__u64	hiwater_rss;		/* High-watermark of RSS usage */
+
+	/* High watermark of VM  usage in duration of a task, in KBytes. */
+	__u64	hiwater_vm;		/* High-water virtual memory usage */
+
+	/* The following four fields are I/O statistics of a task. */
+	__u64	read_char;		/* bytes read */
+	__u64	write_char;		/* bytes written */
+	__u64	read_syscalls;		/* read syscalls */
+	__u64	write_syscalls;		/* write syscalls */
+
+	/* Extended accounting fields end */
+
+4) Per-task and per-thread statistics::
+
+	__u64	nvcsw;			/* Context voluntary switch counter */
+	__u64	nivcsw;			/* Context involuntary switch counter */
+
+5) Time accounting for SMT machines::
+
+	__u64	ac_utimescaled;		/* utime scaled on frequency etc */
+	__u64	ac_stimescaled;		/* stime scaled on frequency etc */
+	__u64	cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
+
+6) Extended delay accounting fields for memory reclaim::
+
+	/* Delay waiting for memory reclaim */
+	__u64	freepages_count;
+	__u64	freepages_delay_total;
+
+::
+
+  }
diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
deleted file mode 100644
index e7512c061c15..000000000000
--- a/Documentation/accounting/taskstats-struct.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-The struct taskstats
---------------------
-
-This document contains an explanation of the struct taskstats fields.
-
-There are three different groups of fields in the struct taskstats:
-
-1) Common and basic accounting fields
-    If CONFIG_TASKSTATS is set, the taskstats interface is enabled and
-    the common fields and basic accounting fields are collected for
-    delivery at do_exit() of a task.
-2) Delay accounting fields
-    These fields are placed between
-    /* Delay accounting fields start */
-    and
-    /* Delay accounting fields end */
-    Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
-3) Extended accounting fields
-    These fields are placed between
-    /* Extended accounting fields start */
-    and
-    /* Extended accounting fields end */
-    Their values are collected if CONFIG_TASK_XACCT is set.
-
-4) Per-task and per-thread context switch count statistics
-
-5) Time accounting for SMT machines
-
-6) Extended delay accounting fields for memory reclaim
-
-Future extension should add fields to the end of the taskstats struct, and
-should not change the relative position of each field within the struct.
-
-
-struct taskstats {
-
-1) Common and basic accounting fields:
-	/* The version number of this struct. This field is always set to
-	 * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
-	 * Each time the struct is changed, the value should be incremented.
-	 */
-	__u16	version;
-
-  	/* The exit code of a task. */
-	__u32	ac_exitcode;		/* Exit status */
-
-  	/* The accounting flags of a task as defined in <linux/acct.h>
-	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
-	 */
-	__u8	ac_flag;		/* Record flags */
-
-  	/* The value of task_nice() of a task. */
-	__u8	ac_nice;		/* task_nice */
-
-  	/* The name of the command that started this task. */
-	char	ac_comm[TS_COMM_LEN];	/* Command name */
-
-  	/* The scheduling discipline as set in task->policy field. */
-	__u8	ac_sched;		/* Scheduling discipline */
-
-	__u8	ac_pad[3];
-	__u32	ac_uid;			/* User ID */
-	__u32	ac_gid;			/* Group ID */
-	__u32	ac_pid;			/* Process ID */
-	__u32	ac_ppid;		/* Parent process ID */
-
-  	/* The time when a task begins, in [secs] since 1970. */
-	__u32	ac_btime;		/* Begin time [sec since 1970] */
-
-  	/* The elapsed time of a task, in [usec]. */
-	__u64	ac_etime;		/* Elapsed time [usec] */
-
-  	/* The user CPU time of a task, in [usec]. */
-	__u64	ac_utime;		/* User CPU time [usec] */
-
-  	/* The system CPU time of a task, in [usec]. */
-	__u64	ac_stime;		/* System CPU time [usec] */
-
-  	/* The minor page fault count of a task, as set in task->min_flt. */
-	__u64	ac_minflt;		/* Minor Page Fault Count */
-
-	/* The major page fault count of a task, as set in task->maj_flt. */
-	__u64	ac_majflt;		/* Major Page Fault Count */
-
-
-2) Delay accounting fields:
-	/* Delay accounting fields start
-	 *
-	 * All values, until the comment "Delay accounting fields end" are
-	 * available only if delay accounting is enabled, even though the last
-	 * few fields are not delays
-	 *
-	 * xxx_count is the number of delay values recorded
-	 * xxx_delay_total is the corresponding cumulative delay in nanoseconds
-	 *
-	 * xxx_delay_total wraps around to zero on overflow
-	 * xxx_count incremented regardless of overflow
-	 */
-
-	/* Delay waiting for cpu, while runnable
-	 * count, delay_total NOT updated atomically
-	 */
-	__u64	cpu_count;
-	__u64	cpu_delay_total;
-
-	/* Following four fields atomically updated using task->delays->lock */
-
-	/* Delay waiting for synchronous block I/O to complete
-	 * does not account for delays in I/O submission
-	 */
-	__u64	blkio_count;
-	__u64	blkio_delay_total;
-
-	/* Delay waiting for page fault I/O (swap in only) */
-	__u64	swapin_count;
-	__u64	swapin_delay_total;
-
-	/* cpu "wall-clock" running time
-	 * On some architectures, value will adjust for cpu time stolen
-	 * from the kernel in involuntary waits due to virtualization.
-	 * Value is cumulative, in nanoseconds, without a corresponding count
-	 * and wraps around to zero silently on overflow
-	 */
-	__u64	cpu_run_real_total;
-
-	/* cpu "virtual" running time
-	 * Uses time intervals seen by the kernel i.e. no adjustment
-	 * for kernel's involuntary waits due to virtualization.
-	 * Value is cumulative, in nanoseconds, without a corresponding count
-	 * and wraps around to zero silently on overflow
-	 */
-	__u64	cpu_run_virtual_total;
-	/* Delay accounting fields end */
-	/* version 1 ends here */
-
-
-3) Extended accounting fields
-	/* Extended accounting fields start */
-
-	/* Accumulated RSS usage in duration of a task, in MBytes-usecs.
-	 * The current rss usage is added to this counter every time
-	 * a tick is charged to a task's system time. So, at the end we
-	 * will have memory usage multiplied by system time. Thus an
-	 * average usage per system time unit can be calculated.
-	 */
-	__u64	coremem;		/* accumulated RSS usage in MB-usec */
-
-  	/* Accumulated virtual memory usage in duration of a task.
-	 * Same as acct_rss_mem1 above except that we keep track of VM usage.
-	 */
-	__u64	virtmem;		/* accumulated VM usage in MB-usec */
-
-  	/* High watermark of RSS usage in duration of a task, in KBytes. */
-	__u64	hiwater_rss;		/* High-watermark of RSS usage */
-
-  	/* High watermark of VM  usage in duration of a task, in KBytes. */
-	__u64	hiwater_vm;		/* High-water virtual memory usage */
-
-	/* The following four fields are I/O statistics of a task. */
-	__u64	read_char;		/* bytes read */
-	__u64	write_char;		/* bytes written */
-	__u64	read_syscalls;		/* read syscalls */
-	__u64	write_syscalls;		/* write syscalls */
-
-	/* Extended accounting fields end */
-
-4) Per-task and per-thread statistics
-	__u64	nvcsw;			/* Context voluntary switch counter */
-	__u64	nivcsw;			/* Context involuntary switch counter */
-
-5) Time accounting for SMT machines
-	__u64	ac_utimescaled;		/* utime scaled on frequency etc */
-	__u64	ac_stimescaled;		/* stime scaled on frequency etc */
-	__u64	cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
-
-6) Extended delay accounting fields for memory reclaim
-	/* Delay waiting for memory reclaim */
-	__u64	freepages_count;
-	__u64	freepages_delay_total;
-}
diff --git a/Documentation/accounting/taskstats.rst b/Documentation/accounting/taskstats.rst
new file mode 100644
index 000000000000..2a28b7f55c10
--- /dev/null
+++ b/Documentation/accounting/taskstats.rst
@@ -0,0 +1,180 @@
+=============================
+Per-task statistics interface
+=============================
+
+
+Taskstats is a netlink-based interface for sending per-task and
+per-process statistics from the kernel to userspace.
+
+Taskstats was designed for the following benefits:
+
+- efficiently provide statistics during lifetime of a task and on its exit
+- unified interface for multiple accounting subsystems
+- extensibility for use by future accounting patches
+
+Terminology
+-----------
+
+"pid", "tid" and "task" are used interchangeably and refer to the standard
+Linux task defined by struct task_struct.  per-pid stats are the same as
+per-task stats.
+
+"tgid", "process" and "thread group" are used interchangeably and refer to the
+tasks that share an mm_struct i.e. the traditional Unix process. Despite the
+use of tgid, there is no special treatment for the task that is thread group
+leader - a process is deemed alive as long as it has any task belonging to it.
+
+Usage
+-----
+
+To get statistics during a task's lifetime, userspace opens a unicast netlink
+socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
+The response contains statistics for a task (if pid is specified) or the sum of
+statistics for all tasks of the process (if tgid is specified).
+
+To obtain statistics for tasks which are exiting, the userspace listener
+sends a register command and specifies a cpumask. Whenever a task exits on
+one of the cpus in the cpumask, its per-pid statistics are sent to the
+registered listener. Using cpumasks allows the data received by one listener
+to be limited and assists in flow control over the netlink interface and is
+explained in more detail below.
+
+If the exiting task is the last thread exiting its thread group,
+an additional record containing the per-tgid stats is also sent to userspace.
+The latter contains the sum of per-pid stats for all threads in the thread
+group, both past and present.
+
+getdelays.c is a simple utility demonstrating usage of the taskstats interface
+for reporting delay accounting statistics. Users can register cpumasks,
+send commands and process responses, listen for per-tid/tgid exit data,
+write the data received to a file and do basic flow control by increasing
+receive buffer sizes.
+
+Interface
+---------
+
+The user-kernel interface is encapsulated in include/linux/taskstats.h
+
+To avoid this documentation becoming obsolete as the interface evolves, only
+an outline of the current version is given. taskstats.h always overrides the
+description here.
+
+struct taskstats is the common accounting structure for both per-pid and
+per-tgid data. It is versioned and can be extended by each accounting subsystem
+that is added to the kernel. The fields and their semantics are defined in the
+taskstats.h file.
+
+The data exchanged between user and kernel space is a netlink message belonging
+to the NETLINK_GENERIC family and using the netlink attributes interface.
+The messages are in the format::
+
+    +----------+- - -+-------------+-------------------+
+    | nlmsghdr | Pad |  genlmsghdr | taskstats payload |
+    +----------+- - -+-------------+-------------------+
+
+
+The taskstats payload is one of the following three kinds:
+
+1. Commands: Sent from user to kernel. Commands to get data on
+a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
+containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
+the task/process for which userspace wants statistics.
+
+Commands to register/deregister interest in exit data from a set of cpus
+consist of one attribute, of type
+TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
+attribute payload. The cpumask is specified as an ascii string of
+comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
+the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
+in cpus before closing the listening socket, the kernel cleans up its interest
+set over time. However, for the sake of efficiency, an explicit deregistration
+is advisable.
+
+2. Response for a command: sent from the kernel in response to a userspace
+command. The payload is a series of three attributes of type:
+
+a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
+a pid/tgid will be followed by some stats.
+
+b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
+are being returned.
+
+c) TASKSTATS_TYPE_STATS: attribute with a struct taskstats as payload. The
+same structure is used for both per-pid and per-tgid stats.
+
+3. New message sent by kernel whenever a task exits. The payload consists of a
+   series of attributes of the following type:
+
+a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
+b) TASKSTATS_TYPE_PID: contains exiting task's pid
+c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
+d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
+e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
+f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
+
+
+per-tgid stats
+--------------
+
+Taskstats provides per-process stats, in addition to per-task stats, since
+resource management is often done at a process granularity and aggregating task
+stats in userspace alone is inefficient and potentially inaccurate (due to lack
+of atomicity).
+
+However, maintaining per-process, in addition to per-task stats, within the
+kernel has space and time overheads. To address this, the taskstats code
+accumulates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumulated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumulated total for previously exited
+threads of the same thread group.
+
+Extending taskstats
+-------------------
+
+There are two ways to extend the taskstats interface to export more
+per-task/process stats as patches to collect them get added to the kernel
+in future:
+
+1. Adding more fields to the end of the existing struct taskstats. Backward
+   compatibility is ensured by the version number within the
+   structure. Userspace will use only the fields of the struct that correspond
+   to the version its using.
+
+2. Defining separate statistic structs and using the netlink attributes
+   interface to return them. Since userspace processes each netlink attribute
+   independently, it can always ignore attributes whose type it does not
+   understand (because it is using an older version of the interface).
+
+
+Choosing between 1. and 2. is a matter of trading off flexibility and
+overhead. If only a few fields need to be added, then 1. is the preferable
+path since the kernel and userspace don't need to incur the overhead of
+processing new netlink attributes. But if the new fields expand the existing
+struct too much, requiring disparate userspace accounting utilities to
+unnecessarily receive large structures whose fields are of no interest, then
+extending the attributes structure would be worthwhile.
+
+Flow control for taskstats
+--------------------------
+
+When the rate of task exits becomes large, a listener may not be able to keep
+up with the kernel's rate of sending per-tid/tgid exit data leading to data
+loss. This possibility gets compounded when the taskstats structure gets
+extended and the number of cpus grows large.
+
+To avoid losing statistics, userspace should do one or more of the following:
+
+- increase the receive buffer sizes for the netlink sockets opened by
+  listeners to receive exit data.
+
+- create more listeners and reduce the number of cpus being listened to by
+  each listener. In the extreme case, there could be one listener for each cpu.
+  Users may also consider setting the cpu affinity of the listener to the subset
+  of cpus to which it listens, especially if they are listening to just one cpu.
+
+Despite these measures, if the userspace receives ENOBUFS error messages
+indicated overflow of receive buffers, it should take measures to handle the
+loss of data.
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
deleted file mode 100644
index ff06b738bb88..000000000000
--- a/Documentation/accounting/taskstats.txt
+++ /dev/null
@@ -1,181 +0,0 @@
-Per-task statistics interface
------------------------------
-
-
-Taskstats is a netlink-based interface for sending per-task and
-per-process statistics from the kernel to userspace.
-
-Taskstats was designed for the following benefits:
-
-- efficiently provide statistics during lifetime of a task and on its exit
-- unified interface for multiple accounting subsystems
-- extensibility for use by future accounting patches
-
-Terminology
------------
-
-"pid", "tid" and "task" are used interchangeably and refer to the standard
-Linux task defined by struct task_struct.  per-pid stats are the same as
-per-task stats.
-
-"tgid", "process" and "thread group" are used interchangeably and refer to the
-tasks that share an mm_struct i.e. the traditional Unix process. Despite the
-use of tgid, there is no special treatment for the task that is thread group
-leader - a process is deemed alive as long as it has any task belonging to it.
-
-Usage
------
-
-To get statistics during a task's lifetime, userspace opens a unicast netlink
-socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
-The response contains statistics for a task (if pid is specified) or the sum of
-statistics for all tasks of the process (if tgid is specified).
-
-To obtain statistics for tasks which are exiting, the userspace listener
-sends a register command and specifies a cpumask. Whenever a task exits on
-one of the cpus in the cpumask, its per-pid statistics are sent to the
-registered listener. Using cpumasks allows the data received by one listener
-to be limited and assists in flow control over the netlink interface and is
-explained in more detail below.
-
-If the exiting task is the last thread exiting its thread group,
-an additional record containing the per-tgid stats is also sent to userspace.
-The latter contains the sum of per-pid stats for all threads in the thread
-group, both past and present.
-
-getdelays.c is a simple utility demonstrating usage of the taskstats interface
-for reporting delay accounting statistics. Users can register cpumasks,
-send commands and process responses, listen for per-tid/tgid exit data,
-write the data received to a file and do basic flow control by increasing
-receive buffer sizes.
-
-Interface
----------
-
-The user-kernel interface is encapsulated in include/linux/taskstats.h
-
-To avoid this documentation becoming obsolete as the interface evolves, only
-an outline of the current version is given. taskstats.h always overrides the
-description here.
-
-struct taskstats is the common accounting structure for both per-pid and
-per-tgid data. It is versioned and can be extended by each accounting subsystem
-that is added to the kernel. The fields and their semantics are defined in the
-taskstats.h file.
-
-The data exchanged between user and kernel space is a netlink message belonging
-to the NETLINK_GENERIC family and using the netlink attributes interface.
-The messages are in the format
-
-    +----------+- - -+-------------+-------------------+
-    | nlmsghdr | Pad |  genlmsghdr | taskstats payload |
-    +----------+- - -+-------------+-------------------+
-
-
-The taskstats payload is one of the following three kinds:
-
-1. Commands: Sent from user to kernel. Commands to get data on
-a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
-containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
-the task/process for which userspace wants statistics.
-
-Commands to register/deregister interest in exit data from a set of cpus
-consist of one attribute, of type
-TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
-attribute payload. The cpumask is specified as an ascii string of
-comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
-the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
-in cpus before closing the listening socket, the kernel cleans up its interest
-set over time. However, for the sake of efficiency, an explicit deregistration
-is advisable.
-
-2. Response for a command: sent from the kernel in response to a userspace
-command. The payload is a series of three attributes of type:
-
-a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
-a pid/tgid will be followed by some stats.
-
-b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
-are being returned.
-
-c) TASKSTATS_TYPE_STATS: attribute with a struct taskstats as payload. The
-same structure is used for both per-pid and per-tgid stats.
-
-3. New message sent by kernel whenever a task exits. The payload consists of a
-   series of attributes of the following type:
-
-a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
-b) TASKSTATS_TYPE_PID: contains exiting task's pid
-c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
-d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
-e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
-f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
-
-
-per-tgid stats
---------------
-
-Taskstats provides per-process stats, in addition to per-task stats, since
-resource management is often done at a process granularity and aggregating task
-stats in userspace alone is inefficient and potentially inaccurate (due to lack
-of atomicity).
-
-However, maintaining per-process, in addition to per-task stats, within the
-kernel has space and time overheads. To address this, the taskstats code
-accumulates each exiting task's statistics into a process-wide data structure.
-When the last task of a process exits, the process level data accumulated also
-gets sent to userspace (along with the per-task data).
-
-When a user queries to get per-tgid data, the sum of all other live threads in
-the group is added up and added to the accumulated total for previously exited
-threads of the same thread group.
-
-Extending taskstats
--------------------
-
-There are two ways to extend the taskstats interface to export more
-per-task/process stats as patches to collect them get added to the kernel
-in future:
-
-1. Adding more fields to the end of the existing struct taskstats. Backward
-   compatibility is ensured by the version number within the
-   structure. Userspace will use only the fields of the struct that correspond
-   to the version its using.
-
-2. Defining separate statistic structs and using the netlink attributes
-   interface to return them. Since userspace processes each netlink attribute
-   independently, it can always ignore attributes whose type it does not
-   understand (because it is using an older version of the interface).
-
-
-Choosing between 1. and 2. is a matter of trading off flexibility and
-overhead. If only a few fields need to be added, then 1. is the preferable
-path since the kernel and userspace don't need to incur the overhead of
-processing new netlink attributes. But if the new fields expand the existing
-struct too much, requiring disparate userspace accounting utilities to
-unnecessarily receive large structures whose fields are of no interest, then
-extending the attributes structure would be worthwhile.
-
-Flow control for taskstats
---------------------------
-
-When the rate of task exits becomes large, a listener may not be able to keep
-up with the kernel's rate of sending per-tid/tgid exit data leading to data
-loss. This possibility gets compounded when the taskstats structure gets
-extended and the number of cpus grows large.
-
-To avoid losing statistics, userspace should do one or more of the following:
-
-- increase the receive buffer sizes for the netlink sockets opened by
-listeners to receive exit data.
-
-- create more listeners and reduce the number of cpus being listened to by
-each listener. In the extreme case, there could be one listener for each cpu.
-Users may also consider setting the cpu affinity of the listener to the subset
-of cpus to which it listens, especially if they are listening to just one cpu.
-
-Despite these measures, if the userspace receives ENOBUFS error messages
-indicated overflow of receive buffers, it should take measures to handle the
-loss of data.
-
-----
diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt
index 81a63af42ed2..cc58b1a574c5 100644
--- a/Documentation/acpi/dsd/leds.txt
+++ b/Documentation/acpi/dsd/leds.txt
@@ -96,4 +96,4 @@ where
     <URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
     referenced 2019-02-21.
 
-[7] Documentation/acpi/dsd/data-node-reference.txt
+[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst
diff --git a/Documentation/admin-guide/LSM/LoadPin.rst b/Documentation/admin-guide/LSM/LoadPin.rst
index 32070762d24c..716ad9b23c9a 100644
--- a/Documentation/admin-guide/LSM/LoadPin.rst
+++ b/Documentation/admin-guide/LSM/LoadPin.rst
@@ -19,3 +19,13 @@ block device backing the filesystem is not read-only, a sysctl is
 created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having
 a mutable filesystem means pinning is mutable too, but having the
 sysctl allows for easy testing on systems with a mutable filesystem.)
+
+It's also possible to exclude specific file types from LoadPin using kernel
+command line option "``loadpin.exclude``". By default, all files are
+included, but they can be excluded using kernel command line option such
+as "``loadpin.exclude=kernel-module,kexec-image``". This allows to use
+different mechanisms such as ``CONFIG_MODULE_SIG`` and
+``CONFIG_KEXEC_VERIFY_SIG`` to verify kernel module and kernel image while
+still use LoadPin to protect the integrity of other files kernel loads. The
+full list of valid file types can be found in ``kernel_read_file_str``
+defined in ``include/linux/fs.h``.
diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index a582c780c3bd..cc6151fc0845 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -227,7 +227,7 @@ Configuring the kernel
      "make tinyconfig"  Configure the tiniest possible kernel.
 
    You can find more information on using the Linux kernel config tools
-   in Documentation/kbuild/kconfig.txt.
+   in Documentation/kbuild/kconfig.rst.
 
  - NOTES on ``make config``:
 
diff --git a/Documentation/admin-guide/aoe/aoe.rst b/Documentation/admin-guide/aoe/aoe.rst
new file mode 100644
index 000000000000..a05e751363a0
--- /dev/null
+++ b/Documentation/admin-guide/aoe/aoe.rst
@@ -0,0 +1,150 @@
+Introduction
+============
+
+ATA over Ethernet is a network protocol that provides simple access to
+block storage on the LAN.
+
+  http://support.coraid.com/documents/AoEr11.txt
+
+The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ...
+
+  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
+
+It has many tips and hints!  Please see, especially, recommended
+tunings for virtual memory:
+
+  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19
+
+The aoetools are userland programs that are designed to work with this
+driver.  The aoetools are on sourceforge.
+
+  http://aoetools.sourceforge.net/
+
+The scripts in this Documentation/admin-guide/aoe directory are intended to
+document the use of the driver and are not necessary if you install
+the aoetools.
+
+
+Creating Device Nodes
+=====================
+
+  Users of udev should find the block device nodes created
+  automatically, but to create all the necessary device nodes, use the
+  udev configuration rules provided in udev.txt (in this directory).
+
+  There is a udev-install.sh script that shows how to install these
+  rules on your system.
+
+  There is also an autoload script that shows how to edit
+  /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when
+  necessary.  Preloading the aoe module is preferable to autoloading,
+  however, because AoE discovery takes a few seconds.  It can be
+  confusing when an AoE device is not present the first time the a
+  command is run but appears a second later.
+
+Using Device Nodes
+==================
+
+  "cat /dev/etherd/err" blocks, waiting for error diagnostic output,
+  like any retransmitted packets.
+
+  "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to
+  limit ATA over Ethernet traffic to eth2 and eth4.  AoE traffic from
+  untrusted networks should be ignored as a matter of security.  See
+  also the aoe_iflist driver option described below.
+
+  "echo > /dev/etherd/discover" tells the driver to find out what AoE
+  devices are available.
+
+  In the future these character devices may disappear and be replaced
+  by sysfs counterparts.  Using the commands in aoetools insulates
+  users from these implementation details.
+
+  The block devices are named like this::
+
+	e{shelf}.{slot}
+	e{shelf}.{slot}p{part}
+
+  ... so that "e0.2" is the third blade from the left (slot 2) in the
+  first shelf (shelf address zero).  That's the whole disk.  The first
+  partition on that disk would be "e0.2p1".
+
+Using sysfs
+===========
+
+  Each aoe block device in /sys/block has the extra attributes of
+  state, mac, and netif.  The state attribute is "up" when the device
+  is ready for I/O and "down" if detected but unusable.  The
+  "down,closewait" state shows that the device is still open and
+  cannot come up again until it has been closed.
+
+  The mac attribute is the ethernet address of the remote AoE device.
+  The netif attribute is the network interface on the localhost
+  through which we are communicating with the remote AoE device.
+
+  There is a script in this directory that formats this information in
+  a convenient way.  Users with aoetools should use the aoe-stat
+  command::
+
+    root@makki root# sh Documentation/admin-guide/aoe/status.sh
+       e10.0            eth3              up
+       e10.1            eth3              up
+       e10.2            eth3              up
+       e10.3            eth3              up
+       e10.4            eth3              up
+       e10.5            eth3              up
+       e10.6            eth3              up
+       e10.7            eth3              up
+       e10.8            eth3              up
+       e10.9            eth3              up
+        e4.0            eth1              up
+        e4.1            eth1              up
+        e4.2            eth1              up
+        e4.3            eth1              up
+        e4.4            eth1              up
+        e4.5            eth1              up
+        e4.6            eth1              up
+        e4.7            eth1              up
+        e4.8            eth1              up
+        e4.9            eth1              up
+
+  Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver
+  option discussed below) instead of /dev/etherd/interfaces to limit
+  AoE traffic to the network interfaces in the given
+  whitespace-separated list.  Unlike the old character device, the
+  sysfs entry can be read from as well as written to.
+
+  It's helpful to trigger discovery after setting the list of allowed
+  interfaces.  The aoetools package provides an aoe-discover script
+  for this purpose.  You can also directly use the
+  /dev/etherd/discover special file described above.
+
+Driver Options
+==============
+
+  There is a boot option for the built-in aoe driver and a
+  corresponding module parameter, aoe_iflist.  Without this option,
+  all network interfaces may be used for ATA over Ethernet.  Here is a
+  usage example for the module parameter::
+
+    modprobe aoe_iflist="eth1 eth3"
+
+  The aoe_deadsecs module parameter determines the maximum number of
+  seconds that the driver will wait for an AoE device to provide a
+  response to an AoE command.  After aoe_deadsecs seconds have
+  elapsed, the AoE device will be marked as "down".  A value of zero
+  is supported for testing purposes and makes the aoe driver keep
+  trying AoE commands forever.
+
+  The aoe_maxout module parameter has a default of 128.  This is the
+  maximum number of unresponded packets that will be sent to an AoE
+  target at one time.
+
+  The aoe_dyndevs module parameter defaults to 1, meaning that the
+  driver will assign a block device minor number to a discovered AoE
+  target based on the order of its discovery.  With dynamic minor
+  device numbers in use, a greater range of AoE shelf and slot
+  addresses can be supported.  Users with udev will never have to
+  think about minor numbers.  Using aoe_dyndevs=0 allows device nodes
+  to be pre-created using a static minor-number scheme with the
+  aoe-mkshelf script in the aoetools.
diff --git a/Documentation/aoe/autoload.sh b/Documentation/admin-guide/aoe/autoload.sh
index 815dff4691c9..815dff4691c9 100644
--- a/Documentation/aoe/autoload.sh
+++ b/Documentation/admin-guide/aoe/autoload.sh
diff --git a/Documentation/admin-guide/aoe/examples.rst b/Documentation/admin-guide/aoe/examples.rst
new file mode 100644
index 000000000000..91f3198e52c1
--- /dev/null
+++ b/Documentation/admin-guide/aoe/examples.rst
@@ -0,0 +1,23 @@
+Example of udev rules
+---------------------
+
+ .. include:: udev.txt
+    :literal:
+
+Example of udev install rules script
+------------------------------------
+
+ .. literalinclude:: udev-install.sh
+    :language: shell
+
+Example script to get status
+----------------------------
+
+ .. literalinclude:: status.sh
+    :language: shell
+
+Example of AoE autoload script
+------------------------------
+
+ .. literalinclude:: autoload.sh
+    :language: shell
diff --git a/Documentation/admin-guide/aoe/index.rst b/Documentation/admin-guide/aoe/index.rst
new file mode 100644
index 000000000000..d71c5df15922
--- /dev/null
+++ b/Documentation/admin-guide/aoe/index.rst
@@ -0,0 +1,17 @@
+=======================
+ATA over Ethernet (AoE)
+=======================
+
+.. toctree::
+    :maxdepth: 1
+
+    aoe
+    todo
+    examples
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/aoe/status.sh b/Documentation/admin-guide/aoe/status.sh
index eeec7baae57a..eeec7baae57a 100644
--- a/Documentation/aoe/status.sh
+++ b/Documentation/admin-guide/aoe/status.sh
diff --git a/Documentation/admin-guide/aoe/todo.rst b/Documentation/admin-guide/aoe/todo.rst
new file mode 100644
index 000000000000..dea8db5a33e1
--- /dev/null
+++ b/Documentation/admin-guide/aoe/todo.rst
@@ -0,0 +1,17 @@
+TODO
+====
+
+There is a potential for deadlock when allocating a struct sk_buff for
+data that needs to be written out to aoe storage.  If the data is
+being written from a dirty page in order to free that page, and if
+there are no other pages available, then deadlock may occur when a
+free page is needed for the sk_buff allocation.  This situation has
+not been observed, but it would be nice to eliminate any potential for
+deadlock under memory pressure.
+
+Because ATA over Ethernet is not fragmented by the kernel's IP code,
+the destructor member of the struct sk_buff is available to the aoe
+driver.  By using a mempool for allocating all but the first few
+sk_buffs, and by registering a destructor, we should be able to
+efficiently allocate sk_buffs without introducing any potential for
+deadlock.
diff --git a/Documentation/aoe/udev-install.sh b/Documentation/admin-guide/aoe/udev-install.sh
index 15e86f58c036..15e86f58c036 100644
--- a/Documentation/aoe/udev-install.sh
+++ b/Documentation/admin-guide/aoe/udev-install.sh
diff --git a/Documentation/aoe/udev.txt b/Documentation/admin-guide/aoe/udev.txt
index 1f06daf03f5b..5fb756466bc7 100644
--- a/Documentation/aoe/udev.txt
+++ b/Documentation/admin-guide/aoe/udev.txt
@@ -11,7 +11,7 @@
 #   udev_rules="/etc/udev/rules.d/"
 #   bash# ls /etc/udev/rules.d/
 #   10-wacom.rules  50-udev.rules
-#   bash# cp /path/to/linux-2.6.xx/Documentation/aoe/udev.txt \
+#   bash# cp /path/to/linux/Documentation/admin-guide/aoe/udev.txt \
 #           /etc/udev/rules.d/60-aoe.rules
 #  
 
diff --git a/Documentation/filesystems/binderfs.rst b/Documentation/admin-guide/binderfs.rst
index c009671f8434..c009671f8434 100644
--- a/Documentation/filesystems/binderfs.rst
+++ b/Documentation/admin-guide/binderfs.rst
diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg
index f87cfa0dc2fb..f87cfa0dc2fb 100644
--- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
+++ b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg
index 48a1e2165fec..48a1e2165fec 100644
--- a/Documentation/blockdev/drbd/DRBD-data-packets.svg
+++ b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot
index 025e8cf5e64a..025e8cf5e64a 100644
--- a/Documentation/blockdev/drbd/conn-states-8.dot
+++ b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot
diff --git a/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst
new file mode 100644
index 000000000000..66036b901644
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst
@@ -0,0 +1,42 @@
+================================
+kernel data structure for DRBD-9
+================================
+
+This describes the in kernel data structure for DRBD-9. Starting with
+Linux v3.14 we are reorganizing DRBD to use this data structure.
+
+Basic Data Structure
+====================
+
+A node has a number of DRBD resources.  Each such resource has a number of
+devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD
+device is represented by a block device locally.
+
+The DRBD objects are interconnected to form a matrix as depicted below; a
+drbd_peer_device object sits at each intersection between a drbd_device and a
+drbd_connection::
+
+  /--------------+---------------+.....+---------------\
+  |   resource   |    device     |     |    device     |
+  +--------------+---------------+.....+---------------+
+  |  connection  |  peer_device  |     |  peer_device  |
+  +--------------+---------------+.....+---------------+
+  :              :               :     :               :
+  :              :               :     :               :
+  +--------------+---------------+.....+---------------+
+  |  connection  |  peer_device  |     |  peer_device  |
+  \--------------+---------------+.....+---------------/
+
+In this table, horizontally, devices can be accessed from resources by their
+volume number.  Likewise, peer_devices can be accessed from connections by
+their volume number.  Objects in the vertical direction are connected by double
+linked lists.  There are back pointers from peer_devices to their connections a
+devices, and from connections and devices to their resource.
+
+All resources are in the drbd_resources double-linked list.  In addition, all
+devices can be accessed by their minor device number via the drbd_devices idr.
+
+The drbd_resource, drbd_connection, and drbd_device objects are reference
+counted.  The peer_device objects only serve to establish the links between
+devices and connections; their lifetime is determined by the lifetime of the
+device and connection which they reference.
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot
index d06cfb46fb98..d06cfb46fb98 100644
--- a/Documentation/blockdev/drbd/disk-states-8.dot
+++ b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot
index 6d9cf0a7b11d..6d9cf0a7b11d 100644
--- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
+++ b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot
diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst
new file mode 100644
index 000000000000..bd9a4901fe46
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/figures.rst
@@ -0,0 +1,30 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. The here included files are intended to help understand the implementation
+
+Data flows that Relate some functions, and write packets
+========================================================
+
+.. kernel-figure:: DRBD-8.3-data-packets.svg
+    :alt:   DRBD-8.3-data-packets.svg
+    :align: center
+
+.. kernel-figure:: DRBD-data-packets.svg
+    :alt:   DRBD-data-packets.svg
+    :align: center
+
+
+Sub graphs of DRBD's state transitions
+======================================
+
+.. kernel-figure:: conn-states-8.dot
+    :alt:   conn-states-8.dot
+    :align: center
+
+.. kernel-figure:: disk-states-8.dot
+    :alt:   disk-states-8.dot
+    :align: center
+
+.. kernel-figure:: node-states-8.dot
+    :alt:   node-states-8.dot
+    :align: center
diff --git a/Documentation/admin-guide/blockdev/drbd/index.rst b/Documentation/admin-guide/blockdev/drbd/index.rst
new file mode 100644
index 000000000000..68ecd5c113e9
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/index.rst
@@ -0,0 +1,19 @@
+==========================================
+Distributed Replicated Block Device - DRBD
+==========================================
+
+Description
+===========
+
+  DRBD is a shared-nothing, synchronously replicated block device. It
+  is designed to serve as a building block for high availability
+  clusters and in this context, is a "drop-in" replacement for shared
+  storage. Simplistically, you could see it as a network RAID 1.
+
+  Please visit http://www.drbd.org to find out more.
+
+.. toctree::
+   :maxdepth: 1
+
+   data-structure-v9
+   figures
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
index 4a2b00c23547..bfa54e1f8016 100644
--- a/Documentation/blockdev/drbd/node-states-8.dot
+++ b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
@@ -11,4 +11,3 @@ digraph peer_states {
 	Unknown   -> Primary           [ label = "connected" ]
 	Unknown   -> Secondary         [ label = "connected" ]
 }
-
diff --git a/Documentation/admin-guide/blockdev/floppy.rst b/Documentation/admin-guide/blockdev/floppy.rst
new file mode 100644
index 000000000000..4a8f31cf4139
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/floppy.rst
@@ -0,0 +1,255 @@
+=============
+Floppy Driver
+=============
+
+FAQ list:
+=========
+
+A FAQ list may be found in the fdutils package (see below), and also
+at <http://fdutils.linux.lu/faq.html>.
+
+
+LILO configuration options (Thinkpad users, read this)
+======================================================
+
+The floppy driver is configured using the 'floppy=' option in
+lilo. This option can be typed at the boot prompt, or entered in the
+lilo configuration file.
+
+Example: If your kernel is called linux-2.6.9, type the following line
+at the lilo boot prompt (if you have a thinkpad)::
+
+ linux-2.6.9 floppy=thinkpad
+
+You may also enter the following line in /etc/lilo.conf, in the description
+of linux-2.6.9::
+
+ append = "floppy=thinkpad"
+
+Several floppy related options may be given, example::
+
+ linux-2.6.9 floppy=daring floppy=two_fdc
+ append = "floppy=daring floppy=two_fdc"
+
+If you give options both in the lilo config file and on the boot
+prompt, the option strings of both places are concatenated, the boot
+prompt options coming last. That's why there are also options to
+restore the default behavior.
+
+
+Module configuration options
+============================
+
+If you use the floppy driver as a module, use the following syntax::
+
+	modprobe floppy floppy="<options>"
+
+Example::
+
+	modprobe floppy floppy="omnibook messages"
+
+If you need certain options enabled every time you load the floppy driver,
+you can put::
+
+	options floppy floppy="omnibook messages"
+
+in a configuration file in /etc/modprobe.d/.
+
+
+The floppy driver related options are:
+
+ floppy=asus_pci
+	Sets the bit mask to allow only units 0 and 1. (default)
+
+ floppy=daring
+	Tells the floppy driver that you have a well behaved floppy controller.
+	This allows more efficient and smoother operation, but may fail on
+	certain controllers. This may speed up certain operations.
+
+ floppy=0,daring
+	Tells the floppy driver that your floppy controller should be used
+	with caution.
+
+ floppy=one_fdc
+	Tells the floppy driver that you have only one floppy controller.
+	(default)
+
+ floppy=two_fdc / floppy=<address>,two_fdc
+	Tells the floppy driver that you have two floppy controllers.
+	The second floppy controller is assumed to be at <address>.
+	This option is not needed if the second controller is at address
+	0x370, and if you use the 'cmos' option.
+
+ floppy=thinkpad
+	Tells the floppy driver that you have a Thinkpad. Thinkpads use an
+	inverted convention for the disk change line.
+
+ floppy=0,thinkpad
+	Tells the floppy driver that you don't have a Thinkpad.
+
+ floppy=omnibook / floppy=nodma
+	Tells the floppy driver not to use Dma for data transfers.
+	This is needed on HP Omnibooks, which don't have a workable
+	DMA channel for the floppy driver. This option is also useful
+	if you frequently get "Unable to allocate DMA memory" messages.
+	Indeed, dma memory needs to be continuous in physical memory,
+	and is thus harder to find, whereas non-dma buffers may be
+	allocated in virtual memory. However, I advise against this if
+	you have an FDC without a FIFO (8272A or 82072). 82072A and
+	later are OK. You also need at least a 486 to use nodma.
+	If you use nodma mode, I suggest you also set the FIFO
+	threshold to 10 or lower, in order to limit the number of data
+	transfer interrupts.
+
+	If you have a FIFO-able FDC, the floppy driver automatically
+	falls back on non DMA mode if no DMA-able memory can be found.
+	If you want to avoid this, explicitly ask for 'yesdma'.
+
+ floppy=yesdma
+	Tells the floppy driver that a workable DMA channel is available.
+	(default)
+
+ floppy=nofifo
+	Disables the FIFO entirely. This is needed if you get "Bus
+	master arbitration error" messages from your Ethernet card (or
+	from other devices) while accessing the floppy.
+
+ floppy=usefifo
+	Enables the FIFO. (default)
+
+ floppy=<threshold>,fifo_depth
+	Sets the FIFO threshold. This is mostly relevant in DMA
+	mode. If this is higher, the floppy driver tolerates more
+	interrupt latency, but it triggers more interrupts (i.e. it
+	imposes more load on the rest of the system). If this is
+	lower, the interrupt latency should be lower too (faster
+	processor). The benefit of a lower threshold is less
+	interrupts.
+
+	To tune the fifo threshold, switch on over/underrun messages
+	using 'floppycontrol --messages'. Then access a floppy
+	disk. If you get a huge amount of "Over/Underrun - retrying"
+	messages, then the fifo threshold is too low. Try with a
+	higher value, until you only get an occasional Over/Underrun.
+	It is a good idea to compile the floppy driver as a module
+	when doing this tuning. Indeed, it allows to try different
+	fifo values without rebooting the machine for each test. Note
+	that you need to do 'floppycontrol --messages' every time you
+	re-insert the module.
+
+	Usually, tuning the fifo threshold should not be needed, as
+	the default (0xa) is reasonable.
+
+ floppy=<drive>,<type>,cmos
+	Sets the CMOS type of <drive> to <type>. This is mandatory if
+	you have more than two floppy drives (only two can be
+	described in the physical CMOS), or if your BIOS uses
+	non-standard CMOS types. The CMOS types are:
+
+	       ==  ==================================
+		0  Use the value of the physical CMOS
+		1  5 1/4 DD
+		2  5 1/4 HD
+		3  3 1/2 DD
+		4  3 1/2 HD
+		5  3 1/2 ED
+		6  3 1/2 ED
+	       16  unknown or not installed
+	       ==  ==================================
+
+	(Note: there are two valid types for ED drives. This is because 5 was
+	initially chosen to represent floppy *tapes*, and 6 for ED drives.
+	AMI ignored this, and used 5 for ED drives. That's why the floppy
+	driver handles both.)
+
+ floppy=unexpected_interrupts
+	Print a warning message when an unexpected interrupt is received.
+	(default)
+
+ floppy=no_unexpected_interrupts / floppy=L40SX
+	Don't print a message when an unexpected interrupt is received. This
+	is needed on IBM L40SX laptops in certain video modes. (There seems
+	to be an interaction between video and floppy. The unexpected
+	interrupts affect only performance, and can be safely ignored.)
+
+ floppy=broken_dcl
+	Don't use the disk change line, but assume that the disk was
+	changed whenever the device node is reopened. Needed on some
+	boxes where the disk change line is broken or unsupported.
+	This should be regarded as a stopgap measure, indeed it makes
+	floppy operation less efficient due to unneeded cache
+	flushings, and slightly more unreliable. Please verify your
+	cable, connection and jumper settings if you have any DCL
+	problems. However, some older drives, and also some laptops
+	are known not to have a DCL.
+
+ floppy=debug
+	Print debugging messages.
+
+ floppy=messages
+	Print informational messages for some operations (disk change
+	notifications, warnings about over and underruns, and about
+	autodetection).
+
+ floppy=silent_dcl_clear
+	Uses a less noisy way to clear the disk change line (which
+	doesn't involve seeks). Implied by 'daring' option.
+
+ floppy=<nr>,irq
+	Sets the floppy IRQ to <nr> instead of 6.
+
+ floppy=<nr>,dma
+	Sets the floppy DMA channel to <nr> instead of 2.
+
+ floppy=slow
+	Use PS/2 stepping rate::
+
+	   PS/2 floppies have much slower step rates than regular floppies.
+	   It's been recommended that take about 1/4 of the default speed
+	   in some more extreme cases.
+
+
+Supporting utilities and additional documentation:
+==================================================
+
+Additional parameters of the floppy driver can be configured at
+runtime. Utilities which do this can be found in the fdutils package.
+This package also contains a new version of mtools which allows to
+access high capacity disks (up to 1992K on a high density 3 1/2 disk!).
+It also contains additional documentation about the floppy driver.
+
+The latest version can be found at fdutils homepage:
+
+ http://fdutils.linux.lu
+
+The fdutils releases can be found at:
+
+ http://fdutils.linux.lu/download.html
+
+ http://www.tux.org/pub/knaff/fdutils/
+
+ ftp://metalab.unc.edu/pub/Linux/utils/disk-management/
+
+Reporting problems about the floppy driver
+==========================================
+
+If you have a question or a bug report about the floppy driver, mail
+me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use
+comp.os.linux.hardware. As the volume in these groups is rather high,
+be sure to include the word "floppy" (or "FLOPPY") in the subject
+line.  If the reported problem happens when mounting floppy disks, be
+sure to mention also the type of the filesystem in the subject line.
+
+Be sure to read the FAQ before mailing/posting any bug reports!
+
+Alain
+
+Changelog
+=========
+
+10-30-2004 :
+		Cleanup, updating, add reference to module configuration.
+		James Nelson <james4765@gmail.com>
+
+6-3-2000 :
+		Original Document
diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst
new file mode 100644
index 000000000000..b903cf152091
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+The Linux RapidIO Subsystem
+===========================
+
+.. toctree::
+   :maxdepth: 1
+
+   floppy
+   nbd
+   paride
+   ramdisk
+   zram
+
+   drbd/index
diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst
new file mode 100644
index 000000000000..d78dfe559dcf
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/nbd.rst
@@ -0,0 +1,31 @@
+==================================
+Network Block Device (TCP version)
+==================================
+
+1) Overview
+-----------
+
+What is it: With this compiled in the kernel (or as a module), Linux
+can use a remote server as one of its block devices. So every time
+the client computer wants to read, e.g., /dev/nb0, it sends a
+request over TCP to the server, which will reply with the data read.
+This can be used for stations with low disk space (or even diskless)
+to borrow disk space from another computer.
+Unlike NFS, it is possible to put any filesystem on it, etc.
+
+For more information, or to download the nbd-client and nbd-server
+tools, go to http://nbd.sf.net/.
+
+The nbd kernel module need only be installed on the client
+system, as the nbd-server is completely in userspace. In fact,
+the nbd-server has been successfully ported to other operating
+systems, including Windows.
+
+A) NBD parameters
+-----------------
+
+max_part
+	Number of partitions per device (default: 0).
+
+nbds_max
+	Number of block devices that should be initialized (default: 16).
diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst
new file mode 100644
index 000000000000..87b4278bf314
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/paride.rst
@@ -0,0 +1,439 @@
+===================================
+Linux and parallel port IDE devices
+===================================
+
+PARIDE v1.03   (c) 1997-8  Grant Guenther <grant@torque.net>
+
+1. Introduction
+===============
+
+Owing to the simplicity and near universality of the parallel port interface
+to personal computers, many external devices such as portable hard-disk,
+CD-ROM, LS-120 and tape drives use the parallel port to connect to their
+host computer.  While some devices (notably scanners) use ad-hoc methods
+to pass commands and data through the parallel port interface, most
+external devices are actually identical to an internal model, but with
+a parallel-port adapter chip added in.  Some of the original parallel port
+adapters were little more than mechanisms for multiplexing a SCSI bus.
+(The Iomega PPA-3 adapter used in the ZIP drives is an example of this
+approach).  Most current designs, however, take a different approach.
+The adapter chip reproduces a small ISA or IDE bus in the external device
+and the communication protocol provides operations for reading and writing
+device registers, as well as data block transfer functions.  Sometimes,
+the device being addressed via the parallel cable is a standard SCSI
+controller like an NCR 5380.  The "ditto" family of external tape
+drives use the ISA replicator to interface a floppy disk controller,
+which is then connected to a floppy-tape mechanism.  The vast majority
+of external parallel port devices, however, are now based on standard
+IDE type devices, which require no intermediate controller.  If one
+were to open up a parallel port CD-ROM drive, for instance, one would
+find a standard ATAPI CD-ROM drive, a power supply, and a single adapter
+that interconnected a standard PC parallel port cable and a standard
+IDE cable.  It is usually possible to exchange the CD-ROM device with
+any other device using the IDE interface.
+
+The document describes the support in Linux for parallel port IDE
+devices.  It does not cover parallel port SCSI devices, "ditto" tape
+drives or scanners.  Many different devices are supported by the
+parallel port IDE subsystem, including:
+
+	- MicroSolutions backpack CD-ROM
+	- MicroSolutions backpack PD/CD
+	- MicroSolutions backpack hard-drives
+	- MicroSolutions backpack 8000t tape drive
+	- SyQuest EZ-135, EZ-230 & SparQ drives
+	- Avatar Shark
+	- Imation Superdisk LS-120
+	- Maxell Superdisk LS-120
+	- FreeCom Power CD
+	- Hewlett-Packard 5GB and 8GB tape drives
+	- Hewlett-Packard 7100 and 7200 CD-RW drives
+
+as well as most of the clone and no-name products on the market.
+
+To support such a wide range of devices, PARIDE, the parallel port IDE
+subsystem, is actually structured in three parts.   There is a base
+paride module which provides a registry and some common methods for
+accessing the parallel ports.  The second component is a set of
+high-level drivers for each of the different types of supported devices:
+
+	===	=============
+	pd	IDE disk
+	pcd	ATAPI CD-ROM
+	pf	ATAPI disk
+	pt	ATAPI tape
+	pg	ATAPI generic
+	===	=============
+
+(Currently, the pg driver is only used with CD-R drives).
+
+The high-level drivers function according to the relevant standards.
+The third component of PARIDE is a set of low-level protocol drivers
+for each of the parallel port IDE adapter chips.  Thanks to the interest
+and encouragement of Linux users from many parts of the world,
+support is available for almost all known adapter protocols:
+
+	====    ====================================== ====
+        aten    ATEN EH-100                            (HK)
+        bpck    Microsolutions backpack                (US)
+        comm    DataStor (old-type) "commuter" adapter (TW)
+        dstr    DataStor EP-2000                       (TW)
+        epat    Shuttle EPAT                           (UK)
+        epia    Shuttle EPIA                           (UK)
+	fit2    FIT TD-2000			       (US)
+	fit3    FIT TD-3000			       (US)
+	friq    Freecom IQ cable                       (DE)
+        frpw    Freecom Power                          (DE)
+        kbic    KingByte KBIC-951A and KBIC-971A       (TW)
+	ktti    KT Technology PHd adapter              (SG)
+        on20    OnSpec 90c20                           (US)
+        on26    OnSpec 90c26                           (US)
+	====    ====================================== ====
+
+
+2. Using the PARIDE subsystem
+=============================
+
+While configuring the Linux kernel, you may choose either to build
+the PARIDE drivers into your kernel, or to build them as modules.
+
+In either case, you will need to select "Parallel port IDE device support"
+as well as at least one of the high-level drivers and at least one
+of the parallel port communication protocols.  If you do not know
+what kind of parallel port adapter is used in your drive, you could
+begin by checking the file names and any text files on your DOS
+installation floppy.  Alternatively, you can look at the markings on
+the adapter chip itself.  That's usually sufficient to identify the
+correct device.
+
+You can actually select all the protocol modules, and allow the PARIDE
+subsystem to try them all for you.
+
+For the "brand-name" products listed above, here are the protocol
+and high-level drivers that you would use:
+
+	================	============	======	========
+	Manufacturer		Model		Driver	Protocol
+	================	============	======	========
+	MicroSolutions		CD-ROM		pcd	bpck
+	MicroSolutions		PD drive	pf	bpck
+	MicroSolutions		hard-drive	pd	bpck
+	MicroSolutions          8000t tape      pt      bpck
+	SyQuest			EZ, SparQ	pd	epat
+	Imation			Superdisk	pf	epat
+	Maxell                  Superdisk       pf      friq
+	Avatar			Shark		pd	epat
+	FreeCom			CD-ROM		pcd	frpw
+	Hewlett-Packard		5GB Tape	pt	epat
+	Hewlett-Packard		7200e (CD)	pcd	epat
+	Hewlett-Packard		7200e (CD-R)	pg	epat
+	================	============	======	========
+
+2.1  Configuring built-in drivers
+---------------------------------
+
+We recommend that you get to know how the drivers work and how to
+configure them as loadable modules, before attempting to compile a
+kernel with the drivers built-in.
+
+If you built all of your PARIDE support directly into your kernel,
+and you have just a single parallel port IDE device, your kernel should
+locate it automatically for you.  If you have more than one device,
+you may need to give some command line options to your bootloader
+(eg: LILO), how to do that is beyond the scope of this document.
+
+The high-level drivers accept a number of command line parameters, all
+of which are documented in the source files in linux/drivers/block/paride.
+By default, each driver will automatically try all parallel ports it
+can find, and all protocol types that have been installed, until it finds
+a parallel port IDE adapter.  Once it finds one, the probe stops.  So,
+if you have more than one device, you will need to tell the drivers
+how to identify them.  This requires specifying the port address, the
+protocol identification number and, for some devices, the drive's
+chain ID.  While your system is booting, a number of messages are
+displayed on the console.  Like all such messages, they can be
+reviewed with the 'dmesg' command.  Among those messages will be
+some lines like::
+
+	paride: bpck registered as protocol 0
+	paride: epat registered as protocol 1
+
+The numbers will always be the same until you build a new kernel with
+different protocol selections.  You should note these numbers as you
+will need them to identify the devices.
+
+If you happen to be using a MicroSolutions backpack device, you will
+also need to know the unit ID number for each drive.  This is usually
+the last two digits of the drive's serial number (but read MicroSolutions'
+documentation about this).
+
+As an example, let's assume that you have a MicroSolutions PD/CD drive
+with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
+EZ-135 connected to the chained port on the PD/CD drive and also an
+Imation Superdisk connected to port 0x278.  You could give the following
+options on your boot command::
+
+	pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
+
+In the last option, pf.drive1 configures device /dev/pf1, the 0x378
+is the parallel port base address, the 0 is the protocol registration
+number and 36 is the chain ID.
+
+Please note:  while PARIDE will work both with and without the
+PARPORT parallel port sharing system that is included by the
+"Parallel port support" option, PARPORT must be included and enabled
+if you want to use chains of devices on the same parallel port.
+
+2.2  Loading and configuring PARIDE as modules
+----------------------------------------------
+
+It is much faster and simpler to get to understand the PARIDE drivers
+if you use them as loadable kernel modules.
+
+Note 1:
+	using these drivers with the "kerneld" automatic module loading
+	system is not recommended for beginners, and is not documented here.
+
+Note 2:
+	if you build PARPORT support as a loadable module, PARIDE must
+	also be built as loadable modules, and PARPORT must be loaded before
+	the PARIDE modules.
+
+To use PARIDE, you must begin by::
+
+	insmod paride
+
+this loads a base module which provides a registry for the protocols,
+among other tasks.
+
+Then, load as many of the protocol modules as you think you might need.
+As you load each module, it will register the protocols that it supports,
+and print a log message to your kernel log file and your console. For
+example::
+
+	# insmod epat
+	paride: epat registered as protocol 0
+	# insmod kbic
+	paride: k951 registered as protocol 1
+        paride: k971 registered as protocol 2
+
+Finally, you can load high-level drivers for each kind of device that
+you have connected.  By default, each driver will autoprobe for a single
+device, but you can support up to four similar devices by giving their
+individual co-ordinates when you load the driver.
+
+For example, if you had two no-name CD-ROM drives both using the
+KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
+you could give the following command::
+
+	# insmod pcd drive0=0x378,1 drive1=0x3bc,1
+
+For most adapters, giving a port address and protocol number is sufficient,
+but check the source files in linux/drivers/block/paride for more
+information.  (Hopefully someone will write some man pages one day !).
+
+As another example, here's what happens when PARPORT is installed, and
+a SyQuest EZ-135 is attached to port 0x378::
+
+	# insmod paride
+	paride: version 1.0 installed
+	# insmod epat
+	paride: epat registered as protocol 0
+	# insmod pd
+	pd: pd version 1.0, major 45, cluster 64, nice 0
+	pda: Sharing parport1 at 0x378
+	pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
+	pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
+	 pda: pda1
+
+Note that the last line is the output from the generic partition table
+scanner - in this case it reports that it has found a disk with one partition.
+
+2.3  Using a PARIDE device
+--------------------------
+
+Once the drivers have been loaded, you can access PARIDE devices in the
+same way as their traditional counterparts.  You will probably need to
+create the device "special files".  Here is a simple script that you can
+cut to a file and execute::
+
+  #!/bin/bash
+  #
+  # mkd -- a script to create the device special files for the PARIDE subsystem
+  #
+  function mkdev {
+    mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
+  }
+  #
+  function pd {
+    D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
+    mkdev pd$D b 45 $[ $1 * 16 ]
+    for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+    do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
+    done
+  }
+  #
+  cd /dev
+  #
+  for u in 0 1 2 3 ; do pd $u ; done
+  for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
+  for u in 0 1 2 3 ; do mkdev pf$u  b 47 $u ; done
+  for u in 0 1 2 3 ; do mkdev pt$u  c 96 $u ; done
+  for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
+  for u in 0 1 2 3 ; do mkdev pg$u  c 97 $u ; done
+  #
+  # end of mkd
+
+With the device files and drivers in place, you can access PARIDE devices
+like any other Linux device.   For example, to mount a CD-ROM in pcd0, use::
+
+	mount /dev/pcd0 /cdrom
+
+If you have a fresh Avatar Shark cartridge, and the drive is pda, you
+might do something like::
+
+	fdisk /dev/pda		-- make a new partition table with
+				   partition 1 of type 83
+
+	mke2fs /dev/pda1	-- to build the file system
+
+	mkdir /shark		-- make a place to mount the disk
+
+	mount /dev/pda1 /shark
+
+Devices like the Imation superdisk work in the same way, except that
+they do not have a partition table.  For example to make a 120MB
+floppy that you could share with a DOS system::
+
+	mkdosfs /dev/pf0
+	mount /dev/pf0 /mnt
+
+
+2.4  The pf driver
+------------------
+
+The pf driver is intended for use with parallel port ATAPI disk
+devices.  The most common devices in this category are PD drives
+and LS-120 drives.  Traditionally, media for these devices are not
+partitioned.  Consequently, the pf driver does not support partitioned
+media.  This may be changed in a future version of the driver.
+
+2.5  Using the pt driver
+------------------------
+
+The pt driver for parallel port ATAPI tape drives is a minimal driver.
+It does not yet support many of the standard tape ioctl operations.
+For best performance, a block size of 32KB should be used.  You will
+probably want to set the parallel port delay to 0, if you can.
+
+2.6  Using the pg driver
+------------------------
+
+The pg driver can be used in conjunction with the cdrecord program
+to create CD-ROMs.  Please get cdrecord version 1.6.1 or later
+from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ .  To record CD-R media
+your parallel port should ideally be set to EPP mode, and the "port delay"
+should be set to 0.  With those settings it is possible to record at 2x
+speed without any buffer underruns.  If you cannot get the driver to work
+in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
+
+
+3. Troubleshooting
+==================
+
+3.1  Use EPP mode if you can
+----------------------------
+
+The most common problems that people report with the PARIDE drivers
+concern the parallel port CMOS settings.  At this time, none of the
+PARIDE protocol modules support ECP mode, or any ECP combination modes.
+If you are able to do so, please set your parallel port into EPP mode
+using your CMOS setup procedure.
+
+3.2  Check the port delay
+-------------------------
+
+Some parallel ports cannot reliably transfer data at full speed.  To
+offset the errors, the PARIDE protocol modules introduce a "port
+delay" between each access to the i/o ports.  Each protocol sets
+a default value for this delay.  In most cases, the user can override
+the default and set it to 0 - resulting in somewhat higher transfer
+rates.  In some rare cases (especially with older 486 systems) the
+default delays are not long enough.  if you experience corrupt data
+transfers, or unexpected failures, you may wish to increase the
+port delay.   The delay can be programmed using the "driveN" parameters
+to each of the high-level drivers.  Please see the notes above, or
+read the comments at the beginning of the driver source files in
+linux/drivers/block/paride.
+
+3.3  Some drives need a printer reset
+-------------------------------------
+
+There appear to be a number of "noname" external drives on the market
+that do not always power up correctly.  We have noticed this with some
+drives based on OnSpec and older Freecom adapters.  In these rare cases,
+the adapter can often be reinitialised by issuing a "printer reset" on
+the parallel port.  As the reset operation is potentially disruptive in
+multiple device environments, the PARIDE drivers will not do it
+automatically.  You can however, force a printer reset by doing::
+
+	insmod lp reset=1
+	rmmod lp
+
+If you have one of these marginal cases, you should probably build
+your paride drivers as modules, and arrange to do the printer reset
+before loading the PARIDE drivers.
+
+3.4  Use the verbose option and dmesg if you need help
+------------------------------------------------------
+
+While a lot of testing has gone into these drivers to make them work
+as smoothly as possible, problems will arise.  If you do have problems,
+please check all the obvious things first:  does the drive work in
+DOS with the manufacturer's drivers ?  If that doesn't yield any useful
+clues, then please make sure that only one drive is hooked to your system,
+and that either (a) PARPORT is enabled or (b) no other device driver
+is using your parallel port (check in /proc/ioports).  Then, load the
+appropriate drivers (you can load several protocol modules if you want)
+as in::
+
+	# insmod paride
+	# insmod epat
+	# insmod bpck
+	# insmod kbic
+	...
+	# insmod pd verbose=1
+
+(using the correct driver for the type of device you have, of course).
+The verbose=1 parameter will cause the drivers to log a trace of their
+activity as they attempt to locate your drive.
+
+Use 'dmesg' to capture a log of all the PARIDE messages (any messages
+beginning with paride:, a protocol module's name or a driver's name) and
+include that with your bug report.  You can submit a bug report in one
+of two ways.  Either send it directly to the author of the PARIDE suite,
+by e-mail to grant@torque.net, or join the linux-parport mailing list
+and post your report there.
+
+3.5  For more information or help
+---------------------------------
+
+You can join the linux-parport mailing list by sending a mail message
+to:
+
+		linux-parport-request@torque.net
+
+with the single word::
+
+		subscribe
+
+in the body of the mail message (not in the subject line).   Please be
+sure that your mail program is correctly set up when you do this,  as
+the list manager is a robot that will subscribe you using the reply
+address in your mail headers.  REMOVE any anti-spam gimmicks you may
+have in your mail headers, when sending mail to the list server.
+
+You might also find some useful information on the linux-parport
+web pages (although they are not always up to date) at
+
+	http://web.archive.org/web/%2E/http://www.torque.net/parport/
diff --git a/Documentation/admin-guide/blockdev/ramdisk.rst b/Documentation/admin-guide/blockdev/ramdisk.rst
new file mode 100644
index 000000000000..b7c2268f8dec
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/ramdisk.rst
@@ -0,0 +1,177 @@
+==========================================
+Using the RAM disk block device with Linux
+==========================================
+
+.. Contents:
+
+	1) Overview
+	2) Kernel Command Line Parameters
+	3) Using "rdev -r"
+	4) An Example of Creating a Compressed RAM Disk
+
+
+1) Overview
+-----------
+
+The RAM disk driver is a way to use main system memory as a block device.  It
+is required for initrd, an initial filesystem used if you need to load modules
+in order to access the root filesystem (see Documentation/admin-guide/initrd.rst).  It can
+also be used for a temporary filesystem for crypto work, since the contents
+are erased on reboot.
+
+The RAM disk dynamically grows as more space is required. It does this by using
+RAM from the buffer cache. The driver marks the buffers it is using as dirty
+so that the VM subsystem does not try to reclaim them later.
+
+The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
+to support an unlimited number of RAM disks (at your own risk).  Just change
+the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
+and (re)build the kernel.
+
+To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
+directory.  RAM disks are all major number 1, and start with minor number 0
+for /dev/ram0, etc.  If used, modern kernels use /dev/ram0 for an initrd.
+
+The new RAM disk also has the ability to load compressed RAM disk images,
+allowing one to squeeze more programs onto an average installation or
+rescue floppy disk.
+
+
+2) Parameters
+---------------------------------
+
+2a) Kernel Command Line Parameters
+
+	ramdisk_size=N
+		Size of the ramdisk.
+
+This parameter tells the RAM disk driver to set up RAM disks of N k size.  The
+default is 4096 (4 MB).
+
+2b) Module parameters
+
+	rd_nr
+		/dev/ramX devices created.
+
+	max_part
+		Maximum partition number.
+
+	rd_size
+		See ramdisk_size.
+
+3) Using "rdev -r"
+------------------
+
+The usage of the word (two bytes) that "rdev -r" sets in the kernel image is
+as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up
+to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit
+14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a
+prompt/wait sequence is to be given before trying to read the RAM disk. Since
+the RAM disk dynamically grows as data is being written into it, a size field
+is not required. Bits 11 to 13 are not currently used and may as well be zero.
+These numbers are no magical secrets, as seen below::
+
+  ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK     0x07FF
+  ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG          0x8000
+  ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG            0x4000
+
+Consider a typical two floppy disk setup, where you will have the
+kernel on disk one, and have already put a RAM disk image onto disk #2.
+
+Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk
+starts at an offset of 0 kB from the beginning of the floppy.
+The command line equivalent is: "ramdisk_start=0"
+
+You want bit 14 as one, indicating that a RAM disk is to be loaded.
+The command line equivalent is: "load_ramdisk=1"
+
+You want bit 15 as one, indicating that you want a prompt/keypress
+sequence so that you have a chance to switch floppy disks.
+The command line equivalent is: "prompt_ramdisk=1"
+
+Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word.
+So to create disk one of the set, you would do::
+
+	/usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0
+	/usr/src/linux# rdev /dev/fd0 /dev/fd0
+	/usr/src/linux# rdev -r /dev/fd0 49152
+
+If you make a boot disk that has LILO, then for the above, you would use::
+
+	append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1"
+
+Since the default start = 0 and the default prompt = 1, you could use::
+
+	append = "load_ramdisk=1"
+
+
+4) An Example of Creating a Compressed RAM Disk
+-----------------------------------------------
+
+To create a RAM disk image, you will need a spare block device to
+construct it on. This can be the RAM disk device itself, or an
+unused disk partition (such as an unmounted swap partition). For this
+example, we will use the RAM disk device, "/dev/ram0".
+
+Note: This technique should not be done on a machine with less than 8 MB
+of RAM. If using a spare disk partition instead of /dev/ram0, then this
+restriction does not apply.
+
+a) Decide on the RAM disk size that you want. Say 2 MB for this example.
+   Create it by writing to the RAM disk device. (This step is not currently
+   required, but may be in the future.) It is wise to zero out the
+   area (esp. for disks) so that maximal compression is achieved for
+   the unused blocks of the image that you are about to create::
+
+	dd if=/dev/zero of=/dev/ram0 bs=1k count=2048
+
+b) Make a filesystem on it. Say ext2fs for this example::
+
+	mke2fs -vm0 /dev/ram0 2048
+
+c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...)
+   and unmount it again.
+
+d) Compress the contents of the RAM disk. The level of compression
+   will be approximately 50% of the space used by the files. Unused
+   space on the RAM disk will compress to almost nothing::
+
+	dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz
+
+e) Put the kernel onto the floppy::
+
+	dd if=zImage of=/dev/fd0 bs=1k
+
+f) Put the RAM disk image onto the floppy, after the kernel. Use an offset
+   that is slightly larger than the kernel, so that you can put another
+   (possibly larger) kernel onto the same floppy later without overlapping
+   the RAM disk image. An offset of 400 kB for kernels about 350 kB in
+   size would be reasonable. Make sure offset+size of ram_image.gz is
+   not larger than the total space on your floppy (usually 1440 kB)::
+
+	dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400
+
+g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc.
+   For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would
+   have 2^15 + 2^14 + 400 = 49552::
+
+	rdev /dev/fd0 /dev/fd0
+	rdev -r /dev/fd0 49552
+
+That is it. You now have your boot/root compressed RAM disk floppy. Some
+users may wish to combine steps (d) and (f) by using a pipe.
+
+
+						Paul Gortmaker 12/95
+
+Changelog:
+----------
+
+10-22-04 :
+		Updated to reflect changes in command line options, remove
+		obsolete references, general cleanup.
+		James Nelson (james4765@gmail.com)
+
+
+12-95 :
+		Original Document
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
new file mode 100644
index 000000000000..6eccf13219ff
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -0,0 +1,422 @@
+========================================
+zram: Compressed RAM based block devices
+========================================
+
+Introduction
+============
+
+The zram module creates RAM based block devices named /dev/zram<id>
+(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
+in memory itself. These disks allow very fast I/O and compression provides
+good amounts of memory savings. Some of the usecases include /tmp storage,
+use as swap disks, various caches under /var and maybe many more :)
+
+Statistics for individual zram devices are exported through sysfs nodes at
+/sys/block/zram<id>/
+
+Usage
+=====
+
+There are several ways to configure and manage zram device(-s):
+
+a) using zram and zram_control sysfs attributes
+b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org).
+
+In this document we will describe only 'manual' zram configuration steps,
+IOW, zram and zram_control sysfs attributes.
+
+In order to get a better idea about zramctl please consult util-linux
+documentation, zramctl man-page or `zramctl --help`. Please be informed
+that zram maintainers do not develop/maintain util-linux or zramctl, should
+you have any questions please contact util-linux@vger.kernel.org
+
+Following shows a typical sequence of steps for using zram.
+
+WARNING
+=======
+
+For the sake of simplicity we skip error checking parts in most of the
+examples below. However, it is your sole responsibility to handle errors.
+
+zram sysfs attributes always return negative values in case of errors.
+The list of possible return codes:
+
+========  =============================================================
+-EBUSY	  an attempt to modify an attribute that cannot be changed once
+	  the device has been initialised. Please reset device first;
+-ENOMEM	  zram was not able to allocate enough memory to fulfil your
+	  needs;
+-EINVAL	  invalid input has been provided.
+========  =============================================================
+
+If you use 'echo', the returned value that is changed by 'echo' utility,
+and, in general case, something like::
+
+	echo 3 > /sys/block/zram0/max_comp_streams
+	if [ $? -ne 0 ];
+		handle_error
+	fi
+
+should suffice.
+
+1) Load Module
+==============
+
+::
+
+	modprobe zram num_devices=4
+	This creates 4 devices: /dev/zram{0,1,2,3}
+
+num_devices parameter is optional and tells zram how many devices should be
+pre-created. Default: 1.
+
+2) Set max number of compression streams
+========================================
+
+Regardless the value passed to this attribute, ZRAM will always
+allocate multiple compression streams - one per online CPUs - thus
+allowing several concurrent compression operations. The number of
+allocated compression streams goes down when some of the CPUs
+become offline. There is no single-compression-stream mode anymore,
+unless you are running a UP system or has only 1 CPU online.
+
+To find out how many streams are currently available::
+
+	cat /sys/block/zram0/max_comp_streams
+
+3) Select compression algorithm
+===============================
+
+Using comp_algorithm device attribute one can see available and
+currently selected (shown in square brackets) compression algorithms,
+change selected compression algorithm (once the device is initialised
+there is no way to change compression algorithm).
+
+Examples::
+
+	#show supported compression algorithms
+	cat /sys/block/zram0/comp_algorithm
+	lzo [lz4]
+
+	#select lzo compression algorithm
+	echo lzo > /sys/block/zram0/comp_algorithm
+
+For the time being, the `comp_algorithm` content does not necessarily
+show every compression algorithm supported by the kernel. We keep this
+list primarily to simplify device configuration and one can configure
+a new device with a compression algorithm that is not listed in
+`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API
+and, if some of the algorithms were built as modules, it's impossible
+to list all of them using, for instance, /proc/crypto or any other
+method. This, however, has an advantage of permitting the usage of
+custom crypto compression modules (implementing S/W or H/W compression).
+
+4) Set Disksize
+===============
+
+Set disk size by writing the value to sysfs node 'disksize'.
+The value can be either in bytes or you can use mem suffixes.
+Examples::
+
+	# Initialize /dev/zram0 with 50MB disksize
+	echo $((50*1024*1024)) > /sys/block/zram0/disksize
+
+	# Using mem suffixes
+	echo 256K > /sys/block/zram0/disksize
+	echo 512M > /sys/block/zram0/disksize
+	echo 1G > /sys/block/zram0/disksize
+
+Note:
+There is little point creating a zram of greater than twice the size of memory
+since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
+size of the disk when not in use so a huge zram is wasteful.
+
+5) Set memory limit: Optional
+=============================
+
+Set memory limit by writing the value to sysfs node 'mem_limit'.
+The value can be either in bytes or you can use mem suffixes.
+In addition, you could change the value in runtime.
+Examples::
+
+	# limit /dev/zram0 with 50MB memory
+	echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
+
+	# Using mem suffixes
+	echo 256K > /sys/block/zram0/mem_limit
+	echo 512M > /sys/block/zram0/mem_limit
+	echo 1G > /sys/block/zram0/mem_limit
+
+	# To disable memory limit
+	echo 0 > /sys/block/zram0/mem_limit
+
+6) Activate
+===========
+
+::
+
+	mkswap /dev/zram0
+	swapon /dev/zram0
+
+	mkfs.ext4 /dev/zram1
+	mount /dev/zram1 /tmp
+
+7) Add/remove zram devices
+==========================
+
+zram provides a control interface, which enables dynamic (on-demand) device
+addition and removal.
+
+In order to add a new /dev/zramX device, perform read operation on hot_add
+attribute. This will return either new device's device id (meaning that you
+can use /dev/zram<id>) or error code.
+
+Example::
+
+	cat /sys/class/zram-control/hot_add
+	1
+
+To remove the existing /dev/zramX device (where X is a device id)
+execute::
+
+	echo X > /sys/class/zram-control/hot_remove
+
+8) Stats
+========
+
+Per-device statistics are exported as various nodes under /sys/block/zram<id>/
+
+A brief description of exported device attributes. For more details please
+read Documentation/ABI/testing/sysfs-block-zram.
+
+======================  ======  ===============================================
+Name            	access            description
+======================  ======  ===============================================
+disksize          	RW	show and set the device's disk size
+initstate         	RO	shows the initialization state of the device
+reset             	WO	trigger device reset
+mem_used_max      	WO	reset the `mem_used_max` counter (see later)
+mem_limit         	WO	specifies the maximum amount of memory ZRAM can
+				use to store the compressed data
+writeback_limit   	WO	specifies the maximum amount of write IO zram
+				can write out to backing device as 4KB unit
+writeback_limit_enable  RW	show and set writeback_limit feature
+max_comp_streams  	RW	the number of possible concurrent compress
+				operations
+comp_algorithm    	RW	show and change the compression algorithm
+compact           	WO	trigger memory compaction
+debug_stat        	RO	this file is used for zram debugging purposes
+backing_dev	  	RW	set up backend storage for zram to write out
+idle		  	WO	mark allocated slot as idle
+======================  ======  ===============================================
+
+
+User space is advised to use the following files to read the device statistics.
+
+File /sys/block/zram<id>/stat
+
+Represents block layer statistics. Read Documentation/block/stat.rst for
+details.
+
+File /sys/block/zram<id>/io_stat
+
+The stat file represents device's I/O statistics not accounted by block
+layer and, thus, not available in zram<id>/stat file. It consists of a
+single line of text and contains the following stats separated by
+whitespace:
+
+ =============    =============================================================
+ failed_reads     The number of failed reads
+ failed_writes    The number of failed writes
+ invalid_io       The number of non-page-size-aligned I/O requests
+ notify_free      Depending on device usage scenario it may account
+
+                  a) the number of pages freed because of swap slot free
+                     notifications
+                  b) the number of pages freed because of
+                     REQ_OP_DISCARD requests sent by bio. The former ones are
+                     sent to a swap block device when a swap slot is freed,
+                     which implies that this disk is being used as a swap disk.
+
+                  The latter ones are sent by filesystem mounted with
+                  discard option, whenever some data blocks are getting
+                  discarded.
+ =============    =============================================================
+
+File /sys/block/zram<id>/mm_stat
+
+The stat file represents device's mm statistics. It consists of a single
+line of text and contains the following stats separated by whitespace:
+
+ ================ =============================================================
+ orig_data_size   uncompressed size of data stored in this disk.
+		  This excludes same-element-filled pages (same_pages) since
+		  no memory is allocated for them.
+                  Unit: bytes
+ compr_data_size  compressed size of data stored in this disk
+ mem_used_total   the amount of memory allocated for this disk. This
+                  includes allocator fragmentation and metadata overhead,
+                  allocated for this disk. So, allocator space efficiency
+                  can be calculated using compr_data_size and this statistic.
+                  Unit: bytes
+ mem_limit        the maximum amount of memory ZRAM can use to store
+                  the compressed data
+ mem_used_max     the maximum amount of memory zram have consumed to
+                  store the data
+ same_pages       the number of same element filled pages written to this disk.
+                  No memory is allocated for such pages.
+ pages_compacted  the number of pages freed during compaction
+ huge_pages	  the number of incompressible pages
+ ================ =============================================================
+
+File /sys/block/zram<id>/bd_stat
+
+The stat file represents device's backing device statistics. It consists of
+a single line of text and contains the following stats separated by whitespace:
+
+ ============== =============================================================
+ bd_count	size of data written in backing device.
+		Unit: 4K bytes
+ bd_reads	the number of reads from backing device
+		Unit: 4K bytes
+ bd_writes	the number of writes to backing device
+		Unit: 4K bytes
+ ============== =============================================================
+
+9) Deactivate
+=============
+
+::
+
+	swapoff /dev/zram0
+	umount /dev/zram1
+
+10) Reset
+=========
+
+	Write any positive value to 'reset' sysfs node::
+
+		echo 1 > /sys/block/zram0/reset
+		echo 1 > /sys/block/zram1/reset
+
+	This frees all the memory allocated for the given device and
+	resets the disksize to zero. You must set the disksize again
+	before reusing the device.
+
+Optional Feature
+================
+
+writeback
+---------
+
+With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
+to backing storage rather than keeping it in memory.
+To use the feature, admin should set up backing device via::
+
+	echo /dev/sda5 > /sys/block/zramX/backing_dev
+
+before disksize setting. It supports only partition at this moment.
+If admin want to use incompressible page writeback, they could do via::
+
+	echo huge > /sys/block/zramX/write
+
+To use idle page writeback, first, user need to declare zram pages
+as idle::
+
+	echo all > /sys/block/zramX/idle
+
+From now on, any pages on zram are idle pages. The idle mark
+will be removed until someone request access of the block.
+IOW, unless there is access request, those pages are still idle pages.
+
+Admin can request writeback of those idle pages at right timing via::
+
+	echo idle > /sys/block/zramX/writeback
+
+With the command, zram writeback idle pages from memory to the storage.
+
+If there are lots of write IO with flash device, potentially, it has
+flash wearout problem so that admin needs to design write limitation
+to guarantee storage health for entire product life.
+
+To overcome the concern, zram supports "writeback_limit" feature.
+The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
+any writeback. IOW, if admin want to apply writeback budget, he should
+enable writeback_limit_enable via::
+
+	$ echo 1 > /sys/block/zramX/writeback_limit_enable
+
+Once writeback_limit_enable is set, zram doesn't allow any writeback
+until admin set the budget via /sys/block/zramX/writeback_limit.
+
+(If admin doesn't enable writeback_limit_enable, writeback_limit's value
+assigned via /sys/block/zramX/writeback_limit is meaninless.)
+
+If admin want to limit writeback as per-day 400M, he could do it
+like below::
+
+	$ MB_SHIFT=20
+	$ 4K_SHIFT=12
+	$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
+		/sys/block/zram0/writeback_limit.
+	$ echo 1 > /sys/block/zram0/writeback_limit_enable
+
+If admin want to allow further write again once the bugdet is exausted,
+he could do it like below::
+
+	$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
+		/sys/block/zram0/writeback_limit
+
+If admin want to see remaining writeback budget since he set::
+
+	$ cat /sys/block/zramX/writeback_limit
+
+If admin want to disable writeback limit, he could do::
+
+	$ echo 0 > /sys/block/zramX/writeback_limit_enable
+
+The writeback_limit count will reset whenever you reset zram(e.g.,
+system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
+writeback happened until you reset the zram to allocate extra writeback
+budget in next setting is user's job.
+
+If admin want to measure writeback count in a certain period, he could
+know it via /sys/block/zram0/bd_stat's 3rd column.
+
+memory tracking
+===============
+
+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
+zram block. It could be useful to catch cold or incompressible
+pages of the process with*pagemap.
+
+If you enable the feature, you could see block state via
+/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
+
+	  300    75.033841 .wh.
+	  301    63.806904 s...
+	  302    63.806919 ..hi
+
+First column
+	zram's block index.
+Second column
+	access time since the system was booted
+Third column
+	state of the block:
+
+	s:
+		same page
+	w:
+		written page to backing store
+	h:
+		huge page
+	i:
+		idle page
+
+First line of above example says 300th block is accessed at 75.033841sec
+and the block's state is huge so it is written back to the backing
+storage. It's a debugging feature so anyone shouldn't rely on it to work
+properly.
+
+Nitin Gupta
+ngupta@vflare.org
diff --git a/Documentation/btmrvl.txt b/Documentation/admin-guide/btmrvl.rst
index ec57740ead0c..ec57740ead0c 100644
--- a/Documentation/btmrvl.txt
+++ b/Documentation/admin-guide/btmrvl.rst
diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst
index f278b289e260..44b8a4edd348 100644
--- a/Documentation/admin-guide/bug-hunting.rst
+++ b/Documentation/admin-guide/bug-hunting.rst
@@ -90,9 +90,9 @@ the disk is not available then you have three options:
     run a null modem to a second machine and capture the output there
     using your favourite communication program.  Minicom works well.
 
-(3) Use Kdump (see Documentation/kdump/kdump.txt),
+(3) Use Kdump (see Documentation/admin-guide/kdump/kdump.rst),
     extract the kernel ring buffer from old memory with using dmesg
-    gdbmacro in Documentation/kdump/gdbmacros.txt.
+    gdbmacro in Documentation/admin-guide/kdump/gdbmacros.txt.
 
 Finding the bug's location
 --------------------------
diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..1d7d962933be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,302 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+One IO control policy is throttling policy which can be used to
+specify upper IO rate limits on devices. This policy is implemented in
+generic block layer and can be used on leaf nodes as well as higher
+level logical devices like device mapper.
+
+HOWTO
+=====
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>"::
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Throttling implements hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+Throttling with "sane_behavior" will handle the
+hierarchy correctly. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+	- Block IO controller.
+
+CONFIG_BFQ_CGROUP_DEBUG
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
+	  Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format::
+
+	    # echo dev_maj:dev_minor weight > blkio.weight_device
+
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+	    # echo 8:16 300 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+	    # echo 8:0 500 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:0     500
+	    8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup::
+
+	    # echo 8:0 0 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the existing ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per device. Following is
+	  the format::
+
+	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..b0688011ed06
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/admin-guide/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+	1. Control Groups
+	1.1 What are cgroups ?
+	1.2 Why are cgroups needed ?
+	1.3 How are cgroups implemented ?
+	1.4 What does notify_on_release do ?
+	1.5 What does clone_children do ?
+	1.6 How do I use cgroups ?
+	2. Usage Examples and Syntax
+	2.1 Basic Usage
+	2.2 Attaching processes
+	2.3 Mounting hierarchies by name
+	3. Kernel API
+	3.1 Overview
+	3.2 Synchronization
+	3.3 Subsystem API
+	4. Extended attributes usage
+	5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+  # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+  # mount -t tmpfs cgroup_root /sys/fs/cgroup
+  # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+    xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+  # cd /sys/fs/cgroup/rg1
+  # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+  # cd my_cgroup
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.procs notify_on_release tasks
+  (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+  # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	  ...
+  # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+  # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+
+	- Trusted (XATTR_TRUSTED)
+	- Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+  Q: what's up with this '/bin/echo' ?
+  A: bash's builtin 'echo' command does not check calls to write() against
+     errors. If you use it in the cgroup file system, you won't be
+     able to tell whether a command succeeded or failed.
+
+  Q: When I attach processes, only the first of the line gets really attached !
+  A: We can only return one error code per call to write(). So you should also
+     put only ONE PID.
diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..86a6ae995d54
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+   1. Cpusets
+     1.1 What are cpusets ?
+     1.2 Why are cpusets needed ?
+     1.3 How are cpusets implemented ?
+     1.4 What are exclusive cpusets ?
+     1.5 What is memory_pressure ?
+     1.6 What is memory spread ?
+     1.7 What is sched_load_balance ?
+     1.8 What is sched_relax_domain_level ?
+     1.9 How do I use cpusets ?
+   2. Usage Examples and Syntax
+     2.1 Basic Usage
+     2.2 Adding/removing cpus
+     2.3 Setting flags
+     2.4 Attaching processes
+   3. Questions
+   4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/admin-guide/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+    Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+  -1   no request. use system default or follow request of others.
+   0   no search.
+   1   search siblings (hyperthreads in a core).
+   2   search cores in a package.
+   3   search cpus in a node [= system wide on non-NUMA system]
+   4   search nodes in a chunk of node [on NUMA system]
+   5   search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+   then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+  # cd /sys/fs/cgroup/cpuset
+  # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+  # cd my_cpuset
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.clone_children  cpuset.memory_pressure
+  cgroup.event_control   cpuset.memory_spread_page
+  cgroup.procs           cpuset.memory_spread_slab
+  cpuset.cpu_exclusive   cpuset.mems
+  cpuset.cpus            cpuset.sched_load_balance
+  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+  cpuset.mem_hardwall    notify_on_release
+  cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+  # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+  # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+  # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+  mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+  # /bin/echo "" > cpuset.cpus		-> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	...
+  # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+   what's up with this '/bin/echo' ?
+
+A:
+   bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q:
+   When I attach processes, only the first of the line gets really attached !
+
+A:
+   We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance::
+
+	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+	# echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding ``c *:3 rwm``::
+
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B::
+
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16690
+
+	<at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container::
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container::
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+  hugetlb.1GB.limit_in_bytes
+  hugetlb.1GB.max_usage_in_bytes
+  hugetlb.1GB.usage_in_bytes
+  hugetlb.1GB.failcnt
+  hugetlb.64KB.limit_in_bytes
+  hugetlb.64KB.max_usage_in_bytes
+  hugetlb.64KB.usage_in_bytes
+  hugetlb.64KB.failcnt
+  hugetlb.32MB.limit_in_bytes
+  hugetlb.32MB.max_usage_in_bytes
+  hugetlb.32MB.usage_in_bytes
+  hugetlb.32MB.failcnt
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
new file mode 100644
index 000000000000..10bf48bae0b0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -0,0 +1,28 @@
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+    :maxdepth: 1
+
+    cgroups
+
+    blkio-controller
+    cpuacct
+    cpusets
+    devices
+    freezer-subsystem
+    hugetlb
+    memcg_test
+    memory
+    net_cls
+    net_prio
+    pids
+    rdma
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..3f7115e07b5d
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+
+	Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+
+	Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+	mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
+
+	mem_cgroup_uncharge_swap()
+	  Called when swp_entry's refcnt goes down to 0. A charge against swap
+	  disappears.
+
+3. charge-commit-cancel
+=======================
+
+	Memcg pages are charged in two steps:
+
+		- mem_cgroup_try_charge()
+		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+	At try_charge(), there are no flags to say "this page is charged".
+	at this point, usage += PAGE_SIZE.
+
+	At commit(), the page is associated with the memcg.
+
+	At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+	Anonymous page is newly allocated at
+		  - page fault into MAP_ANONYMOUS mapping.
+		  - Copy-On-Write.
+
+	4.1 Swap-in.
+	At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+	(a) If the SwapCache is newly allocated and read, it has no charges.
+	(b) If the SwapCache has been mapped by processes, it has been
+	    charged already.
+
+	4.2 Swap-out.
+	At swap-out, typical state transition is below.
+
+	(a) add to swap cache. (marked as SwapCache)
+	    swp_entry's refcnt += 1.
+	(b) fully unmapped.
+	    swp_entry's refcnt += # of ptes.
+	(c) write back to swap.
+	(d) delete from swap cache. (remove from SwapCache)
+	    swp_entry's refcnt -= 1.
+
+
+	Finally, at task exit,
+	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+	Page Cache is charged at
+	- add_to_page_cache_locked().
+
+	The logic is very clear. (About migration, see below)
+
+	Note:
+	  __remove_from_page_cache() is called by remove_from_page_cache()
+	  and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
+
+	But brief explanation of the behavior of memcg around shmem will be
+	helpful to understand the logic.
+
+	Shmem's page (just leaf page, not direct/indirect block) can be on
+
+		- radix-tree of shmem's inode.
+		- SwapCache.
+		- Both on radix-tree and SwapCache. This happens at swap-in
+		  and swap-out,
+
+	It's charged when...
+
+	- A new page is added to shmem's radix-tree.
+	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+	mem_cgroup_migrate()
+
+8. LRU
+======
+        Each memcg has its own private LRU. Now, its handling is under global
+	VM's control (means that it's handled under global pgdat->lru_lock).
+	Almost all routines around memcg's LRU is called by global LRU's
+	list management functions under pgdat->lru_lock.
+
+	A special function is mem_cgroup_isolate_pages(). This scans
+	memcg's private LRU and call __isolate_lru_page() to extract a page
+	from LRU.
+
+	(By __isolate_lru_page(), the page is removed from both of global and
+	private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+	When you do test to do racy case, it's good test to set memcg's limit
+	to be very small rather than GB. Many races found in the test under
+	xKB or xxMB limits.
+
+	(Memory behavior under GB and Memory behavior under MB shows very
+	different situation.)
+
+9.2 Shmem
+---------
+
+	Historically, memcg's shmem handling was poor and we saw some amount
+	of troubles here. This is because shmem is page-cache but can be
+	SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+	For NUMA, migration is an another special case. To do easy test, cpuset
+	is useful. Following is a sample script to do migration::
+
+		mount -t cgroup -o cpuset none /opt/cpuset
+
+		mkdir /opt/cpuset/01
+		echo 1 > /opt/cpuset/01/cpuset.cpus
+		echo 0 > /opt/cpuset/01/cpuset.mems
+		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+		mkdir /opt/cpuset/02
+		echo 1 > /opt/cpuset/02/cpuset.cpus
+		echo 1 > /opt/cpuset/02/cpuset.mems
+		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+	In above set, when you moves a task from 01 to 02, page migration to
+	node 0 to node 1 will occur. Following is a script to migrate all
+	under cpuset.::
+
+		--
+		move_task()
+		{
+		for pid in $1
+		do
+			/bin/echo $pid >$2/tasks 2>/dev/null
+			echo -n $pid
+			echo -n " "
+		done
+		echo END
+		}
+
+		G1_TASK=`cat ${G1}/tasks`
+		G2_TASK=`cat ${G2}/tasks`
+		move_task "${G1_TASK}" ${G2} &
+		--
+
+9.4 Memory hotplug
+------------------
+
+	memory hotplug test is one of good test.
+
+	to offline memory, do following::
+
+		# echo offline > /sys/devices/system/memory/memoryXXX/state
+
+	(XXX is the place of memory)
+
+	This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+	When using hierarchy, mkdir/rmdir test should be done.
+	Use tests like the following::
+
+		echo 1 >/opt/cgroup/01/memory/use_hierarchy
+		mkdir /opt/cgroup/01/child_a
+		mkdir /opt/cgroup/01/child_b
+
+		set limit to 01.
+		add limit to 01/child_b
+		run jobs under child_a and child_b
+
+	create/delete following groups at random while jobs are running::
+
+		/opt/cgroup/01/child_a/child_aa
+		/opt/cgroup/01/child_b/child_bb
+		/opt/cgroup/01/child_c
+
+	running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+	Mounting with other subsystems is a good test because there is a
+	race and lock dependency with other cgroup subsystems.
+
+	example::
+
+		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+	and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+	Besides management of swap is one of complicated parts of memcg,
+	call path of swap-in at swapoff is not same as usual swap-in path..
+	It's worth to be tested explicitly.
+
+	For example, test like following is good:
+
+	(Shell-A)::
+
+		# mount -t cgroup none /cgroup -o memory
+		# mkdir /cgroup/test
+		# echo 40M > /cgroup/test/memory.limit_in_bytes
+		# echo 0 > /cgroup/test/tasks
+
+	Run malloc(100M) program under this. You'll see 60M of swaps.
+
+	(Shell-B)::
+
+		# move all tasks in /cgroup/test to /cgroup
+		# /sbin/swapoff -a
+		# rmdir /cgroup/test
+		# kill malloc task.
+
+	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+
+	Case A) when you can swapoff::
+
+		#swapoff -a
+		#echo 50M > /memory.limit_in_bytes
+
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation::
+
+		#echo 50M > memory.limit_in_bytes
+		#echo 50M > memory.memsw.limit_in_bytes
+
+	run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)::
+
+		#mkdir /cgroup/A
+		#echo $$ >/cgroup/A/tasks
+
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)::
+
+		#mkdir /cgroup/B
+		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+		#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading ``*.usage_in_bytes`` or
+	memory.stat of both A and B.
+
+	See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should
+	be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+	Memory controller implements memory thresholds using cgroups notification
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+	(Shell-A) Create cgroup and run event listener::
+
+		# mkdir /cgroup/A
+		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory::
+
+		# echo $$ >/cgroup/A/tasks
+		# a="$(dd if=/dev/zero bs=1M count=10)"
+		# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+      This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE:
+      The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks				     attach a task(thread) and show list of
+				     threads
+ cgroup.procs			     show list of processes
+ cgroup.event_control		     an interface for event_fd()
+ memory.usage_in_bytes		     show current usage for memory
+				     (See 5.5 for details)
+ memory.memsw.usage_in_bytes	     show current usage for memory+Swap
+				     (See 5.5 for details)
+ memory.limit_in_bytes		     set/show limit of memory usage
+ memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
+ memory.failcnt			     show the number of memory usage hits limits
+ memory.memsw.failcnt		     show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes	     show max memory usage recorded
+ memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes	     set/show soft limit of memory usage
+ memory.stat			     show various statistics
+ memory.use_hierarchy		     set/show hierarchical account enabled
+ memory.force_empty		     trigger forced page reclaim
+ memory.pressure_level		     set memory pressure notifications
+ memory.swappiness		     set/show swappiness parameter of vmscan
+				     (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate     set/show controls of moving charges
+ memory.oom_control		     set/show oom controls.
+ memory.numa_stat		     show the number of memory usage per numa
+				     node
+
+ memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes          show current kernel memory allocation
+ memory.kmem.failcnt                 show the number of kernel memory usage
+				     hits limits
+ memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
+				     hits limits
+ memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+		+--------------------+
+		|  mem_cgroup        |
+		|  (page_counter)    |
+		+--------------------+
+		 /            ^      \
+		/             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+  Reclaim does not work for the root cgroup, since we cannot set any
+  limits on the root cgroup.
+
+Note2:
+  When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   the i_pages lock.
+
+   Other lock order is following:
+
+   PG_locked.
+     mm->page_table_lock
+         pgdat->lru_lock
+	   lock_page_cgroup.
+
+  In many cases, just lock_page_cgroup() is called.
+
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+  every process consumes some stack pages. By accounting into
+  kernel memory, we prevent new processes from being created when the kernel
+  memory usage is too high.
+
+slab pages:
+  pages allocated by the SLAB or SLUB allocator are tracked. A copy
+  of each kmem_cache is created every time the cache is touched by the first time
+  from inside the memcg. The creation is done lazily, so some objects can still be
+  skipped while the cache is being created. All objects in a slab page should
+  belong to the same memcg. This only fails to hold when a task is migrated to a
+  different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+  some sockets protocols have memory pressure
+  thresholds. The Memory Controller allows them to be controlled individually
+  per cgroup, instead of globally.
+
+tcp memory pressure:
+  sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+WARNING:
+    In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+	# mount -t tmpfs none /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/memory
+	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+	# mkdir /sys/fs/cgroup/memory/0
+	# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+  Gibibytes.)
+
+NOTE:
+  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+  We cannot set limits on the root cgroup any more.
+
+::
+
+  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+  4194304
+
+We can check the usage::
+
+  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+  1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+  # echo 1 > memory.limit_in_bytes
+  # cat memory.limit_in_bytes
+  4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this::
+
+    # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Though rmdir() offlines memcg, but the memcg may still stay there due to
+  charged file caches. Some out-of-use page caches may keep charged until
+  memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache		# of bytes of page cache memory.
+rss		# of bytes of anonymous and swap cache memory (includes
+		transparent hugepages).
+rss_huge	# of bytes of anonymous transparent hugepages.
+mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
+pgpgin		# of charging events to the memory cgroup. The charging
+		event happens each time a page is accounted as either mapped
+		anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout		# of uncharging events to the memory cgroup. The uncharging
+		event happens each time a page is unaccounted from the cgroup.
+swap		# of bytes of swap usage
+dirty		# of bytes that are waiting to get written back to the disk.
+writeback	# of bytes of file/anon cache that are queued for syncing to
+		disk.
+inactive_anon	# of bytes of anonymous and swap cache memory on inactive
+		LRU list.
+active_anon	# of bytes of anonymous and swap cache memory on active
+		LRU list.
+inactive_file	# of bytes of file-backed memory on inactive LRU list.
+active_file	# of bytes of file-backed memory on active LRU list.
+unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+			  under which the memory cgroup is
+hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
+			  hierarchy under which memory cgroup is.
+
+total_<counter>		  # hierarchical version of <counter>, which in
+			  addition to the cgroup's own value includes the
+			  sum of all hierarchical children's values of
+			  <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+	recent_rotated means recent frequency of LRU rotation.
+	recent_scanned means recent # of scans to LRU.
+	showing for better debug please see the code for meanings.
+
+Note:
+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
+	This should not be confused with the true 'resident set size' or the
+	amount of physical memory used by the cgroup.
+
+	'rss + mapped_file" will give you resident set size of cgroup.
+
+	(Note: file and shmem may be shared among other cgroups. In that case,
+	mapped_file is accounted only when the memory cgroup is owner of page
+	cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+	# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+	       root
+	     /  |   \
+            /	|    \
+	   a	b     c
+		      | \
+		      |  \
+		      d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+	# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+	# echo 0 > memory.use_hierarchy
+
+NOTE1:
+       Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2:
+       When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+	# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+	# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+       Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2:
+       It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+	# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+      Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note:
+      Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note:
+      If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note:
+      It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+	# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ?                                    |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
+|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
+|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+|   | will be moved even if the task hasn't done page fault, i.e. they might   |
+|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+|   | And mapcount of the page is ignored (the page can be moved even if       |
+|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
+|   | enable move of swap charges.                                             |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+	#echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+	* enlarge limit or reduce usage.
+
+To reduce usage,
+
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+	- oom_kill_disable 0 or 1
+	  (if 1, oom-killer is disabled)
+	- under_oom	   0 or 1
+	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+   same as omitting the optional mode parameter, preserved by backwards
+   compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+   behavior, except that propagation continues regardless of whether there are
+   event listeners at each level, with the "hierarchy" mode. In the above
+   example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+   memory pressure is experienced in the memcg for which the notification is
+   registered. In the above example, group C will receive notification if
+   registered for "local" notification and the group experiences memory
+   pressure. However, group B will never receive notification, regardless if
+   there is an event listener for group C or not, if group B is registered for
+   local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir foo
+	# cd foo
+	# cgroup_event_listener memory.pressure_level low,hierarchy &
+	# echo 8000000 > memory.limit_in_bytes
+	# echo 8000000 > memory.memsw.limit_in_bytes
+	# echo $$ > tasks
+	# dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+	mkdir /sys/fs/cgroup/net_cls
+	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+	mkdir /sys/fs/cgroup/net_cls/0
+	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+	1048577
+
+- configuring tc::
+
+	tc qdisc add dev eth0 root handle 10: htb
+	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+  This file is read-only, and is simply informative.  It contains a unique
+  integer value that the kernel uses as an internal representation of this
+  cgroup.
+
+net_prio.ifpriomap
+  This file contains a map of the priorities assigned to traffic originating
+  from processes in this group and egressing the system on various interfaces.
+  It contains a list of tuples in the form <ifname priority>.  Contents of this
+  file can be modified by echoing a string into the file using the same tuple
+  format. For example::
+
+	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+  - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+	# mkdir -p /sys/fs/cgroup/pids
+	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+	# mkdir -p /sys/fs/cgroup/pids/parent/child
+	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.max
+	max
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	#
diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+   1. Overview
+     1-1. What is RDMA controller?
+     1-2. Why RDMA controller needed?
+     1-3. How is RDMA controller implemented?
+   2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+  ==========    =============================
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+  ==========    =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.max
+	#Output:
+	mlx4_0 hca_handle=2 hca_object=2000
+	ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.current
+	#Output:
+	mlx4_0 hca_handle=1 hca_object=20
+	ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 88e746074252..3b29005aa981 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
 conventions of cgroup v2.  It describes all userland-visible aspects
 of cgroup including core and specific controller behaviors.  All
 future changes must be reflected in this document.  Documentation for
-v1 is available under Documentation/cgroup-v1/.
+v1 is available under Documentation/admin-guide/cgroup-v1/.
 
 .. CONTENTS
 
@@ -177,6 +177,15 @@ cgroup v2 currently supports the following mount options.
 	ignored on non-init namespace mounts.  Please refer to the
 	Delegation section for details.
 
+  memory_localevents
+
+        Only populate memory.events with data for the current cgroup,
+        and not any subtrees. This is legacy behaviour, the default
+        behaviour without this option is to include subtree counts.
+        This option is system wide and can only be set on mount or
+        modified through remount from the init namespace. The mount
+        option is ignored on non-init namespace mounts.
+
 
 Organizing Processes and Threads
 --------------------------------
@@ -696,6 +705,12 @@ Conventions
   informational files on the root cgroup which end up showing global
   information available elsewhere shouldn't exist.
 
+- The default time unit is microseconds.  If a different unit is ever
+  used, an explicit unit suffix must be present.
+
+- A parts-per quantity should use a percentage decimal with at least
+  two digit fractional part - e.g. 13.40.
+
 - If a controller implements weight based resource distribution, its
   interface file should be named "weight" and have the range [1,
   10000] with 100 as the default.  The values are chosen to allow
@@ -999,7 +1014,7 @@ All time durations are in microseconds.
 	A read-only nested-key file which exists on non-root cgroups.
 
 	Shows pressure stall information for CPU. See
-	Documentation/accounting/psi.txt for details.
+	Documentation/accounting/psi.rst for details.
 
 
 Memory
@@ -1131,6 +1146,11 @@ PAGE_SIZE multiple when read back.
 	otherwise, a value change in this file generates a file
 	modified event.
 
+	Note that all fields in this file are hierarchical and the
+	file modified event can be generated due to an event down the
+	hierarchy. For for the local events at the cgroup level see
+	memory.events.local.
+
 	  low
 		The number of times the cgroup is reclaimed due to
 		high memory pressure even though its usage is under
@@ -1170,6 +1190,11 @@ PAGE_SIZE multiple when read back.
 		The number of processes belonging to this cgroup
 		killed by any kind of OOM killer.
 
+  memory.events.local
+	Similar to memory.events but the fields in the file are local
+	to the cgroup i.e. not hierarchical. The file modified event
+	generated on this file reflects only the local events.
+
   memory.stat
 	A read-only flat-keyed file which exists on non-root cgroups.
 
@@ -1330,7 +1355,7 @@ PAGE_SIZE multiple when read back.
 	A read-only nested-key file which exists on non-root cgroups.
 
 	Shows pressure stall information for memory. See
-	Documentation/accounting/psi.txt for details.
+	Documentation/accounting/psi.rst for details.
 
 
 Usage Guidelines
@@ -1473,7 +1498,7 @@ IO Interface Files
 	A read-only nested-key file which exists on non-root cgroups.
 
 	Shows pressure stall information for IO. See
-	Documentation/accounting/psi.txt for details.
+	Documentation/accounting/psi.rst for details.
 
 
 Writeback
@@ -2099,7 +2124,7 @@ following two functions.
 	a queue (device) has been associated with the bio and
 	before submission.
 
-  wbc_account_io(@wbc, @page, @bytes)
+  wbc_account_cgroup_owner(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
 	While this function doesn't care exactly when it's called
 	during the writeback session, it's the easiest and most
diff --git a/Documentation/clearing-warn-once.txt b/Documentation/admin-guide/clearing-warn-once.rst
index 211fd926cf00..211fd926cf00 100644
--- a/Documentation/clearing-warn-once.txt
+++ b/Documentation/admin-guide/clearing-warn-once.rst
diff --git a/Documentation/admin-guide/conf.py b/Documentation/admin-guide/conf.py
deleted file mode 100644
index 86f738953799..000000000000
--- a/Documentation/admin-guide/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = 'Linux Kernel User Documentation'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'linux-user.tex', 'Linux Kernel User Documentation',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/cpu-load.txt b/Documentation/admin-guide/cpu-load.rst
index 2d01ce43d2a2..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/admin-guide/cpu-load.rst
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
new file mode 100644
index 000000000000..b90dafcc8237
--- /dev/null
+++ b/Documentation/admin-guide/cputopology.rst
@@ -0,0 +1,177 @@
+===========================================
+How CPU topology info is exported via sysfs
+===========================================
+
+Export CPU topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo output of some architectures.  They reside in
+/sys/devices/system/cpu/cpuX/topology/:
+
+physical_package_id:
+
+	physical package id of cpuX. Typically corresponds to a physical
+	socket number, but the actual value is architecture and platform
+	dependent.
+
+die_id:
+
+	the CPU die ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+core_id:
+
+	the CPU core ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+book_id:
+
+	the book ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+drawer_id:
+
+	the drawer ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+core_cpus:
+
+	internal kernel map of CPUs within the same core.
+	(deprecated name: "thread_siblings")
+
+core_cpus_list:
+
+	human-readable list of CPUs within the same core.
+	(deprecated name: "thread_siblings_list");
+
+package_cpus:
+
+	internal kernel map of the CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings")
+
+package_cpus_list:
+
+	human-readable list of CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings_list")
+
+die_cpus:
+
+	internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+	human-readable list of CPUs within the same die.
+
+book_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	book_id.
+
+book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
+drawer_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	drawer_id.
+
+drawer_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	drawer_id.
+
+Architecture-neutral, drivers/base/topology.c, exports these attributes.
+However, the book and drawer related sysfs files will only be created if
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
+
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
+where they reflect the cpu and cache hierarchy.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h::
+
+	#define topology_physical_package_id(cpu)
+	#define topology_die_id(cpu)
+	#define topology_core_id(cpu)
+	#define topology_book_id(cpu)
+	#define topology_drawer_id(cpu)
+	#define topology_sibling_cpumask(cpu)
+	#define topology_core_cpumask(cpu)
+	#define topology_die_cpumask(cpu)
+	#define topology_book_cpumask(cpu)
+	#define topology_drawer_cpumask(cpu)
+
+The type of ``**_id macros`` is int.
+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
+correspond with appropriate ``**_siblings`` sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+
+1) topology_physical_package_id: -1
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
+
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
+no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+
+Additionally, CPU topology information is provided under
+/sys/devices/system/cpu and includes these files.  The internal
+source for the output is in brackets ("[]").
+
+    =========== ==========================================================
+    kernel_max: the maximum CPU index allowed by the kernel configuration.
+		[NR_CPUS-1]
+
+    offline:	CPUs that are not online because they have been
+		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
+		of CPUs allowed by the kernel configuration (kernel_max
+		above). [~cpu_online_mask + cpus >= NR_CPUS]
+
+    online:	CPUs that are online and being scheduled [cpu_online_mask]
+
+    possible:	CPUs that have been allocated resources and can be
+		brought online if they are present. [cpu_possible_mask]
+
+    present:	CPUs that have been identified as being present in the
+		system. [cpu_present_mask]
+    =========== ==========================================================
+
+The format for the above output is compatible with cpulist_parse()
+[see <linux/cpumask.h>].  Some examples follow.
+
+In this example, there are 64 CPUs in the system but cpus 32-63 exceed
+the kernel max which is limited to 0..31 by the NR_CPUS config option
+being 32.  Note also that CPUs 2 and 4-31 are not online but could be
+brought online as they are both present and possible::
+
+     kernel_max: 31
+        offline: 2,4-31,32-63
+         online: 0-1,3
+       possible: 0-31
+        present: 0-31
+
+In this example, the NR_CPUS config option is 128, but the kernel was
+started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
+was manually taken offline (and is the only CPU that can be brought
+online.)::
+
+     kernel_max: 127
+        offline: 2,4-127,128-143
+         online: 0-1,3
+       possible: 0-127
+        present: 0-3
+
+See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
+as well as more information on the various cpumasks.
diff --git a/Documentation/admin-guide/device-mapper/cache-policies.rst b/Documentation/admin-guide/device-mapper/cache-policies.rst
new file mode 100644
index 000000000000..b17fe352fc41
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/cache-policies.rst
@@ -0,0 +1,131 @@
+=============================
+Guidance for writing policies
+=============================
+
+Try to keep transactionality out of it.  The core is careful to
+avoid asking about anything that is migrating.  This is a pain, but
+makes it easier to write the policies.
+
+Mappings are loaded into the policy at construction time.
+
+Every bio that is mapped by the target is referred to the policy.
+The policy can return a simple HIT or MISS or issue a migration.
+
+Currently there's no way for the policy to issue background work,
+e.g. to start writing back dirty blocks that are going to be evicted
+soon.
+
+Because we map bios, rather than requests it's easy for the policy
+to get fooled by many small bios.  For this reason the core target
+issues periodic ticks to the policy.  It's suggested that the policy
+doesn't update states (eg, hit counts) for a block more than once
+for each tick.  The core ticks by watching bios complete, and so
+trying to see when the io scheduler has let the ios run.
+
+
+Overview of supplied cache replacement policies
+===============================================
+
+multiqueue (mq)
+---------------
+
+This policy is now an alias for smq (see below).
+
+The following tunables are accepted, but have no effect::
+
+	'sequential_threshold <#nr_sequential_ios>'
+	'random_threshold <#nr_random_ios>'
+	'read_promote_adjustment <value>'
+	'write_promote_adjustment <value>'
+	'discard_promote_adjustment <value>'
+
+Stochastic multiqueue (smq)
+---------------------------
+
+This policy is the default.
+
+The stochastic multi-queue (smq) policy addresses some of the problems
+with the multiqueue (mq) policy.
+
+The smq policy (vs mq) offers the promise of less memory utilization,
+improved performance and increased adaptability in the face of changing
+workloads.  smq also does not have any cumbersome tuning knobs.
+
+Users may switch from "mq" to "smq" simply by appropriately reloading a
+DM table that is using the cache target.  Doing so will cause all of the
+mq policy's hints to be dropped.  Also, performance of the cache may
+degrade slightly until smq recalculates the origin device's hotspots
+that should be cached.
+
+Memory usage
+^^^^^^^^^^^^
+
+The mq policy used a lot of memory; 88 bytes per cache block on a 64
+bit machine.
+
+smq uses 28bit indexes to implement its data structures rather than
+pointers.  It avoids storing an explicit hit count for each block.  It
+has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of
+the entries (each hotspot block covers a larger area than a single
+cache block).
+
+All this means smq uses ~25bytes per cache block.  Still a lot of
+memory, but a substantial improvement nontheless.
+
+Level balancing
+^^^^^^^^^^^^^^^
+
+mq placed entries in different levels of the multiqueue structures
+based on their hit count (~ln(hit count)).  This meant the bottom
+levels generally had the most entries, and the top ones had very
+few.  Having unbalanced levels like this reduced the efficacy of the
+multiqueue.
+
+smq does not maintain a hit count, instead it swaps hit entries with
+the least recently used entry from the level above.  The overall
+ordering being a side effect of this stochastic process.  With this
+scheme we can decide how many entries occupy each multiqueue level,
+resulting in better promotion/demotion decisions.
+
+Adaptability:
+The mq policy maintained a hit count for each cache block.  For a
+different block to get promoted to the cache its hit count has to
+exceed the lowest currently in the cache.  This meant it could take a
+long time for the cache to adapt between varying IO patterns.
+
+smq doesn't maintain hit counts, so a lot of this problem just goes
+away.  In addition it tracks performance of the hotspot queue, which
+is used to decide which blocks to promote.  If the hotspot queue is
+performing badly then it starts moving entries more quickly between
+levels.  This lets it adapt to new IO patterns very quickly.
+
+Performance
+^^^^^^^^^^^
+
+Testing smq shows substantially better performance than mq.
+
+cleaner
+-------
+
+The cleaner writes back all dirty blocks in a cache to decommission it.
+
+Examples
+========
+
+The syntax for a table is::
+
+	cache <metadata dev> <cache dev> <origin dev> <block size>
+	<#feature_args> [<feature arg>]*
+	<policy> <#policy_args> [<policy arg>]*
+
+The syntax to send a message using the dmsetup command is::
+
+	dmsetup message <mapped device> 0 sequential_threshold 1024
+	dmsetup message <mapped device> 0 random_threshold 8
+
+Using dmsetup::
+
+	dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \
+	    /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8"
+	creates a 128GB large mapped device named 'blah' with the
+	sequential threshold set to 1024 and the random_threshold set to 8.
diff --git a/Documentation/admin-guide/device-mapper/cache.rst b/Documentation/admin-guide/device-mapper/cache.rst
new file mode 100644
index 000000000000..f15e5254d05b
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/cache.rst
@@ -0,0 +1,337 @@
+=====
+Cache
+=====
+
+Introduction
+============
+
+dm-cache is a device mapper target written by Joe Thornber, Heinz
+Mauelshagen, and Mike Snitzer.
+
+It aims to improve performance of a block device (eg, a spindle) by
+dynamically migrating some of its data to a faster, smaller device
+(eg, an SSD).
+
+This device-mapper solution allows us to insert this caching at
+different levels of the dm stack, for instance above the data device for
+a thin-provisioning pool.  Caching solutions that are integrated more
+closely with the virtual memory system should give better performance.
+
+The target reuses the metadata library used in the thin-provisioning
+library.
+
+The decision as to what data to migrate and when is left to a plug-in
+policy module.  Several of these have been written as we experiment,
+and we hope other people will contribute others for specific io
+scenarios (eg. a vm image server).
+
+Glossary
+========
+
+  Migration
+	       Movement of the primary copy of a logical block from one
+	       device to the other.
+  Promotion
+	       Migration from slow device to fast device.
+  Demotion
+	       Migration from fast device to slow device.
+
+The origin device always contains a copy of the logical block, which
+may be out of date or kept in sync with the copy on the cache device
+(depending on policy).
+
+Design
+======
+
+Sub-devices
+-----------
+
+The target is constructed by passing three devices to it (along with
+other parameters detailed later):
+
+1. An origin device - the big, slow one.
+
+2. A cache device - the small, fast one.
+
+3. A small metadata device - records which blocks are in the cache,
+   which are dirty, and extra hints for use by the policy object.
+   This information could be put on the cache device, but having it
+   separate allows the volume manager to configure it differently,
+   e.g. as a mirror for extra robustness.  This metadata device may only
+   be used by a single cache device.
+
+Fixed block size
+----------------
+
+The origin is divided up into blocks of a fixed size.  This block size
+is configurable when you first create the cache.  Typically we've been
+using block sizes of 256KB - 1024KB.  The block size must be between 64
+sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB).
+
+Having a fixed block size simplifies the target a lot.  But it is
+something of a compromise.  For instance, a small part of a block may be
+getting hit a lot, yet the whole block will be promoted to the cache.
+So large block sizes are bad because they waste cache space.  And small
+block sizes are bad because they increase the amount of metadata (both
+in core and on disk).
+
+Cache operating modes
+---------------------
+
+The cache has three operating modes: writeback, writethrough and
+passthrough.
+
+If writeback, the default, is selected then a write to a block that is
+cached will go only to the cache and the block will be marked dirty in
+the metadata.
+
+If writethrough is selected then a write to a cached block will not
+complete until it has hit both the origin and cache devices.  Clean
+blocks should remain clean.
+
+If passthrough is selected, useful when the cache contents are not known
+to be coherent with the origin device, then all reads are served from
+the origin device (all reads miss the cache) and all writes are
+forwarded to the origin device; additionally, write hits cause cache
+block invalidates.  To enable passthrough mode the cache must be clean.
+Passthrough mode allows a cache device to be activated without having to
+worry about coherency.  Coherency that exists is maintained, although
+the cache will gradually cool as writes take place.  If the coherency of
+the cache can later be verified, or established through use of the
+"invalidate_cblocks" message, the cache device can be transitioned to
+writethrough or writeback mode while still warm.  Otherwise, the cache
+contents can be discarded prior to transitioning to the desired
+operating mode.
+
+A simple cleaner policy is provided, which will clean (write back) all
+dirty blocks in a cache.  Useful for decommissioning a cache or when
+shrinking a cache.  Shrinking the cache's fast device requires all cache
+blocks, in the area of the cache being removed, to be clean.  If the
+area being removed from the cache still contains dirty blocks the resize
+will fail.  Care must be taken to never reduce the volume used for the
+cache's fast device until the cache is clean.  This is of particular
+importance if writeback mode is used.  Writethrough and passthrough
+modes already maintain a clean cache.  Future support to partially clean
+the cache, above a specified threshold, will allow for keeping the cache
+warm and in writeback mode during resize.
+
+Migration throttling
+--------------------
+
+Migrating data between the origin and cache device uses bandwidth.
+The user can set a throttle to prevent more than a certain amount of
+migration occurring at any one time.  Currently we're not taking any
+account of normal io traffic going to the devices.  More work needs
+doing here to avoid migrating during those peak io moments.
+
+For the time being, a message "migration_threshold <#sectors>"
+can be used to set the maximum number of sectors being migrated,
+the default being 2048 sectors (1MB).
+
+Updating on-disk metadata
+-------------------------
+
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the cache behaves like a physical disk that has a volatile write
+cache.  If power is lost you may lose some recent writes.  The metadata
+should always be consistent in spite of any crash.
+
+The 'dirty' state for a cache block changes far too frequently for us
+to keep updating it on the fly.  So we treat it as a hint.  In normal
+operation it will be written when the dm device is suspended.  If the
+system crashes all cache blocks will be assumed dirty when restarted.
+
+Per-block policy hints
+----------------------
+
+Policy plug-ins can store a chunk of data per cache block.  It's up to
+the policy how big this chunk is, but it should be kept small.  Like the
+dirty flags this data is lost if there's a crash so a safe fallback
+value should always be possible.
+
+Policy hints affect performance, not correctness.
+
+Policy messaging
+----------------
+
+Policies will have different tunables, specific to each one, so we
+need a generic way of getting and setting these.  Device-mapper
+messages are used.  Refer to cache-policies.txt.
+
+Discard bitset resolution
+-------------------------
+
+We can avoid copying data during migration if we know the block has
+been discarded.  A prime example of this is when mkfs discards the
+whole block device.  We store a bitset tracking the discard state of
+blocks.  However, we allow this bitset to have a different block size
+from the cache blocks.  This is because we need to track the discard
+state for all of the origin device (compare with the dirty bitset
+which is just for the smaller cache device).
+
+Target interface
+================
+
+Constructor
+-----------
+
+  ::
+
+   cache <metadata dev> <cache dev> <origin dev> <block size>
+         <#feature args> [<feature arg>]*
+         <policy> <#policy args> [policy args]*
+
+ ================ =======================================================
+ metadata dev     fast device holding the persistent metadata
+ cache dev	  fast device holding cached data blocks
+ origin dev	  slow device holding original data blocks
+ block size       cache unit size in sectors
+
+ #feature args    number of feature arguments passed
+ feature args     writethrough or passthrough (The default is writeback.)
+
+ policy           the replacement policy to use
+ #policy args     an even number of arguments corresponding to
+                  key/value pairs passed to the policy
+ policy args      key/value pairs passed to the policy
+		  E.g. 'sequential_threshold 1024'
+		  See cache-policies.txt for details.
+ ================ =======================================================
+
+Optional feature arguments are:
+
+
+   ==================== ========================================================
+   writethrough		write through caching that prohibits cache block
+			content from being different from origin block content.
+			Without this argument, the default behaviour is to write
+			back cache block contents later for performance reasons,
+			so they may differ from the corresponding origin blocks.
+
+   passthrough		a degraded mode useful for various cache coherency
+			situations (e.g., rolling back snapshots of
+			underlying storage).	 Reads and writes always go to
+			the origin.	If a write goes to a cached origin
+			block, then the cache block is invalidated.
+			To enable passthrough mode the cache must be clean.
+
+   metadata2		use version 2 of the metadata.  This stores the dirty
+			bits in a separate btree, which improves speed of
+			shutting down the cache.
+
+   no_discard_passdown	disable passing down discards from the cache
+			to the origin's data device.
+   ==================== ========================================================
+
+A policy called 'default' is always registered.  This is an alias for
+the policy we currently think is giving best all round performance.
+
+As the default policy could vary between kernels, if you are relying on
+the characteristics of a specific policy, always request it by name.
+
+Status
+------
+
+::
+
+  <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+  <cache block size> <#used cache blocks>/<#total cache blocks>
+  <#read hits> <#read misses> <#write hits> <#write misses>
+  <#demotions> <#promotions> <#dirty> <#features> <features>*
+  <#core args> <core args>* <policy name> <#policy args> <policy args>*
+  <cache metadata mode>
+
+
+========================= =====================================================
+metadata block size	  Fixed block size for each metadata block in
+			  sectors
+#used metadata blocks	  Number of metadata blocks used
+#total metadata blocks	  Total number of metadata blocks
+cache block size	  Configurable block size for the cache device
+			  in sectors
+#used cache blocks	  Number of blocks resident in the cache
+#total cache blocks	  Total number of cache blocks
+#read hits		  Number of times a READ bio has been mapped
+			  to the cache
+#read misses		  Number of times a READ bio has been mapped
+			  to the origin
+#write hits		  Number of times a WRITE bio has been mapped
+			  to the cache
+#write misses		  Number of times a WRITE bio has been
+			  mapped to the origin
+#demotions		  Number of times a block has been removed
+			  from the cache
+#promotions		  Number of times a block has been moved to
+			  the cache
+#dirty			  Number of blocks in the cache that differ
+			  from the origin
+#feature args		  Number of feature args to follow
+feature args		  'writethrough' (optional)
+#core args		  Number of core arguments (must be even)
+core args		  Key/value pairs for tuning the core
+			  e.g. migration_threshold
+policy name		  Name of the policy
+#policy args		  Number of policy arguments to follow (must be even)
+policy args		  Key/value pairs e.g. sequential_threshold
+cache metadata mode       ro if read-only, rw if read-write
+
+			  In serious cases where even a read-only mode is
+			  deemed unsafe no further I/O will be permitted and
+			  the status will just contain the string 'Fail'.
+			  The userspace recovery tools should then be used.
+needs_check		  'needs_check' if set, '-' if not set
+			  A metadata operation has failed, resulting in the
+			  needs_check flag being set in the metadata's
+			  superblock.  The metadata device must be
+			  deactivated and checked/repaired before the
+			  cache can be made fully operational again.
+			  '-' indicates	needs_check is not set.
+========================= =====================================================
+
+Messages
+--------
+
+Policies will have different tunables, specific to each one, so we
+need a generic way of getting and setting these.  Device-mapper
+messages are used.  (A sysfs interface would also be possible.)
+
+The message format is::
+
+   <key> <value>
+
+E.g.::
+
+   dmsetup message my_cache 0 sequential_threshold 1024
+
+
+Invalidation is removing an entry from the cache without writing it
+back.  Cache blocks can be invalidated via the invalidate_cblocks
+message, which takes an arbitrary number of cblock ranges.  Each cblock
+range's end value is "one past the end", meaning 5-10 expresses a range
+of values from 5 to 9.  Each cblock must be expressed as a decimal
+value, in the future a variant message that takes cblock ranges
+expressed in hexadecimal may be needed to better support efficient
+invalidation of larger caches.  The cache must be in passthrough mode
+when invalidate_cblocks is used::
+
+   invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*
+
+E.g.::
+
+   dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789
+
+Examples
+========
+
+The test suite can be found here:
+
+https://github.com/jthornber/device-mapper-test-suite
+
+::
+
+  dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
+	  /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0'
+  dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
+	  /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \
+	  mq 4 sequential_threshold 1024 random_threshold 8'
diff --git a/Documentation/admin-guide/device-mapper/delay.rst b/Documentation/admin-guide/device-mapper/delay.rst
new file mode 100644
index 000000000000..917ba8c33359
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/delay.rst
@@ -0,0 +1,31 @@
+========
+dm-delay
+========
+
+Device-Mapper's "delay" target delays reads and/or writes
+and maps them to different devices.
+
+Parameters::
+
+    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>
+			       [<flush_device> <flush_offset> <flush_delay>]]
+
+With separate write parameters, the first set is only used for reads.
+Offsets are specified in sectors.
+Delays are specified in milliseconds.
+
+Example scripts
+===============
+
+::
+
+	#!/bin/sh
+	# Create device delaying rw operation for 500ms
+	echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed
+
+::
+
+	#!/bin/sh
+	# Create device delaying only write operation for 500ms and
+	# splitting reads and writes to different devices $1 $2
+	echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed
diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst
new file mode 100644
index 000000000000..8f4a3f889d43
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst
@@ -0,0 +1,173 @@
+========
+dm-crypt
+========
+
+Device-Mapper's "crypt" target provides transparent encryption of block devices
+using the kernel crypto API.
+
+For a more detailed description of supported parameters see:
+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
+
+Parameters::
+
+	      <cipher> <key> <iv_offset> <device path> \
+	      <offset> [<#opt_params> <opt_params>]
+
+<cipher>
+    Encryption cipher, encryption mode and Initial Vector (IV) generator.
+
+    The cipher specifications format is::
+
+       cipher[:keycount]-chainmode-ivmode[:ivopts]
+
+    Examples::
+
+       aes-cbc-essiv:sha256
+       aes-xts-plain64
+       serpent-xts-plain64
+
+    Cipher format also supports direct specification with kernel crypt API
+    format (selected by capi: prefix). The IV specification is the same
+    as for the first format type.
+    This format is mainly used for specification of authenticated modes.
+
+    The crypto API cipher specifications format is::
+
+        capi:cipher_api_spec-ivmode[:ivopts]
+
+    Examples::
+
+        capi:cbc(aes)-essiv:sha256
+        capi:xts(aes)-plain64
+
+    Examples of authenticated modes::
+
+        capi:gcm(aes)-random
+        capi:authenc(hmac(sha256),xts(aes))-random
+        capi:rfc7539(chacha20,poly1305)-random
+
+    The /proc/crypto contains a list of curently loaded crypto modes.
+
+<key>
+    Key used for encryption. It is encoded either as a hexadecimal number
+    or it can be passed as <key_string> prefixed with single colon
+    character (':') for keys residing in kernel keyring service.
+    You can only use key sizes that are valid for the selected cipher
+    in combination with the selected iv mode.
+    Note that for some iv modes the key string can contain additional
+    keys (for example IV seed) so the key contains more parts concatenated
+    into a single string.
+
+<key_string>
+    The kernel keyring key is identified by string in following format:
+    <key_size>:<key_type>:<key_description>.
+
+<key_size>
+    The encryption key size in bytes. The kernel key payload size must match
+    the value passed in <key_size>.
+
+<key_type>
+    Either 'logon' or 'user' kernel key type.
+
+<key_description>
+    The kernel keyring key description crypt target should look for
+    when loading key of <key_type>.
+
+<keycount>
+    Multi-key compatibility mode. You can define <keycount> keys and
+    then sectors are encrypted according to their offsets (sector 0 uses key0;
+    sector 1 uses key1 etc.).  <keycount> must be a power of two.
+
+<iv_offset>
+    The IV offset is a sector count that is added to the sector number
+    before creating the IV.
+
+<device path>
+    This is the device that is going to be used as backend and contains the
+    encrypted data.  You can specify it as a path like /dev/xxx or a device
+    number <major>:<minor>.
+
+<offset>
+    Starting sector within the device where the encrypted data begins.
+
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        3 allow_discards same_cpu_crypt submit_from_crypt_cpus
+
+allow_discards
+    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
+    The default is to ignore discard requests.
+
+    WARNING: Assess the specific security risks carefully before enabling this
+    option.  For example, allowing discards on encrypted devices may lead to
+    the leak of information about the ciphertext device (filesystem type,
+    used space etc.) if the discarded blocks can be located easily on the
+    device later.
+
+same_cpu_crypt
+    Perform encryption using the same cpu that IO was submitted on.
+    The default is to use an unbound workqueue so that encryption work
+    is automatically balanced between available CPUs.
+
+submit_from_crypt_cpus
+    Disable offloading writes to a separate thread after encryption.
+    There are some situations where offloading write bios from the
+    encryption threads to a single thread degrades performance
+    significantly.  The default is to offload write bios to the same
+    thread because it benefits CFQ to have writes submitted using the
+    same context.
+
+integrity:<bytes>:<type>
+    The device requires additional <bytes> metadata per-sector stored
+    in per-bio integrity structure. This metadata must by provided
+    by underlying dm-integrity target.
+
+    The <type> can be "none" if metadata is used only for persistent IV.
+
+    For Authenticated Encryption with Additional Data (AEAD)
+    the <type> is "aead". An AEAD mode additionally calculates and verifies
+    integrity for the encrypted device. The additional space is then
+    used for storing authentication tag (and persistent IV if needed).
+
+sector_size:<bytes>
+    Use <bytes> as the encryption unit instead of 512 bytes sectors.
+    This option can be in range 512 - 4096 bytes and must be power of two.
+    Virtual device will announce this size as a minimal IO and logical sector.
+
+iv_large_sectors
+   IV generators will use sector number counted in <sector_size> units
+   instead of default 512 bytes sectors.
+
+   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
+   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
+   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
+   if this flag is specified.
+
+Example scripts
+===============
+LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
+encryption with dm-crypt using the 'cryptsetup' utility, see
+https://gitlab.com/cryptsetup/cryptsetup
+
+::
+
+	#!/bin/sh
+	# Create a crypt device using dmsetup
+	dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0"
+
+::
+
+	#!/bin/sh
+	# Create a crypt device using dmsetup when encryption key is stored in keyring service
+	dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0"
+
+::
+
+	#!/bin/sh
+	# Create a crypt device using cryptsetup and LUKS header with default cipher
+	cryptsetup luksFormat $1
+	cryptsetup luksOpen $1 crypt1
diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/admin-guide/device-mapper/dm-dust.txt
index 954d402a1f6a..954d402a1f6a 100644
--- a/Documentation/device-mapper/dm-dust.txt
+++ b/Documentation/admin-guide/device-mapper/dm-dust.txt
diff --git a/Documentation/admin-guide/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst
new file mode 100644
index 000000000000..86138735879d
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst
@@ -0,0 +1,74 @@
+=========
+dm-flakey
+=========
+
+This target is the same as the linear target except that it exhibits
+unreliable behaviour periodically.  It's been found useful in simulating
+failing devices for testing purposes.
+
+Starting from the time the table is loaded, the device is available for
+<up interval> seconds, then exhibits unreliable behaviour for <down
+interval> seconds, and then this cycle repeats.
+
+Also, consider using this in combination with the dm-delay target too,
+which can delay reads and writes and/or send them to different
+underlying devices.
+
+Table parameters
+----------------
+
+::
+
+  <dev path> <offset> <up interval> <down interval> \
+    [<num_features> [<feature arguments>]]
+
+Mandatory parameters:
+
+    <dev path>:
+        Full pathname to the underlying block-device, or a
+        "major:minor" device-number.
+    <offset>:
+        Starting sector within the device.
+    <up interval>:
+        Number of seconds device is available.
+    <down interval>:
+        Number of seconds device returns errors.
+
+Optional feature parameters:
+
+  If no feature parameters are present, during the periods of
+  unreliability, all I/O returns errors.
+
+  drop_writes:
+	All write I/O is silently ignored.
+	Read I/O is handled correctly.
+
+  error_writes:
+	All write I/O is failed with an error signalled.
+	Read I/O is handled correctly.
+
+  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
+	During <down interval>, replace <Nth_byte> of the data of
+	each matching bio with <value>.
+
+    <Nth_byte>:
+	The offset of the byte to replace.
+	Counting starts at 1, to replace the first byte.
+    <direction>:
+	Either 'r' to corrupt reads or 'w' to corrupt writes.
+	'w' is incompatible with drop_writes.
+    <value>:
+	The value (from 0-255) to write.
+    <flags>:
+	Perform the replacement only if bio->bi_opf has all the
+	selected flags set.
+
+Examples:
+
+Replaces the 32nd byte of READ bios with the value 1::
+
+  corrupt_bio_byte 32 r 1 0
+
+Replaces the 224th byte of REQ_META (=32) bios with the value 0::
+
+  corrupt_bio_byte 224 w 0 32
diff --git a/Documentation/admin-guide/device-mapper/dm-init.rst b/Documentation/admin-guide/device-mapper/dm-init.rst
new file mode 100644
index 000000000000..e5242ff17e9b
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-init.rst
@@ -0,0 +1,125 @@
+================================
+Early creation of mapped devices
+================================
+
+It is possible to configure a device-mapper device to act as the root device for
+your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal userspace
+which configures the device, then pivot_root(8) in to it.
+
+The second is to create one or more device-mappers using the module parameter
+"dm-mod.create=" through the kernel boot command line argument.
+
+The format is specified as a string of data separated by commas and optionally
+semi-colons, where:
+
+ - a comma is used to separate fields like name, uuid, flags and table
+   (specifies one device)
+ - a semi-colon is used to separate devices.
+
+So the format will look like this::
+
+ dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+
+Where::
+
+	<name>		::= The device name.
+	<uuid>		::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | ""
+	<minor>		::= The device minor number | ""
+	<flags>		::= "ro" | "rw"
+	<table>		::= <start_sector> <num_sectors> <target_type> <target_args>
+	<target_type>	::= "verity" | "linear" | ... (see list below)
+
+The dm line should be equivalent to the one used by the dmsetup tool with the
+`--concise` argument.
+
+Target types
+============
+
+Not all target types are available as there are serious risks in allowing
+activation of certain DM targets without first using userspace tools to check
+the validity of associated metadata.
+
+======================= =======================================================
+`cache`			constrained, userspace should verify cache device
+`crypt`			allowed
+`delay`			allowed
+`era`			constrained, userspace should verify metadata device
+`flakey`		constrained, meant for test
+`linear`		allowed
+`log-writes`		constrained, userspace should verify metadata device
+`mirror`		constrained, userspace should verify main/mirror device
+`raid`			constrained, userspace should verify metadata device
+`snapshot`		constrained, userspace should verify src/dst device
+`snapshot-origin`	allowed
+`snapshot-merge`	constrained, userspace should verify src/dst device
+`striped`		allowed
+`switch`		constrained, userspace should verify dev path
+`thin`			constrained, requires dm target message from userspace
+`thin-pool`		constrained, requires dm target message from userspace
+`verity`		allowed
+`writecache`		constrained, userspace should verify cache device
+`zero`			constrained, not meant for rootfs
+======================= =======================================================
+
+If the target is not listed above, it is constrained by default (not tested).
+
+Examples
+========
+An example of booting to a linear array made up of user-mode linux block
+devices::
+
+  dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0
+
+This will boot to a rw dm-linear target of 8192 sectors split across two block
+devices identified by their major:minor numbers.  After boot, udev will rename
+this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned.
+
+An example of multiple device-mappers, with the dm-mod.create="..." contents
+is shown here split on multiple lines for readability::
+
+  dm-linear,,1,rw,
+    0 32768 linear 8:1 0,
+    32768 1024000 linear 8:2 0;
+  dm-verity,,3,ro,
+    0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256
+    ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42
+    5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51
+
+Other examples (per target):
+
+"crypt"::
+
+  dm-crypt,,8,ro,
+    0 1048576 crypt aes-xts-plain64
+    babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0
+    /dev/sda 0 1 allow_discards
+
+"delay"::
+
+  dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500
+
+"linear"::
+
+  dm-linear,,,rw,
+    0 32768 linear /dev/sda1 0,
+    32768 1024000 linear /dev/sda2 0,
+    1056768 204800 linear /dev/sda3 0,
+    1261568 512000 linear /dev/sda4 0
+
+"snapshot-origin"::
+
+  dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2
+
+"striped"::
+
+  dm-striped,,4,ro,0 1638400 striped 4 4096
+  /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0
+
+"verity"::
+
+  dm-verity,,4,ro,
+    0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
+    fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
+    51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584
diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst
new file mode 100644
index 000000000000..a30aa91b5fbe
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst
@@ -0,0 +1,259 @@
+============
+dm-integrity
+============
+
+The dm-integrity target emulates a block device that has additional
+per-sector tags that can be used for storing integrity information.
+
+A general problem with storing integrity tags with every sector is that
+writing the sector and the integrity tag must be atomic - i.e. in case of
+crash, either both sector and integrity tag or none of them is written.
+
+To guarantee write atomicity, the dm-integrity target uses journal, it
+writes sector data and integrity tags into a journal, commits the journal
+and then copies the data and integrity tags to their respective location.
+
+The dm-integrity target can be used with the dm-crypt target - in this
+situation the dm-crypt target creates the integrity data and passes them
+to the dm-integrity target via bio_integrity_payload attached to the bio.
+In this mode, the dm-crypt and dm-integrity targets provide authenticated
+disk encryption - if the attacker modifies the encrypted device, an I/O
+error is returned instead of random data.
+
+The dm-integrity target can also be used as a standalone target, in this
+mode it calculates and verifies the integrity tag internally. In this
+mode, the dm-integrity target can be used to detect silent data
+corruption on the disk or in the I/O path.
+
+There's an alternate mode of operation where dm-integrity uses bitmap
+instead of a journal. If a bit in the bitmap is 1, the corresponding
+region's data and integrity tags are not synchronized - if the machine
+crashes, the unsynchronized regions will be recalculated. The bitmap mode
+is faster than the journal mode, because we don't have to write the data
+twice, but it is also less reliable, because if data corruption happens
+when the machine crashes, it may not be detected.
+
+When loading the target for the first time, the kernel driver will format
+the device. But it will only format the device if the superblock contains
+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
+target can't be loaded.
+
+To use the target for the first time:
+
+1. overwrite the superblock with zeroes
+2. load the dm-integrity target with one-sector size, the kernel driver
+   will format the device
+3. unload the dm-integrity target
+4. read the "provided_data_sectors" value from the superblock
+5. load the dm-integrity target with the the target size
+   "provided_data_sectors"
+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
+   with the size "provided_data_sectors"
+
+
+Target arguments:
+
+1. the underlying block device
+
+2. the number of reserved sector at the beginning of the device - the
+   dm-integrity won't read of write these sectors
+
+3. the size of the integrity tag (if "-" is used, the size is taken from
+   the internal-hash algorithm)
+
+4. mode:
+
+	D - direct writes (without journal)
+		in this mode, journaling is
+		not used and data sectors and integrity tags are written
+		separately. In case of crash, it is possible that the data
+		and integrity tag doesn't match.
+	J - journaled writes
+		data and integrity tags are written to the
+		journal and atomicity is guaranteed. In case of crash,
+		either both data and tag or none of them are written. The
+		journaled mode degrades write throughput twice because the
+		data have to be written twice.
+	B - bitmap mode - data and metadata are written without any
+		synchronization, the driver maintains a bitmap of dirty
+		regions where data and metadata don't match. This mode can
+		only be used with internal hash.
+	R - recovery mode - in this mode, journal is not replayed,
+		checksums are not checked and writes to the device are not
+		allowed. This mode is useful for data recovery if the
+		device cannot be activated in any of the other standard
+		modes.
+
+5. the number of additional arguments
+
+Additional arguments:
+
+journal_sectors:number
+	The size of journal, this argument is used only if formatting the
+	device. If the device is already formatted, the value from the
+	superblock is used.
+
+interleave_sectors:number
+	The number of interleaved sectors. This values is rounded down to
+	a power of two. If the device is already formatted, the value from
+	the superblock is used.
+
+meta_device:device
+	Don't interleave the data and metadata on on device. Use a
+	separate device for metadata.
+
+buffer_sectors:number
+	The number of sectors in one buffer. The value is rounded down to
+	a power of two.
+
+	The tag area is accessed using buffers, the buffer size is
+	configurable. The large buffer size means that the I/O size will
+	be larger, but there could be less I/Os issued.
+
+journal_watermark:number
+	The journal watermark in percents. When the size of the journal
+	exceeds this watermark, the thread that flushes the journal will
+	be started.
+
+commit_time:number
+	Commit time in milliseconds. When this time passes, the journal is
+	written. The journal is also written immediatelly if the FLUSH
+	request is received.
+
+internal_hash:algorithm(:key)	(the key is optional)
+	Use internal hash or crc.
+	When this argument is used, the dm-integrity target won't accept
+	integrity tags from the upper target, but it will automatically
+	generate and verify the integrity tags.
+
+	You can use a crc algorithm (such as crc32), then integrity target
+	will protect the data against accidental corruption.
+	You can also use a hmac algorithm (for example
+	"hmac(sha256):0123456789abcdef"), in this mode it will provide
+	cryptographic authentication of the data without encryption.
+
+	When this argument is not used, the integrity tags are accepted
+	from an upper layer target, such as dm-crypt. The upper layer
+	target should check the validity of the integrity tags.
+
+recalculate
+	Recalculate the integrity tags automatically. It is only valid
+	when using internal hash.
+
+journal_crypt:algorithm(:key)	(the key is optional)
+	Encrypt the journal using given algorithm to make sure that the
+	attacker can't read the journal. You can use a block cipher here
+	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
+	"salsa20", "ctr(aes)" or "ecb(arc4)").
+
+	The journal contains history of last writes to the block device,
+	an attacker reading the journal could see the last sector nubmers
+	that were written. From the sector numbers, the attacker can infer
+	the size of files that were written. To protect against this
+	situation, you can encrypt the journal.
+
+journal_mac:algorithm(:key)	(the key is optional)
+	Protect sector numbers in the journal from accidental or malicious
+	modification. To protect against accidental modification, use a
+	crc algorithm, to protect against malicious modification, use a
+	hmac algorithm with a key.
+
+	This option is not needed when using internal-hash because in this
+	mode, the integrity of journal entries is checked when replaying
+	the journal. Thus, modified sector number would be detected at
+	this stage.
+
+block_size:number
+	The size of a data block in bytes.  The larger the block size the
+	less overhead there is for per-block integrity metadata.
+	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
+	specified the default block size is 512 bytes.
+
+sectors_per_bit:number
+	In the bitmap mode, this parameter specifies the number of
+	512-byte sectors that corresponds to one bitmap bit.
+
+bitmap_flush_interval:number
+	The bitmap flush interval in milliseconds. The metadata buffers
+	are synchronized when this interval expires.
+
+
+The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
+be changed when reloading the target (load an inactive table and swap the
+tables with suspend and resume). The other arguments should not be changed
+when reloading the target because the layout of disk data depend on them
+and the reloaded target would be non-functional.
+
+
+The layout of the formatted block device:
+
+* reserved sectors
+    (they are not used by this target, they can be used for
+    storing LUKS metadata or for other purpose), the size of the reserved
+    area is specified in the target arguments
+
+* superblock (4kiB)
+	* magic string - identifies that the device was formatted
+	* version
+	* log2(interleave sectors)
+	* integrity tag size
+	* the number of journal sections
+	* provided data sectors - the number of sectors that this target
+	  provides (i.e. the size of the device minus the size of all
+	  metadata and padding). The user of this target should not send
+	  bios that access data beyond the "provided data sectors" limit.
+	* flags
+	    SB_FLAG_HAVE_JOURNAL_MAC
+		- a flag is set if journal_mac is used
+	    SB_FLAG_RECALCULATING
+		- recalculating is in progress
+	    SB_FLAG_DIRTY_BITMAP
+		- journal area contains the bitmap of dirty
+		  blocks
+	* log2(sectors per block)
+	* a position where recalculating finished
+* journal
+	The journal is divided into sections, each section contains:
+
+	* metadata area (4kiB), it contains journal entries
+
+	  - every journal entry contains:
+
+		* logical sector (specifies where the data and tag should
+		  be written)
+		* last 8 bytes of data
+		* integrity tag (the size is specified in the superblock)
+
+	  - every metadata sector ends with
+
+		* mac (8-bytes), all the macs in 8 metadata sectors form a
+		  64-byte value. It is used to store hmac of sector
+		  numbers in the journal section, to protect against a
+		  possibility that the attacker tampers with sector
+		  numbers in the journal.
+		* commit id
+
+	* data area (the size is variable; it depends on how many journal
+	  entries fit into the metadata area)
+
+	    - every sector in the data area contains:
+
+		* data (504 bytes of data, the last 8 bytes are stored in
+		  the journal entry)
+		* commit id
+
+	To test if the whole journal section was written correctly, every
+	512-byte sector of the journal ends with 8-byte commit id. If the
+	commit id matches on all sectors in a journal section, then it is
+	assumed that the section was written correctly. If the commit id
+	doesn't match, the section was written partially and it should not
+	be replayed.
+
+* one or more runs of interleaved tags and data.
+    Each run contains:
+
+	* tag area - it contains integrity tags. There is one tag for each
+	  sector in the data area
+	* data area - it contains data sectors. The number of data sectors
+	  in one run must be a power of two. log2 of this value is stored
+	  in the superblock.
diff --git a/Documentation/admin-guide/device-mapper/dm-io.rst b/Documentation/admin-guide/device-mapper/dm-io.rst
new file mode 100644
index 000000000000..d2492917a1f5
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-io.rst
@@ -0,0 +1,75 @@
+=====
+dm-io
+=====
+
+Dm-io provides synchronous and asynchronous I/O services. There are three
+types of I/O services available, and each type has a sync and an async
+version.
+
+The user must set up an io_region structure to describe the desired location
+of the I/O. Each io_region indicates a block-device along with the starting
+sector and size of the region::
+
+   struct io_region {
+      struct block_device *bdev;
+      sector_t sector;
+      sector_t count;
+   };
+
+Dm-io can read from one io_region or write to one or more io_regions. Writes
+to multiple regions are specified by an array of io_region structures.
+
+The first I/O service type takes a list of memory pages as the data buffer for
+the I/O, along with an offset into the first page::
+
+   struct page_list {
+      struct page_list *next;
+      struct page *page;
+   };
+
+   int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+                  struct page_list *pl, unsigned int offset,
+                  unsigned long *error_bits);
+   int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+                   struct page_list *pl, unsigned int offset,
+                   io_notify_fn fn, void *context);
+
+The second I/O service type takes an array of bio vectors as the data buffer
+for the I/O. This service can be handy if the caller has a pre-assembled bio,
+but wants to direct different portions of the bio to different devices::
+
+   int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where,
+                       int rw, struct bio_vec *bvec,
+                       unsigned long *error_bits);
+   int dm_io_async_bvec(unsigned int num_regions, struct io_region *where,
+                        int rw, struct bio_vec *bvec,
+                        io_notify_fn fn, void *context);
+
+The third I/O service type takes a pointer to a vmalloc'd memory buffer as the
+data buffer for the I/O. This service can be handy if the caller needs to do
+I/O to a large region but doesn't want to allocate a large number of individual
+memory pages::
+
+   int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
+                     void *data, unsigned long *error_bits);
+   int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
+                      void *data, io_notify_fn fn, void *context);
+
+Callers of the asynchronous I/O services must include the name of a completion
+callback routine and a pointer to some context data for the I/O::
+
+   typedef void (*io_notify_fn)(unsigned long error, void *context);
+
+The "error" parameter in this callback, as well as the `*error` parameter in
+all of the synchronous versions, is a bitset (instead of a simple error value).
+In the case of an write-I/O to multiple regions, this bitset allows dm-io to
+indicate success or failure on each individual region.
+
+Before using any of the dm-io services, the user should call dm_io_get()
+and specify the number of pages they expect to perform I/O on concurrently.
+Dm-io will attempt to resize its mempool to make sure enough pages are
+always available in order to avoid unnecessary waiting while performing I/O.
+
+When the user is finished using the dm-io services, they should call
+dm_io_put() and specify the same number of pages that were given on the
+dm_io_get() call.
diff --git a/Documentation/admin-guide/device-mapper/dm-log.rst b/Documentation/admin-guide/device-mapper/dm-log.rst
new file mode 100644
index 000000000000..ba4fce39bc27
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-log.rst
@@ -0,0 +1,57 @@
+=====================
+Device-Mapper Logging
+=====================
+The device-mapper logging code is used by some of the device-mapper
+RAID targets to track regions of the disk that are not consistent.
+A region (or portion of the address space) of the disk may be
+inconsistent because a RAID stripe is currently being operated on or
+a machine died while the region was being altered.  In the case of
+mirrors, a region would be considered dirty/inconsistent while you
+are writing to it because the writes need to be replicated for all
+the legs of the mirror and may not reach the legs at the same time.
+Once all writes are complete, the region is considered clean again.
+
+There is a generic logging interface that the device-mapper RAID
+implementations use to perform logging operations (see
+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
+logging implementations are available and provide different
+capabilities.  The list includes:
+
+==============	==============================================================
+Type		Files
+==============	==============================================================
+disk		drivers/md/dm-log.c
+core		drivers/md/dm-log.c
+userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
+==============	==============================================================
+
+The "disk" log type
+-------------------
+This log implementation commits the log state to disk.  This way, the
+logging state survives reboots/crashes.
+
+The "core" log type
+-------------------
+This log implementation keeps the log state in memory.  The log state
+will not survive a reboot or crash, but there may be a small boost in
+performance.  This method can also be used if no storage device is
+available for storing log state.
+
+The "userspace" log type
+------------------------
+This log type simply provides a way to export the log API to userspace,
+so log implementations can be done there.  This is done by forwarding most
+logging requests to userspace, where a daemon receives and processes the
+request.
+
+The structure used for communication between kernel and userspace are
+located in include/linux/dm-log-userspace.h.  Due to the frequency,
+diversity, and 2-way communication nature of the exchanges between
+kernel and userspace, 'connector' is used as the interface for
+communication.
+
+There are currently two userspace log implementations that leverage this
+framework - "clustered-disk" and "clustered-core".  These implementations
+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
+can be used in a shared-storage environment when the cluster log implementations
+are employed.
diff --git a/Documentation/admin-guide/device-mapper/dm-queue-length.rst b/Documentation/admin-guide/device-mapper/dm-queue-length.rst
new file mode 100644
index 000000000000..d8e381c1cb02
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-queue-length.rst
@@ -0,0 +1,48 @@
+===============
+dm-queue-length
+===============
+
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path with the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+
+Table parameters for each path: [<repeat_count>]
+
+::
+
+	<repeat_count>: The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used. To check
+			the default value, see the activated table.
+
+Status for each path: <status> <fail-count> <in-flight>
+
+::
+
+	<status>: 'A' if the path is active, 'F' if the path is failed.
+	<fail-count>: The number of path failures.
+	<in-flight>: The number of in-flight I/Os on the path.
+
+
+Algorithm
+=========
+
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path with the minimum 'in-flight'.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+
+::
+
+  # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+    dmsetup create test
+  #
+  # dmsetup table
+  test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+  #
+  # dmsetup status
+  test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst
new file mode 100644
index 000000000000..2fe255b130fb
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-raid.rst
@@ -0,0 +1,419 @@
+=======
+dm-raid
+=======
+
+The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
+It allows the MD RAID drivers to be accessed using a device-mapper
+interface.
+
+
+Mapping Table Interface
+-----------------------
+The target is named "raid" and it accepts the following parameters::
+
+  <raid_type> <#raid_params> <raid_params> \
+    <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
+
+<raid_type>:
+
+  ============= ===============================================================
+  raid0		RAID0 striping (no resilience)
+  raid1		RAID1 mirroring
+  raid4		RAID4 with dedicated last parity disk
+  raid5_n 	RAID5 with dedicated last parity disk supporting takeover
+		Same as raid4
+
+		- Transitory layout
+  raid5_la	RAID5 left asymmetric
+
+		- rotating parity 0 with data continuation
+  raid5_ra	RAID5 right asymmetric
+
+		- rotating parity N with data continuation
+  raid5_ls	RAID5 left symmetric
+
+		- rotating parity 0 with data restart
+  raid5_rs 	RAID5 right symmetric
+
+		- rotating parity N with data restart
+  raid6_zr	RAID6 zero restart
+
+		- rotating parity zero (left-to-right) with data restart
+  raid6_nr	RAID6 N restart
+
+		- rotating parity N (right-to-left) with data restart
+  raid6_nc	RAID6 N continue
+
+		- rotating parity N (right-to-left) with data continuation
+  raid6_n_6	RAID6 with dedicate parity disks
+
+		- parity and Q-syndrome on the last 2 disks;
+		  layout for takeover from/to raid4/raid5_n
+  raid6_la_6	Same as "raid_la" plus dedicated last Q-syndrome disk
+
+		- layout for takeover from raid5_la from/to raid6
+  raid6_ra_6	Same as "raid5_ra" dedicated last Q-syndrome disk
+
+		- layout for takeover from raid5_ra from/to raid6
+  raid6_ls_6	Same as "raid5_ls" dedicated last Q-syndrome disk
+
+		- layout for takeover from raid5_ls from/to raid6
+  raid6_rs_6	Same as "raid5_rs" dedicated last Q-syndrome disk
+
+		- layout for takeover from raid5_rs from/to raid6
+  raid10        Various RAID10 inspired algorithms chosen by additional params
+		(see raid10_format and raid10_copies below)
+
+		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
+		- RAID1E: Integrated Adjacent Stripe Mirroring
+		- RAID1E: Integrated Offset Stripe Mirroring
+		- and other similar RAID10 variants
+  ============= ===============================================================
+
+  Reference: Chapter 4 of
+  http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
+
+<#raid_params>: The number of parameters that follow.
+
+<raid_params> consists of
+
+    Mandatory parameters:
+        <chunk_size>:
+		      Chunk size in sectors.  This parameter is often known as
+		      "stripe size".  It is the only mandatory parameter and
+		      is placed first.
+
+    followed by optional parameters (in any order):
+	[sync|nosync]
+		Force or prevent RAID initialization.
+
+	[rebuild <idx>]
+		Rebuild drive number 'idx' (first drive is 0).
+
+	[daemon_sleep <ms>]
+		Interval between runs of the bitmap daemon that
+		clear bits.  A longer interval means less bitmap I/O but
+		resyncing after a failure is likely to take longer.
+
+	[min_recovery_rate <kB/sec/disk>]
+		Throttle RAID initialization
+	[max_recovery_rate <kB/sec/disk>]
+		Throttle RAID initialization
+	[write_mostly <idx>]
+		Mark drive index 'idx' write-mostly.
+	[max_write_behind <sectors>]
+		See '--write-behind=' (man mdadm)
+	[stripe_cache <sectors>]
+		Stripe cache size (RAID 4/5/6 only)
+	[region_size <sectors>]
+		The region_size multiplied by the number of regions is the
+		logical size of the array.  The bitmap records the device
+		synchronisation state for each region.
+
+        [raid10_copies   <# copies>], [raid10_format   <near|far|offset>]
+		These two options are used to alter the default layout of
+		a RAID10 configuration.  The number of copies is can be
+		specified, but the default is 2.  There are also three
+		variations to how the copies are laid down - the default
+		is "near".  Near copies are what most people think of with
+		respect to mirroring.  If these options are left unspecified,
+		or 'raid10_copies 2' and/or 'raid10_format near' are given,
+		then the layouts for 2, 3 and 4 devices	are:
+
+		========	 ==========	   ==============
+		2 drives         3 drives          4 drives
+		========	 ==========	   ==============
+		A1  A1           A1  A1  A2        A1  A1  A2  A2
+		A2  A2           A2  A3  A3        A3  A3  A4  A4
+		A3  A3           A4  A4  A5        A5  A5  A6  A6
+		A4  A4           A5  A6  A6        A7  A7  A8  A8
+		..  ..           ..  ..  ..        ..  ..  ..  ..
+		========	 ==========	   ==============
+
+		The 2-device layout is equivalent 2-way RAID1.  The 4-device
+		layout is what a traditional RAID10 would look like.  The
+		3-device layout is what might be called a 'RAID1E - Integrated
+		Adjacent Stripe Mirroring'.
+
+		If 'raid10_copies 2' and 'raid10_format far', then the layouts
+		for 2, 3 and 4 devices are:
+
+		========	     ============	  ===================
+		2 drives             3 drives             4 drives
+		========	     ============	  ===================
+		A1  A2               A1   A2   A3         A1   A2   A3   A4
+		A3  A4               A4   A5   A6         A5   A6   A7   A8
+		A5  A6               A7   A8   A9         A9   A10  A11  A12
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		A2  A1               A3   A1   A2         A2   A1   A4   A3
+		A4  A3               A6   A4   A5         A6   A5   A8   A7
+		A6  A5               A9   A7   A8         A10  A9   A12  A11
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		========	     ============	  ===================
+
+		If 'raid10_copies 2' and 'raid10_format offset', then the
+		layouts for 2, 3 and 4 devices are:
+
+		========       ==========         ================
+		2 drives       3 drives           4 drives
+		========       ==========         ================
+		A1  A2         A1  A2  A3         A1  A2  A3  A4
+		A2  A1         A3  A1  A2         A2  A1  A4  A3
+		A3  A4         A4  A5  A6         A5  A6  A7  A8
+		A4  A3         A6  A4  A5         A6  A5  A8  A7
+		A5  A6         A7  A8  A9         A9  A10 A11 A12
+		A6  A5         A9  A7  A8         A10 A9  A12 A11
+		..  ..         ..  ..  ..         ..  ..  ..  ..
+		========       ==========         ================
+
+		Here we see layouts closely akin to 'RAID1E - Integrated
+		Offset Stripe Mirroring'.
+
+        [delta_disks <N>]
+		The delta_disks option value (-251 < N < +251) triggers
+		device removal (negative value) or device addition (positive
+		value) to any reshape supporting raid levels 4/5/6 and 10.
+		RAID levels 4/5/6 allow for addition of devices (metadata
+		and data device tuple), raid10_near and raid10_offset only
+		allow for device addition. raid10_far does not support any
+		reshaping at all.
+		A minimum of devices have to be kept to enforce resilience,
+		which is 3 devices for raid4/5 and 4 devices for raid6.
+
+        [data_offset <sectors>]
+		This option value defines the offset into each data device
+		where the data starts. This is used to provide out-of-place
+		reshaping space to avoid writing over data while
+		changing the layout of stripes, hence an interruption/crash
+		may happen at any time without the risk of losing data.
+		E.g. when adding devices to an existing raid set during
+		forward reshaping, the out-of-place space will be allocated
+		at the beginning of each raid device. The kernel raid4/5/6/10
+		MD personalities supporting such device addition will read the data from
+		the existing first stripes (those with smaller number of stripes)
+		starting at data_offset to fill up a new stripe with the larger
+		number of stripes, calculate the redundancy blocks (CRC/Q-syndrome)
+		and write that new stripe to offset 0. Same will be applied to all
+		N-1 other new stripes. This out-of-place scheme is used to change
+		the RAID type (i.e. the allocation algorithm) as well, e.g.
+		changing from raid5_ls to raid5_n.
+
+	[journal_dev <dev>]
+		This option adds a journal device to raid4/5/6 raid sets and
+		uses it to close the 'write hole' caused by the non-atomic updates
+		to the component devices which can cause data loss during recovery.
+		The journal device is used as writethrough thus causing writes to
+		be throttled versus non-journaled raid4/5/6 sets.
+		Takeover/reshape is not possible with a raid4/5/6 journal device;
+		it has to be deconfigured before requesting these.
+
+	[journal_mode <mode>]
+		This option sets the caching mode on journaled raid4/5/6 raid sets
+		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
+		If 'writeback' is selected the journal device has to be resilient
+		and must not suffer from the 'write hole' problem itself (e.g. use
+		raid1 or raid10) to avoid a single point of failure.
+
+<#raid_devs>: The number of devices composing the array.
+	Each device consists of two entries.  The first is the device
+	containing the metadata (if any); the second is the one containing the
+	data. A Maximum of 64 metadata/data device entries are supported
+	up to target version 1.8.0.
+	1.9.0 supports up to 253 which is enforced by the used MD kernel runtime.
+
+	If a drive has failed or is missing at creation time, a '-' can be
+	given for both the metadata and data drives for a given position.
+
+
+Example Tables
+--------------
+
+::
+
+  # RAID4 - 4 data drives, 1 parity (no metadata devices)
+  # No metadata devices specified to hold superblock/bitmap info
+  # Chunk size of 1MiB
+  # (Lines separated for easy reading)
+
+  0 1960893648 raid \
+          raid4 1 2048 \
+          5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+  # RAID4 - 4 data drives, 1 parity (with metadata devices)
+  # Chunk size of 1MiB, force RAID initialization,
+  #       min recovery rate at 20 kiB/sec/disk
+
+  0 1960893648 raid \
+          raid4 4 2048 sync min_recovery_rate 20 \
+          5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
+
+
+Status Output
+-------------
+'dmsetup table' displays the table used to construct the mapping.
+The optional parameters are always printed in the order listed
+above with "sync" or "nosync" always output ahead of the other
+arguments, regardless of the order used when originally loading the table.
+Arguments that can be repeated are ordered by value.
+
+
+'dmsetup status' yields information on the state and health of the array.
+The output is as follows (normally a single line, but expanded here for
+clarity)::
+
+  1: <s> <l> raid \
+  2:      <raid_type> <#devices> <health_chars> \
+  3:      <sync_ratio> <sync_action> <mismatch_cnt>
+
+Line 1 is the standard output produced by device-mapper.
+
+Line 2 & 3 are produced by the raid target and are best explained by example::
+
+        0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0
+
+Here we can see the RAID type is raid4, there are 5 devices - all of
+which are 'A'live, and the array is 2/490221568 complete with its initial
+recovery.  Here is a fuller description of the individual fields:
+
+	=============== =========================================================
+	<raid_type>     Same as the <raid_type> used to create the array.
+	<health_chars>  One char for each device, indicating:
+
+			- 'A' = alive and in-sync
+			- 'a' = alive but not in-sync
+			- 'D' = dead/failed.
+	<sync_ratio>    The ratio indicating how much of the array has undergone
+			the process described by 'sync_action'.  If the
+			'sync_action' is "check" or "repair", then the process
+			of "resync" or "recover" can be considered complete.
+	<sync_action>   One of the following possible states:
+
+			idle
+				- No synchronization action is being performed.
+			frozen
+				- The current action has been halted.
+			resync
+				- Array is undergoing its initial synchronization
+				  or is resynchronizing after an unclean shutdown
+				  (possibly aided by a bitmap).
+			recover
+				- A device in the array is being rebuilt or
+				  replaced.
+			check
+				- A user-initiated full check of the array is
+				  being performed.  All blocks are read and
+				  checked for consistency.  The number of
+				  discrepancies found are recorded in
+				  <mismatch_cnt>.  No changes are made to the
+				  array by this action.
+			repair
+				- The same as "check", but discrepancies are
+				  corrected.
+			reshape
+				- The array is undergoing a reshape.
+	<mismatch_cnt>  The number of discrepancies found between mirror copies
+			in RAID1/10 or wrong parity values found in RAID4/5/6.
+			This value is valid only after a "check" of the array
+			is performed.  A healthy array has a 'mismatch_cnt' of 0.
+	<data_offset>   The current data offset to the start of the user data on
+			each component device of a raid set (see the respective
+			raid parameter to support out-of-place reshaping).
+	<journal_char>	- 'A' - active write-through journal device.
+			- 'a' - active write-back journal device.
+			- 'D' - dead journal device.
+			- '-' - no journal device.
+	=============== =========================================================
+
+
+Message Interface
+-----------------
+The dm-raid target will accept certain actions through the 'message' interface.
+('man dmsetup' for more information on the message interface.)  These actions
+include:
+
+	========= ================================================
+	"idle"    Halt the current sync action.
+	"frozen"  Freeze the current sync action.
+	"resync"  Initiate/continue a resync.
+	"recover" Initiate/continue a recover process.
+	"check"   Initiate a check (i.e. a "scrub") of the array.
+	"repair"  Initiate a repair of the array.
+	========= ================================================
+
+
+Discard Support
+---------------
+The implementation of discard support among hardware vendors varies.
+When a block is discarded, some storage devices will return zeroes when
+the block is read.  These devices set the 'discard_zeroes_data'
+attribute.  Other devices will return random data.  Confusingly, some
+devices that advertise 'discard_zeroes_data' will not reliably return
+zeroes when discarded blocks are read!  Since RAID 4/5/6 uses blocks
+from a number of devices to calculate parity blocks and (for performance
+reasons) relies on 'discard_zeroes_data' being reliable, it is important
+that the devices be consistent.  Blocks may be discarded in the middle
+of a RAID 4/5/6 stripe and if subsequent read results are not
+consistent, the parity blocks may be calculated differently at any time;
+making the parity blocks useless for redundancy.  It is important to
+understand how your hardware behaves with discards if you are going to
+enable discards with RAID 4/5/6.
+
+Since the behavior of storage devices is unreliable in this respect,
+even when reporting 'discard_zeroes_data', by default RAID 4/5/6
+discard support is disabled -- this ensures data integrity at the
+expense of losing some performance.
+
+Storage devices that properly support 'discard_zeroes_data' are
+increasingly whitelisted in the kernel and can thus be trusted.
+
+For trusted devices, the following dm-raid module parameter can be set
+to safely enable discard support for RAID 4/5/6:
+
+    'devices_handle_discards_safely'
+
+
+Version History
+---------------
+
+::
+
+ 1.0.0	Initial version.  Support for RAID 4/5/6
+ 1.1.0	Added support for RAID 1
+ 1.2.0	Handle creation of arrays that contain failed devices.
+ 1.3.0	Added support for RAID 10
+ 1.3.1	Allow device replacement/rebuild for RAID 10
+ 1.3.2	Fix/improve redundancy checking for RAID10
+ 1.4.0	Non-functional change.  Removes arg from mapping function.
+ 1.4.1	RAID10 fix redundancy validation checks (commit 55ebbb5).
+ 1.4.2	Add RAID10 "far" and "offset" algorithm support.
+ 1.5.0	Add message interface to allow manipulation of the sync_action.
+	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
+ 1.5.1	Add ability to restore transiently failed devices on resume.
+ 1.5.2	'mismatch_cnt' is zero unless [last_]sync_action is "check".
+ 1.6.0	Add discard support (and devices_handle_discard_safely module param).
+ 1.7.0	Add support for MD RAID0 mappings.
+ 1.8.0	Explicitly check for compatible flags in the superblock metadata
+	and reject to start the raid set if any are set by a newer
+	target version, thus avoiding data corruption on a raid set
+	with a reshape in progress.
+ 1.9.0	Add support for RAID level takeover/reshape/region size
+	and set size reduction.
+ 1.9.1	Fix activation of existing RAID 4/10 mapped devices
+ 1.9.2	Don't emit '- -' on the status table line in case the constructor
+	fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
+	'D' on the status line.  If '- -' is passed into the constructor, emit
+	'- -' on the table line and '-' as the status line health character.
+ 1.10.0	Add support for raid4/5/6 journal device
+ 1.10.1	Fix data corruption on reshape request
+ 1.11.0	Fix table line argument order
+	(wrong raid10_copies/raid10_format sequence)
+ 1.11.1	Add raid4/5/6 journal write-back support via journal_mode option
+ 1.12.1	Fix for MD deadlock between mddev_suspend() and md_write_start() available
+ 1.13.0	Fix dev_health status at end of "recover" (was 'a', now 'A')
+ 1.13.1	Fix deadlock caused by early md_stop_writes().  Also fix size an
+	state races.
+ 1.13.2	Fix raid redundancy validation and avoid keeping raid set frozen
+ 1.14.0	Fix reshape race on small devices.  Fix stripe adding reshape
+	deadlock/potential data corruption.  Update superblock when
+	specific devices are requested via rebuild.  Fix RAID leg
+	rebuild errors.
diff --git a/Documentation/admin-guide/device-mapper/dm-service-time.rst b/Documentation/admin-guide/device-mapper/dm-service-time.rst
new file mode 100644
index 000000000000..facf277fc13c
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-service-time.rst
@@ -0,0 +1,101 @@
+===============
+dm-service-time
+===============
+
+dm-service-time is a path selector module for device-mapper targets,
+which selects a path with the shortest estimated service time for
+the incoming I/O.
+
+The service time for each path is estimated by dividing the total size
+of in-flight I/Os on a path with the performance value of the path.
+The performance value is a relative throughput value among all paths
+in a path-group, and it can be specified as a table argument.
+
+The path selector name is 'service-time'.
+
+Table parameters for each path:
+
+    [<repeat_count> [<relative_throughput>]]
+	<repeat_count>:
+			The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used.  To check
+			the default value, see the activated table.
+	<relative_throughput>:
+			The relative throughput value of the path
+			among all paths in the path-group.
+			The valid range is 0-100.
+			If not given, minimum value '1' is used.
+			If '0' is given, the path isn't selected while
+			other paths having a positive value are available.
+
+Status for each path:
+
+    <status> <fail-count> <in-flight-size> <relative_throughput>
+	<status>:
+		'A' if the path is active, 'F' if the path is failed.
+	<fail-count>:
+		The number of path failures.
+	<in-flight-size>:
+		The size of in-flight I/Os on the path.
+	<relative_throughput>:
+		The relative throughput value of the path
+		among all paths in the path-group.
+
+
+Algorithm
+=========
+
+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
+dispatched and subtracts when completed.
+Basically, dm-service-time selects a path having minimum service time
+which is calculated by::
+
+	('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
+
+However, some optimizations below are used to reduce the calculation
+as much as possible.
+
+	1. If the paths have the same 'relative_throughput', skip
+	   the division and just compare the 'in-flight-size'.
+
+	2. If the paths have the same 'in-flight-size', skip the division
+	   and just compare the 'relative_throughput'.
+
+	3. If some paths have non-zero 'relative_throughput' and others
+	   have zero 'relative_throughput', ignore those paths with zero
+	   'relative_throughput'.
+
+If such optimizations can't be applied, calculate service time, and
+compare service time.
+If calculated service time is equal, the path having maximum
+'relative_throughput' may be better.  So compare 'relative_throughput'
+then.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128
+and sda has an average throughput 1GB/s and sdb has 4GB/s,
+'relative_throughput' value may be '1' for sda and '4' for sdb::
+
+  # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
+    dmsetup create test
+  #
+  # dmsetup table
+  test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
+  #
+  # dmsetup status
+  test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
+
+
+Or '2' for sda and '8' for sdb would be also true::
+
+  # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
+    dmsetup create test
+  #
+  # dmsetup table
+  test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
+  #
+  # dmsetup status
+  test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/Documentation/admin-guide/device-mapper/dm-uevent.rst b/Documentation/admin-guide/device-mapper/dm-uevent.rst
new file mode 100644
index 000000000000..4a8ee8d069c9
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-uevent.rst
@@ -0,0 +1,110 @@
+====================
+device-mapper uevent
+====================
+
+The device-mapper uevent code adds the capability to device-mapper to create
+and send kobject uevents (uevents).  Previously device-mapper events were only
+available through the ioctl interface.  The advantage of the uevents interface
+is the event contains environment attributes providing increased context for
+the event avoiding the need to query the state of the device-mapper device after
+the event is received.
+
+There are two functions currently for device-mapper events.  The first function
+listed creates the event and the second function sends the event(s)::
+
+  void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+                      const char *path, unsigned nr_valid_paths)
+
+  void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+
+
+The variables added to the uevent environment are:
+
+Variable Name: DM_TARGET
+------------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: string
+:Description:
+:Value: Name of device-mapper target that generated the event.
+
+Variable Name: DM_ACTION
+------------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: string
+:Description:
+:Value: Device-mapper specific action that caused the uevent action.
+	PATH_FAILED - A path has failed;
+	PATH_REINSTATED - A path has been reinstated.
+
+Variable Name: DM_SEQNUM
+------------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: unsigned integer
+:Description: A sequence number for this specific device-mapper device.
+:Value: Valid unsigned integer range.
+
+Variable Name: DM_PATH
+----------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: string
+:Description: Major and minor number of the path device pertaining to this
+	      event.
+:Value: Path name in the form of "Major:Minor"
+
+Variable Name: DM_NR_VALID_PATHS
+--------------------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: unsigned integer
+:Description:
+:Value: Valid unsigned integer range.
+
+Variable Name: DM_NAME
+----------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: string
+:Description: Name of the device-mapper device.
+:Value: Name
+
+Variable Name: DM_UUID
+----------------------
+:Uevent Action(s): KOBJ_CHANGE
+:Type: string
+:Description: UUID of the device-mapper device.
+:Value: UUID. (Empty string if there isn't one.)
+
+An example of the uevents generated as captured by udevmonitor is shown
+below
+
+1.) Path failure::
+
+	UEVENT[1192521009.711215] change@/block/dm-3
+	ACTION=change
+	DEVPATH=/block/dm-3
+	SUBSYSTEM=block
+	DM_TARGET=multipath
+	DM_ACTION=PATH_FAILED
+	DM_SEQNUM=1
+	DM_PATH=8:32
+	DM_NR_VALID_PATHS=0
+	DM_NAME=mpath2
+	DM_UUID=mpath-35333333000002328
+	MINOR=3
+	MAJOR=253
+	SEQNUM=1130
+
+2.) Path reinstate::
+
+	UEVENT[1192521132.989927] change@/block/dm-3
+	ACTION=change
+	DEVPATH=/block/dm-3
+	SUBSYSTEM=block
+	DM_TARGET=multipath
+	DM_ACTION=PATH_REINSTATED
+	DM_SEQNUM=2
+	DM_PATH=8:32
+	DM_NR_VALID_PATHS=1
+	DM_NAME=mpath2
+	DM_UUID=mpath-35333333000002328
+	MINOR=3
+	MAJOR=253
+	SEQNUM=1131
diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst
new file mode 100644
index 000000000000..07f56ebc1730
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst
@@ -0,0 +1,146 @@
+========
+dm-zoned
+========
+
+The dm-zoned device mapper target exposes a zoned block device (ZBC and
+ZAC compliant devices) as a regular block device without any write
+pattern constraints. In effect, it implements a drive-managed zoned
+block device which hides from the user (a file system or an application
+doing raw block device accesses) the sequential write constraints of
+host-managed zoned block devices and can mitigate the potential
+device-side performance degradation due to excessive random writes on
+host-aware zoned block devices.
+
+For a more detailed description of the zoned block device models and
+their constraints see (for SCSI devices):
+
+http://www.t10.org/drafts.htm#ZBC_Family
+
+and (for ATA devices):
+
+http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf
+
+The dm-zoned implementation is simple and minimizes system overhead (CPU
+and memory usage as well as storage capacity loss). For a 10TB
+host-managed disk with 256 MB zones, dm-zoned memory usage per disk
+instance is at most 4.5 MB and as little as 5 zones will be used
+internally for storing metadata and performaing reclaim operations.
+
+dm-zoned target devices are formatted and checked using the dmzadm
+utility available at:
+
+https://github.com/hgst/dm-zoned-tools
+
+Algorithm
+=========
+
+dm-zoned implements an on-disk buffering scheme to handle non-sequential
+write accesses to the sequential zones of a zoned block device.
+Conventional zones are used for caching as well as for storing internal
+metadata.
+
+The zones of the device are separated into 2 types:
+
+1) Metadata zones: these are conventional zones used to store metadata.
+Metadata zones are not reported as useable capacity to the user.
+
+2) Data zones: all remaining zones, the vast majority of which will be
+sequential zones used exclusively to store user data. The conventional
+zones of the device may be used also for buffering user random writes.
+Data in these zones may be directly mapped to the conventional zone, but
+later moved to a sequential zone so that the conventional zone can be
+reused for buffering incoming random writes.
+
+dm-zoned exposes a logical device with a sector size of 4096 bytes,
+irrespective of the physical sector size of the backend zoned block
+device being used. This allows reducing the amount of metadata needed to
+manage valid blocks (blocks written).
+
+The on-disk metadata format is as follows:
+
+1) The first block of the first conventional zone found contains the
+super block which describes the on disk amount and position of metadata
+blocks.
+
+2) Following the super block, a set of blocks is used to describe the
+mapping of the logical device blocks. The mapping is done per chunk of
+blocks, with the chunk size equal to the zoned block device size. The
+mapping table is indexed by chunk number and each mapping entry
+indicates the zone number of the device storing the chunk of data. Each
+mapping entry may also indicate if the zone number of a conventional
+zone used to buffer random modification to the data zone.
+
+3) A set of blocks used to store bitmaps indicating the validity of
+blocks in the data zones follows the mapping table. A valid block is
+defined as a block that was written and not discarded. For a buffered
+data chunk, a block is always valid only in the data zone mapping the
+chunk or in the buffer zone of the chunk.
+
+For a logical chunk mapped to a conventional zone, all write operations
+are processed by directly writing to the zone. If the mapping zone is a
+sequential zone, the write operation is processed directly only if the
+write offset within the logical chunk is equal to the write pointer
+offset within of the sequential data zone (i.e. the write operation is
+aligned on the zone write pointer). Otherwise, write operations are
+processed indirectly using a buffer zone. In that case, an unused
+conventional zone is allocated and assigned to the chunk being
+accessed. Writing a block to the buffer zone of a chunk will
+automatically invalidate the same block in the sequential zone mapping
+the chunk. If all blocks of the sequential zone become invalid, the zone
+is freed and the chunk buffer zone becomes the primary zone mapping the
+chunk, resulting in native random write performance similar to a regular
+block device.
+
+Read operations are processed according to the block validity
+information provided by the bitmaps. Valid blocks are read either from
+the sequential zone mapping a chunk, or if the chunk is buffered, from
+the buffer zone assigned. If the accessed chunk has no mapping, or the
+accessed blocks are invalid, the read buffer is zeroed and the read
+operation terminated.
+
+After some time, the limited number of convnetional zones available may
+be exhausted (all used to map chunks or buffer sequential zones) and
+unaligned writes to unbuffered chunks become impossible. To avoid this
+situation, a reclaim process regularly scans used conventional zones and
+tries to reclaim the least recently used zones by copying the valid
+blocks of the buffer zone to a free sequential zone. Once the copy
+completes, the chunk mapping is updated to point to the sequential zone
+and the buffer zone freed for reuse.
+
+Metadata Protection
+===================
+
+To protect metadata against corruption in case of sudden power loss or
+system crash, 2 sets of metadata zones are used. One set, the primary
+set, is used as the main metadata region, while the secondary set is
+used as a staging area. Modified metadata is first written to the
+secondary set and validated by updating the super block in the secondary
+set, a generation counter is used to indicate that this set contains the
+newest metadata. Once this operation completes, in place of metadata
+block updates can be done in the primary metadata set. This ensures that
+one of the set is always consistent (all modifications committed or none
+at all). Flush operations are used as a commit point. Upon reception of
+a flush request, metadata modification activity is temporarily blocked
+(for both incoming BIO processing and reclaim process) and all dirty
+metadata blocks are staged and updated. Normal operation is then
+resumed. Flushing metadata thus only temporarily delays write and
+discard requests. Read requests can be processed concurrently while
+metadata flush is being executed.
+
+Usage
+=====
+
+A zoned block device must first be formatted using the dmzadm tool. This
+will analyze the device zone configuration, determine where to place the
+metadata sets on the device and initialize the metadata sets.
+
+Ex::
+
+	dmzadm --format /dev/sdxx
+
+For a formatted device, the target can be created normally with the
+dmsetup utility. The only parameter that dm-zoned requires is the
+underlying zoned block device name. Ex::
+
+	echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \
+	dmsetup create dmz-`basename ${dev}`
diff --git a/Documentation/admin-guide/device-mapper/era.rst b/Documentation/admin-guide/device-mapper/era.rst
new file mode 100644
index 000000000000..90dd5c670b9f
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/era.rst
@@ -0,0 +1,116 @@
+======
+dm-era
+======
+
+Introduction
+============
+
+dm-era is a target that behaves similar to the linear target.  In
+addition it keeps track of which blocks were written within a user
+defined period of time called an 'era'.  Each era target instance
+maintains the current era as a monotonically increasing 32-bit
+counter.
+
+Use cases include tracking changed blocks for backup software, and
+partially invalidating the contents of a cache to restore cache
+coherency after rolling back a vendor snapshot.
+
+Constructor
+===========
+
+era <metadata dev> <origin dev> <block size>
+
+ ================ ======================================================
+ metadata dev     fast device holding the persistent metadata
+ origin dev	  device holding data blocks that may change
+ block size       block size of origin data device, granularity that is
+		  tracked by the target
+ ================ ======================================================
+
+Messages
+========
+
+None of the dm messages take any arguments.
+
+checkpoint
+----------
+
+Possibly move to a new era.  You shouldn't assume the era has
+incremented.  After sending this message, you should check the
+current era via the status line.
+
+take_metadata_snap
+------------------
+
+Create a clone of the metadata, to allow a userland process to read it.
+
+drop_metadata_snap
+------------------
+
+Drop the metadata snapshot.
+
+Status
+======
+
+<metadata block size> <#used metadata blocks>/<#total metadata blocks>
+<current era> <held metadata root | '-'>
+
+========================= ==============================================
+metadata block size	  Fixed block size for each metadata block in
+			  sectors
+#used metadata blocks	  Number of metadata blocks used
+#total metadata blocks	  Total number of metadata blocks
+current era		  The current era
+held metadata root	  The location, in blocks, of the metadata root
+			  that has been 'held' for userspace read
+			  access. '-' indicates there is no held root
+========================= ==============================================
+
+Detailed use case
+=================
+
+The scenario of invalidating a cache when rolling back a vendor
+snapshot was the primary use case when developing this target:
+
+Taking a vendor snapshot
+------------------------
+
+- Send a checkpoint message to the era target
+- Make a note of the current era in its status line
+- Take vendor snapshot (the era and snapshot should be forever
+  associated now).
+
+Rolling back to an vendor snapshot
+----------------------------------
+
+- Cache enters passthrough mode (see: dm-cache's docs in cache.txt)
+- Rollback vendor storage
+- Take metadata snapshot
+- Ascertain which blocks have been written since the snapshot was taken
+  by checking each block's era
+- Invalidate those blocks in the caching software
+- Cache returns to writeback/writethrough mode
+
+Memory usage
+============
+
+The target uses a bitset to record writes in the current era.  It also
+has a spare bitset ready for switching over to a new era.  Other than
+that it uses a few 4k blocks for updating metadata::
+
+   (4 * nr_blocks) bytes + buffers
+
+Resilience
+==========
+
+Metadata is updated on disk before a write to a previously unwritten
+block is performed.  As such dm-era should not be effected by a hard
+crash such as power failure.
+
+Userland tools
+==============
+
+Userland tools are found in the increasingly poorly named
+thin-provisioning-tools project:
+
+    https://github.com/jthornber/thin-provisioning-tools
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
new file mode 100644
index 000000000000..c77c58b8f67b
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -0,0 +1,42 @@
+=============
+Device Mapper
+=============
+
+.. toctree::
+    :maxdepth: 1
+
+    cache-policies
+    cache
+    delay
+    dm-crypt
+    dm-flakey
+    dm-init
+    dm-integrity
+    dm-io
+    dm-log
+    dm-queue-length
+    dm-raid
+    dm-service-time
+    dm-uevent
+    dm-zoned
+    era
+    kcopyd
+    linear
+    log-writes
+    persistent-data
+    snapshot
+    statistics
+    striped
+    switch
+    thin-provisioning
+    unstriped
+    verity
+    writecache
+    zero
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/device-mapper/kcopyd.rst b/Documentation/admin-guide/device-mapper/kcopyd.rst
new file mode 100644
index 000000000000..7651d395127f
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/kcopyd.rst
@@ -0,0 +1,47 @@
+======
+kcopyd
+======
+
+Kcopyd provides the ability to copy a range of sectors from one block-device
+to one or more other block-devices, with an asynchronous completion
+notification. It is used by dm-snapshot and dm-mirror.
+
+Users of kcopyd must first create a client and indicate how many memory pages
+to set aside for their copy jobs. This is done with a call to
+kcopyd_client_create()::
+
+   int kcopyd_client_create(unsigned int num_pages,
+                            struct kcopyd_client **result);
+
+To start a copy job, the user must set up io_region structures to describe
+the source and destinations of the copy. Each io_region indicates a
+block-device along with the starting sector and size of the region. The source
+of the copy is given as one io_region structure, and the destinations of the
+copy are given as an array of io_region structures::
+
+   struct io_region {
+      struct block_device *bdev;
+      sector_t sector;
+      sector_t count;
+   };
+
+To start the copy, the user calls kcopyd_copy(), passing in the client
+pointer, pointers to the source and destination io_regions, the name of a
+completion callback routine, and a pointer to some context data for the copy::
+
+   int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+                   unsigned int num_dests, struct io_region *dests,
+                   unsigned int flags, kcopyd_notify_fn fn, void *context);
+
+   typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err,
+				    void *context);
+
+When the copy completes, kcopyd will call the user's completion routine,
+passing back the user's context pointer. It will also indicate if a read or
+write error occurred during the copy.
+
+When a user is done with all their copy jobs, they should call
+kcopyd_client_destroy() to delete the kcopyd client, which will release the
+associated memory pages::
+
+   void kcopyd_client_destroy(struct kcopyd_client *kc);
diff --git a/Documentation/admin-guide/device-mapper/linear.rst b/Documentation/admin-guide/device-mapper/linear.rst
new file mode 100644
index 000000000000..9d17fc6e64a9
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/linear.rst
@@ -0,0 +1,63 @@
+=========
+dm-linear
+=========
+
+Device-Mapper's "linear" target maps a linear range of the Device-Mapper
+device onto a linear range of another device.  This is the basic building
+block of logical volume managers.
+
+Parameters: <dev path> <offset>
+    <dev path>:
+	Full pathname to the underlying block-device, or a
+        "major:minor" device-number.
+    <offset>:
+	Starting sector within the device.
+
+
+Example scripts
+===============
+
+::
+
+  #!/bin/sh
+  # Create an identity mapping for a device
+  echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity
+
+::
+
+  #!/bin/sh
+  # Join 2 devices together
+  size1=`blockdev --getsz $1`
+  size2=`blockdev --getsz $2`
+  echo "0 $size1 linear $1 0
+  $size1 $size2 linear $2 0" | dmsetup create joined
+
+::
+
+  #!/usr/bin/perl -w
+  # Split a device into 4M chunks and then join them together in reverse order.
+
+  my $name = "reverse";
+  my $extent_size = 4 * 1024 * 2;
+  my $dev = $ARGV[0];
+  my $table = "";
+  my $count = 0;
+
+  if (!defined($dev)) {
+          die("Please specify a device.\n");
+  }
+
+  my $dev_size = `blockdev --getsz $dev`;
+  my $extents = int($dev_size / $extent_size) -
+                (($dev_size % $extent_size) ? 1 : 0);
+
+  while ($extents > 0) {
+          my $this_start = $count * $extent_size;
+          $extents--;
+          $count++;
+          my $this_offset = $extents * $extent_size;
+
+          $table .= "$this_start $extent_size linear $dev $this_offset\n";
+  }
+
+  `echo \"$table\" | dmsetup create $name`;
diff --git a/Documentation/admin-guide/device-mapper/log-writes.rst b/Documentation/admin-guide/device-mapper/log-writes.rst
new file mode 100644
index 000000000000..23141f2ffb7c
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/log-writes.rst
@@ -0,0 +1,145 @@
+=============
+dm-log-writes
+=============
+
+This target takes 2 devices, one to pass all IO to normally, and one to log all
+of the write operations to.  This is intended for file system developers wishing
+to verify the integrity of metadata or data as the file system is written to.
+There is a log_write_entry written for every WRITE request and the target is
+able to take arbitrary data from userspace to insert into the log.  The data
+that is in the WRITE requests is copied into the log to make the replay happen
+exactly as it happened originally.
+
+Log Ordering
+============
+
+We log things in order of completion once we are sure the write is no longer in
+cache.  This means that normal WRITE requests are not actually logged until the
+next REQ_PREFLUSH request.  This is to make it easier for userspace to replay
+the log in a way that correlates to what is on disk and not what is in cache,
+to make it easier to detect improper waiting/flushing.
+
+This works by attaching all WRITE requests to a list once the write completes.
+Once we see a REQ_PREFLUSH request we splice this list onto the request and once
+the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
+completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to
+simulate the worst case scenario with regard to power failures.  Consider the
+following example (W means write, C means complete):
+
+	W1,W2,W3,C3,C2,Wflush,C1,Cflush
+
+The log would show the following:
+
+	W3,W2,flush,W1....
+
+Again this is to simulate what is actually on disk, this allows us to detect
+cases where a power failure at a particular point in time would create an
+inconsistent file system.
+
+Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
+they complete as those requests will obviously bypass the device cache.
+
+Any REQ_OP_DISCARD requests are treated like WRITE requests.  Otherwise we would
+have all the DISCARD requests, and then the WRITE requests and then the FLUSH
+request.  Consider the following example:
+
+	WRITE block 1, DISCARD block 1, FLUSH
+
+If we logged DISCARD when it completed, the replay would look like this:
+
+	DISCARD 1, WRITE 1, FLUSH
+
+which isn't quite what happened and wouldn't be caught during the log replay.
+
+Target interface
+================
+
+i) Constructor
+
+   log-writes <dev_path> <log_dev_path>
+
+   ============= ==============================================
+   dev_path	 Device that all of the IO will go to normally.
+   log_dev_path  Device where the log entries are written to.
+   ============= ==============================================
+
+ii) Status
+
+    <#logged entries> <highest allocated sector>
+
+    =========================== ========================
+    #logged entries	        Number of logged entries
+    highest allocated sector    Highest allocated sector
+    =========================== ========================
+
+iii) Messages
+
+    mark <description>
+
+	You can use a dmsetup message to set an arbitrary mark in a log.
+	For example say you want to fsck a file system after every
+	write, but first you need to replay up to the mkfs to make sure
+	we're fsck'ing something reasonable, you would do something like
+	this::
+
+	  mkfs.btrfs -f /dev/mapper/log
+	  dmsetup message log 0 mark mkfs
+	  <run test>
+
+	This would allow you to replay the log up to the mkfs mark and
+	then replay from that point on doing the fsck check in the
+	interval that you want.
+
+	Every log has a mark at the end labeled "dm-log-writes-end".
+
+Userspace component
+===================
+
+There is a userspace tool that will replay the log for you in various ways.
+It can be found here: https://github.com/josefbacik/log-writes
+
+Example usage
+=============
+
+Say you want to test fsync on your file system.  You would do something like
+this::
+
+  TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+  dmsetup create log --table "$TABLE"
+  mkfs.btrfs -f /dev/mapper/log
+  dmsetup message log 0 mark mkfs
+
+  mount /dev/mapper/log /mnt/btrfs-test
+  <some test that does fsync at the end>
+  dmsetup message log 0 mark fsync
+  md5sum /mnt/btrfs-test/foo
+  umount /mnt/btrfs-test
+
+  dmsetup remove log
+  replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
+  mount /dev/sdb /mnt/btrfs-test
+  md5sum /mnt/btrfs-test/foo
+  <verify md5sum's are correct>
+
+  Another option is to do a complicated file system operation and verify the file
+  system is consistent during the entire operation.  You could do this with:
+
+  TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+  dmsetup create log --table "$TABLE"
+  mkfs.btrfs -f /dev/mapper/log
+  dmsetup message log 0 mark mkfs
+
+  mount /dev/mapper/log /mnt/btrfs-test
+  <fsstress to dirty the fs>
+  btrfs filesystem balance /mnt/btrfs-test
+  umount /mnt/btrfs-test
+  dmsetup remove log
+
+  replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
+  btrfsck /dev/sdb
+  replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
+	--fsck "btrfsck /dev/sdb" --check fua
+
+And that will replay the log until it sees a FUA request, run the fsck command
+and if the fsck passes it will replay to the next FUA, until it is completed or
+the fsck command exists abnormally.
diff --git a/Documentation/admin-guide/device-mapper/persistent-data.rst b/Documentation/admin-guide/device-mapper/persistent-data.rst
new file mode 100644
index 000000000000..2065c3c5a091
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/persistent-data.rst
@@ -0,0 +1,88 @@
+===============
+Persistent data
+===============
+
+Introduction
+============
+
+The more-sophisticated device-mapper targets require complex metadata
+that is managed in kernel.  In late 2010 we were seeing that various
+different targets were rolling their own data structures, for example:
+
+- Mikulas Patocka's multisnap implementation
+- Heinz Mauelshagen's thin provisioning target
+- Another btree-based caching target posted to dm-devel
+- Another multi-snapshot target based on a design of Daniel Phillips
+
+Maintaining these data structures takes a lot of work, so if possible
+we'd like to reduce the number.
+
+The persistent-data library is an attempt to provide a re-usable
+framework for people who want to store metadata in device-mapper
+targets.  It's currently used by the thin-provisioning target and an
+upcoming hierarchical storage target.
+
+Overview
+========
+
+The main documentation is in the header files which can all be found
+under drivers/md/persistent-data.
+
+The block manager
+-----------------
+
+dm-block-manager.[hc]
+
+This provides access to the data on disk in fixed sized-blocks.  There
+is a read/write locking interface to prevent concurrent accesses, and
+keep data that is being used in the cache.
+
+Clients of persistent-data are unlikely to use this directly.
+
+The transaction manager
+-----------------------
+
+dm-transaction-manager.[hc]
+
+This restricts access to blocks and enforces copy-on-write semantics.
+The only way you can get hold of a writable block through the
+transaction manager is by shadowing an existing block (ie. doing
+copy-on-write) or allocating a fresh one.  Shadowing is elided within
+the same transaction so performance is reasonable.  The commit method
+ensures that all data is flushed before it writes the superblock.
+On power failure your metadata will be as it was when last committed.
+
+The Space Maps
+--------------
+
+dm-space-map.h
+dm-space-map-metadata.[hc]
+dm-space-map-disk.[hc]
+
+On-disk data structures that keep track of reference counts of blocks.
+Also acts as the allocator of new blocks.  Currently two
+implementations: a simpler one for managing blocks on a different
+device (eg. thinly-provisioned data blocks); and one for managing
+the metadata space.  The latter is complicated by the need to store
+its own data within the space it's managing.
+
+The data structures
+-------------------
+
+dm-btree.[hc]
+dm-btree-remove.c
+dm-btree-spine.c
+dm-btree-internal.h
+
+Currently there is only one data structure, a hierarchical btree.
+There are plans to add more.  For example, something with an
+array-like interface would see a lot of use.
+
+The btree is 'hierarchical' in that you can define it to be composed
+of nested btrees, and take multiple keys.  For example, the
+thin-provisioning target uses a btree with two levels of nesting.
+The first maps a device id to a mapping tree, and that in turn maps a
+virtual block to a physical block.
+
+Values stored in the btrees can have arbitrary size.  Keys are always
+64bits, although nesting allows you to use multiple keys.
diff --git a/Documentation/admin-guide/device-mapper/snapshot.rst b/Documentation/admin-guide/device-mapper/snapshot.rst
new file mode 100644
index 000000000000..ccdd8b587a74
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/snapshot.rst
@@ -0,0 +1,196 @@
+==============================
+Device-mapper snapshot support
+==============================
+
+Device-mapper allows you, without massive data copying:
+
+-  To create snapshots of any block device i.e. mountable, saved states of
+   the block device which are also writable without interfering with the
+   original content;
+-  To create device "forks", i.e. multiple different versions of the
+   same data stream.
+-  To merge a snapshot of a block device back into the snapshot's origin
+   device.
+
+In the first two cases, dm copies only the chunks of data that get
+changed and uses a separate copy-on-write (COW) block device for
+storage.
+
+For snapshot merge the contents of the COW storage are merged back into
+the origin device.
+
+
+There are three dm targets available:
+snapshot, snapshot-origin, and snapshot-merge.
+
+-  snapshot-origin <origin>
+
+which will normally have one or more snapshots based on it.
+Reads will be mapped directly to the backing device. For each write, the
+original data will be saved in the <COW device> of each snapshot to keep
+its visible content unchanged, at least until the <COW device> fills up.
+
+
+-  snapshot <origin> <COW device> <persistent?> <chunksize>
+   [<# feature args> [<arg>]*]
+
+A snapshot of the <origin> block device is created. Changed chunks of
+<chunksize> sectors will be stored on the <COW device>.  Writes will
+only go to the <COW device>.  Reads will come from the <COW device> or
+from <origin> for unchanged data.  <COW device> will often be
+smaller than the origin and if it fills up the snapshot will become
+useless and be disabled, returning errors.  So it is important to monitor
+the amount of free space and expand the <COW device> before it fills up.
+
+<persistent?> is P (Persistent) or N (Not persistent - will not survive
+after reboot).  O (Overflow) can be added as a persistent store option
+to allow userspace to advertise its support for seeing "Overflow" in the
+snapshot status.  So supported store types are "P", "PO" and "N".
+
+The difference between persistent and transient is with transient
+snapshots less metadata must be saved on disk - they can be kept in
+memory by the kernel.
+
+When loading or unloading the snapshot target, the corresponding
+snapshot-origin or snapshot-merge target must be suspended. A failure to
+suspend the origin target could result in data corruption.
+
+Optional features:
+
+   discard_zeroes_cow - a discard issued to the snapshot device that
+   maps to entire chunks to will zero the corresponding exception(s) in
+   the snapshot's exception store.
+
+   discard_passdown_origin - a discard to the snapshot device is passed
+   down to the snapshot-origin's underlying device.  This doesn't cause
+   copy-out to the snapshot exception store because the snapshot-origin
+   target is bypassed.
+
+   The discard_passdown_origin feature depends on the discard_zeroes_cow
+   feature being enabled.
+
+
+-  snapshot-merge <origin> <COW device> <persistent> <chunksize>
+   [<# feature args> [<arg>]*]
+
+takes the same table arguments as the snapshot target except it only
+works with persistent snapshots.  This target assumes the role of the
+"snapshot-origin" target and must not be loaded if the "snapshot-origin"
+is still present for <origin>.
+
+Creates a merging snapshot that takes control of the changed chunks
+stored in the <COW device> of an existing snapshot, through a handover
+procedure, and merges these chunks back into the <origin>.  Once merging
+has started (in the background) the <origin> may be opened and the merge
+will continue while I/O is flowing to it.  Changes to the <origin> are
+deferred until the merging snapshot's corresponding chunk(s) have been
+merged.  Once merging has started the snapshot device, associated with
+the "snapshot" target, will return -EIO when accessed.
+
+
+How snapshot is used by LVM2
+============================
+When you create the first LVM2 snapshot of a volume, four dm devices are used:
+
+1) a device containing the original mapping table of the source volume;
+2) a device used as the <COW device>;
+3) a "snapshot" device, combining #1 and #2, which is the visible snapshot
+   volume;
+4) the "original" volume (which uses the device number used by the original
+   source volume), whose table is replaced by a "snapshot-origin" mapping
+   from device #1.
+
+A fixed naming scheme is used, so with the following commands::
+
+  lvcreate -L 1G -n base volumeGroup
+  lvcreate -L 100M --snapshot -n snap volumeGroup/base
+
+we'll have this situation (with volumes in above order)::
+
+  # dmsetup table|grep volumeGroup
+
+  volumeGroup-base-real: 0 2097152 linear 8:19 384
+  volumeGroup-snap-cow: 0 204800 linear 8:19 2097536
+  volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16
+  volumeGroup-base: 0 2097152 snapshot-origin 254:11
+
+  # ls -lL /dev/mapper/volumeGroup-*
+  brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+  brw-------  1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
+  brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
+  brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
+
+
+How snapshot-merge is used by LVM2
+==================================
+A merging snapshot assumes the role of the "snapshot-origin" while
+merging.  As such the "snapshot-origin" is replaced with
+"snapshot-merge".  The "-real" device is not changed and the "-cow"
+device is renamed to <origin name>-cow to aid LVM2's cleanup of the
+merging snapshot after it completes.  The "snapshot" that hands over its
+COW device to the "snapshot-merge" is deactivated (unless using lvchange
+--refresh); but if it is left active it will simply return I/O errors.
+
+A snapshot will merge into its origin with the following command::
+
+  lvconvert --merge volumeGroup/snap
+
+we'll now have this situation::
+
+  # dmsetup table|grep volumeGroup
+
+  volumeGroup-base-real: 0 2097152 linear 8:19 384
+  volumeGroup-base-cow: 0 204800 linear 8:19 2097536
+  volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
+
+  # ls -lL /dev/mapper/volumeGroup-*
+  brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+  brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
+  brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
+
+
+How to determine when a merging is complete
+===========================================
+The snapshot-merge and snapshot status lines end with:
+
+  <sectors_allocated>/<total_sectors> <metadata_sectors>
+
+Both <sectors_allocated> and <total_sectors> include both data and metadata.
+During merging, the number of sectors allocated gets smaller and
+smaller.  Merging has finished when the number of sectors holding data
+is zero, in other words <sectors_allocated> == <metadata_sectors>.
+
+Here is a practical example (using a hybrid of lvm and dmsetup commands)::
+
+  # lvs
+    LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+    base    volumeGroup owi-a- 4.00g
+    snap    volumeGroup swi-a- 1.00g base  18.97
+
+  # dmsetup status volumeGroup-snap
+  0 8388608 snapshot 397896/2097152 1560
+                                    ^^^^ metadata sectors
+
+  # lvconvert --merge -b volumeGroup/snap
+    Merging of volume snap started.
+
+  # lvs volumeGroup/snap
+    LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+    base    volumeGroup Owi-a- 4.00g          17.23
+
+  # dmsetup status volumeGroup-base
+  0 8388608 snapshot-merge 281688/2097152 1104
+
+  # dmsetup status volumeGroup-base
+  0 8388608 snapshot-merge 180480/2097152 712
+
+  # dmsetup status volumeGroup-base
+  0 8388608 snapshot-merge 16/2097152 16
+
+Merging has finished.
+
+::
+
+  # lvs
+    LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+    base    volumeGroup owi-a- 4.00g
diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst
new file mode 100644
index 000000000000..41ded0bc5933
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/statistics.rst
@@ -0,0 +1,225 @@
+=============
+DM statistics
+=============
+
+Device Mapper supports the collection of I/O statistics on user-defined
+regions of a DM device.	 If no regions are defined no statistics are
+collected so there isn't any performance impact.  Only bio-based DM
+devices are currently supported.
+
+Each user-defined region specifies a starting sector, length and step.
+Individual statistics will be collected for each step-sized area within
+the range specified.
+
+The I/O statistics counters for each step-sized area of a region are
+in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
+Documentation/admin-guide/iostats.rst).  But two extra counters (12 and 13) are
+provided: total time spent reading and writing.  When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies.  All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
+
+The reported times are in milliseconds and the granularity depends on
+the kernel ticks.  When the option precise_timestamps is used, the
+reported times are in nanoseconds.
+
+Each region has a corresponding unique identifier, which we call a
+region_id, that is assigned when the region is created.	 The region_id
+must be supplied when querying statistics about the region, deleting the
+region, etc.  Unique region_ids enable multiple userspace programs to
+request and process statistics for the same DM device without stepping
+on each other's data.
+
+The creation of DM statistics will allocate memory via kmalloc or
+fallback to using vmalloc space.  At most, 1/4 of the overall system
+memory may be allocated by DM statistics.  The admin can see how much
+memory is used by reading:
+
+	/sys/module/dm_mod/parameters/stats_current_allocated_bytes
+
+Messages
+========
+
+    @stats_create <range> <step> [<number_of_optional_arguments> <optional_arguments>...] [<program_id> [<aux_data>]]
+	Create a new region and return the region_id.
+
+	<range>
+	  "-"
+		whole device
+	  "<start_sector>+<length>"
+		a range of <length> 512-byte sectors
+		starting with <start_sector>.
+
+	<step>
+	  "<area_size>"
+		the range is subdivided into areas each containing
+		<area_size> sectors.
+	  "/<number_of_areas>"
+		the range is subdivided into the specified
+		number of areas.
+
+	<number_of_optional_arguments>
+	  The number of optional arguments
+
+	<optional_arguments>
+	  The following optional arguments are supported:
+
+	  precise_timestamps
+		use precise timer with nanosecond resolution
+		instead of the "jiffies" variable.  When this argument is
+		used, the resulting times are in nanoseconds instead of
+		milliseconds.  Precise timestamps are a little bit slower
+		to obtain than jiffies-based timestamps.
+	  histogram:n1,n2,n3,n4,...
+		collect histogram of latencies.  The
+		numbers n1, n2, etc are times that represent the boundaries
+		of the histogram.  If precise_timestamps is not used, the
+		times are in milliseconds, otherwise they are in
+		nanoseconds.  For each range, the kernel will report the
+		number of requests that completed within this range. For
+		example, if we use "histogram:10,20,30", the kernel will
+		report four numbers a:b:c:d. a is the number of requests
+		that took 0-10 ms to complete, b is the number of requests
+		that took 10-20 ms to complete, c is the number of requests
+		that took 20-30 ms to complete and d is the number of
+		requests that took more than 30 ms to complete.
+
+	<program_id>
+	  An optional parameter.  A name that uniquely identifies
+	  the userspace owner of the range.  This groups ranges together
+	  so that userspace programs can identify the ranges they
+	  created and ignore those created by others.
+	  The kernel returns this string back in the output of
+	  @stats_list message, but it doesn't use it for anything else.
+	  If we omit the number of optional arguments, program id must not
+	  be a number, otherwise it would be interpreted as the number of
+	  optional arguments.
+
+	<aux_data>
+	  An optional parameter.  A word that provides auxiliary data
+	  that is useful to the client program that created the range.
+	  The kernel returns this string back in the output of
+	  @stats_list message, but it doesn't use this value for anything.
+
+    @stats_delete <region_id>
+	Delete the region with the specified id.
+
+	<region_id>
+	  region_id returned from @stats_create
+
+    @stats_clear <region_id>
+	Clear all the counters except the in-flight i/o counters.
+
+	<region_id>
+	  region_id returned from @stats_create
+
+    @stats_list [<program_id>]
+	List all regions registered with @stats_create.
+
+	<program_id>
+	  An optional parameter.
+	  If this parameter is specified, only matching regions
+	  are returned.
+	  If it is not specified, all regions are returned.
+
+	Output format:
+	  <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
+	        precise_timestamps histogram:n1,n2,n3,...
+
+	The strings "precise_timestamps" and "histogram" are printed only
+	if they were specified when creating the region.
+
+    @stats_print <region_id> [<starting_line> <number_of_lines>]
+	Print counters for each step-sized area of a region.
+
+	<region_id>
+	  region_id returned from @stats_create
+
+	<starting_line>
+	  The index of the starting line in the output.
+	  If omitted, all lines are returned.
+
+	<number_of_lines>
+	  The number of lines to include in the output.
+	  If omitted, all lines are returned.
+
+	Output format for each step-sized area of a region:
+
+	  <start_sector>+<length>
+		counters
+
+	  The first 11 counters have the same meaning as
+	  `/sys/block/*/stat or /proc/diskstats`.
+
+	  Please refer to Documentation/admin-guide/iostats.rst for details.
+
+	  1. the number of reads completed
+	  2. the number of reads merged
+	  3. the number of sectors read
+	  4. the number of milliseconds spent reading
+	  5. the number of writes completed
+	  6. the number of writes merged
+	  7. the number of sectors written
+	  8. the number of milliseconds spent writing
+	  9. the number of I/Os currently in progress
+	  10. the number of milliseconds spent doing I/Os
+	  11. the weighted number of milliseconds spent doing I/Os
+
+	  Additional counters:
+
+	  12. the total time spent reading in milliseconds
+	  13. the total time spent writing in milliseconds
+
+    @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
+	Atomically print and then clear all the counters except the
+	in-flight i/o counters.	 Useful when the client consuming the
+	statistics does not want to lose any statistics (those updated
+	between printing and clearing).
+
+	<region_id>
+	  region_id returned from @stats_create
+
+	<starting_line>
+	  The index of the starting line in the output.
+	  If omitted, all lines are printed and then cleared.
+
+	<number_of_lines>
+	  The number of lines to process.
+	  If omitted, all lines are printed and then cleared.
+
+    @stats_set_aux <region_id> <aux_data>
+	Store auxiliary data aux_data for the specified region.
+
+	<region_id>
+	  region_id returned from @stats_create
+
+	<aux_data>
+	  The string that identifies data which is useful to the client
+	  program that created the range.  The kernel returns this
+	  string back in the output of @stats_list message, but it
+	  doesn't use this value for anything.
+
+Examples
+========
+
+Subdivide the DM device 'vol' into 100 pieces and start collecting
+statistics on them::
+
+  dmsetup message vol 0 @stats_create - /100
+
+Set the auxiliary data string to "foo bar baz" (the escape for each
+space must also be escaped, otherwise the shell will consume them)::
+
+  dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
+
+List the statistics::
+
+  dmsetup message vol 0 @stats_list
+
+Print the statistics::
+
+  dmsetup message vol 0 @stats_print 0
+
+Delete the statistics::
+
+  dmsetup message vol 0 @stats_delete 0
diff --git a/Documentation/admin-guide/device-mapper/striped.rst b/Documentation/admin-guide/device-mapper/striped.rst
new file mode 100644
index 000000000000..e9a8da192ae1
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/striped.rst
@@ -0,0 +1,61 @@
+=========
+dm-stripe
+=========
+
+Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0)
+device across one or more underlying devices. Data is written in "chunks",
+with consecutive chunks rotating among the underlying devices. This can
+potentially provide improved I/O throughput by utilizing several physical
+devices in parallel.
+
+Parameters: <num devs> <chunk size> [<dev path> <offset>]+
+    <num devs>:
+	Number of underlying devices.
+    <chunk size>:
+	Size of each chunk of data. Must be at least as
+        large as the system's PAGE_SIZE.
+    <dev path>:
+	Full pathname to the underlying block-device, or a
+	"major:minor" device-number.
+    <offset>:
+	Starting sector within the device.
+
+One or more underlying devices can be specified. The striped device size must
+be a multiple of the chunk size multiplied by the number of underlying devices.
+
+
+Example scripts
+===============
+
+::
+
+  #!/usr/bin/perl -w
+  # Create a striped device across any number of underlying devices. The device
+  # will be called "stripe_dev" and have a chunk-size of 128k.
+
+  my $chunk_size = 128 * 2;
+  my $dev_name = "stripe_dev";
+  my $num_devs = @ARGV;
+  my @devs = @ARGV;
+  my ($min_dev_size, $stripe_dev_size, $i);
+
+  if (!$num_devs) {
+          die("Specify at least one device\n");
+  }
+
+  $min_dev_size = `blockdev --getsz $devs[0]`;
+  for ($i = 1; $i < $num_devs; $i++) {
+          my $this_size = `blockdev --getsz $devs[$i]`;
+          $min_dev_size = ($min_dev_size < $this_size) ?
+                          $min_dev_size : $this_size;
+  }
+
+  $stripe_dev_size = $min_dev_size * $num_devs;
+  $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs);
+
+  $table = "0 $stripe_dev_size striped $num_devs $chunk_size";
+  for ($i = 0; $i < $num_devs; $i++) {
+          $table .= " $devs[$i] 0";
+  }
+
+  `echo $table | dmsetup create $dev_name`;
diff --git a/Documentation/admin-guide/device-mapper/switch.rst b/Documentation/admin-guide/device-mapper/switch.rst
new file mode 100644
index 000000000000..7dde06be1a4f
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/switch.rst
@@ -0,0 +1,141 @@
+=========
+dm-switch
+=========
+
+The device-mapper switch target creates a device that supports an
+arbitrary mapping of fixed-size regions of I/O across a fixed set of
+paths.  The path used for any specific region can be switched
+dynamically by sending the target a message.
+
+It maps I/O to underlying block devices efficiently when there is a large
+number of fixed-sized address regions but there is no simple pattern
+that would allow for a compact representation of the mapping such as
+dm-stripe.
+
+Background
+----------
+
+Dell EqualLogic and some other iSCSI storage arrays use a distributed
+frameless architecture.  In this architecture, the storage group
+consists of a number of distinct storage arrays ("members") each having
+independent controllers, disk storage and network adapters.  When a LUN
+is created it is spread across multiple members.  The details of the
+spreading are hidden from initiators connected to this storage system.
+The storage group exposes a single target discovery portal, no matter
+how many members are being used.  When iSCSI sessions are created, each
+session is connected to an eth port on a single member.  Data to a LUN
+can be sent on any iSCSI session, and if the blocks being accessed are
+stored on another member the I/O will be forwarded as required.  This
+forwarding is invisible to the initiator.  The storage layout is also
+dynamic, and the blocks stored on disk may be moved from member to
+member as needed to balance the load.
+
+This architecture simplifies the management and configuration of both
+the storage group and initiators.  In a multipathing configuration, it
+is possible to set up multiple iSCSI sessions to use multiple network
+interfaces on both the host and target to take advantage of the
+increased network bandwidth.  An initiator could use a simple round
+robin algorithm to send I/O across all paths and let the storage array
+members forward it as necessary, but there is a performance advantage to
+sending data directly to the correct member.
+
+A device-mapper table already lets you map different regions of a
+device onto different targets.  However in this architecture the LUN is
+spread with an address region size on the order of 10s of MBs, which
+means the resulting table could have more than a million entries and
+consume far too much memory.
+
+Using this device-mapper switch target we can now build a two-layer
+device hierarchy:
+
+    Upper Tier - Determine which array member the I/O should be sent to.
+    Lower Tier - Load balance amongst paths to a particular member.
+
+The lower tier consists of a single dm multipath device for each member.
+Each of these multipath devices contains the set of paths directly to
+the array member in one priority group, and leverages existing path
+selectors to load balance amongst these paths.  We also build a
+non-preferred priority group containing paths to other array members for
+failover reasons.
+
+The upper tier consists of a single dm-switch device.  This device uses
+a bitmap to look up the location of the I/O and choose the appropriate
+lower tier device to route the I/O.  By using a bitmap we are able to
+use 4 bits for each address range in a 16 member group (which is very
+large for us).  This is a much denser representation than the dm table
+b-tree can achieve.
+
+Construction Parameters
+=======================
+
+    <num_paths> <region_size> <num_optional_args> [<optional_args>...] [<dev_path> <offset>]+
+	<num_paths>
+	    The number of paths across which to distribute the I/O.
+
+	<region_size>
+	    The number of 512-byte sectors in a region. Each region can be redirected
+	    to any of the available paths.
+
+	<num_optional_args>
+	    The number of optional arguments. Currently, no optional arguments
+	    are supported and so this must be zero.
+
+	<dev_path>
+	    The block device that represents a specific path to the device.
+
+	<offset>
+	    The offset of the start of data on the specific <dev_path> (in units
+	    of 512-byte sectors). This number is added to the sector number when
+	    forwarding the request to the specific path. Typically it is zero.
+
+Messages
+========
+
+set_region_mappings <index>:<path_nr> [<index>]:<path_nr> [<index>]:<path_nr>...
+
+Modify the region table by specifying which regions are redirected to
+which paths.
+
+<index>
+    The region number (region size was specified in constructor parameters).
+    If index is omitted, the next region (previous index + 1) is used.
+    Expressed in hexadecimal (WITHOUT any prefix like 0x).
+
+<path_nr>
+    The path number in the range 0 ... (<num_paths> - 1).
+    Expressed in hexadecimal (WITHOUT any prefix like 0x).
+
+R<n>,<m>
+    This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
+    are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
+    slots.
+
+Status
+======
+
+No status line is reported.
+
+Example
+=======
+
+Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with
+the same size.
+
+Create a switch device with 64kB region size::
+
+    dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0`
+	switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0"
+
+Set mappings for the first 7 entries to point to devices switch0, switch1,
+switch2, switch0, switch1, switch2, switch1::
+
+    dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
+
+Set repetitive mapping. This command::
+
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
+
+is equivalent to::
+
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
+	:1 :2 :1 :2 :1 :2 :1 :2 :1 :2
diff --git a/Documentation/admin-guide/device-mapper/thin-provisioning.rst b/Documentation/admin-guide/device-mapper/thin-provisioning.rst
new file mode 100644
index 000000000000..bafebf79da4b
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/thin-provisioning.rst
@@ -0,0 +1,427 @@
+=================
+Thin provisioning
+=================
+
+Introduction
+============
+
+This document describes a collection of device-mapper targets that
+between them implement thin-provisioning and snapshots.
+
+The main highlight of this implementation, compared to the previous
+implementation of snapshots, is that it allows many virtual devices to
+be stored on the same data volume.  This simplifies administration and
+allows the sharing of data between volumes, thus reducing disk usage.
+
+Another significant feature is support for an arbitrary depth of
+recursive snapshots (snapshots of snapshots of snapshots ...).  The
+previous implementation of snapshots did this by chaining together
+lookup tables, and so performance was O(depth).  This new
+implementation uses a single data structure to avoid this degradation
+with depth.  Fragmentation may still be an issue, however, in some
+scenarios.
+
+Metadata is stored on a separate device from data, giving the
+administrator some freedom, for example to:
+
+- Improve metadata resilience by storing metadata on a mirrored volume
+  but data on a non-mirrored one.
+
+- Improve performance by storing the metadata on SSD.
+
+Status
+======
+
+These targets are considered safe for production use.  But different use
+cases will have different performance characteristics, for example due
+to fragmentation of the data volume.
+
+If you find this software is not performing as expected please mail
+dm-devel@redhat.com with details and we'll try our best to improve
+things for you.
+
+Userspace tools for checking and repairing the metadata have been fully
+developed and are available as 'thin_check' and 'thin_repair'.  The name
+of the package that provides these utilities varies by distribution (on
+a Red Hat distribution it is named 'device-mapper-persistent-data').
+
+Cookbook
+========
+
+This section describes some quick recipes for using thin provisioning.
+They use the dmsetup program to control the device-mapper driver
+directly.  End users will be advised to use a higher-level volume
+manager such as LVM2 once support has been added.
+
+Pool device
+-----------
+
+The pool device ties together the metadata volume and the data volume.
+It maps I/O linearly to the data volume and updates the metadata via
+two mechanisms:
+
+- Function calls from the thin targets
+
+- Device-mapper 'messages' from userspace which control the creation of new
+  virtual devices amongst other things.
+
+Setting up a fresh pool device
+------------------------------
+
+Setting up a pool device requires a valid metadata device, and a
+data device.  If you do not have an existing metadata device you can
+make one by zeroing the first 4k to indicate empty metadata.
+
+    dd if=/dev/zero of=$metadata_dev bs=4096 count=1
+
+The amount of metadata you need will vary according to how many blocks
+are shared between thin devices (i.e. through snapshots).  If you have
+less sharing than average you'll need a larger-than-average metadata device.
+
+As a guide, we suggest you calculate the number of bytes to use in the
+metadata device as 48 * $data_dev_size / $data_block_size but round it up
+to 2MB if the answer is smaller.  If you're creating large numbers of
+snapshots which are recording large amounts of change, you may find you
+need to increase this.
+
+The largest size supported is 16GB: If the device is larger,
+a warning will be issued and the excess space will not be used.
+
+Reloading a pool table
+----------------------
+
+You may reload a pool's table, indeed this is how the pool is resized
+if it runs out of space.  (N.B. While specifying a different metadata
+device when reloading is not forbidden at the moment, things will go
+wrong if it does not route I/O to exactly the same on-disk location as
+previously.)
+
+Using an existing pool device
+-----------------------------
+
+::
+
+    dmsetup create pool \
+	--table "0 20971520 thin-pool $metadata_dev $data_dev \
+		 $data_block_size $low_water_mark"
+
+$data_block_size gives the smallest unit of disk space that can be
+allocated at a time expressed in units of 512-byte sectors.
+$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a
+multiple of 128 (64KB).  $data_block_size cannot be changed after the
+thin-pool is created.  People primarily interested in thin provisioning
+may want to use a value such as 1024 (512KB).  People doing lots of
+snapshotting may want a smaller value such as 128 (64KB).  If you are
+not zeroing newly-allocated data, a larger $data_block_size in the
+region of 256000 (128MB) is suggested.
+
+$low_water_mark is expressed in blocks of size $data_block_size.  If
+free space on the data device drops below this level then a dm event
+will be triggered which a userspace daemon should catch allowing it to
+extend the pool device.  Only one such event will be sent.
+
+No special event is triggered if a just resumed device's free space is below
+the low water mark. However, resuming a device always triggers an
+event; a userspace daemon should verify that free space exceeds the low
+water mark when handling this event.
+
+A low water mark for the metadata device is maintained in the kernel and
+will trigger a dm event if free space on the metadata device drops below
+it.
+
+Updating on-disk metadata
+-------------------------
+
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the thin-provisioning target behaves like a physical disk that has
+a volatile write cache.  If power is lost you may lose some recent
+writes.  The metadata should always be consistent in spite of any crash.
+
+If data space is exhausted the pool will either error or queue IO
+according to the configuration (see: error_if_no_space).  If metadata
+space is exhausted or a metadata operation fails: the pool will error IO
+until the pool is taken offline and repair is performed to 1) fix any
+potential inconsistencies and 2) clear the flag that imposes repair.
+Once the pool's metadata device is repaired it may be resized, which
+will allow the pool to return to normal operation.  Note that if a pool
+is flagged as needing repair, the pool's data and metadata devices
+cannot be resized until repair is performed.  It should also be noted
+that when the pool's metadata space is exhausted the current metadata
+transaction is aborted.  Given that the pool will cache IO whose
+completion may have already been acknowledged to upper IO layers
+(e.g. filesystem) it is strongly suggested that consistency checks
+(e.g. fsck) be performed on those layers when repair of the pool is
+required.
+
+Thin provisioning
+-----------------
+
+i) Creating a new thinly-provisioned volume.
+
+  To create a new thinly- provisioned volume you must send a message to an
+  active pool device, /dev/mapper/pool in this example::
+
+    dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+  Here '0' is an identifier for the volume, a 24-bit number.  It's up
+  to the caller to allocate and manage these identifiers.  If the
+  identifier is already in use, the message will fail with -EEXIST.
+
+ii) Using a thinly-provisioned volume.
+
+  Thinly-provisioned volumes are activated using the 'thin' target::
+
+    dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0"
+
+  The last parameter is the identifier for the thinp device.
+
+Internal snapshots
+------------------
+
+i) Creating an internal snapshot.
+
+  Snapshots are created with another message to the pool.
+
+  N.B.  If the origin device that you wish to snapshot is active, you
+  must suspend it before creating the snapshot to avoid corruption.
+  This is NOT enforced at the moment, so please be careful!
+
+  ::
+
+    dmsetup suspend /dev/mapper/thin
+    dmsetup message /dev/mapper/pool 0 "create_snap 1 0"
+    dmsetup resume /dev/mapper/thin
+
+  Here '1' is the identifier for the volume, a 24-bit number.  '0' is the
+  identifier for the origin device.
+
+ii) Using an internal snapshot.
+
+  Once created, the user doesn't have to worry about any connection
+  between the origin and the snapshot.  Indeed the snapshot is no
+  different from any other thinly-provisioned device and can be
+  snapshotted itself via the same method.  It's perfectly legal to
+  have only one of them active, and there's no ordering requirement on
+  activating or removing them both.  (This differs from conventional
+  device-mapper snapshots.)
+
+  Activate it exactly the same way as any other thinly-provisioned volume::
+
+    dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
+
+External snapshots
+------------------
+
+You can use an external **read only** device as an origin for a
+thinly-provisioned volume.  Any read to an unprovisioned area of the
+thin device will be passed through to the origin.  Writes trigger
+the allocation of new blocks as usual.
+
+One use case for this is VM hosts that want to run guests on
+thinly-provisioned volumes but have the base image on another device
+(possibly shared between many VMs).
+
+You must not write to the origin device if you use this technique!
+Of course, you may write to the thin device and take internal snapshots
+of the thin volume.
+
+i) Creating a snapshot of an external device
+
+  This is the same as creating a thin device.
+  You don't mention the origin at this stage.
+
+  ::
+
+    dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+ii) Using a snapshot of an external device.
+
+  Append an extra parameter to the thin target specifying the origin::
+
+    dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
+
+  N.B. All descendants (internal snapshots) of this snapshot require the
+  same extra origin parameter.
+
+Deactivation
+------------
+
+All devices using a pool must be deactivated before the pool itself
+can be.
+
+::
+
+    dmsetup remove thin
+    dmsetup remove snap
+    dmsetup remove pool
+
+Reference
+=========
+
+'thin-pool' target
+------------------
+
+i) Constructor
+
+    ::
+
+      thin-pool <metadata dev> <data dev> <data block size (sectors)> \
+	        <low water mark (blocks)> [<number of feature args> [<arg>]*]
+
+    Optional feature arguments:
+
+      skip_block_zeroing:
+	Skip the zeroing of newly-provisioned blocks.
+
+      ignore_discard:
+	Disable discard support.
+
+      no_discard_passdown:
+	Don't pass discards down to the underlying
+	data device, but just remove the mapping.
+
+      read_only:
+		 Don't allow any changes to be made to the pool
+		 metadata.  This mode is only available after the
+		 thin-pool has been created and first used in full
+		 read/write mode.  It cannot be specified on initial
+		 thin-pool creation.
+
+      error_if_no_space:
+	Error IOs, instead of queueing, if no space.
+
+    Data block size must be between 64KB (128 sectors) and 1GB
+    (2097152 sectors) inclusive.
+
+
+ii) Status
+
+    ::
+
+      <transaction id> <used metadata blocks>/<total metadata blocks>
+      <used data blocks>/<total data blocks> <held metadata root>
+      ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space
+      needs_check|- metadata_low_watermark
+
+    transaction id:
+	A 64-bit number used by userspace to help synchronise with metadata
+	from volume managers.
+
+    used data blocks / total data blocks
+	If the number of free blocks drops below the pool's low water mark a
+	dm event will be sent to userspace.  This event is edge-triggered and
+	it will occur only once after each resume so volume manager writers
+	should register for the event and then check the target's status.
+
+    held metadata root:
+	The location, in blocks, of the metadata root that has been
+	'held' for userspace read access.  '-' indicates there is no
+	held root.
+
+    discard_passdown|no_discard_passdown
+	Whether or not discards are actually being passed down to the
+	underlying device.  When this is enabled when loading the table,
+	it can get disabled if the underlying device doesn't support it.
+
+    ro|rw|out_of_data_space
+	If the pool encounters certain types of device failures it will
+	drop into a read-only metadata mode in which no changes to
+	the pool metadata (like allocating new blocks) are permitted.
+
+	In serious cases where even a read-only mode is deemed unsafe
+	no further I/O will be permitted and the status will just
+	contain the string 'Fail'.  The userspace recovery tools
+	should then be used.
+
+    error_if_no_space|queue_if_no_space
+	If the pool runs out of data or metadata space, the pool will
+	either queue or error the IO destined to the data device.  The
+	default is to queue the IO until more space is added or the
+	'no_space_timeout' expires.  The 'no_space_timeout' dm-thin-pool
+	module parameter can be used to change this timeout -- it
+	defaults to 60 seconds but may be disabled using a value of 0.
+
+    needs_check
+	A metadata operation has failed, resulting in the needs_check
+	flag being set in the metadata's superblock.  The metadata
+	device must be deactivated and checked/repaired before the
+	thin-pool can be made fully operational again.  '-' indicates
+	needs_check is not set.
+
+    metadata_low_watermark:
+	Value of metadata low watermark in blocks.  The kernel sets this
+	value internally but userspace needs to know this value to
+	determine if an event was caused by crossing this threshold.
+
+iii) Messages
+
+    create_thin <dev id>
+	Create a new thinly-provisioned device.
+	<dev id> is an arbitrary unique 24-bit identifier chosen by
+	the caller.
+
+    create_snap <dev id> <origin id>
+	Create a new snapshot of another thinly-provisioned device.
+	<dev id> is an arbitrary unique 24-bit identifier chosen by
+	the caller.
+	<origin id> is the identifier of the thinly-provisioned device
+	of which the new device will be a snapshot.
+
+    delete <dev id>
+	Deletes a thin device.  Irreversible.
+
+    set_transaction_id <current id> <new id>
+	Userland volume managers, such as LVM, need a way to
+	synchronise their external metadata with the internal metadata of the
+	pool target.  The thin-pool target offers to store an
+	arbitrary 64-bit transaction id and return it on the target's
+	status line.  To avoid races you must provide what you think
+	the current transaction id is when you change it with this
+	compare-and-swap message.
+
+    reserve_metadata_snap
+        Reserve a copy of the data mapping btree for use by userland.
+        This allows userland to inspect the mappings as they were when
+        this message was executed.  Use the pool's status command to
+        get the root block associated with the metadata snapshot.
+
+    release_metadata_snap
+        Release a previously reserved copy of the data mapping btree.
+
+'thin' target
+-------------
+
+i) Constructor
+
+    ::
+
+        thin <pool dev> <dev id> [<external origin dev>]
+
+    pool dev:
+	the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
+
+    dev id:
+	the internal device identifier of the device to be
+	activated.
+
+    external origin dev:
+	an optional block device outside the pool to be treated as a
+	read-only snapshot origin: reads to unprovisioned areas of the
+	thin target will be mapped to this device.
+
+The pool doesn't store any size against the thin devices.  If you
+load a thin target that is smaller than you've been using previously,
+then you'll have no access to blocks mapped beyond the end.  If you
+load a target that is bigger than before, then extra blocks will be
+provisioned as and when needed.
+
+ii) Status
+
+    <nr mapped sectors> <highest mapped sector>
+	If the pool has encountered device errors and failed, the status
+	will just contain the string 'Fail'.  The userspace recovery
+	tools should then be used.
+
+    In the case where <nr mapped sectors> is 0, there is no highest
+    mapped sector and the value of <highest mapped sector> is unspecified.
diff --git a/Documentation/admin-guide/device-mapper/unstriped.rst b/Documentation/admin-guide/device-mapper/unstriped.rst
new file mode 100644
index 000000000000..0a8d3eb3f072
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/unstriped.rst
@@ -0,0 +1,135 @@
+================================
+Device-mapper "unstriped" target
+================================
+
+Introduction
+============
+
+The device-mapper "unstriped" target provides a transparent mechanism to
+unstripe a device-mapper "striped" target to access the underlying disks
+without having to touch the true backing block-device.  It can also be
+used to unstripe a hardware RAID-0 to access backing disks.
+
+Parameters:
+<number of stripes> <chunk size> <stripe #> <dev_path> <offset>
+
+<number of stripes>
+        The number of stripes in the RAID 0.
+
+<chunk size>
+	The amount of 512B sectors in the chunk striping.
+
+<dev_path>
+	The block device you wish to unstripe.
+
+<stripe #>
+        The stripe number within the device that corresponds to physical
+        drive you wish to unstripe.  This must be 0 indexed.
+
+
+Why use this module?
+====================
+
+An example of undoing an existing dm-stripe
+-------------------------------------------
+
+This small bash script will setup 4 loop devices and use the existing
+striped target to combine the 4 devices into one.  It then will use
+the unstriped target ontop of the striped device to access the
+individual backing loop devices.  We write data to the newly exposed
+unstriped devices and verify the data written matches the correct
+underlying device on the striped array::
+
+  #!/bin/bash
+
+  MEMBER_SIZE=$((128 * 1024 * 1024))
+  NUM=4
+  SEQ_END=$((${NUM}-1))
+  CHUNK=256
+  BS=4096
+
+  RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512))
+  DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}"
+  COUNT=$((${MEMBER_SIZE} / ${BS}))
+
+  for i in $(seq 0 ${SEQ_END}); do
+    dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct
+    losetup /dev/loop${i} member-${i}
+    DM_PARMS+=" /dev/loop${i} 0"
+  done
+
+  echo $DM_PARMS | dmsetup create raid0
+  for i in $(seq 0 ${SEQ_END}); do
+    echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i}
+  done;
+
+  for i in $(seq 0 ${SEQ_END}); do
+    dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct
+    diff /dev/mapper/set-${i} member-${i}
+  done;
+
+  for i in $(seq 0 ${SEQ_END}); do
+    dmsetup remove set-${i}
+  done
+
+  dmsetup remove raid0
+
+  for i in $(seq 0 ${SEQ_END}); do
+    losetup -d /dev/loop${i}
+    rm -f member-${i}
+  done
+
+Another example
+---------------
+
+Intel NVMe drives contain two cores on the physical device.
+Each core of the drive has segregated access to its LBA range.
+The current LBA model has a RAID 0 128k chunk on each core, resulting
+in a 256k stripe across the two cores::
+
+   Core 0:       Core 1:
+  __________    __________
+  | LBA 512|    | LBA 768|
+  | LBA 0  |    | LBA 256|
+  ----------    ----------
+
+The purpose of this unstriping is to provide better QoS in noisy
+neighbor environments. When two partitions are created on the
+aggregate drive without this unstriping, reads on one partition
+can affect writes on another partition.  This is because the partitions
+are striped across the two cores.  When we unstripe this hardware RAID 0
+and make partitions on each new exposed device the two partitions are now
+physically separated.
+
+With the dm-unstriped target we're able to segregate an fio script that
+has read and write jobs that are independent of each other.  Compared to
+when we run the test on a combined drive with partitions, we were able
+to get a 92% reduction in read latency using this device mapper target.
+
+
+Example dmsetup usage
+=====================
+
+unstriped ontop of Intel NVMe device that has 2 cores
+-----------------------------------------------------
+
+::
+
+  dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0'
+  dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0'
+
+There will now be two devices that expose Intel NVMe core 0 and 1
+respectively::
+
+  /dev/mapper/nvmset0
+  /dev/mapper/nvmset1
+
+unstriped ontop of striped with 4 drives using 128K chunk size
+--------------------------------------------------------------
+
+::
+
+  dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0'
+  dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0'
+  dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0'
+  dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0'
diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
new file mode 100644
index 000000000000..a4d1c1476d72
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -0,0 +1,229 @@
+=========
+dm-verity
+=========
+
+Device-Mapper's "verity" target provides transparent integrity checking of
+block devices using a cryptographic digest provided by the kernel crypto API.
+This target is read-only.
+
+Construction Parameters
+=======================
+
+::
+
+    <version> <dev> <hash_dev>
+    <data_block_size> <hash_block_size>
+    <num_data_blocks> <hash_start_block>
+    <algorithm> <digest> <salt>
+    [<#opt_params> <opt_params>]
+
+<version>
+    This is the type of the on-disk hash format.
+
+    0 is the original format used in the Chromium OS.
+      The salt is appended when hashing, digests are stored continuously and
+      the rest of the block is padded with zeroes.
+
+    1 is the current format that should be used for new devices.
+      The salt is prepended when hashing and each digest is
+      padded with zeroes to the power of two.
+
+<dev>
+    This is the device containing data, the integrity of which needs to be
+    checked.  It may be specified as a path, like /dev/sdaX, or a device number,
+    <major>:<minor>.
+
+<hash_dev>
+    This is the device that supplies the hash tree data.  It may be
+    specified similarly to the device path and may be the same device.  If the
+    same device is used, the hash_start should be outside the configured
+    dm-verity device.
+
+<data_block_size>
+    The block size on a data device in bytes.
+    Each block corresponds to one digest on the hash device.
+
+<hash_block_size>
+    The size of a hash block in bytes.
+
+<num_data_blocks>
+    The number of data blocks on the data device.  Additional blocks are
+    inaccessible.  You can place hashes to the same partition as data, in this
+    case hashes are placed after <num_data_blocks>.
+
+<hash_start_block>
+    This is the offset, in <hash_block_size>-blocks, from the start of hash_dev
+    to the root block of the hash tree.
+
+<algorithm>
+    The cryptographic hash algorithm used for this device.  This should
+    be the name of the algorithm, like "sha1".
+
+<digest>
+    The hexadecimal encoding of the cryptographic hash of the root hash block
+    and the salt.  This hash should be trusted as there is no other authenticity
+    beyond this point.
+
+<salt>
+    The hexadecimal encoding of the salt value.
+
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 ignore_corruption
+
+ignore_corruption
+    Log corrupted blocks, but allow read operations to proceed normally.
+
+restart_on_corruption
+    Restart the system when a corrupted block is discovered. This option is
+    not compatible with ignore_corruption and requires user space support to
+    avoid restart loops.
+
+ignore_zero_blocks
+    Do not verify blocks that are expected to contain zeroes and always return
+    zeroes instead. This may be useful if the partition contains unused blocks
+    that are not guaranteed to contain zeroes.
+
+use_fec_from_device <fec_dev>
+    Use forward error correction (FEC) to recover from corruption if hash
+    verification fails. Use encoding data from the specified device. This
+    may be the same device where data and hash blocks reside, in which case
+    fec_start must be outside data and hash areas.
+
+    If the encoding data covers additional metadata, it must be accessible
+    on the hash device after the hash blocks.
+
+    Note: block sizes for data and hash devices must match. Also, if the
+    verity <dev> is encrypted the <fec_dev> should be too.
+
+fec_roots <num>
+    Number of generator roots. This equals to the number of parity bytes in
+    the encoding data. For example, in RS(M, N) encoding, the number of roots
+    is M-N.
+
+fec_blocks <num>
+    The number of encoding data blocks on the FEC device. The block size for
+    the FEC device is <data_block_size>.
+
+fec_start <offset>
+    This is the offset, in <data_block_size> blocks, from the start of the
+    FEC device to the beginning of the encoding data.
+
+check_at_most_once
+    Verify data blocks only the first time they are read from the data device,
+    rather than every time.  This reduces the overhead of dm-verity so that it
+    can be used on systems that are memory and/or CPU constrained.  However, it
+    provides a reduced level of security because only offline tampering of the
+    data device's content will be detected, not online tampering.
+
+    Hash blocks are still verified each time they are read from the hash device,
+    since verification of hash blocks is less performance critical than data
+    blocks, and a hash block will not be verified any more after all the data
+    blocks it covers have been verified anyway.
+
+Theory of operation
+===================
+
+dm-verity is meant to be set up as part of a verified boot path.  This
+may be anything ranging from a boot using tboot or trustedgrub to just
+booting from a known-good device (like a USB drive or CD).
+
+When a dm-verity device is configured, it is expected that the caller
+has been authenticated in some way (cryptographic signatures, etc).
+After instantiation, all hashes will be verified on-demand during
+disk access.  If they cannot be verified up to the root node of the
+tree, the root hash, then the I/O will fail.  This should detect
+tampering with any data on the device and the hash data.
+
+Cryptographic hashes are used to assert the integrity of the device on a
+per-block basis. This allows for a lightweight hash computation on first read
+into the page cache. Block hashes are stored linearly, aligned to the nearest
+block size.
+
+If forward error correction (FEC) support is enabled any recovery of
+corrupted data will be verified using the cryptographic hash of the
+corresponding data. This is why combining error correction with
+integrity checking is essential.
+
+Hash Tree
+---------
+
+Each node in the tree is a cryptographic hash.  If it is a leaf node, the hash
+of some data block on disk is calculated. If it is an intermediary node,
+the hash of a number of child nodes is calculated.
+
+Each entry in the tree is a collection of neighboring nodes that fit in one
+block.  The number is determined based on block_size and the size of the
+selected cryptographic digest algorithm.  The hashes are linearly-ordered in
+this entry and any unaligned trailing space is ignored but included when
+calculating the parent node.
+
+The tree looks something like:
+
+	alg = sha256, num_blocks = 32768, block_size = 4096
+
+::
+
+                                 [   root    ]
+                                /    . . .    \
+                     [entry_0]                 [entry_1]
+                    /  . . .  \                 . . .   \
+         [entry_0_0]   . . .  [entry_0_127]    . . . .  [entry_1_127]
+           / ... \             /   . . .  \             /           \
+     blk_0 ... blk_127  blk_16256   blk_16383      blk_32640 . . . blk_32767
+
+
+On-disk format
+==============
+
+The verity kernel code does not read the verity metadata on-disk header.
+It only reads the hash blocks which directly follow the header.
+It is expected that a user-space tool will verify the integrity of the
+verity header.
+
+Alternatively, the header can be omitted and the dmsetup parameters can
+be passed via the kernel command-line in a rooted chain of trust where
+the command-line is verified.
+
+Directly following the header (and with sector number padded to the next hash
+block boundary) are the hash blocks which are stored a depth at a time
+(starting from the root), sorted in order of increasing index.
+
+The full specification of kernel parameters and on-disk metadata format
+is available at the cryptsetup project's wiki page
+
+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
+
+Status
+======
+V (for Valid) is returned if every check performed so far was valid.
+If any check failed, C (for Corruption) is returned.
+
+Example
+=======
+Set up a device::
+
+  # dmsetup create vroot --readonly --table \
+    "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\
+    "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\
+    "1234000000000000000000000000000000000000000000000000000000000000"
+
+A command line tool veritysetup is available to compute or verify
+the hash tree or activate the kernel device. This is available from
+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
+(as a libcryptsetup extension).
+
+Create hash on the device::
+
+  # veritysetup format /dev/sda1 /dev/sda2
+  ...
+  Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
+
+Activate the device::
+
+  # veritysetup create vroot /dev/sda1 /dev/sda2 \
+    4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
diff --git a/Documentation/admin-guide/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst
new file mode 100644
index 000000000000..d3d7690f5e8d
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/writecache.rst
@@ -0,0 +1,79 @@
+=================
+Writecache target
+=================
+
+The writecache target caches writes on persistent memory or on SSD. It
+doesn't cache reads because reads are supposed to be cached in page cache
+in normal RAM.
+
+When the device is constructed, the first sector should be zeroed or the
+first sector should contain valid superblock from previous invocation.
+
+Constructor parameters:
+
+1. type of the cache device - "p" or "s"
+
+	- p - persistent memory
+	- s - SSD
+2. the underlying device that will be cached
+3. the cache device
+4. block size (4096 is recommended; the maximum block size is the page
+   size)
+5. the number of optional parameters (the parameters with an argument
+   count as two)
+
+	start_sector n		(default: 0)
+		offset from the start of cache device in 512-byte sectors
+	high_watermark n	(default: 50)
+		start writeback when the number of used blocks reach this
+		watermark
+	low_watermark x		(default: 45)
+		stop writeback when the number of used blocks drops below
+		this watermark
+	writeback_jobs n	(default: unlimited)
+		limit the number of blocks that are in flight during
+		writeback. Setting this value reduces writeback
+		throughput, but it may improve latency of read requests
+	autocommit_blocks n	(default: 64 for pmem, 65536 for ssd)
+		when the application writes this amount of blocks without
+		issuing the FLUSH request, the blocks are automatically
+		commited
+	autocommit_time ms	(default: 1000)
+		autocommit time in milliseconds. The data is automatically
+		commited if this time passes and no FLUSH request is
+		received
+	fua			(by default on)
+		applicable only to persistent memory - use the FUA flag
+		when writing data from persistent memory back to the
+		underlying device
+	nofua
+		applicable only to persistent memory - don't use the FUA
+		flag when writing back data and send the FLUSH request
+		afterwards
+
+		- some underlying devices perform better with fua, some
+		  with nofua. The user should test it
+
+Status:
+1. error indicator - 0 if there was no error, otherwise error number
+2. the number of blocks
+3. the number of free blocks
+4. the number of blocks under writeback
+
+Messages:
+	flush
+		flush the cache device. The message returns successfully
+		if the cache device was flushed without an error
+	flush_on_suspend
+		flush the cache device on next suspend. Use this message
+		when you are going to remove the cache device. The proper
+		sequence for removing the cache device is:
+
+		1. send the "flush_on_suspend" message
+		2. load an inactive table with a linear target that maps
+		   to the underlying device
+		3. suspend the device
+		4. ask for status and verify that there are no errors
+		5. resume the device, so that it will use the linear
+		   target
+		6. the cache device is now inactive and it can be deleted
diff --git a/Documentation/admin-guide/device-mapper/zero.rst b/Documentation/admin-guide/device-mapper/zero.rst
new file mode 100644
index 000000000000..11fb5cf4597c
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/zero.rst
@@ -0,0 +1,37 @@
+=======
+dm-zero
+=======
+
+Device-Mapper's "zero" target provides a block-device that always returns
+zero'd data on reads and silently drops writes. This is similar behavior to
+/dev/zero, but as a block-device instead of a character-device.
+
+Dm-zero has no target-specific parameters.
+
+One very interesting use of dm-zero is for creating "sparse" devices in
+conjunction with dm-snapshot. A sparse device reports a device-size larger
+than the amount of actual storage space available for that device. A user can
+write data anywhere within the sparse device and read it back like a normal
+device. Reads to previously unwritten areas will return a zero'd buffer. When
+enough data has been written to fill up the actual storage space, the sparse
+device is deactivated. This can be very useful for testing device and
+filesystem limitations.
+
+To create a sparse device, start by creating a dm-zero device that's the
+desired size of the sparse device. For this example, we'll assume a 10TB
+sparse device::
+
+  TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2`   # 10 TB in sectors
+  echo "0 $TEN_TERABYTES zero" | dmsetup create zero1
+
+Then create a snapshot of the zero device, using any available block-device as
+the COW device. The size of the COW device will determine the amount of real
+space available to the sparse device. For this example, we'll assume /dev/sdb1
+is an available 10GB partition::
+
+  echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \
+     dmsetup create sparse1
+
+This will create a 10TB sparse device called /dev/mapper/sparse1 that has
+10GB of actual storage space available. If more than 10GB of data is written
+to this device, it will start returning I/O errors.
diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 1649117e6087..e56e00655153 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -2693,8 +2693,8 @@
 		 41 = /dev/ttySMX0		Motorola i.MX - port 0
 		 42 = /dev/ttySMX1		Motorola i.MX - port 1
 		 43 = /dev/ttySMX2		Motorola i.MX - port 2
-		 44 = /dev/ttyMM0		Marvell MPSC - port 0
-		 45 = /dev/ttyMM1		Marvell MPSC - port 1
+		 44 = /dev/ttyMM0		Marvell MPSC - port 0 (obsolete unused)
+		 45 = /dev/ttyMM1		Marvell MPSC - port 1 (obsolete unused)
 		 46 = /dev/ttyCPM0		PPC CPM (SCC or SMC) - port 0
 		    ...
 		 47 = /dev/ttyCPM5		PPC CPM (SCC or SMC) - port 5
diff --git a/Documentation/efi-stub.txt b/Documentation/admin-guide/efi-stub.rst
index 833edb0d0bc4..833edb0d0bc4 100644
--- a/Documentation/efi-stub.txt
+++ b/Documentation/admin-guide/efi-stub.rst
diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst
new file mode 100644
index 000000000000..a244ba4e87d5
--- /dev/null
+++ b/Documentation/admin-guide/gpio/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+gpio
+====
+
+.. toctree::
+    :maxdepth: 1
+
+    sysfs
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst
index ec09ffd983e7..ec09ffd983e7 100644
--- a/Documentation/gpio/sysfs.rst
+++ b/Documentation/admin-guide/gpio/sysfs.rst
diff --git a/Documentation/highuid.txt b/Documentation/admin-guide/highuid.rst
index 6ee70465c0ea..6ee70465c0ea 100644
--- a/Documentation/highuid.txt
+++ b/Documentation/admin-guide/highuid.rst
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index ffc064c1ec68..49311f3da6f2 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -9,5 +9,6 @@ are configurable at compile, boot or run time.
 .. toctree::
    :maxdepth: 1
 
+   spectre
    l1tf
    mds
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 31653a9f0e1b..f83212fae4d5 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
 
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+   https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst
 
 .. _interrupt_isolation:
 
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
new file mode 100644
index 000000000000..25f3b2532198
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -0,0 +1,697 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Spectre Side Channels
+=====================
+
+Spectre is a class of side channel attacks that exploit branch prediction
+and speculative execution on modern CPUs to read memory, possibly
+bypassing access controls. Speculative execution side channel exploits
+do not modify memory but attempt to infer privileged data in the memory.
+
+This document covers Spectre variant 1 and Spectre variant 2.
+
+Affected processors
+-------------------
+
+Speculative execution side channel methods affect a wide range of modern
+high performance processors, since most modern high speed processors
+use branch prediction and speculative execution.
+
+The following CPUs are vulnerable:
+
+    - Intel Core, Atom, Pentium, and Xeon processors
+
+    - AMD Phenom, EPYC, and Zen processors
+
+    - IBM POWER and zSeries processors
+
+    - Higher end ARM processors
+
+    - Apple CPUs
+
+    - Higher end MIPS CPUs
+
+    - Likely most other high performance CPUs. Contact your CPU vendor for details.
+
+Whether a processor is affected or not can be read out from the Spectre
+vulnerability files in sysfs. See :ref:`spectre_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries describe Spectre variants:
+
+   =============   =======================  =================
+   CVE-2017-5753   Bounds check bypass      Spectre variant 1
+   CVE-2017-5715   Branch target injection  Spectre variant 2
+   =============   =======================  =================
+
+Problem
+-------
+
+CPUs use speculative operations to improve performance. That may leave
+traces of memory accesses or computations in the processor's caches,
+buffers, and branch predictors. Malicious software may be able to
+influence the speculative execution paths, and then use the side effects
+of the speculative execution in the CPUs' caches and buffers to infer
+privileged data touched during the speculative execution.
+
+Spectre variant 1 attacks take advantage of speculative execution of
+conditional branches, while Spectre variant 2 attacks use speculative
+execution of indirect branches to leak privileged memory.
+See :ref:`[1] <spec_ref1>` :ref:`[5] <spec_ref5>` :ref:`[7] <spec_ref7>`
+:ref:`[10] <spec_ref10>` :ref:`[11] <spec_ref11>`.
+
+Spectre variant 1 (Bounds Check Bypass)
+---------------------------------------
+
+The bounds check bypass attack :ref:`[2] <spec_ref2>` takes advantage
+of speculative execution that bypasses conditional branch instructions
+used for memory access bounds check (e.g. checking if the index of an
+array results in memory access within a valid range). This results in
+memory accesses to invalid memory (with out-of-bound index) that are
+done speculatively before validation checks resolve. Such speculative
+memory accesses can leave side effects, creating side channels which
+leak information to the attacker.
+
+There are some extensions of Spectre variant 1 attacks for reading data
+over the network, see :ref:`[12] <spec_ref12>`. However such attacks
+are difficult, low bandwidth, fragile, and are considered low risk.
+
+Spectre variant 2 (Branch Target Injection)
+-------------------------------------------
+
+The branch target injection attack takes advantage of speculative
+execution of indirect branches :ref:`[3] <spec_ref3>`.  The indirect
+branch predictors inside the processor used to guess the target of
+indirect branches can be influenced by an attacker, causing gadget code
+to be speculatively executed, thus exposing sensitive data touched by
+the victim. The side effects left in the CPU's caches during speculative
+execution can be measured to infer data values.
+
+.. _poison_btb:
+
+In Spectre variant 2 attacks, the attacker can steer speculative indirect
+branches in the victim to gadget code by poisoning the branch target
+buffer of a CPU used for predicting indirect branch addresses. Such
+poisoning could be done by indirect branching into existing code,
+with the address offset of the indirect branch under the attacker's
+control. Since the branch prediction on impacted hardware does not
+fully disambiguate branch address and uses the offset for prediction,
+this could cause privileged code's indirect branch to jump to a gadget
+code with the same offset.
+
+The most useful gadgets take an attacker-controlled input parameter (such
+as a register value) so that the memory read can be controlled. Gadgets
+without input parameters might be possible, but the attacker would have
+very little control over what memory can be read, reducing the risk of
+the attack revealing useful data.
+
+One other variant 2 attack vector is for the attacker to poison the
+return stack buffer (RSB) :ref:`[13] <spec_ref13>` to cause speculative
+subroutine return instruction execution to go to a gadget.  An attacker's
+imbalanced subroutine call instructions might "poison" entries in the
+return stack buffer which are later consumed by a victim's subroutine
+return instructions.  This attack can be mitigated by flushing the return
+stack buffer on context switch, or virtual machine (VM) exit.
+
+On systems with simultaneous multi-threading (SMT), attacks are possible
+from the sibling thread, as level 1 cache and branch target buffer
+(BTB) may be shared between hardware threads in a CPU core.  A malicious
+program running on the sibling thread may influence its peer's BTB to
+steer its indirect branch speculations to gadget code, and measure the
+speculative execution's side effects left in level 1 cache to infer the
+victim's data.
+
+Attack scenarios
+----------------
+
+The following list of attack scenarios have been anticipated, but may
+not cover all possible attack vectors.
+
+1. A user process attacking the kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The attacker passes a parameter to the kernel via a register or
+   via a known address in memory during a syscall. Such parameter may
+   be used later by the kernel as an index to an array or to derive
+   a pointer for a Spectre variant 1 attack.  The index or pointer
+   is invalid, but bound checks are bypassed in the code branch taken
+   for speculative execution. This could cause privileged memory to be
+   accessed and leaked.
+
+   For kernel code that has been identified where data pointers could
+   potentially be influenced for Spectre attacks, new "nospec" accessor
+   macros are used to prevent speculative loading of data.
+
+   Spectre variant 2 attacker can :ref:`poison <poison_btb>` the branch
+   target buffer (BTB) before issuing syscall to launch an attack.
+   After entering the kernel, the kernel could use the poisoned branch
+   target buffer on indirect jump and jump to gadget code in speculative
+   execution.
+
+   If an attacker tries to control the memory addresses leaked during
+   speculative execution, he would also need to pass a parameter to the
+   gadget, either through a register or a known address in memory. After
+   the gadget has executed, he can measure the side effect.
+
+   The kernel can protect itself against consuming poisoned branch
+   target buffer entries by using return trampolines (also known as
+   "retpoline") :ref:`[3] <spec_ref3>` :ref:`[9] <spec_ref9>` for all
+   indirect branches. Return trampolines trap speculative execution paths
+   to prevent jumping to gadget code during speculative execution.
+   x86 CPUs with Enhanced Indirect Branch Restricted Speculation
+   (Enhanced IBRS) available in hardware should use the feature to
+   mitigate Spectre variant 2 instead of retpoline. Enhanced IBRS is
+   more efficient than retpoline.
+
+   There may be gadget code in firmware which could be exploited with
+   Spectre variant 2 attack by a rogue user process. To mitigate such
+   attacks on x86, Indirect Branch Restricted Speculation (IBRS) feature
+   is turned on before the kernel invokes any firmware code.
+
+2. A user process attacking another user process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   A malicious user process can try to attack another user process,
+   either via a context switch on the same hardware thread, or from the
+   sibling hyperthread sharing a physical processor core on simultaneous
+   multi-threading (SMT) system.
+
+   Spectre variant 1 attacks generally require passing parameters
+   between the processes, which needs a data passing relationship, such
+   as remote procedure calls (RPC).  Those parameters are used in gadget
+   code to derive invalid data pointers accessing privileged memory in
+   the attacked process.
+
+   Spectre variant 2 attacks can be launched from a rogue process by
+   :ref:`poisoning <poison_btb>` the branch target buffer.  This can
+   influence the indirect branch targets for a victim process that either
+   runs later on the same hardware thread, or running concurrently on
+   a sibling hardware thread sharing the same physical core.
+
+   A user process can protect itself against Spectre variant 2 attacks
+   by using the prctl() syscall to disable indirect branch speculation
+   for itself.  An administrator can also cordon off an unsafe process
+   from polluting the branch target buffer by disabling the process's
+   indirect branch speculation. This comes with a performance cost
+   from not using indirect branch speculation and clearing the branch
+   target buffer.  When SMT is enabled on x86, for a process that has
+   indirect branch speculation disabled, Single Threaded Indirect Branch
+   Predictors (STIBP) :ref:`[4] <spec_ref4>` are turned on to prevent the
+   sibling thread from controlling branch target buffer.  In addition,
+   the Indirect Branch Prediction Barrier (IBPB) is issued to clear the
+   branch target buffer when context switching to and from such process.
+
+   On x86, the return stack buffer is stuffed on context switch.
+   This prevents the branch target buffer from being used for branch
+   prediction when the return stack buffer underflows while switching to
+   a deeper call stack. Any poisoned entries in the return stack buffer
+   left by the previous process will also be cleared.
+
+   User programs should use address space randomization to make attacks
+   more difficult (Set /proc/sys/kernel/randomize_va_space = 1 or 2).
+
+3. A virtualized guest attacking the host
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The attack mechanism is similar to how user processes attack the
+   kernel.  The kernel is entered via hyper-calls or other virtualization
+   exit paths.
+
+   For Spectre variant 1 attacks, rogue guests can pass parameters
+   (e.g. in registers) via hyper-calls to derive invalid pointers to
+   speculate into privileged memory after entering the kernel.  For places
+   where such kernel code has been identified, nospec accessor macros
+   are used to stop speculative memory access.
+
+   For Spectre variant 2 attacks, rogue guests can :ref:`poison
+   <poison_btb>` the branch target buffer or return stack buffer, causing
+   the kernel to jump to gadget code in the speculative execution paths.
+
+   To mitigate variant 2, the host kernel can use return trampolines
+   for indirect branches to bypass the poisoned branch target buffer,
+   and flushing the return stack buffer on VM exit.  This prevents rogue
+   guests from affecting indirect branching in the host kernel.
+
+   To protect host processes from rogue guests, host processes can have
+   indirect branch speculation disabled via prctl().  The branch target
+   buffer is cleared before context switching to such processes.
+
+4. A virtualized guest attacking other guest
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   A rogue guest may attack another guest to get data accessible by the
+   other guest.
+
+   Spectre variant 1 attacks are possible if parameters can be passed
+   between guests.  This may be done via mechanisms such as shared memory
+   or message passing.  Such parameters could be used to derive data
+   pointers to privileged data in guest.  The privileged data could be
+   accessed by gadget code in the victim's speculation paths.
+
+   Spectre variant 2 attacks can be launched from a rogue guest by
+   :ref:`poisoning <poison_btb>` the branch target buffer or the return
+   stack buffer. Such poisoned entries could be used to influence
+   speculation execution paths in the victim guest.
+
+   Linux kernel mitigates attacks to other guests running in the same
+   CPU hardware thread by flushing the return stack buffer on VM exit,
+   and clearing the branch target buffer before switching to a new guest.
+
+   If SMT is used, Spectre variant 2 attacks from an untrusted guest
+   in the sibling hyperthread can be mitigated by the administrator,
+   by turning off the unsafe guest's indirect branch speculation via
+   prctl().  A guest can also protect itself by turning on microcode
+   based mitigations (such as IBPB or STIBP on x86) within the guest.
+
+.. _spectre_sys_info:
+
+Spectre system information
+--------------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current
+mitigation status of the system for Spectre: whether the system is
+vulnerable, and which mitigations are active.
+
+The sysfs file showing Spectre variant 1 mitigation status is:
+
+   /sys/devices/system/cpu/vulnerabilities/spectre_v1
+
+The possible values in this file are:
+
+  =======================================  =================================
+  'Mitigation: __user pointer sanitation'  Protection in kernel on a case by
+                                           case base with explicit pointer
+                                           sanitation.
+  =======================================  =================================
+
+However, the protections are put in place on a case by case basis,
+and there is no guarantee that all possible attack vectors for Spectre
+variant 1 are covered.
+
+The spectre_v2 kernel file reports if the kernel has been compiled with
+retpoline mitigation or if the CPU has hardware mitigation, and if the
+CPU has support for additional process-specific mitigation.
+
+This file also reports CPU features enabled by microcode to mitigate
+attack between user processes:
+
+1. Indirect Branch Prediction Barrier (IBPB) to add additional
+   isolation between processes of different users.
+2. Single Thread Indirect Branch Predictors (STIBP) to add additional
+   isolation between CPU threads running on the same core.
+
+These CPU features may impact performance when used and can be enabled
+per process on a case-by-case base.
+
+The sysfs file showing Spectre variant 2 mitigation status is:
+
+   /sys/devices/system/cpu/vulnerabilities/spectre_v2
+
+The possible values in this file are:
+
+  - Kernel status:
+
+  ====================================  =================================
+  'Not affected'                        The processor is not vulnerable
+  'Vulnerable'                          Vulnerable, no mitigation
+  'Mitigation: Full generic retpoline'  Software-focused mitigation
+  'Mitigation: Full AMD retpoline'      AMD-specific software mitigation
+  'Mitigation: Enhanced IBRS'           Hardware-focused mitigation
+  ====================================  =================================
+
+  - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is
+    used to protect against Spectre variant 2 attacks when calling firmware (x86 only).
+
+  ========== =============================================================
+  'IBRS_FW'  Protection against user program attacks when calling firmware
+  ========== =============================================================
+
+  - Indirect branch prediction barrier (IBPB) status for protection between
+    processes of different users. This feature can be controlled through
+    prctl() per process, or through kernel command line options. This is
+    an x86 only feature. For more details see below.
+
+  ===================   ========================================================
+  'IBPB: disabled'      IBPB unused
+  'IBPB: always-on'     Use IBPB on all tasks
+  'IBPB: conditional'   Use IBPB on SECCOMP or indirect branch restricted tasks
+  ===================   ========================================================
+
+  - Single threaded indirect branch prediction (STIBP) status for protection
+    between different hyper threads. This feature can be controlled through
+    prctl per process, or through kernel command line options. This is x86
+    only feature. For more details see below.
+
+  ====================  ========================================================
+  'STIBP: disabled'     STIBP unused
+  'STIBP: forced'       Use STIBP on all tasks
+  'STIBP: conditional'  Use STIBP on SECCOMP or indirect branch restricted tasks
+  ====================  ========================================================
+
+  - Return stack buffer (RSB) protection status:
+
+  =============   ===========================================
+  'RSB filling'   Protection of RSB on context switch enabled
+  =============   ===========================================
+
+Full mitigation might require a microcode update from the CPU
+vendor. When the necessary microcode is not available, the kernel will
+report vulnerability.
+
+Turning on mitigation for Spectre variant 1 and Spectre variant 2
+-----------------------------------------------------------------
+
+1. Kernel mitigation
+^^^^^^^^^^^^^^^^^^^^
+
+   For the Spectre variant 1, vulnerable kernel code (as determined
+   by code audit or scanning tools) is annotated on a case by case
+   basis to use nospec accessor macros for bounds clipping :ref:`[2]
+   <spec_ref2>` to avoid any usable disclosure gadgets. However, it may
+   not cover all attack vectors for Spectre variant 1.
+
+   For Spectre variant 2 mitigation, the compiler turns indirect calls or
+   jumps in the kernel into equivalent return trampolines (retpolines)
+   :ref:`[3] <spec_ref3>` :ref:`[9] <spec_ref9>` to go to the target
+   addresses.  Speculative execution paths under retpolines are trapped
+   in an infinite loop to prevent any speculative execution jumping to
+   a gadget.
+
+   To turn on retpoline mitigation on a vulnerable CPU, the kernel
+   needs to be compiled with a gcc compiler that supports the
+   -mindirect-branch=thunk-extern -mindirect-branch-register options.
+   If the kernel is compiled with a Clang compiler, the compiler needs
+   to support -mretpoline-external-thunk option.  The kernel config
+   CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with
+   the latest updated microcode.
+
+   On Intel Skylake-era systems the mitigation covers most, but not all,
+   cases. See :ref:`[3] <spec_ref3>` for more details.
+
+   On CPUs with hardware mitigation for Spectre variant 2 (e.g. Enhanced
+   IBRS on x86), retpoline is automatically disabled at run time.
+
+   The retpoline mitigation is turned on by default on vulnerable
+   CPUs. It can be forced on or off by the administrator
+   via the kernel command line and sysfs control files. See
+   :ref:`spectre_mitigation_control_command_line`.
+
+   On x86, indirect branch restricted speculation is turned on by default
+   before invoking any firmware code to prevent Spectre variant 2 exploits
+   using the firmware.
+
+   Using kernel address space randomization (CONFIG_RANDOMIZE_SLAB=y
+   and CONFIG_SLAB_FREELIST_RANDOM=y in the kernel configuration) makes
+   attacks on the kernel generally more difficult.
+
+2. User program mitigation
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   User programs can mitigate Spectre variant 1 using LFENCE or "bounds
+   clipping". For more details see :ref:`[2] <spec_ref2>`.
+
+   For Spectre variant 2 mitigation, individual user programs
+   can be compiled with return trampolines for indirect branches.
+   This protects them from consuming poisoned entries in the branch
+   target buffer left by malicious software.  Alternatively, the
+   programs can disable their indirect branch speculation via prctl()
+   (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
+   On x86, this will turn on STIBP to guard against attacks from the
+   sibling thread when the user program is running, and use IBPB to
+   flush the branch target buffer when switching to/from the program.
+
+   Restricting indirect branch speculation on a user program will
+   also prevent the program from launching a variant 2 attack
+   on x86.  All sand-boxed SECCOMP programs have indirect branch
+   speculation restricted by default.  Administrators can change
+   that behavior via the kernel command line and sysfs control files.
+   See :ref:`spectre_mitigation_control_command_line`.
+
+   Programs that disable their indirect branch speculation will have
+   more overhead and run slower.
+
+   User programs should use address space randomization
+   (/proc/sys/kernel/randomize_va_space = 1 or 2) to make attacks more
+   difficult.
+
+3. VM mitigation
+^^^^^^^^^^^^^^^^
+
+   Within the kernel, Spectre variant 1 attacks from rogue guests are
+   mitigated on a case by case basis in VM exit paths. Vulnerable code
+   uses nospec accessor macros for "bounds clipping", to avoid any
+   usable disclosure gadgets.  However, this may not cover all variant
+   1 attack vectors.
+
+   For Spectre variant 2 attacks from rogue guests to the kernel, the
+   Linux kernel uses retpoline or Enhanced IBRS to prevent consumption of
+   poisoned entries in branch target buffer left by rogue guests.  It also
+   flushes the return stack buffer on every VM exit to prevent a return
+   stack buffer underflow so poisoned branch target buffer could be used,
+   or attacker guests leaving poisoned entries in the return stack buffer.
+
+   To mitigate guest-to-guest attacks in the same CPU hardware thread,
+   the branch target buffer is sanitized by flushing before switching
+   to a new guest on a CPU.
+
+   The above mitigations are turned on by default on vulnerable CPUs.
+
+   To mitigate guest-to-guest attacks from sibling thread when SMT is
+   in use, an untrusted guest running in the sibling thread can have
+   its indirect branch speculation disabled by administrator via prctl().
+
+   The kernel also allows guests to use any microcode based mitigation
+   they choose to use (such as IBPB or STIBP on x86) to protect themselves.
+
+.. _spectre_mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+Spectre variant 2 mitigation can be disabled or force enabled at the
+kernel command line.
+
+	nospectre_v2
+
+		[X86] Disable all mitigations for the Spectre variant 2
+		(indirect branch prediction) vulnerability. System may
+		allow data leaks with this option, which is equivalent
+		to spectre_v2=off.
+
+
+        spectre_v2=
+
+		[X86] Control mitigation of Spectre variant 2
+		(indirect branch speculation) vulnerability.
+		The default operation protects the kernel from
+		user space attacks.
+
+		on
+			unconditionally enable, implies
+			spectre_v2_user=on
+		off
+			unconditionally disable, implies
+		        spectre_v2_user=off
+		auto
+			kernel detects whether your CPU model is
+		        vulnerable
+
+		Selecting 'on' will, and 'auto' may, choose a
+		mitigation method at run time according to the
+		CPU, the available microcode, the setting of the
+		CONFIG_RETPOLINE configuration option, and the
+		compiler with which the kernel was built.
+
+		Selecting 'on' will also enable the mitigation
+		against user space to user space task attacks.
+
+		Selecting 'off' will disable both the kernel and
+		the user space protections.
+
+		Specific mitigations can also be selected manually:
+
+		retpoline
+					replace indirect branches
+		retpoline,generic
+					google's original retpoline
+		retpoline,amd
+					AMD-specific minimal thunk
+
+		Not specifying this option is equivalent to
+		spectre_v2=auto.
+
+For user space mitigation:
+
+        spectre_v2_user=
+
+		[X86] Control mitigation of Spectre variant 2
+		(indirect branch speculation) vulnerability between
+		user space tasks
+
+		on
+			Unconditionally enable mitigations. Is
+			enforced by spectre_v2=on
+
+		off
+			Unconditionally disable mitigations. Is
+			enforced by spectre_v2=off
+
+		prctl
+			Indirect branch speculation is enabled,
+			but mitigation can be enabled via prctl
+			per thread. The mitigation control state
+			is inherited on fork.
+
+		prctl,ibpb
+			Like "prctl" above, but only STIBP is
+			controlled per thread. IBPB is issued
+			always when switching between different user
+			space processes.
+
+		seccomp
+			Same as "prctl" above, but all seccomp
+			threads will enable the mitigation unless
+			they explicitly opt out.
+
+		seccomp,ibpb
+			Like "seccomp" above, but only STIBP is
+			controlled per thread. IBPB is issued
+			always when switching between different
+			user space processes.
+
+		auto
+			Kernel selects the mitigation depending on
+			the available CPU features and vulnerability.
+
+		Default mitigation:
+		If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
+
+		Not specifying this option is equivalent to
+		spectre_v2_user=auto.
+
+		In general the kernel by default selects
+		reasonable mitigations for the current CPU. To
+		disable Spectre variant 2 mitigations, boot with
+		spectre_v2=off. Spectre variant 1 mitigations
+		cannot be disabled.
+
+Mitigation selection guide
+--------------------------
+
+1. Trusted userspace
+^^^^^^^^^^^^^^^^^^^^
+
+   If all userspace applications are from trusted sources and do not
+   execute externally supplied untrusted code, then the mitigations can
+   be disabled.
+
+2. Protect sensitive programs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   For security-sensitive programs that have secrets (e.g. crypto
+   keys), protection against Spectre variant 2 can be put in place by
+   disabling indirect branch speculation when the program is running
+   (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
+
+3. Sandbox untrusted programs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   Untrusted programs that could be a source of attacks can be cordoned
+   off by disabling their indirect branch speculation when they are run
+   (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
+   This prevents untrusted programs from polluting the branch target
+   buffer.  All programs running in SECCOMP sandboxes have indirect
+   branch speculation restricted by default. This behavior can be
+   changed via the kernel command line and sysfs control files. See
+   :ref:`spectre_mitigation_control_command_line`.
+
+3. High security mode
+^^^^^^^^^^^^^^^^^^^^^
+
+   All Spectre variant 2 mitigations can be forced on
+   at boot time for all programs (See the "on" option in
+   :ref:`spectre_mitigation_control_command_line`).  This will add
+   overhead as indirect branch speculations for all programs will be
+   restricted.
+
+   On x86, branch target buffer will be flushed with IBPB when switching
+   to a new program. STIBP is left on all the time to protect programs
+   against variant 2 attacks originating from programs running on
+   sibling threads.
+
+   Alternatively, STIBP can be used only when running programs
+   whose indirect branch speculation is explicitly disabled,
+   while IBPB is still used all the time when switching to a new
+   program to clear the branch target buffer (See "ibpb" option in
+   :ref:`spectre_mitigation_control_command_line`).  This "ibpb" option
+   has less performance cost than the "on" option, which leaves STIBP
+   on all the time.
+
+References on Spectre
+---------------------
+
+Intel white papers:
+
+.. _spec_ref1:
+
+[1] `Intel analysis of speculative execution side channels <https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/Intel-Analysis-of-Speculative-Execution-Side-Channels.pdf>`_.
+
+.. _spec_ref2:
+
+[2] `Bounds check bypass <https://software.intel.com/security-software-guidance/software-guidance/bounds-check-bypass>`_.
+
+.. _spec_ref3:
+
+[3] `Deep dive: Retpoline: A branch target injection mitigation <https://software.intel.com/security-software-guidance/insights/deep-dive-retpoline-branch-target-injection-mitigation>`_.
+
+.. _spec_ref4:
+
+[4] `Deep Dive: Single Thread Indirect Branch Predictors <https://software.intel.com/security-software-guidance/insights/deep-dive-single-thread-indirect-branch-predictors>`_.
+
+AMD white papers:
+
+.. _spec_ref5:
+
+[5] `AMD64 technology indirect branch control extension <https://developer.amd.com/wp-content/resources/Architecture_Guidelines_Update_Indirect_Branch_Control.pdf>`_.
+
+.. _spec_ref6:
+
+[6] `Software techniques for managing speculation on AMD processors <https://developer.amd.com/wp-content/resources/90343-B_SoftwareTechniquesforManagingSpeculation_WP_7-18Update_FNL.pdf>`_.
+
+ARM white papers:
+
+.. _spec_ref7:
+
+[7] `Cache speculation side-channels <https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability/download-the-whitepaper>`_.
+
+.. _spec_ref8:
+
+[8] `Cache speculation issues update <https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability/latest-updates/cache-speculation-issues-update>`_.
+
+Google white paper:
+
+.. _spec_ref9:
+
+[9] `Retpoline: a software construct for preventing branch-target-injection <https://support.google.com/faqs/answer/7625886>`_.
+
+MIPS white paper:
+
+.. _spec_ref10:
+
+[10] `MIPS: response on speculative execution and side channel vulnerabilities <https://www.mips.com/blog/mips-response-on-speculative-execution-and-side-channel-vulnerabilities/>`_.
+
+Academic papers:
+
+.. _spec_ref11:
+
+[11] `Spectre Attacks: Exploiting Speculative Execution <https://spectreattack.com/spectre.pdf>`_.
+
+.. _spec_ref12:
+
+[12] `NetSpectre: Read Arbitrary Memory over Network <https://arxiv.org/abs/1807.10535>`_.
+
+.. _spec_ref13:
+
+[13] `Spectre Returns! Speculation Attacks using the Return Stack Buffer <https://www.usenix.org/system/files/conference/woot18/woot18-paper-koruyeh.pdf>`_.
diff --git a/Documentation/hw_random.txt b/Documentation/admin-guide/hw_random.rst
index 121de96e395e..121de96e395e 100644
--- a/Documentation/hw_random.txt
+++ b/Documentation/admin-guide/hw_random.rst
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 8001917ee012..33feab2f4084 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -16,6 +16,7 @@ etc.
    README
    kernel-parameters
    devices
+   sysctl/index
 
 This section describes CPU vulnerabilities and their mitigations.
 
@@ -38,6 +39,8 @@ problems and bugs in particular.
    ramoops
    dynamic-debug-howto
    init
+   kdump/index
+   perf/index
 
 This is the beginning of a section with information of interest to
 application developers.  Documents covering various aspects of the kernel
@@ -56,11 +59,13 @@ configure specific aspects of kernel behavior to your liking.
 
    initrd
    cgroup-v2
+   cgroup-v1/index
    serial-console
    braille-console
    parport
    md
    module-signing
+   rapidio
    sysrq
    unicode
    vga-softcursor
@@ -69,13 +74,38 @@ configure specific aspects of kernel behavior to your liking.
    java
    ras
    bcache
+   blockdev/index
    ext4
+   binderfs
+   xfs
    pm/index
    thunderbolt
    LSM/index
    mm/index
+   namespaces/index
    perf-security
    acpi/index
+   aoe/index
+   btmrvl
+   clearing-warn-once
+   cpu-load
+   cputopology
+   device-mapper/index
+   efi-stub
+   gpio/index
+   highuid
+   hw_random
+   iostats
+   kernel-per-CPU-kthreads
+   laptops/index
+   lcd-panel-cgram
+   ldm
+   lockup-watchdogs
+   numastat
+   pnp
+   rtc
+   svga
+   video-output
 
 .. only::  subproject and html
 
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
new file mode 100644
index 000000000000..5d63b18bd6d1
--- /dev/null
+++ b/Documentation/admin-guide/iostats.rst
@@ -0,0 +1,197 @@
+=====================
+I/O statistics fields
+=====================
+
+Since 2.4.20 (and some versions before, with patches), and 2.5.45,
+more extensive disk statistics have been introduced to help measure disk
+activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
+the work for you, but in case you are interested in creating your own
+tools, the fields are explained here.
+
+In 2.4 now, the information is found as additional fields in
+``/proc/partitions``.  In 2.6 and upper, the same information is found in two
+places: one is in the file ``/proc/diskstats``, and the other is within
+the sysfs file system, which must be mounted in order to obtain
+the information. Throughout this document we'll assume that sysfs
+is mounted on ``/sys``, although of course it may be mounted anywhere.
+Both ``/proc/diskstats`` and sysfs use the same source for the information
+and so should not differ.
+
+Here are examples of these different formats::
+
+   2.4:
+      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
+
+   2.6+ sysfs:
+      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      35486    38030    38030    38030
+
+   2.6+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3    1   hda1 35486 38030 38030 38030
+
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
+On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
+a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
+
+The advantage of one over the other is that the sysfs choice works well
+if you are watching a known, small set of disks.  ``/proc/diskstats`` may
+be a better choice if you are watching a large number of disks because
+you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
+each snapshot of your disk statistics.
+
+In 2.4, the statistics fields are those after the device name. In
+the above example, the first field of statistics would be 446216.
+By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
+find just the eleven fields, beginning with 446216.  If you look at
+``/proc/diskstats``, the eleven fields will be preceded by the major and
+minor device numbers, and device name.  Each of these formats provides
+eleven fields of statistics, each meaning exactly the same things.
+All fields except field 9 are cumulative since boot.  Field 9 should
+go to zero as I/Os complete; all others only increase (unless they
+overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
+(native word size) numbers, and on a very busy or long-lived system they
+may wrap. Applications should be prepared to deal with that; unless
+your observations are measured in large numbers of minutes or hours,
+they should not wrap twice before you notice them.
+
+Each set of stats only applies to the indicated device; if you want
+system-wide stats you'll have to find all the devices and sum them all up.
+
+Field  1 -- # of reads completed
+    This is the total number of reads completed successfully.
+
+Field  2 -- # of reads merged, field 6 -- # of writes merged
+    Reads and writes which are adjacent to each other may be merged for
+    efficiency.  Thus two 4K reads may become one 8K read before it is
+    ultimately handed to the disk, and so it will be counted (and queued)
+    as only one I/O.  This field lets you know how often this was done.
+
+Field  3 -- # of sectors read
+    This is the total number of sectors read successfully.
+
+Field  4 -- # of milliseconds spent reading
+    This is the total number of milliseconds spent by all reads (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  5 -- # of writes completed
+    This is the total number of writes completed successfully.
+
+Field  6 -- # of writes merged
+    See the description of field 2.
+
+Field  7 -- # of sectors written
+    This is the total number of sectors written successfully.
+
+Field  8 -- # of milliseconds spent writing
+    This is the total number of milliseconds spent by all writes (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  9 -- # of I/Os currently in progress
+    The only field that should go to zero. Incremented as requests are
+    given to appropriate struct request_queue and decremented as they finish.
+
+Field 10 -- # of milliseconds spent doing I/Os
+    This field increases so long as field 9 is nonzero.
+
+    Since 5.0 this field counts jiffies when at least one request was
+    started or completed. If request runs more than 2 jiffies then some
+    I/O time will not be accounted unless there are other requests.
+
+Field 11 -- weighted # of milliseconds spent doing I/Os
+    This field is incremented at each I/O start, I/O completion, I/O
+    merge, or read of these stats by the number of I/Os in progress
+    (field 9) times the number of milliseconds spent doing I/O since the
+    last update of this field.  This can provide an easy measure of both
+    I/O completion time and the backlog that may be accumulating.
+
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
+
+To avoid introducing performance bottlenecks, no locks are held while
+modifying these counters.  This implies that minor inaccuracies may be
+introduced when changes collide, so (for instance) adding up all the
+read I/Os issued per partition should equal those made to the disks ...
+but due to the lack of locking it may only be very close.
+
+In 2.6+, there are counters for each CPU, which make the lack of locking
+almost a non-issue.  When the statistics are read, the per-CPU counters
+are summed (possibly overflowing the unsigned long variable they are
+summed to) and the result given to the user.  There is no convenient
+user interface for accessing the per-CPU counters themselves.
+
+Disks vs Partitions
+-------------------
+
+There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
+As a result, some statistic information disappeared. The translation from
+a disk address relative to a partition to the disk address relative to
+the host disk happens much earlier.  All merges and timings now happen
+at the disk level rather than at both the disk and partition level as
+in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
+partitions from that for disks.  There are only *four* fields available
+for partitions on 2.6+ machines.  This is reflected in the examples above.
+
+Field  1 -- # of reads issued
+    This is the total number of reads issued to this partition.
+
+Field  2 -- # of sectors read
+    This is the total number of sectors requested to be read from this
+    partition.
+
+Field  3 -- # of writes issued
+    This is the total number of writes issued to this partition.
+
+Field  4 -- # of sectors written
+    This is the total number of sectors requested to be written to
+    this partition.
+
+Note that since the address is translated to a disk-relative one, and no
+record of the partition-relative address is kept, the subsequent success
+or failure of the read cannot be attributed to the partition.  In other
+words, the number of reads for partitions is counted slightly before time
+of queuing for partitions, and at completion for whole disks.  This is
+a subtle distinction that is probably uninteresting for most cases.
+
+More significant is the error induced by counting the numbers of
+reads/writes before merges for partitions and after for disks. Since a
+typical workload usually contains a lot of successive and adjacent requests,
+the number of reads/writes issued can be several times higher than the
+number of reads/writes completed.
+
+In 2.6.25, the full statistic set is again available for partitions and
+disk and partition statistics are consistent again. Since we still don't
+keep record of the partition-relative address, an operation is attributed to
+the partition which contains the first sector of the request after the
+eventual merges. As requests can be merged across partition, this could lead
+to some (probably insignificant) inaccuracy.
+
+Additional notes
+----------------
+
+In 2.6+, sysfs is not mounted by default.  If your distribution of
+Linux hasn't added it already, here's the line you'll want to add to
+your ``/etc/fstab``::
+
+	none /sys sysfs defaults 0 0
+
+
+In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
+appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
+``/proc/stat`` take a very different format from those in ``/proc/partitions``
+(see proc(5), if your system has it.)
+
+-- ricklind@us.ibm.com
diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt
index 220d0a80ca2c..220d0a80ca2c 100644
--- a/Documentation/kdump/gdbmacros.txt
+++ b/Documentation/admin-guide/kdump/gdbmacros.txt
diff --git a/Documentation/admin-guide/kdump/index.rst b/Documentation/admin-guide/kdump/index.rst
new file mode 100644
index 000000000000..8e2ebd0383cd
--- /dev/null
+++ b/Documentation/admin-guide/kdump/index.rst
@@ -0,0 +1,20 @@
+
+================================================================
+Documentation for Kdump - The kexec-based Crash Dumping Solution
+================================================================
+
+This document includes overview, setup and installation, and analysis
+information.
+
+.. toctree::
+    :maxdepth: 1
+
+    kdump
+    vmcoreinfo
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
new file mode 100644
index 000000000000..ac7e131d2935
--- /dev/null
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -0,0 +1,534 @@
+================================================================
+Documentation for Kdump - The kexec-based Crash Dumping Solution
+================================================================
+
+This document includes overview, setup and installation, and analysis
+information.
+
+Overview
+========
+
+Kdump uses kexec to quickly boot to a dump-capture kernel whenever a
+dump of the system kernel's memory needs to be taken (for example, when
+the system panics). The system kernel's memory image is preserved across
+the reboot and is accessible to the dump-capture kernel.
+
+You can use common commands, such as cp and scp, to copy the
+memory image to a dump file on the local disk, or across the network to
+a remote system.
+
+Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64,
+s390x, arm and arm64 architectures.
+
+When the system kernel boots, it reserves a small section of memory for
+the dump-capture kernel. This ensures that ongoing Direct Memory Access
+(DMA) from the system kernel does not corrupt the dump-capture kernel.
+The kexec -p command loads the dump-capture kernel into this reserved
+memory.
+
+On x86 machines, the first 640 KB of physical memory is needed to boot,
+regardless of where the kernel loads. Therefore, kexec backs up this
+region just before rebooting into the dump-capture kernel.
+
+Similarly on PPC64 machines first 32KB of physical memory is needed for
+booting regardless of where the kernel is loaded and to support 64K page
+size kexec backs up the first 64KB memory.
+
+For s390x, when kdump is triggered, the crashkernel region is exchanged
+with the region [0, crashkernel region size] and then the kdump kernel
+runs in [0, crashkernel region size]. Therefore no relocatable kernel is
+needed for s390x.
+
+All of the necessary information about the system kernel's core image is
+encoded in the ELF format, and stored in a reserved area of memory
+before a crash. The physical address of the start of the ELF header is
+passed to the dump-capture kernel through the elfcorehdr= boot
+parameter. Optionally the size of the ELF header can also be passed
+when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax.
+
+
+With the dump-capture kernel, you can access the memory image through
+/proc/vmcore. This exports the dump as an ELF-format file that you can
+write out using file copy commands such as cp or scp. Further, you can
+use analysis tools such as the GNU Debugger (GDB) and the Crash tool to
+debug the dump file. This method ensures that the dump pages are correctly
+ordered.
+
+
+Setup and Installation
+======================
+
+Install kexec-tools
+-------------------
+
+1) Login as the root user.
+
+2) Download the kexec-tools user-space package from the following URL:
+
+http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz
+
+This is a symlink to the latest version.
+
+The latest kexec-tools git tree is available at:
+
+- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
+- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
+
+There is also a gitweb interface available at
+http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git
+
+More information about kexec-tools can be found at
+http://horms.net/projects/kexec/
+
+3) Unpack the tarball with the tar command, as follows::
+
+	tar xvpzf kexec-tools.tar.gz
+
+4) Change to the kexec-tools directory, as follows::
+
+	cd kexec-tools-VERSION
+
+5) Configure the package, as follows::
+
+	./configure
+
+6) Compile the package, as follows::
+
+	make
+
+7) Install the package, as follows::
+
+	make install
+
+
+Build the system and dump-capture kernels
+-----------------------------------------
+There are two possible methods of using Kdump.
+
+1) Build a separate custom dump-capture kernel for capturing the
+   kernel core dump.
+
+2) Or use the system kernel binary itself as dump-capture kernel and there is
+   no need to build a separate dump-capture kernel. This is possible
+   only with the architectures which support a relocatable kernel. As
+   of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support
+   relocatable kernel.
+
+Building a relocatable kernel is advantageous from the point of view that
+one does not have to build a second kernel for capturing the dump. But
+at the same time one might want to build a custom dump capture kernel
+suitable to his needs.
+
+Following are the configuration setting required for system and
+dump-capture kernels for enabling kdump support.
+
+System kernel config options
+----------------------------
+
+1) Enable "kexec system call" in "Processor type and features."::
+
+	CONFIG_KEXEC=y
+
+2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo
+   filesystems." This is usually enabled by default::
+
+	CONFIG_SYSFS=y
+
+   Note that "sysfs file system support" might not appear in the "Pseudo
+   filesystems" menu if "Configure standard kernel features (for small
+   systems)" is not enabled in "General Setup." In this case, check the
+   .config file itself to ensure that sysfs is turned on, as follows::
+
+	grep 'CONFIG_SYSFS' .config
+
+3) Enable "Compile the kernel with debug info" in "Kernel hacking."::
+
+	CONFIG_DEBUG_INFO=Y
+
+   This causes the kernel to be built with debug symbols. The dump
+   analysis tools require a vmlinux with debug symbols in order to read
+   and analyze a dump file.
+
+Dump-capture kernel config options (Arch Independent)
+-----------------------------------------------------
+
+1) Enable "kernel crash dumps" support under "Processor type and
+   features"::
+
+	CONFIG_CRASH_DUMP=y
+
+2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems"::
+
+	CONFIG_PROC_VMCORE=y
+
+   (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.)
+
+Dump-capture kernel config options (Arch Dependent, i386 and x86_64)
+--------------------------------------------------------------------
+
+1) On i386, enable high memory support under "Processor type and
+   features"::
+
+	CONFIG_HIGHMEM64G=y
+
+   or::
+
+	CONFIG_HIGHMEM4G
+
+2) On i386 and x86_64, disable symmetric multi-processing support
+   under "Processor type and features"::
+
+	CONFIG_SMP=n
+
+   (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line
+   when loading the dump-capture kernel, see section "Load the Dump-capture
+   Kernel".)
+
+3) If one wants to build and use a relocatable kernel,
+   Enable "Build a relocatable kernel" support under "Processor type and
+   features"::
+
+	CONFIG_RELOCATABLE=y
+
+4) Use a suitable value for "Physical address where the kernel is
+   loaded" (under "Processor type and features"). This only appears when
+   "kernel crash dumps" is enabled. A suitable value depends upon
+   whether kernel is relocatable or not.
+
+   If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000
+   This will compile the kernel for physical address 1MB, but given the fact
+   kernel is relocatable, it can be run from any physical address hence
+   kexec boot loader will load it in memory region reserved for dump-capture
+   kernel.
+
+   Otherwise it should be the start of memory region reserved for
+   second kernel using boot parameter "crashkernel=Y@X". Here X is
+   start of memory region reserved for dump-capture kernel.
+   Generally X is 16MB (0x1000000). So you can set
+   CONFIG_PHYSICAL_START=0x1000000
+
+5) Make and install the kernel and its modules. DO NOT add this kernel
+   to the boot loader configuration files.
+
+Dump-capture kernel config options (Arch Dependent, ppc64)
+----------------------------------------------------------
+
+1) Enable "Build a kdump crash kernel" support under "Kernel" options::
+
+	CONFIG_CRASH_DUMP=y
+
+2)   Enable "Build a relocatable kernel" support::
+
+	CONFIG_RELOCATABLE=y
+
+   Make and install the kernel and its modules.
+
+Dump-capture kernel config options (Arch Dependent, ia64)
+----------------------------------------------------------
+
+- No specific options are required to create a dump-capture kernel
+  for ia64, other than those specified in the arch independent section
+  above. This means that it is possible to use the system kernel
+  as a dump-capture kernel if desired.
+
+  The crashkernel region can be automatically placed by the system
+  kernel at run time. This is done by specifying the base address as 0,
+  or omitting it all together::
+
+	crashkernel=256M@0
+
+  or::
+
+	crashkernel=256M
+
+  If the start address is specified, note that the start address of the
+  kernel will be aligned to 64Mb, so if the start address is not then
+  any space below the alignment point will be wasted.
+
+Dump-capture kernel config options (Arch Dependent, arm)
+----------------------------------------------------------
+
+-   To use a relocatable kernel,
+    Enable "AUTO_ZRELADDR" support under "Boot" options::
+
+	AUTO_ZRELADDR=y
+
+Dump-capture kernel config options (Arch Dependent, arm64)
+----------------------------------------------------------
+
+- Please note that kvm of the dump-capture kernel will not be enabled
+  on non-VHE systems even if it is configured. This is because the CPU
+  will not be reset to EL2 on panic.
+
+Extended crashkernel syntax
+===========================
+
+While the "crashkernel=size[@offset]" syntax is sufficient for most
+configurations, sometimes it's handy to have the reserved memory dependent
+on the value of System RAM -- that's mostly for distributors that pre-setup
+the kernel command line to avoid a unbootable system after some memory has
+been removed from the machine.
+
+The syntax is::
+
+    crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
+    range=start-[end]
+
+For example::
+
+    crashkernel=512M-2G:64M,2G-:128M
+
+This would mean:
+
+    1) if the RAM is smaller than 512M, then don't reserve anything
+       (this is the "rescue" case)
+    2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M
+    3) if the RAM size is larger than 2G, then reserve 128M
+
+
+
+Boot into System Kernel
+=======================
+
+1) Update the boot loader (such as grub, yaboot, or lilo) configuration
+   files as necessary.
+
+2) Boot the system kernel with the boot parameter "crashkernel=Y@X",
+   where Y specifies how much memory to reserve for the dump-capture kernel
+   and X specifies the beginning of this reserved memory. For example,
+   "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory
+   starting at physical address 0x01000000 (16MB) for the dump-capture kernel.
+
+   On x86 and x86_64, use "crashkernel=64M@16M".
+
+   On ppc64, use "crashkernel=128M@32M".
+
+   On ia64, 256M@256M is a generous value that typically works.
+   The region may be automatically placed on ia64, see the
+   dump-capture kernel config option notes above.
+   If use sparse memory, the size should be rounded to GRANULE boundaries.
+
+   On s390x, typically use "crashkernel=xxM". The value of xx is dependent
+   on the memory consumption of the kdump system. In general this is not
+   dependent on the memory size of the production system.
+
+   On arm, the use of "crashkernel=Y@X" is no longer necessary; the
+   kernel will automatically locate the crash kernel image within the
+   first 512MB of RAM if X is not given.
+
+   On arm64, use "crashkernel=Y[@X]".  Note that the start address of
+   the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000).
+
+Load the Dump-capture Kernel
+============================
+
+After booting to the system kernel, dump-capture kernel needs to be
+loaded.
+
+Based on the architecture and type of image (relocatable or not), one
+can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz
+of dump-capture kernel. Following is the summary.
+
+For i386 and x86_64:
+
+	- Use vmlinux if kernel is not relocatable.
+	- Use bzImage/vmlinuz if kernel is relocatable.
+
+For ppc64:
+
+	- Use vmlinux
+
+For ia64:
+
+	- Use vmlinux or vmlinuz.gz
+
+For s390x:
+
+	- Use image or bzImage
+
+For arm:
+
+	- Use zImage
+
+For arm64:
+
+	- Use vmlinux or Image
+
+If you are using an uncompressed vmlinux image then use following command
+to load dump-capture kernel::
+
+   kexec -p <dump-capture-kernel-vmlinux-image> \
+   --initrd=<initrd-for-dump-capture-kernel> --args-linux \
+   --append="root=<root-dev> <arch-specific-options>"
+
+If you are using a compressed bzImage/vmlinuz, then use following command
+to load dump-capture kernel::
+
+   kexec -p <dump-capture-kernel-bzImage> \
+   --initrd=<initrd-for-dump-capture-kernel> \
+   --append="root=<root-dev> <arch-specific-options>"
+
+If you are using a compressed zImage, then use following command
+to load dump-capture kernel::
+
+   kexec --type zImage -p <dump-capture-kernel-bzImage> \
+   --initrd=<initrd-for-dump-capture-kernel> \
+   --dtb=<dtb-for-dump-capture-kernel> \
+   --append="root=<root-dev> <arch-specific-options>"
+
+If you are using an uncompressed Image, then use following command
+to load dump-capture kernel::
+
+   kexec -p <dump-capture-kernel-Image> \
+   --initrd=<initrd-for-dump-capture-kernel> \
+   --append="root=<root-dev> <arch-specific-options>"
+
+Please note, that --args-linux does not need to be specified for ia64.
+It is planned to make this a no-op on that architecture, but for now
+it should be omitted
+
+Following are the arch specific command line options to be used while
+loading dump-capture kernel.
+
+For i386, x86_64 and ia64:
+
+	"1 irqpoll maxcpus=1 reset_devices"
+
+For ppc64:
+
+	"1 maxcpus=1 noirqdistrib reset_devices"
+
+For s390x:
+
+	"1 maxcpus=1 cgroup_disable=memory"
+
+For arm:
+
+	"1 maxcpus=1 reset_devices"
+
+For arm64:
+
+	"1 maxcpus=1 reset_devices"
+
+Notes on loading the dump-capture kernel:
+
+* By default, the ELF headers are stored in ELF64 format to support
+  systems with more than 4GB memory. On i386, kexec automatically checks if
+  the physical RAM size exceeds the 4 GB limit and if not, uses ELF32.
+  So, on non-PAE systems, ELF32 is always used.
+
+  The --elf32-core-headers option can be used to force the generation of ELF32
+  headers. This is necessary because GDB currently cannot open vmcore files
+  with ELF64 headers on 32-bit systems.
+
+* The "irqpoll" boot parameter reduces driver initialization failures
+  due to shared interrupts in the dump-capture kernel.
+
+* You must specify <root-dev> in the format corresponding to the root
+  device name in the output of mount command.
+
+* Boot parameter "1" boots the dump-capture kernel into single-user
+  mode without networking. If you want networking, use "3".
+
+* We generally don't have to bring up a SMP kernel just to capture the
+  dump. Hence generally it is useful either to build a UP dump-capture
+  kernel or specify maxcpus=1 option while loading dump-capture kernel.
+  Note, though maxcpus always works, you had better replace it with
+  nr_cpus to save memory if supported by the current ARCH, such as x86.
+
+* You should enable multi-cpu support in dump-capture kernel if you intend
+  to use multi-thread programs with it, such as parallel dump feature of
+  makedumpfile. Otherwise, the multi-thread program may have a great
+  performance degradation. To enable multi-cpu support, you should bring up an
+  SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X]
+  options while loading it.
+
+* For s390x there are two kdump modes: If a ELF header is specified with
+  the elfcorehdr= kernel parameter, it is used by the kdump kernel as it
+  is done on all other architectures. If no elfcorehdr= kernel parameter is
+  specified, the s390x kdump kernel dynamically creates the header. The
+  second mode has the advantage that for CPU and memory hotplug, kdump has
+  not to be reloaded with kexec_load().
+
+* For s390x systems with many attached devices the "cio_ignore" kernel
+  parameter should be used for the kdump kernel in order to prevent allocation
+  of kernel memory for devices that are not relevant for kdump. The same
+  applies to systems that use SCSI/FCP devices. In that case the
+  "allow_lun_scan" zfcp module parameter should be set to zero before
+  setting FCP devices online.
+
+Kernel Panic
+============
+
+After successfully loading the dump-capture kernel as previously
+described, the system will reboot into the dump-capture kernel if a
+system crash is triggered.  Trigger points are located in panic(),
+die(), die_nmi() and in the sysrq handler (ALT-SysRq-c).
+
+The following conditions will execute a crash trigger point:
+
+If a hard lockup is detected and "NMI watchdog" is configured, the system
+will boot into the dump-capture kernel ( die_nmi() ).
+
+If die() is called, and it happens to be a thread with pid 0 or 1, or die()
+is called inside interrupt context or die() is called and panic_on_oops is set,
+the system will boot into the dump-capture kernel.
+
+On powerpc systems when a soft-reset is generated, die() is called by all cpus
+and the system will boot into the dump-capture kernel.
+
+For testing purposes, you can trigger a crash by using "ALT-SysRq-c",
+"echo c > /proc/sysrq-trigger" or write a module to force the panic.
+
+Write Out the Dump File
+=======================
+
+After the dump-capture kernel is booted, write out the dump file with
+the following command::
+
+   cp /proc/vmcore <dump-file>
+
+
+Analysis
+========
+
+Before analyzing the dump image, you should reboot into a stable kernel.
+
+You can do limited analysis using GDB on the dump file copied out of
+/proc/vmcore. Use the debug vmlinux built with -g and run the following
+command::
+
+   gdb vmlinux <dump-file>
+
+Stack trace for the task on processor 0, register display, and memory
+display work fine.
+
+Note: GDB cannot analyze core files generated in ELF64 format for x86.
+On systems with a maximum of 4GB of memory, you can generate
+ELF32-format headers using the --elf32-core-headers kernel option on the
+dump kernel.
+
+You can also use the Crash utility to analyze dump files in Kdump
+format. Crash is available on Dave Anderson's site at the following URL:
+
+   http://people.redhat.com/~anderson/
+
+Trigger Kdump on WARN()
+=======================
+
+The kernel parameter, panic_on_warn, calls panic() in all WARN() paths.  This
+will cause a kdump to occur at the panic() call.  In cases where a user wants
+to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
+to achieve the same behaviour.
+
+Contact
+=======
+
+- Vivek Goyal (vgoyal@redhat.com)
+- Maneesh Soni (maneesh@in.ibm.com)
+
+GDB macros
+==========
+
+.. include:: gdbmacros.txt
+   :literal:
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
new file mode 100644
index 000000000000..007a6b86e0ee
--- /dev/null
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -0,0 +1,488 @@
+==========
+VMCOREINFO
+==========
+
+What is it?
+===========
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+Common variables
+================
+
+init_uts_ns.name.release
+------------------------
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built. For example, crash uses it to
+find the corresponding vmlinux in order to process vmcore.
+
+PAGE_SIZE
+---------
+
+The size of a page. It is the smallest unit of data used by the memory
+management facilities. It is usually 4096 bytes of size and a page is
+aligned on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+-----------
+
+The UTS namespace which is used to isolate two specific elements of the
+system that relate to the uname(2) system call. It is named after the
+data structure used to store information returned by the uname(2) system
+call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---------------
+
+An array node_states[N_ONLINE] which represents the set of online nodes
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+--------------
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+------
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--------------
+
+Stores the virtual area list. makedumpfile gets the vmalloc start value
+from this variable and its value is necessary for vmalloc translation.
+
+mem_map
+-------
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+----------------
+
+Makedumpfile gets the pglist_data structure from this symbol, which is
+used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--------------------------------------------------------------------------
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+----
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute contiguous memory.
+
+pglist_data
+-----------
+
+The size of a pglist_data structure. This value is used to check if the
+pglist_data structure is valid. It is also used for checking the memory
+type.
+
+zone
+----
+
+The size of a zone structure. This value is used to check if the zone
+structure has been found. It is also used for excluding free pages.
+
+free_area
+---------
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful when excluding free pages.
+
+list_head
+---------
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+----------
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head)
+-------------------------------------------------------------------------------------------------
+
+User-space tools compute their values based on the offset of these
+variables. The variables are used when excluding unnecessary pages.
+
+(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id)
+-----------------------------------------------------------------------------------------
+
+On NUMA machines, each NUMA node has a pg_data_t to describe its memory
+layout. On UMA machines there is a single pglist_data which describes the
+whole memory.
+
+These values are used to check the memory type and to compute the
+virtual address for memory map.
+
+(zone, free_area|vm_stat|spanned_pages)
+---------------------------------------
+
+Each node is divided into a number of blocks called zones which
+represent ranges within memory. A zone is described by a structure zone.
+
+User-space tools compute required values based on the offset of these
+variables.
+
+(free_area, free_list)
+----------------------
+
+Offset of the free_list's member. This value is used to compute the number
+of free pages.
+
+Each zone has a free_area structure array called free_area[MAX_ORDER].
+The free_list represents a linked list of free page blocks.
+
+(list_head, next|prev)
+----------------------
+
+Offsets of the list_head's members. list_head is used to define a
+circular linked list. User-space tools need these in order to traverse
+lists.
+
+(vmap_area, va_start|list)
+--------------------------
+
+Offsets of the vmap_area's members. They carry vmalloc-specific
+information. Makedumpfile gets the start address of the vmalloc region
+from this.
+
+(zone.free_area, MAX_ORDER)
+---------------------------
+
+Free areas descriptor. User-space tools use this value to iterate the
+free_area ranges. MAX_ORDER is used by the zone buddy allocator.
+
+log_first_idx
+-------------
+
+Index of the first record stored in the buffer log_buf. Used by
+user-space tools to read the strings in the log_buf.
+
+log_buf
+-------
+
+Console output is written to the ring buffer log_buf at index
+log_first_idx. Used to get the kernel log.
+
+log_buf_len
+-----------
+
+log_buf's length.
+
+clear_idx
+---------
+
+The index that the next printk() record to read after the last clear
+command. It indicates the first record after the last SYSLOG_ACTION
+_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
+the dmesg log.
+
+log_next_idx
+------------
+
+The index of the next record to store in the buffer log_buf. Used to
+compute the index of the current buffer position.
+
+printk_log
+----------
+
+The size of a structure printk_log. Used to compute the size of
+messages, and extract dmesg log. It encapsulates header information for
+log_buf, such as timestamp, syslog level, etc.
+
+(printk_log, ts_nsec|len|text_len|dict_len)
+-------------------------------------------
+
+It represents field offsets in struct printk_log. User space tools
+parse it and check whether the values of printk_log's members have been
+changed.
+
+(free_area.free_list, MIGRATE_TYPES)
+------------------------------------
+
+The number of migrate types for pages. The free_list is described by the
+array. Used by tools to compute the number of free pages.
+
+NR_FREE_PAGES
+-------------
+
+On linux-2.6.21 or later, the number of free pages is in
+vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
+
+PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask
+------------------------------------------------------------------------------
+
+Page attributes. These flags are used to filter various unnecessary for
+dumping pages.
+
+PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
+-----------------------------------------------------------------------------
+
+More page attributes. These flags are used to filter various unnecessary for
+dumping pages.
+
+
+HUGETLB_PAGE_DTOR
+-----------------
+
+The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
+excludes these pages.
+
+x86_64
+======
+
+phys_base
+---------
+
+Used to convert the virtual address of an exported kernel symbol to its
+corresponding physical address.
+
+init_top_pgt
+------------
+
+Used to walk through the whole page table and convert virtual addresses
+to physical addresses. The init_top_pgt is somewhat similar to
+swapper_pg_dir, but it is only used in x86_64.
+
+pgtable_l5_enabled
+------------------
+
+User-space tools need to know whether the crash kernel was in 5-level
+paging mode.
+
+node_data
+---------
+
+This is a struct pglist_data array and stores all NUMA nodes
+information. Makedumpfile gets the pglist_data structure from it.
+
+(node_data, MAX_NUMNODES)
+-------------------------
+
+The maximum number of nodes in system.
+
+KERNELOFFSET
+------------
+
+The kernel randomization offset. Used to compute the page offset. If
+KASLR is disabled, this value is zero.
+
+KERNEL_IMAGE_SIZE
+-----------------
+
+Currently unused by Makedumpfile. Used to compute the module virtual
+address by Crash.
+
+sme_mask
+--------
+
+AMD-specific with SME support: it indicates the secure memory encryption
+mask. Makedumpfile tools need to know whether the crash kernel was
+encrypted. If SME is enabled in the first kernel, the crash kernel's
+page table entries (pgd/pud/pmd/pte) contain the memory encryption
+mask. This is used to remove the SME mask and obtain the true physical
+address.
+
+Currently, sme_mask stores the value of the C-bit position. If needed,
+additional SME-relevant info can be placed in that variable.
+
+For example::
+
+  [ misc	        ][ enc bit  ][ other misc SME info       ]
+  0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000
+  63   59   55   51   47   43   39   35   31   27   ... 3
+
+x86_32
+======
+
+X86_PAE
+-------
+
+Denotes whether physical address extensions are enabled. It has the cost
+of a higher page table lookup overhead, and also consumes more page
+table space per process. Used to check whether PAE was enabled in the
+crash kernel when converting virtual addresses to physical addresses.
+
+ia64
+====
+
+pgdat_list|(pgdat_list, MAX_NUMNODES)
+-------------------------------------
+
+pg_data_t array storing all NUMA nodes information. MAX_NUMNODES
+indicates the number of the nodes.
+
+node_memblk|(node_memblk, NR_NODE_MEMBLKS)
+------------------------------------------
+
+List of node memory chunks. Filled when parsing the SRAT table to obtain
+information about memory nodes. NR_NODE_MEMBLKS indicates the number of
+node memory chunks.
+
+These values are used to compute the number of nodes the crashed kernel used.
+
+node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size)
+----------------------------------------------------------------
+
+The size of a struct node_memblk_s and the offsets of the
+node_memblk_s's members. Used to compute the number of nodes.
+
+PGTABLE_3|PGTABLE_4
+-------------------
+
+User-space tools need to know whether the crash kernel was in 3-level or
+4-level paging mode. Used to distinguish the page table.
+
+ARM64
+=====
+
+VA_BITS
+-------
+
+The maximum number of bits for virtual addresses. Used to compute the
+virtual memory ranges.
+
+kimage_voffset
+--------------
+
+The offset between the kernel virtual and physical mappings. Used to
+translate virtual to physical addresses.
+
+PHYS_OFFSET
+-----------
+
+Indicates the physical address of the start of memory. Similar to
+kimage_voffset, which is used to translate virtual to physical
+addresses.
+
+KERNELOFFSET
+------------
+
+The kernel randomization offset. Used to compute the page offset. If
+KASLR is disabled, this value is zero.
+
+arm
+===
+
+ARM_LPAE
+--------
+
+It indicates whether the crash kernel supports large physical address
+extensions. Used to translate virtual to physical addresses.
+
+s390
+====
+
+lowcore_ptr
+-----------
+
+An array with a pointer to the lowcore of every CPU. Used to print the
+psw and all registers information.
+
+high_memory
+-----------
+
+Used to get the vmalloc_start address from the high_memory symbol.
+
+(lowcore_ptr, NR_CPUS)
+----------------------
+
+The maximum number of CPUs.
+
+powerpc
+=======
+
+
+node_data|(node_data, MAX_NUMNODES)
+-----------------------------------
+
+See above.
+
+contig_page_data
+----------------
+
+See above.
+
+vmemmap_list
+------------
+
+The vmemmap_list maintains the entire vmemmap physical mapping. Used
+to get vmemmap list count and populated vmemmap regions info. If the
+vmemmap address translation information is stored in the crash kernel,
+it is used to translate vmemmap kernel virtual addresses.
+
+mmu_vmemmap_psize
+-----------------
+
+The size of a page. Used to translate virtual to physical addresses.
+
+mmu_psize_defs
+--------------
+
+Page size definitions, i.e. 4k, 64k, or 16M.
+
+Used to make vtop translations.
+
+vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr)
+--------------------------------------------------------------------------------------------
+
+The vmemmap virtual address space management does not have a traditional
+page table to track which virtual struct pages are backed by a physical
+mapping. The virtual to physical mappings are tracked in a simple linked
+list format.
+
+User-space tools need to know the offset of list, phys and virt_addr
+when computing the count of vmemmap regions.
+
+mmu_psize_def|(mmu_psize_def, shift)
+------------------------------------
+
+The size of a struct mmu_psize_def and the offset of mmu_psize_def's
+member.
+
+Used in vtop translations.
+
+sh
+==
+
+node_data|(node_data, MAX_NUMNODES)
+-----------------------------------
+
+See above.
+
+X2TLB
+-----
+
+Indicates whether the crashed kernel enabled SH extended mode.
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 0124980dca2d..d05d531b4ec9 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -9,11 +9,11 @@ and sorted into English Dictionary order (defined as ignoring all
 punctuation and sorting digits before letters in a case insensitive
 manner), and with descriptions where known.
 
-The kernel parses parameters from the kernel command line up to "--";
+The kernel parses parameters from the kernel command line up to "``--``";
 if it doesn't recognize a parameter and it doesn't contain a '.', the
 parameter gets passed to init: parameters with '=' go into init's
 environment, others are passed as command line arguments to init.
-Everything after "--" is passed as an argument to init.
+Everything after "``--``" is passed as an argument to init.
 
 Module parameters can be specified in two ways: via the kernel command
 line with a module name prefix, or via modprobe, e.g.::
@@ -118,7 +118,7 @@ parameter is applicable::
 	LOOP	Loopback device support is enabled.
 	M68k	M68k architecture is enabled.
 			These options have more detailed description inside of
-			Documentation/m68k/kernel-options.txt.
+			Documentation/m68k/kernel-options.rst.
 	MDA	MDA console support is enabled.
 	MIPS	MIPS architecture is enabled.
 	MOUSE	Appropriate mouse support is enabled.
@@ -167,7 +167,7 @@ parameter is applicable::
 	X86-32	X86-32, aka i386 architecture is enabled.
 	X86-64	X86-64 architecture is enabled.
 			More X86-64 boot options can be found in
-			Documentation/x86/x86_64/boot-options.txt .
+			Documentation/x86/x86_64/boot-options.rst.
 	X86	Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
 	X86_UV	SGI UV support is enabled.
 	XEN	Xen support is enabled
@@ -181,10 +181,10 @@ In addition, the following text indicates that the option::
 Parameters denoted with BOOT are actually interpreted by the boot
 loader, and have no meaning to the kernel directly.
 Do not modify the syntax of boot loader parameters without extreme
-need or coordination with <Documentation/x86/boot.txt>.
+need or coordination with <Documentation/x86/boot.rst>.
 
 There are also arch-specific kernel-parameters not documented here.
-See for example <Documentation/x86/x86_64/boot-options.txt>.
+See for example <Documentation/x86/x86_64/boot-options.rst>.
 
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
 a trailing = on the name of any parameter states that that parameter will
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..7ccd158b3894 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -13,7 +13,7 @@
 			For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
 			are available
 
-			See also Documentation/power/runtime_pm.txt, pci=noacpi
+			See also Documentation/power/runtime_pm.rst, pci=noacpi
 
 	acpi_apic_instance=	[ACPI, IOAPIC]
 			Format: <int>
@@ -53,7 +53,7 @@
 			ACPI_DEBUG_PRINT statements, e.g.,
 			    ACPI_DEBUG_PRINT((ACPI_DB_INFO, ...
 			The debug_level mask defaults to "info".  See
-			Documentation/acpi/debug.txt for more information about
+			Documentation/firmware-guide/acpi/debug.rst for more information about
 			debug layers and levels.
 
 			Enable processor driver info messages:
@@ -223,7 +223,7 @@
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
 				  old_ordering, nonvs, sci_force_enable, nobl }
-			See Documentation/power/video.txt for information on
+			See Documentation/power/video.rst for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
@@ -430,7 +430,7 @@
 
 	blkdevparts=	Manual partition parsing of block device(s) for
 			embedded devices based on command line input.
-			See Documentation/block/cmdline-partition.txt
+			See Documentation/block/cmdline-partition.rst
 
 	boot_delay=	Milliseconds to delay each printk during boot.
 			Values larger than 10 seconds (10000) are changed to
@@ -478,7 +478,7 @@
 			others).
 
 	ccw_timeout_log	[S390]
-			See Documentation/s390/CommonIO for details.
+			See Documentation/s390/common_io.rst for details.
 
 	cgroup_disable=	[KNL] Disable a particular controller
 			Format: {name of the controller(s) to disable}
@@ -516,7 +516,7 @@
 				/selinux/checkreqprot.
 
 	cio_ignore=	[S390]
-			See Documentation/s390/CommonIO for details.
+			See Documentation/s390/common_io.rst for details.
 	clk_ignore_unused
 			[CLK]
 			Prevents the clock framework from automatically gating
@@ -708,14 +708,14 @@
 			[KNL, x86_64] select a region under 4G first, and
 			fall back to reserve region above 4G when '@offset'
 			hasn't been specified.
-			See Documentation/kdump/kdump.txt for further details.
+			See Documentation/admin-guide/kdump/kdump.rst for further details.
 
 	crashkernel=range1:size1[,range2:size2,...][@offset]
 			[KNL] Same as above, but depends on the memory
 			in the running system. The syntax of range is
 			start-[end] where start and end are both
 			a memory unit (amount[KMG]). See also
-			Documentation/kdump/kdump.txt for an example.
+			Documentation/admin-guide/kdump/kdump.rst for an example.
 
 	crashkernel=size[KMG],high
 			[KNL, x86_64] range could be above 4G. Allow kernel
@@ -805,12 +805,10 @@
 			tracking down these problems.
 
 	debug_pagealloc=
-			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
-			parameter enables the feature at boot time. In
-			default, it is disabled. We can avoid allocating huge
-			chunk of memory for debug pagealloc if we don't enable
-			it at boot time and the system will work mostly same
-			with the kernel built without CONFIG_DEBUG_PAGEALLOC.
+			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
+			enables the feature at boot time. By default, it is
+			disabled and the system will work mostly the same as a
+			kernel built without CONFIG_DEBUG_PAGEALLOC.
 			on: enable the feature
 
 	debugpat	[X86] Enable PAT debugging
@@ -932,7 +930,7 @@
 			edid/1680x1050.bin, or edid/1920x1080.bin is given
 			and no file with the same name exists. Details and
 			instructions how to build your own EDID data are
-			available in Documentation/EDID/HOWTO.txt. An EDID
+			available in Documentation/driver-api/edid.rst. An EDID
 			data set will only be used for a particular connector,
 			if its name and a colon are prepended to the EDID
 			name. Each connector may use a unique EDID data
@@ -963,7 +961,7 @@
 			for details.
 
 	nompx		[X86] Disables Intel Memory Protection Extensions.
-			See Documentation/x86/intel_mpx.txt for more
+			See Documentation/x86/intel_mpx.rst for more
 			information about the feature.
 
 	nopku		[X86] Disable Memory Protection Keys CPU feature found
@@ -1189,7 +1187,7 @@
 			that is to be dynamically loaded by Linux. If there are
 			multiple variables with the same name but with different
 			vendor GUIDs, all of them will be loaded. See
-			Documentation/acpi/ssdt-overlays.txt for details.
+			Documentation/admin-guide/acpi/ssdt-overlays.rst for details.
 
 
 	eisa_irq_edge=	[PARISC,HW]
@@ -1201,15 +1199,15 @@
 
 	elevator=	[IOSCHED]
 			Format: { "mq-deadline" | "kyber" | "bfq" }
-			See Documentation/block/deadline-iosched.txt,
-			Documentation/block/kyber-iosched.txt and
-			Documentation/block/bfq-iosched.txt for details.
+			See Documentation/block/deadline-iosched.rst,
+			Documentation/block/kyber-iosched.rst and
+			Documentation/block/bfq-iosched.rst for details.
 
 	elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
 			Specifies physical address of start of kernel core
 			image elf header and optionally the size. Generally
 			kexec loader will pass this option to capture kernel.
-			See Documentation/kdump/kdump.txt for details.
+			See Documentation/admin-guide/kdump/kdump.rst for details.
 
 	enable_mtrr_cleanup [X86]
 			The kernel tries to adjust MTRR layout from continuous
@@ -1251,7 +1249,7 @@
 			See also Documentation/fault-injection/.
 
 	floppy=		[HW]
-			See Documentation/blockdev/floppy.txt.
+			See Documentation/admin-guide/blockdev/floppy.rst.
 
 	force_pal_cache_flush
 			[IA-64] Avoid check_sal_cache_flush which may hang on
@@ -1388,9 +1386,6 @@
 			Valid parameters: "on", "off"
 			Default: "on"
 
-	hisax=		[HW,ISDN]
-			See Documentation/isdn/README.HiSax.
-
 	hlt		[BUGS=ARM,SH]
 
 	hpet=		[X86-32,HPET] option to control HPET usage
@@ -1507,7 +1502,7 @@
 			Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
 			.vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
 			.cdrom .chs .ignore_cable are additional options
-			See Documentation/ide/ide.txt.
+			See Documentation/ide/ide.rst.
 
 	ide-generic.probe-mask= [HW] (E)IDE subsystem
 			Format: <int>
@@ -1673,6 +1668,15 @@
 
 	initrd=		[BOOT] Specify the location of the initial ramdisk
 
+	init_on_alloc=	[MM] Fill newly allocated pages and heap objects with
+			zeroes.
+			Format: 0 | 1
+			Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON.
+
+	init_on_free=	[MM] Fill freed pages and heap objects with zeroes.
+			Format: 0 | 1
+			Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
+
 	init_pkru=	[x86] Specify the default memory protection keys rights
 			register contents for all processes.  0x55555554 by
 			default (disallow access to all but pkey 0).  Can
@@ -2007,6 +2011,19 @@
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
 			the default is off.
 
+	kprobe_event=[probe-list]
+			[FTRACE] Add kprobe events and enable at boot time.
+			The probe-list is a semicolon delimited list of probe
+			definitions. Each definition is same as kprobe_events
+			interface, but the parameters are comma delimited.
+			For example, to add a kprobe event on vfs_read with
+			arg1 and arg2, add to the command line;
+
+			      kprobe_event=p,vfs_read,$arg1,$arg2
+
+			See also Documentation/trace/kprobetrace.rst "Kernel
+			Boot Parameter" section.
+
 	kpti=		[ARM64] Control page table isolation of user
 			and kernel address spaces.
 			Default: enabled on cores which need mitigation.
@@ -2230,7 +2247,7 @@
 	memblock=debug	[KNL] Enable memblock debug messages.
 
 	load_ramdisk=	[RAM] List of ramdisks to load from floppy
-			See Documentation/blockdev/ramdisk.txt.
+			See Documentation/admin-guide/blockdev/ramdisk.rst.
 
 	lockd.nlm_grace_period=P  [NFS] Assign grace period.
 			Format: <integer>
@@ -2383,7 +2400,7 @@
 
 	mce		[X86-32] Machine Check Exception
 
-	mce=option	[X86-64] See Documentation/x86/x86_64/boot-options.txt
+	mce=option	[X86-64] See Documentation/x86/x86_64/boot-options.rst
 
 	md=		[HW] RAID subsystems devices and level
 			See Documentation/admin-guide/md.rst.
@@ -2439,7 +2456,7 @@
 			set according to the
 			CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
 			option.
-			See Documentation/memory-hotplug.txt.
+			See Documentation/admin-guide/mm/memory-hotplug.rst.
 
 	memmap=exactmap	[KNL,X86] Enable setting of an exact
 			E820 memory map, as specified by the user.
@@ -2528,7 +2545,7 @@
 			mem_encrypt=on:		Activate SME
 			mem_encrypt=off:	Do not activate SME
 
-			Refer to Documentation/x86/amd-memory-encryption.txt
+			Refer to Documentation/virt/kvm/amd-memory-encryption.rst
 			for details on when memory encryption can be activated.
 
 	mem_sleep_default=	[SUSPEND] Default system suspend mode:
@@ -2836,8 +2853,9 @@
 			0 - turn hardlockup detector in nmi_watchdog off
 			1 - turn hardlockup detector in nmi_watchdog on
 			When panic is specified, panic when an NMI watchdog
-			timeout occurs (or 'nopanic' to override the opposite
-			default). To disable both hard and soft lockup detectors,
+			timeout occurs (or 'nopanic' to not panic on an NMI
+			watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set)
+			To disable both hard and soft lockup detectors,
 			please see 'nowatchdog'.
 			This is useful when you use a panic=... timeout and
 			need the box quickly up again.
@@ -2872,6 +2890,17 @@
 			/sys/module/printk/parameters/console_suspend) to
 			turn on/off it dynamically.
 
+	novmcoredd	[KNL,KDUMP]
+			Disable device dump. Device dump allows drivers to
+			append dump data to vmcore so you can collect driver
+			specified debug info.  Drivers can append the data
+			without any limit and this data is stored in memory,
+			so this may cause significant memory stress.  Disabling
+			device dump can help save memory but the driver debug
+			data will be no longer available.  This parameter
+			is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP
+			is set.
+
 	noaliencache	[MM, NUMA, SLAB] Disables the allocation of alien
 			caches in the slab allocator.  Saves per-node memory,
 			but will impact performance.
@@ -2927,7 +2956,7 @@
 			register save and restore. The kernel will only save
 			legacy floating-point registers on task switch.
 
-	nohugeiomap	[KNL,x86] Disable kernel huge I/O mappings.
+	nohugeiomap	[KNL,x86,PPC] Disable kernel huge I/O mappings.
 
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
@@ -3139,7 +3168,7 @@
 	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
 			'node', 'default' can be specified
 			This can be set from sysctl after boot.
-			See Documentation/sysctl/vm.txt for details.
+			See Documentation/admin-guide/sysctl/vm.rst for details.
 
 	ohci1394_dma=early	[HW] enable debugging via the ohci1394 driver.
 			See Documentation/debugging-via-ohci1394.txt for more
@@ -3263,7 +3292,7 @@
 
 	pcd.		[PARIDE]
 			See header of drivers/block/paride/pcd.c.
-			See also Documentation/blockdev/paride.txt.
+			See also Documentation/admin-guide/blockdev/paride.rst.
 
 	pci=option[,option...]	[PCI] various PCI subsystem options.
 
@@ -3507,7 +3536,7 @@
 			needed on a platform with proper driver support.
 
 	pd.		[PARIDE]
-			See Documentation/blockdev/paride.txt.
+			See Documentation/admin-guide/blockdev/paride.rst.
 
 	pdcchassis=	[PARISC,HW] Disable/Enable PDC Chassis Status codes at
 			boot time.
@@ -3522,13 +3551,13 @@
 			and performance comparison.
 
 	pf.		[PARIDE]
-			See Documentation/blockdev/paride.txt.
+			See Documentation/admin-guide/blockdev/paride.rst.
 
 	pg.		[PARIDE]
-			See Documentation/blockdev/paride.txt.
+			See Documentation/admin-guide/blockdev/paride.rst.
 
 	pirq=		[SMP,APIC] Manual mp-table setup
-			See Documentation/x86/i386/IO-APIC.txt.
+			See Documentation/x86/i386/IO-APIC.rst.
 
 	plip=		[PPT,NET] Parallel port network link
 			Format: { parport<nr> | timid | 0 }
@@ -3637,7 +3666,7 @@
 
 	prompt_ramdisk=	[RAM] List of RAM disks to prompt for floppy disk
 			before loading.
-			See Documentation/blockdev/ramdisk.txt.
+			See Documentation/admin-guide/blockdev/ramdisk.rst.
 
 	psi=		[KNL] Enable or disable pressure stall information
 			tracking.
@@ -3659,7 +3688,7 @@
 	pstore.backend=	Specify the name of the pstore backend to use
 
 	pt.		[PARIDE]
-			See Documentation/blockdev/paride.txt.
+			See Documentation/admin-guide/blockdev/paride.rst.
 
 	pti=		[X86_64] Control Page Table Isolation of user and
 			kernel address spaces.  Disabling this feature
@@ -3688,7 +3717,7 @@
 			See Documentation/admin-guide/md.rst.
 
 	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
-			See Documentation/blockdev/ramdisk.txt.
+			See Documentation/admin-guide/blockdev/ramdisk.rst.
 
 	random.trust_cpu={on,off}
 			[KNL] Enable or disable trusting the use of the
@@ -3752,6 +3781,12 @@
 			the propagation of recent CPU-hotplug changes up
 			the rcu_node combining tree.
 
+	rcutree.use_softirq=	[KNL]
+			If set to zero, move all RCU_SOFTIRQ processing to
+			per-CPU rcuc kthreads.  Defaults to a non-zero
+			value, meaning that RCU_SOFTIRQ is used by default.
+			Specify rcutree.use_softirq=0 to use rcuc kthreads.
+
 	rcutree.rcu_fanout_exact= [KNL]
 			Disable autobalancing of the rcu_node combining
 			tree.  This is used by rcutorture, and might
@@ -4078,7 +4113,7 @@
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cgroup-v1/cpusets.txt.
+			See Documentation/admin-guide/cgroup-v1/cpusets.rst.
 
 	reserve=	[KNL,BUGS] Force kernel to ignore I/O ports or memory
 			Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4108,7 +4143,7 @@
 			Specify the offset from the beginning of the partition
 			given by "resume=" at which the swap header is located,
 			in <PAGE_SIZE> units (needed only for swap files).
-			See  Documentation/power/swsusp-and-swap-files.txt
+			See  Documentation/power/swsusp-and-swap-files.rst
 
 	resumedelay=	[HIBERNATION] Delay (in seconds) to pause before attempting to
 			read the resume files
@@ -4336,7 +4371,7 @@
 			Format: <integer>
 
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
-			See Documentation/laptops/sonypi.txt
+			See Documentation/admin-guide/laptops/sonypi.rst
 
 	spectre_v2=	[X86] Control mitigation of Spectre variant 2
 			(indirect branch speculation) vulnerability.
@@ -4588,7 +4623,7 @@
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
-			it if 0 is given (See Documentation/cgroup-v1/memory.txt)
+			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
 			Format: { <int> | force | noforce }
@@ -4663,27 +4698,6 @@
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
 
-	tmem		[KNL,XEN]
-			Enable the Transcendent memory driver if built-in.
-
-	tmem.cleancache=0|1 [KNL, XEN]
-			Default is on (1). Disable the usage of the cleancache
-			API to send anonymous pages to the hypervisor.
-
-	tmem.frontswap=0|1 [KNL, XEN]
-			Default is on (1). Disable the usage of the frontswap
-			API to send swap pages to the hypervisor. If disabled
-			the selfballooning and selfshrinking are force disabled.
-
-	tmem.selfballooning=0|1 [KNL, XEN]
-			Default is on (1). Disable the driving of swap pages
-			to the hypervisor.
-
-	tmem.selfshrinking=0|1 [KNL, XEN]
-			Default is on (1). Partial swapoff that immediately
-			transfers pages from Xen hypervisor back to the
-			kernel based on different criteria.
-
 	topology=	[S390]
 			Format: {off | on}
 			Specify if the kernel should make use of the cpu
@@ -5026,7 +5040,7 @@
 			vector=percpu: enable percpu vector domain
 
 	video=		[FB] Frame buffer configuration
-			See Documentation/fb/modedb.txt.
+			See Documentation/fb/modedb.rst.
 
 	video.brightness_switch_enabled= [0,1]
 			If set to 1, on receiving an ACPI notify event
@@ -5054,8 +5068,8 @@
 			Can be used multiple times for multiple devices.
 
 	vga=		[BOOT,X86-32] Select a particular video mode
-			See Documentation/x86/boot.txt and
-			Documentation/svga.txt.
+			See Documentation/x86/boot.rst and
+			Documentation/admin-guide/svga.rst.
 			Use vga=ask for menu.
 			This is actually a boot loader parameter; the value is
 			passed to the kernel using a special protocol.
@@ -5100,13 +5114,12 @@
 			targets for exploits that can control RIP.
 
 			emulate     [default] Vsyscalls turn into traps and are
-			            emulated reasonably safely.
+			            emulated reasonably safely.  The vsyscall
+				    page is readable.
 
-			native      Vsyscalls are native syscall instructions.
-			            This is a little bit faster than trapping
-			            and makes a few dynamic recompilers work
-			            better than they would in emulation mode.
-			            It also makes exploits much easier to write.
+			xonly       Vsyscalls turn into traps and are
+			            emulated reasonably safely.  The vsyscall
+				    page is not readable.
 
 			none        Vsyscalls don't work at all.  This makes
 			            them quite hard to use for exploits but
@@ -5162,7 +5175,7 @@
 			Default: 3 = cyan.
 
 	watchdog timers	[HW,WDT] For information on watchdog timers,
-			see Documentation/watchdog/watchdog-parameters.txt
+			see Documentation/watchdog/watchdog-parameters.rst
 			or other driver-specific files in the
 			Documentation/watchdog/ directory.
 
@@ -5254,6 +5267,8 @@
 	xen_nopv	[X86]
 			Disables the PV optimizations forcing the HVM guest to
 			run as generic HVM guest with no PV drivers.
+			This option is obsoleted by the "nopv" option, which
+			has equivalent effect for XEN platform.
 
 	xen_scrub_pages=	[XEN]
 			Boolean option to control scrubbing pages before giving them back
@@ -5268,10 +5283,24 @@
 			improve timer resolution at the expense of processing
 			more timer interrupts.
 
+	nopv=		[X86,XEN,KVM,HYPER_V,VMWARE]
+			Disables the PV optimizations forcing the guest to run
+			as generic guest with no PV drivers. Currently support
+			XEN HVM, KVM, HYPER_V and VMWARE guest.
+
 	xirc2ps_cs=	[NET,PCMCIA]
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
 
+	xive=		[PPC]
+			By default on POWER9 and above, the kernel will
+			natively use the XIVE interrupt controller. This option
+			allows the fallback firmware mode to be used:
+
+			off       Fallback to firmware control of XIVE interrupt
+				  controller on both pseries and powernv
+				  platforms. Only useful on POWER9 and above.
+
 	xhci-hcd.quirks		[USB,KNL]
 			A hex value specifying bitmask with supplemental xhci
 			host controller quirks. Meaning of each bit can be
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
new file mode 100644
index 000000000000..4f18456dd3b1
--- /dev/null
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -0,0 +1,356 @@
+==========================================
+Reducing OS jitter due to per-cpu kthreads
+==========================================
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter.  Note that non-per-CPU kthreads are
+not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+References
+==========
+
+-	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
+
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+
+-	man taskset:  Using the taskset command to bind tasks to sets
+	of CPUs.
+
+-	man sched_setaffinity:  Using the sched_setaffinity() system
+	call to bind tasks to sets of CPUs.
+
+-	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
+	writing "0" to offline and "1" to online.
+
+-	In order to locate kernel-generated OS jitter on CPU N:
+
+		cd /sys/kernel/debug/tracing
+		echo 1 > max_graph_depth # Increase the "1" for more detail
+		echo function_graph > current_tracer
+		# run workload
+		cat per_cpu/cpuN/trace
+
+kthreads
+========
+
+Name:
+  ehca_comp/%u
+
+Purpose:
+  Periodically process Infiniband-related work.
+
+To reduce its OS jitter, do any of the following:
+
+1.	Don't use eHCA Infiniband hardware, instead choosing hardware
+	that does not require per-CPU kthreads.  This will prevent these
+	kthreads from being created in the first place.  (This will
+	work for most people, as this hardware, though important, is
+	relatively old and is produced in relatively low unit volumes.)
+2.	Do all eHCA-Infiniband-related work on other CPUs, including
+	interrupts.
+3.	Rework the eHCA driver so that its per-CPU kthreads are
+	provisioned only on selected CPUs.
+
+
+Name:
+  irq/%d-%s
+
+Purpose:
+  Handle threaded interrupts.
+
+To reduce its OS jitter, do the following:
+
+1.	Use irq affinity to force the irq threads to execute on
+	some other CPU.
+
+Name:
+  kcmtpd_ctr_%d
+
+Purpose:
+  Handle Bluetooth work.
+
+To reduce its OS jitter, do one of the following:
+
+1.	Don't use Bluetooth, in which case these kthreads won't be
+	created in the first place.
+2.	Use irq affinity to force Bluetooth-related interrupts to
+	occur on some other CPU and furthermore initiate all
+	Bluetooth activity on some other CPU.
+
+Name:
+  ksoftirqd/%u
+
+Purpose:
+  Execute softirq handlers when threaded or when under heavy load.
+
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+
+TIMER_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by forcing
+	both kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
+	the CPU offline, then bring it back online.  This forces
+	recurring timers to migrate elsewhere.	If you are concerned
+	with multiple CPUs, force them all offline before bringing the
+	first one back online.  Once you have onlined the CPUs in question,
+	do not offline any other CPUs, because doing so could force the
+	timer back onto one of the CPUs in question.
+
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
+---------------------------------
+
+Do all of the following:
+
+1.	Force networking interrupts onto other CPUs.
+2.	Initiate any network I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+BLOCK_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+IRQ_POLL_SOFTIRQ
+----------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O and block-I/O polling on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+TASKLET_SOFTIRQ
+---------------
+
+Do one or more of the following:
+
+1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
+	calls to things like tasklet_schedule().)
+2.	Convert all drivers that you must use from tasklets to workqueues.
+3.	Force interrupts for drivers using tasklets onto other CPUs,
+	and also do I/O involving these drivers on other CPUs.
+
+SCHED_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
+	for example, ensure that at most one runnable kthread is present
+	on that CPU.  If a thread that expects to run on the de-jittered
+	CPU awakens, the scheduler will send an IPI that can result in
+	a subsequent SCHED_SOFTIRQ.
+2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+	is marked as an adaptive-ticks CPU using the "nohz_full="
+	boot parameter.  This reduces the number of scheduler-clock
+	interrupts that the de-jittered CPU receives, minimizing its
+	chances of being selected to do the load balancing work that
+	runs in SCHED_SOFTIRQ context.
+3.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by
+	forcing both kernel threads and interrupts to execute elsewhere.
+	This further reduces the number of scheduler-clock interrupts
+	received by the de-jittered CPU.
+
+HRTIMER_SOFTIRQ
+---------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle.  For example, avoid system calls and force both
+	kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
+	CPU offline, then bring it back online.  This forces recurring
+	timers to migrate elsewhere.  If you are concerned with multiple
+	CPUs, force them all offline before bringing the first one
+	back online.  Once you have onlined the CPUs in question, do not
+	offline any other CPUs, because doing so could force the timer
+	back onto one of the CPUs in question.
+
+RCU_SOFTIRQ
+-----------
+
+Do at least one of the following:
+
+1.	Offload callbacks and keep the CPU in either dyntick-idle or
+	adaptive-ticks state by doing all of the following:
+
+	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+		de-jittered is marked as an adaptive-ticks CPU using the
+		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
+		housekeeping CPUs, which can tolerate OS jitter.
+	b.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+2.	Enable RCU to do its processing remotely via dyntick-idle by
+	doing all of the following:
+
+	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+	b.	Ensure that the CPU goes idle frequently, allowing other
+		CPUs to detect that it has passed through an RCU quiescent
+		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
+		userspace execution also allows other CPUs to detect that
+		the CPU in question has passed through a quiescent state.
+	c.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+Name:
+  kworker/%u:%d%s (cpu, id, priority)
+
+Purpose:
+  Execute workqueue requests
+
+To reduce its OS jitter, do any of the following:
+
+1.	Run your workload at a real-time priority, which will allow
+	preempting the kworker daemons.
+2.	A given workqueue can be made visible in the sysfs filesystem
+	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+	Such a workqueue can be confined to a given subset of the
+	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
+	files.	The set of WQ_SYSFS workqueues can be displayed using
+	"ls sys/devices/virtual/workqueue".  That said, the workqueues
+	maintainer would like to caution people against indiscriminately
+	sprinkling WQ_SYSFS across all the workqueues.	The reason for
+	caution is that it is easy to add WQ_SYSFS, but because sysfs is
+	part of the formal user/kernel API, it can be nearly impossible
+	to remove it, even if its addition was a mistake.
+3.	Do any of the following needed to avoid jitter that your
+	application cannot tolerate:
+
+	a.	Build your kernel with CONFIG_SLUB=y rather than
+		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
+		use of each CPU's workqueues to run its cache_reap()
+		function.
+	b.	Avoid using oprofile, thus avoiding OS jitter from
+		wq_sync_buffer().
+	c.	Limit your CPU frequency so that a CPU-frequency
+		governor is not required, possibly enlisting the aid of
+		special heatsinks or other cooling technologies.  If done
+		correctly, and if you CPU architecture permits, you should
+		be able to build your kernel with CONFIG_CPU_FREQ=n to
+		avoid the CPU-frequency governor periodically running
+		on each CPU, including cs_dbs_timer() and od_dbs_timer().
+
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
+		but if your workload is CPU-bound, this is a bad idea.
+		However, there is an RFC patch from Christoph Lameter
+		(based on an earlier one from Gilad Ben-Yossef) that
+		reduces or even eliminates vmstat overhead for some
+		workloads at https://lkml.org/lkml/2013/9/4/379.
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
+		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
+		daemon from running on each CPU every second or so.
+		(This will require editing Kconfig files and will defeat
+		this platform's RAS functionality.)  This avoids jitter
+		due to the rtas_event_scan() function.
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	g.	If running on Cell Processor, build your kernel with
+		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
+		spu_gov_work().
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	h.	If running on PowerMAC, build your kernel with
+		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
+		avoiding OS jitter from rackmeter_do_timer().
+
+Name:
+  rcuc/%u
+
+Purpose:
+  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
+	kthreads from being created in the first place, and also obviates
+	the need for RCU priority boosting.  This approach is feasible
+	for workloads that do not require high degrees of responsiveness.
+2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
+	kthreads from being created in the first place.  This approach
+	is feasible only if your workload never requires RCU priority
+	boosting, for example, if you ensure frequent idle time on all
+	CPUs that might execute within the kernel.
+3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+	boot parameter offloading RCU callbacks from all CPUs susceptible
+	to OS jitter.  This approach prevents the rcuc/%u kthreads from
+	having any work to do, so that they are never awakened.
+4.	Ensure that the CPU never enters the kernel, and, in particular,
+	avoid initiating any CPU hotplug operations on this CPU.  This is
+	another way of preventing any callbacks from being queued on the
+	CPU, again preventing the rcuc/%u kthreads from having any work
+	to do.
+
+Name:
+  rcuop/%d and rcuos/%d
+
+Purpose:
+  Offload RCU callbacks from the corresponding CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Use affinity, cgroups, or other mechanism to force these kthreads
+	to execute on some other CPU.
+2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
+	kthreads from being created in the first place.  However, please
+	note that this will not eliminate OS jitter, but will instead
+	shift it to RCU_SOFTIRQ.
+
+Name:
+  watchdog/%u
+
+Purpose:
+  Detect software lockups on each CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+	kthreads from being created in the first place.
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
+	and Documentation/watchdog/watchdog-parameters.rst.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+	watchdog timer.
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+	order to reduce the frequency of OS jitter due to the watchdog
+	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/admin-guide/laptops/asus-laptop.rst b/Documentation/admin-guide/laptops/asus-laptop.rst
new file mode 100644
index 000000000000..95176321a25a
--- /dev/null
+++ b/Documentation/admin-guide/laptops/asus-laptop.rst
@@ -0,0 +1,271 @@
+==================
+Asus Laptop Extras
+==================
+
+Version 0.1
+
+August 6, 2009
+
+Corentin Chary <corentincj@iksaif.net>
+http://acpi4asus.sf.net/
+
+ This driver provides support for extra features of ACPI-compatible ASUS laptops.
+ It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or
+ VICTOR XP7210 for example). It makes all the extra buttons generate input
+ events (like keyboards).
+
+ On some models adds support for changing the display brightness and output,
+ switching the LCD backlight on and off, and most importantly, allows you to
+ blink those fancy LEDs intended for reporting mail and wireless status.
+
+This driver supersedes the old asus_acpi driver.
+
+Requirements
+------------
+
+  Kernel 2.6.X sources, configured for your computer, with ACPI support.
+  You also need CONFIG_INPUT and CONFIG_ACPI.
+
+Status
+------
+
+ The features currently supported are the following (see below for
+ detailed description):
+
+ - Fn key combinations
+ - Bluetooth enable and disable
+ - Wlan enable and disable
+ - GPS enable and disable
+ - Video output switching
+ - Ambient Light Sensor on and off
+ - LED control
+ - LED Display control
+ - LCD brightness control
+ - LCD on and off
+
+ A compatibility table by model and feature is maintained on the web
+ site, http://acpi4asus.sf.net/.
+
+Usage
+-----
+
+  Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should
+  see some lines like this :
+
+      Asus Laptop Extras version 0.42
+        - L2D model detected.
+
+  If it is not the output you have on your laptop, send it (and the laptop's
+  DSDT) to me.
+
+  That's all, now, all the events generated by the hotkeys of your laptop
+  should be reported via netlink events. You can check with
+  "acpi_genl monitor" (part of the acpica project).
+
+  Hotkeys are also reported as input keys (like keyboards) you can check
+  which key are supported using "xev" under X11.
+
+  You can get information on the version of your DSDT table by reading the
+  /sys/devices/platform/asus-laptop/infos entry. If you have a question or a
+  bug report to do, please include the output of this entry.
+
+LEDs
+----
+
+  You can modify LEDs be echoing values to `/sys/class/leds/asus/*/brightness`::
+
+    echo 1 >  /sys/class/leds/asus::mail/brightness
+
+  will switch the mail LED on.
+
+  You can also know if they are on/off by reading their content and use
+  kernel triggers like disk-activity or heartbeat.
+
+Backlight
+---------
+
+  You can control lcd backlight power and brightness with
+  /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15.
+
+Wireless devices
+----------------
+
+  You can turn the internal Bluetooth adapter on/off with the bluetooth entry
+  (only on models with Bluetooth). This usually controls the associated LED.
+  Same for Wlan adapter.
+
+Display switching
+-----------------
+
+  Note: the display switching code is currently considered EXPERIMENTAL.
+
+  Switching works for the following models:
+
+    - L3800C
+    - A2500H
+    - L5800C
+    - M5200N
+    - W1000N (albeit with some glitches)
+    - M6700R
+    - A6JC
+    - F3J
+
+  Switching doesn't work for the following:
+
+    - M3700N
+    - L2X00D (locks the laptop under certain conditions)
+
+  To switch the displays, echo values from 0 to 15 to
+  /sys/devices/platform/asus-laptop/display. The significance of those values
+  is as follows:
+
+  +-------+-----+-----+-----+-----+-----+
+  | Bin   | Val | DVI | TV  | CRT | LCD |
+  +-------+-----+-----+-----+-----+-----+
+  | 0000  |   0 |     |     |     |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 0001  |   1 |     |     |     |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 0010  |   2 |     |     |  X  |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 0011  |   3 |     |     |  X  |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 0100  |   4 |     |  X  |     |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 0101  |   5 |     |  X  |     | X   |
+  +-------+-----+-----+-----+-----+-----+
+  | 0110  |   6 |     |  X  |  X  |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 0111  |   7 |     |  X  |  X  |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 1000  |   8 |  X  |     |     |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 1001  |   9 |  X  |     |     |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 1010  |  10 |  X  |     |  X  |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 1011  |  11 |  X  |     |  X  |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 1100  |  12 |  X  |  X  |     |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 1101  |  13 |  X  |  X  |     |  X  |
+  +-------+-----+-----+-----+-----+-----+
+  | 1110  |  14 |  X  |  X  |  X  |     |
+  +-------+-----+-----+-----+-----+-----+
+  | 1111  |  15 |  X  |  X  |  X  |  X  |
+  +-------+-----+-----+-----+-----+-----+
+
+  In most cases, the appropriate displays must be plugged in for the above
+  combinations to work. TV-Out may need to be initialized at boot time.
+
+  Debugging:
+
+  1) Check whether the Fn+F8 key:
+
+     a) does not lock the laptop (try a boot with noapic / nolapic if it does)
+     b) generates events (0x6n, where n is the value corresponding to the
+        configuration above)
+     c) actually works
+
+     Record the disp value at every configuration.
+  2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display.
+     Record its value, note any change. If nothing changes, try a broader range,
+     up to 65535.
+  3) Send ANY output (both positive and negative reports are needed, unless your
+     machine is already listed above) to the acpi4asus-user mailing list.
+
+  Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n
+  events are generated and no actual switching occurs. In such a case, a line
+  like::
+
+    echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display
+
+  will usually do the trick ($arg is the 0000006n-like event passed to acpid).
+
+  Note: there is currently no reliable way to read display status on xxN
+  (Centrino) models.
+
+LED display
+-----------
+
+  Some models like the W1N have a LED display that can be used to display
+  several items of information.
+
+  LED display works for the following models:
+
+    - W1000N
+    - W1J
+
+  To control the LED display, use the following::
+
+    echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
+
+  where T control the 3 letters display, and DDD the 3 digits display,
+  according to the tables below::
+
+         DDD (digits)
+         000 to 999 = display digits
+         AAA        = ---
+         BBB to FFF = turn-off
+
+         T  (type)
+         0 = off
+         1 = dvd
+         2 = vcd
+         3 = mp3
+         4 = cd
+         5 = tv
+         6 = cpu
+         7 = vol
+
+  For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd"
+  would display "DVD001".
+
+Driver options
+--------------
+
+ Options can be passed to the asus-laptop driver using the standard
+ module argument syntax (<param>=<value> when passing the option to the
+ module or asus-laptop.<param>=<value> on the kernel boot line when
+ asus-laptop is statically linked into the kernel).
+
+	     wapf: WAPF defines the behavior of the Fn+Fx wlan key
+		   The significance of values is yet to be found, but
+		   most of the time:
+
+		   - 0x0 should do nothing
+		   - 0x1 should allow to control the device with Fn+Fx key.
+		   - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key
+		   - 0x5 like 0x1 or 0x4
+
+ The default value is 0x1.
+
+Unsupported models
+------------------
+
+ These models will never be supported by this module, as they use a completely
+ different mechanism to handle LEDs and extra stuff (meaning we have no clue
+ how it works):
+
+ - ASUS A1300 (A1B), A1370D
+ - ASUS L7300G
+ - ASUS L8400
+
+Patches, Errors, Questions
+--------------------------
+
+ I appreciate any success or failure
+ reports, especially if they add to or correct the compatibility table.
+ Please include the following information in your report:
+
+ - Asus model name
+ - a copy of your ACPI tables, using the "acpidump" utility
+ - a copy of /sys/devices/platform/asus-laptop/infos
+ - which driver features work and which don't
+ - the observed behavior of non-working features
+
+ Any other comments or patches are also more than welcome.
+
+ acpi4asus-user@lists.sourceforge.net
+
+ http://sourceforge.net/projects/acpi4asus
diff --git a/Documentation/admin-guide/laptops/disk-shock-protection.rst b/Documentation/admin-guide/laptops/disk-shock-protection.rst
new file mode 100644
index 000000000000..e97c5f78d8c3
--- /dev/null
+++ b/Documentation/admin-guide/laptops/disk-shock-protection.rst
@@ -0,0 +1,151 @@
+==========================
+Hard disk shock protection
+==========================
+
+Author: Elias Oltmanns <eo@nebensachen.de>
+
+Last modified: 2008-10-03
+
+
+.. 0. Contents
+
+   1. Intro
+   2. The interface
+   3. References
+   4. CREDITS
+
+
+1. Intro
+--------
+
+ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature.
+Issuing this command should cause the drive to switch to idle mode and
+unload disk heads. This feature is being used in modern laptops in
+conjunction with accelerometers and appropriate software to implement
+a shock protection facility. The idea is to stop all I/O operations on
+the internal hard drive and park its heads on the ramp when critical
+situations are anticipated. The desire to have such a feature
+available on GNU/Linux systems has been the original motivation to
+implement a generic disk head parking interface in the Linux kernel.
+Please note, however, that other components have to be set up on your
+system in order to get disk shock protection working (see
+section 3. References below for pointers to more information about
+that).
+
+
+2. The interface
+----------------
+
+For each ATA device, the kernel exports the file
+`block/*/device/unload_heads` in sysfs (here assumed to be mounted under
+/sys). Access to `/sys/block/*/device/unload_heads` is denied with
+-EOPNOTSUPP if the device does not support the unload feature.
+Otherwise, writing an integer value to this file will take the heads
+of the respective drive off the platter and block all I/O operations
+for the specified number of milliseconds. When the timeout expires and
+no further disk head park request has been issued in the meantime,
+normal operation will be resumed. The maximal value accepted for a
+timeout is 30000 milliseconds. Exceeding this limit will return
+-EOVERFLOW, but heads will be parked anyway and the timeout will be
+set to 30 seconds. However, you can always change a timeout to any
+value between 0 and 30000 by issuing a subsequent head park request
+before the timeout of the previous one has expired. In particular, the
+total timeout can exceed 30 seconds and, more importantly, you can
+cancel a previously set timeout and resume normal operation
+immediately by specifying a timeout of 0. Values below -2 are rejected
+with -EINVAL (see below for the special meaning of -1 and -2). If the
+timeout specified for a recent head park request has not yet expired,
+reading from `/sys/block/*/device/unload_heads` will report the number
+of milliseconds remaining until normal operation will be resumed;
+otherwise, reading the unload_heads attribute will return 0.
+
+For example, do the following in order to park the heads of drive
+/dev/sda and stop all I/O operations for five seconds::
+
+	# echo 5000 > /sys/block/sda/device/unload_heads
+
+A simple::
+
+	# cat /sys/block/sda/device/unload_heads
+
+will show you how many milliseconds are left before normal operation
+will be resumed.
+
+A word of caution: The fact that the interface operates on a basis of
+milliseconds may raise expectations that cannot be satisfied in
+reality. In fact, the ATA specs clearly state that the time for an
+unload operation to complete is vendor specific. The hint in ATA-7
+that this will typically be within 500 milliseconds apparently has
+been dropped in ATA-8.
+
+There is a technical detail of this implementation that may cause some
+confusion and should be discussed here. When a head park request has
+been issued to a device successfully, all I/O operations on the
+controller port this device is attached to will be deferred. That is
+to say, any other device that may be connected to the same port will
+be affected too. The only exception is that a subsequent head unload
+request to that other device will be executed immediately. Further
+operations on that port will be deferred until the timeout specified
+for either device on the port has expired. As far as PATA (old style
+IDE) configurations are concerned, there can only be two devices
+attached to any single port. In SATA world we have port multipliers
+which means that a user-issued head parking request to one device may
+actually result in stopping I/O to a whole bunch of devices. However,
+since this feature is supposed to be used on laptops and does not seem
+to be very useful in any other environment, there will be mostly one
+device per port. Even if the CD/DVD writer happens to be connected to
+the same port as the hard drive, it generally *should* recover just
+fine from the occasional buffer under-run incurred by a head park
+request to the HD. Actually, when you are using an ide driver rather
+than its libata counterpart (i.e. your disk is called /dev/hda
+instead of /dev/sda), then parking the heads of one drive (drive X)
+will generally not affect the mode of operation of another drive
+(drive Y) on the same port as described above. It is only when a port
+reset is required to recover from an exception on drive Y that further
+I/O operations on that drive (and the reset itself) will be delayed
+until drive X is no longer in the parked state.
+
+Finally, there are some hard drives that only comply with an earlier
+version of the ATA standard than ATA-7, but do support the unload
+feature nonetheless. Unfortunately, there is no safe way Linux can
+detect these devices, so you won't be able to write to the
+unload_heads attribute. If you know that your device really does
+support the unload feature (for instance, because the vendor of your
+laptop or the hard drive itself told you so), then you can tell the
+kernel to enable the usage of this feature for that drive by writing
+the special value -1 to the unload_heads attribute::
+
+	# echo -1 > /sys/block/sda/device/unload_heads
+
+will enable the feature for /dev/sda, and giving -2 instead of -1 will
+disable it again.
+
+
+3. References
+-------------
+
+There are several laptops from different vendors featuring shock
+protection capabilities. As manufacturers have refused to support open
+source development of the required software components so far, Linux
+support for shock protection varies considerably between different
+hardware implementations. Ideally, this section should contain a list
+of pointers at different projects aiming at an implementation of shock
+protection on different systems. Unfortunately, I only know of a
+single project which, although still considered experimental, is fit
+for use. Please feel free to add projects that have been the victims
+of my ignorance.
+
+- http://www.thinkwiki.org/wiki/HDAPS
+
+  See this page for information about Linux support of the hard disk
+  active protection system as implemented in IBM/Lenovo Thinkpads.
+
+
+4. CREDITS
+----------
+
+This implementation of disk head parking has been inspired by a patch
+originally published by Jon Escombe <lists@dresco.co.uk>. My efforts
+to develop an implementation of this feature that is fit to be merged
+into mainline have been aided by various kernel developers, in
+particular by Tejun Heo and Bartlomiej Zolnierkiewicz.
diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst
new file mode 100644
index 000000000000..cd9a1c2695fd
--- /dev/null
+++ b/Documentation/admin-guide/laptops/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+Laptop Drivers
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   asus-laptop
+   disk-shock-protection
+   laptop-mode
+   lg-laptop
+   sony-laptop
+   sonypi
+   thinkpad-acpi
+   toshiba_haps
diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst
new file mode 100644
index 000000000000..c984c4262f2e
--- /dev/null
+++ b/Documentation/admin-guide/laptops/laptop-mode.rst
@@ -0,0 +1,781 @@
+===============================================
+How to conserve battery power using laptop-mode
+===============================================
+
+Document Author: Bart Samwel (bart@samwel.tk)
+
+Date created: January 2, 2004
+
+Last modified: December 06, 2004
+
+Introduction
+------------
+
+Laptop mode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+.. Contents
+
+   * Introduction
+   * Installation
+   * Caveats
+   * The Details
+   * Tips & Tricks
+   * Control script
+   * ACPI integration
+   * Monitoring tool
+
+
+Installation
+------------
+
+To use laptop mode, you don't need to set any kernel configuration options
+or anything. Simply install all the files included in this document, and
+laptop mode will automatically be started when you're on battery. For
+your convenience, a tarball containing an installer can be downloaded at:
+
+	http://www.samwel.tk/laptop_mode/laptop_mode/
+
+To configure laptop mode, you need to edit the configuration file, which is
+located in /etc/default/laptop-mode on Debian-based systems, or in
+/etc/sysconfig/laptop-mode on other systems.
+
+Unfortunately, automatic enabling of laptop mode does not work for
+laptops that don't have ACPI. On those laptops, you need to start laptop
+mode manually. To start laptop mode, run "laptop_mode start", and to
+stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
+has experimental support for APM, you might want to try that first.)
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up to 10
+  minutes of work. If you cannot afford this, don't use it! The supplied ACPI
+  scripts automatically turn off laptop mode when the battery almost runs out,
+  so that you won't lose any data at the end of your battery life.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+  Check your drive's rating, and don't wear down your drive's lifetime if you
+  don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+  the control script will not be able to remount them correctly. You must set
+  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+  wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+  the control script will not recognize them as filesystems that need remounting.
+  You must list the filesystems with their true type instead.
+
+* It has been reported that some versions of the mutt mail client use file access
+  times to determine whether a folder contains new mail. If you use mutt and
+  experience this, you must disable the noatime remounting by setting the option
+  DO_REMOUNT_NOATIME to 0 in the configuration file.
+
+
+The Details
+-----------
+
+Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
+present for all kernels that have the laptop mode patch, regardless of any
+configuration options. When the knob is set, any physical disk I/O (that might
+have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
+result of this is that after a disk has spun down, it will not be spun up
+anymore to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation. The value of the laptop_mode
+knob determines the time between the occurrence of disk I/O and when the flush
+is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
+0 disables laptop mode.
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump and your kernel logging level also includes
+kernel debugging messages, you probably want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+
+Configuration
+-------------
+
+The laptop mode configuration file is located in /etc/default/laptop-mode on
+Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
+contains the following options:
+
+MAX_AGE:
+
+Maximum time, in seconds, of hard drive spindown time that you are
+comfortable with. Worst case, it's possible that you could lose this
+amount of work if your battery fails while you're in laptop mode.
+
+MINIMUM_BATTERY_MINUTES:
+
+Automatically disable laptop mode if the remaining number of minutes of
+battery power is less than this value. Default is 10 minutes.
+
+AC_HD/BATT_HD:
+
+The idle timeout that should be set on your hard drive when laptop mode
+is active (BATT_HD) and when it is not active (AC_HD). The defaults are
+20 seconds (value 4) for BATT_HD  and 2 hours (value 244) for AC_HD. The
+possible values are those listed in the manual page for "hdparm" for the
+"-S" option.
+
+HD:
+
+The devices for which the spindown timeout should be adjusted by laptop mode.
+Default is /dev/hda. If you specify multiple devices, separate them by a space.
+
+READAHEAD:
+
+Disk readahead, in 512-byte sectors, while laptop mode is active. A large
+readahead can prevent disk accesses for things like executable pages (which are
+loaded on demand while the application executes) and sequentially accessed data
+(MP3s).
+
+DO_REMOUNTS:
+
+The control script automatically remounts any mounted journaled filesystems
+with appropriate commit interval options. When this option is set to 0, this
+feature is disabled.
+
+DO_REMOUNT_NOATIME:
+
+When remounting, should the filesystems be remounted with the noatime option?
+Normally, this is set to "1" (enabled), but there may be programs that require
+access time recording.
+
+DIRTY_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+before a writeback is forced, while laptop mode is active. Corresponds to
+the /proc/sys/vm/dirty_ratio sysctl.
+
+DIRTY_BACKGROUND_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
+this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
+sysctl.
+
+Note that the behaviour of dirty_background_ratio is quite different
+when laptop mode is active and when it isn't. When laptop mode is inactive,
+dirty_background_ratio is the threshold percentage at which background writeouts
+start taking place. When laptop mode is active, however, background writeouts
+are disabled, and the dirty_background_ratio only determines how much writeback
+is done when dirty_ratio is reached.
+
+DO_CPU:
+
+Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
+See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
+
+CPU_MAXFREQ:
+
+When on battery, what is the maximum CPU speed that the system should use? Legal
+values are "slowest" for the slowest speed that your CPU is able to operate at,
+or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+  of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
+
+* You can spin down the disk while playing MP3, by setting disk readahead
+  to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
+  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+  Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+  of colours that my display uses it consumes less battery power. I've seen
+  this on powerbooks too. I hope that this is a piece of information that
+  might be useful to the Laptop Mode patch or its users."
+
+* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the
+  file after every logging. When you're using laptop-mode and your disk doesn't
+  spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+  from doing its thing.
+
+* If you're worried about your data, you might want to consider using a USB
+  memory stick or something like that as a "working area". (Be aware though
+  that flash memory can only handle a limited number of writes, and overuse
+  may wear out your memory stick pretty quickly. Do _not_ use journalling
+  filesystems on flash memory sticks.)
+
+
+Configuration file for control and ACPI battery scripts
+-------------------------------------------------------
+
+This allows the tunables to be changed for the scripts via an external
+configuration file
+
+It should be installed as /etc/default/laptop-mode on Debian, and as
+/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
+
+Config file::
+
+  # Maximum time, in seconds, of hard drive spindown time that you are
+  # comfortable with. Worst case, it's possible that you could lose this
+  # amount of work if your battery fails you while in laptop mode.
+  #MAX_AGE=600
+
+  # Automatically disable laptop mode when the number of minutes of battery
+  # that you have left goes below this threshold.
+  MINIMUM_BATTERY_MINUTES=10
+
+  # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
+  # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
+  # will read a complete MP3 at once, and will then spin down while the MP3/OGG is
+  # playing.
+  #READAHEAD=4096
+
+  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+  #DO_REMOUNTS=1
+
+  # And shall we add the "noatime" option to that as well? (1=yes)
+  #DO_REMOUNT_NOATIME=1
+
+  # Dirty synchronous ratio.  At this percentage of dirty pages the process
+  # which
+  # calls write() does its own writeback
+  #DIRTY_RATIO=40
+
+  #
+  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
+  # exceeded, the kernel will wake flusher threads which will then reduce the
+  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
+  # so once some writeout has commenced, we do a lot of it.
+  #
+  #DIRTY_BACKGROUND_RATIO=5
+
+  # kernel default dirty buffer age
+  #DEF_AGE=30
+  #DEF_UPDATE=5
+  #DEF_DIRTY_BACKGROUND_RATIO=10
+  #DEF_DIRTY_RATIO=40
+  #DEF_XFS_AGE_BUFFER=15
+  #DEF_XFS_SYNC_INTERVAL=30
+  #DEF_XFS_BUFD_INTERVAL=1
+
+  # This must be adjusted manually to the value of HZ in the running kernel
+  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
+  # centisecs. This can be automated, but it's a work in progress that still
+  # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
+  # external interfaces, and that is currently always set to 100. So you don't
+  # need to change this on 2.6.
+  #XFS_HZ=100
+
+  # Should the maximum CPU frequency be adjusted down while on battery?
+  # Requires CPUFreq to be setup.
+  # See Documentation/admin-guide/pm/cpufreq.rst for more info
+  #DO_CPU=0
+
+  # When on battery what is the maximum CPU speed that the system should
+  # use? Legal values are "slowest" for the slowest speed that your
+  # CPU is able to operate at, or a value listed in:
+  # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
+  # Only applicable if DO_CPU=1.
+  #CPU_MAXFREQ=slowest
+
+  # Idle timeout for your hard drive (man hdparm for valid values, -S option)
+  # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
+  #AC_HD=244
+  #BATT_HD=4
+
+  # The drives for which to adjust the idle timeout. Separate them by a space,
+  # e.g. HD="/dev/hda /dev/hdb".
+  #HD="/dev/hda"
+
+  # Set the spindown timeout on a hard drive?
+  #DO_HD=1
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
+to Kiko Piris).
+
+Control script::
+
+  #!/bin/bash
+
+  # start or stop laptop_mode, best run by a power management daemon when
+  # ac gets connected/disconnected from a laptop
+  #
+  # install as /sbin/laptop_mode
+  #
+  # Contributors to this script:   Kiko Piris
+  #				 Bart Samwel
+  #				 Micha Feigin
+  #				 Andrew Morton
+  #				 Herve Eychenne
+  #				 Dax Kelson
+  #
+  # Original Linux 2.4 version by: Jens Axboe
+
+  #############################################################################
+
+  # Source config
+  if [ -f /etc/default/laptop-mode ] ; then
+	# Debian
+	. /etc/default/laptop-mode
+  elif [ -f /etc/sysconfig/laptop-mode ] ; then
+	# Others
+          . /etc/sysconfig/laptop-mode
+  fi
+
+  # Don't raise an error if the config file is incomplete
+  # set defaults instead:
+
+  # Maximum time, in seconds, of hard drive spindown time that you are
+  # comfortable with. Worst case, it's possible that you could lose this
+  # amount of work if your battery fails you while in laptop mode.
+  MAX_AGE=${MAX_AGE:-'600'}
+
+  # Read-ahead, in kilobytes
+  READAHEAD=${READAHEAD:-'4096'}
+
+  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+  DO_REMOUNTS=${DO_REMOUNTS:-'1'}
+
+  # And shall we add the "noatime" option to that as well? (1=yes)
+  DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
+
+  # Shall we adjust the idle timeout on a hard drive?
+  DO_HD=${DO_HD:-'1'}
+
+  # Adjust idle timeout on which hard drive?
+  HD="${HD:-'/dev/hda'}"
+
+  # spindown time for HD (hdparm -S values)
+  AC_HD=${AC_HD:-'244'}
+  BATT_HD=${BATT_HD:-'4'}
+
+  # Dirty synchronous ratio.  At this percentage of dirty pages the process which
+  # calls write() does its own writeback
+  DIRTY_RATIO=${DIRTY_RATIO:-'40'}
+
+  # cpu frequency scaling
+  # See Documentation/admin-guide/pm/cpufreq.rst for more info
+  DO_CPU=${CPU_MANAGE:-'0'}
+  CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
+
+  #
+  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
+  # exceeded, the kernel will wake flusher threads which will then reduce the
+  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
+  # so once some writeout has commenced, we do a lot of it.
+  #
+  DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
+
+  # kernel default dirty buffer age
+  DEF_AGE=${DEF_AGE:-'30'}
+  DEF_UPDATE=${DEF_UPDATE:-'5'}
+  DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
+  DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
+  DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
+  DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
+  DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
+
+  # This must be adjusted manually to the value of HZ in the running kernel
+  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
+  # centisecs. This can be automated, but it's a work in progress that still needs
+  # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
+  # interfaces, and that is currently always set to 100. So you don't need to
+  # change this on 2.6.
+  XFS_HZ=${XFS_HZ:-'100'}
+
+  #############################################################################
+
+  KLEVEL="$(uname -r |
+               {
+	       IFS='.' read a b c
+	       echo $a.$b
+	     }
+  )"
+  case "$KLEVEL" in
+	"2.4"|"2.6")
+		;;
+	*)
+		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
+		exit 1
+		;;
+  esac
+
+  if [ ! -e /proc/sys/vm/laptop_mode ] ; then
+	echo "Kernel is not patched with laptop_mode patch." >&2
+	exit 1
+  fi
+
+  if [ ! -w /proc/sys/vm/laptop_mode ] ; then
+	echo "You do not have enough privileges to enable laptop_mode." >&2
+	exit 1
+  fi
+
+  # Remove an option (the first parameter) of the form option=<number> from
+  # a mount options string (the rest of the parameters).
+  parse_mount_opts () {
+	OPT="$1"
+	shift
+	echo ",$*," | sed		\
+	 -e 's/,'"$OPT"'=[0-9]*,/,/g'	\
+	 -e 's/,,*/,/g'			\
+	 -e 's/^,//'			\
+	 -e 's/,$//'
+  }
+
+  # Remove an option (the first parameter) without any arguments from
+  # a mount option string (the rest of the parameters).
+  parse_nonumber_mount_opts () {
+	OPT="$1"
+	shift
+	echo ",$*," | sed		\
+	 -e 's/,'"$OPT"',/,/g'		\
+	 -e 's/,,*/,/g'			\
+	 -e 's/^,//'			\
+	 -e 's/,$//'
+  }
+
+  # Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+  # fstab for a given filesystem, and use this state to replace the
+  # value of the option in another mount options string. The device
+  # is the first argument, the option name the second, and the default
+  # value the third. The remainder is the mount options string.
+  #
+  # Example:
+  # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+  #
+  # If fstab contains, say, "rw" for this filesystem, then the result
+  # will be "defaults,atime".
+  parse_yesno_opts_wfstab () {
+	L_DEV="$1"
+	OPT="$2"
+	DEF_OPT="$3"
+	shift 3
+	L_OPTS="$*"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+	# Watch for a default atime in fstab
+	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+	if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
+		# option specified in fstab: extract the value and use it
+		if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
+			echo "$PARSEDOPTS1,no$OPT"
+		else
+			# no$OPT not found -- so we must have $OPT.
+			echo "$PARSEDOPTS1,$OPT"
+		fi
+	else
+		# option not specified in fstab -- choose the default.
+		echo "$PARSEDOPTS1,$DEF_OPT"
+	fi
+  }
+
+  # Find out the state of a numbered option (e.g. "commit=NNN") in
+  # fstab for a given filesystem, and use this state to replace the
+  # value of the option in another mount options string. The device
+  # is the first argument, and the option name the second. The
+  # remainder is the mount options string in which the replacement
+  # must be done.
+  #
+  # Example:
+  # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+  #
+  # If fstab contains, say, "commit=3,rw" for this filesystem, then the
+  # result will be "rw,commit=3".
+  parse_mount_opts_wfstab () {
+	L_DEV="$1"
+	OPT="$2"
+	shift 2
+	L_OPTS="$*"
+	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+	# Watch for a default commit in fstab
+	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+	if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
+		# option specified in fstab: extract the value, and use it
+		echo -n "$PARSEDOPTS1,$OPT="
+		echo ",$FSTAB_OPTS," | sed \
+		 -e 's/.*,'"$OPT"'=//'	\
+		 -e 's/,.*//'
+	else
+		# option not specified in fstab: set it to 0
+		echo "$PARSEDOPTS1,$OPT=0"
+	fi
+  }
+
+  deduce_fstype () {
+	MP="$1"
+	# My root filesystem unfortunately has
+	# type "unknown" in /etc/mtab. If we encounter
+	# "unknown", we try to get the type from fstab.
+	cat /etc/fstab |
+	grep -v '^#' |
+	while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
+		if [ "$FSTAB_MP" = "$MP" ]; then
+			echo $FSTAB_FST
+			exit 0
+		fi
+	done
+  }
+
+  if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
+	NOATIME_OPT=",noatime"
+  fi
+
+  case "$1" in
+	start)
+		AGE=$((100*$MAX_AGE))
+		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+		echo -n "Starting laptop_mode"
+
+		if [ -d /proc/sys/vm/pagebuf ] ; then
+			# (For 2.4 and early 2.6.)
+			# This only needs to be set, not reset -- it is only used when
+			# laptop mode is enabled.
+			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# (A couple of early 2.6 laptop mode patches had these.)
+			# The same goes for these.
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+			# (2.6.6)
+			# But not for these -- they are also used in normal
+			# operation.
+			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+			# (2.6.7 upwards)
+			# And not for these either. These are in centisecs,
+			# not USER_HZ, so we have to use $AGE, not $XFS_AGE.
+			echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
+			echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
+			echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
+		fi
+
+		case "$KLEVEL" in
+			"2.4")
+				echo 1					> /proc/sys/vm/laptop_mode
+				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo 5					> /proc/sys/vm/laptop_mode
+				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+				if [ "$FST" = 'unknown' ]; then
+					FST=$(deduce_fstype $MP)
+				fi
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
+						;;
+					"xfs")
+						mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra $(($READAHEAD * 2)) $DEV
+				fi
+			done
+		fi
+		if [ $DO_HD -eq 1 ] ; then
+			for THISHD in $HD ; do
+				/sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
+				/sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
+			done
+		fi
+		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+			if [ $CPU_MAXFREQ = 'slowest' ]; then
+				CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
+			fi
+			echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+		fi
+		echo "."
+		;;
+	stop)
+		U_AGE=$((100*$DEF_UPDATE))
+		B_AGE=$((100*$DEF_AGE))
+		echo -n "Stopping laptop_mode"
+		echo 0 > /proc/sys/vm/laptop_mode
+		if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# These need to be restored, if there are no lm_*.
+			echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER))	 	> /proc/sys/fs/xfs/age_buffer
+			echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) 	> /proc/sys/fs/xfs/sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+			# These need to be restored as well.
+			echo $((100*$DEF_XFS_AGE_BUFFER))	> /proc/sys/fs/xfs/age_buffer_centisecs
+			echo $((100*$DEF_XFS_SYNC_INTERVAL))	> /proc/sys/fs/xfs/xfssyncd_centisecs
+			echo $((100*$DEF_XFS_BUFD_INTERVAL))	> /proc/sys/fs/xfs/xfsbufd_centisecs
+		fi
+		case "$KLEVEL" in
+			"2.4")
+				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ] ; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				# Reset commit and atime options to defaults.
+				if [ "$FST" = 'unknown' ]; then
+					FST=$(deduce_fstype $MP)
+				fi
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+					"xfs")
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra 256 $DEV
+				fi
+			done
+		fi
+		if [ $DO_HD -eq 1 ] ; then
+			for THISHD in $HD ; do
+				/sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
+				/sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
+			done
+		fi
+		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+			echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+		fi
+		echo "."
+		;;
+	*)
+		echo "Usage: $0 {start|stop}" 2>&1
+		exit 1
+		;;
+
+  esac
+
+  exit 0
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm. The part that
+automatically disables laptop mode when the battery is low was
+written by Jan Topinski.
+
+/etc/acpi/events/ac_adapter::
+
+	event=ac_adapter
+	action=/etc/acpi/actions/ac.sh %e
+
+/etc/acpi/events/battery::
+
+	event=battery.*
+	action=/etc/acpi/actions/battery.sh %e
+
+/etc/acpi/actions/ac.sh::
+
+  #!/bin/bash
+
+  # ac on/offline event handler
+
+  status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
+
+  case $status in
+          "on-line")
+                  /sbin/laptop_mode stop
+                  exit 0
+          ;;
+          "off-line")
+                  /sbin/laptop_mode start
+                  exit 0
+          ;;
+  esac
+
+
+/etc/acpi/actions/battery.sh::
+
+  #! /bin/bash
+
+  # Automatically disable laptop mode when the battery almost runs out.
+
+  BATT_INFO=/proc/acpi/battery/$2/state
+
+  if [[ -f /proc/sys/vm/laptop_mode ]]
+  then
+     LM=`cat /proc/sys/vm/laptop_mode`
+     if [[ $LM -gt 0 ]]
+     then
+       if [[ -f $BATT_INFO ]]
+       then
+          # Source the config file only now that we know we need
+          if [ -f /etc/default/laptop-mode ] ; then
+                  # Debian
+                  . /etc/default/laptop-mode
+          elif [ -f /etc/sysconfig/laptop-mode ] ; then
+                  # Others
+                  . /etc/sysconfig/laptop-mode
+          fi
+          MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
+
+          ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
+          if [[ ACTION -eq "discharging" ]]
+          then
+             PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
+             REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
+          fi
+          if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
+          then
+             /sbin/laptop_mode stop
+          fi
+       else
+         logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
+       fi
+     fi
+  fi
+
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down.  See tools/laptop/dslm/dslm.c
diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst
index aa503ee9b3bc..ce9b14671cb9 100644
--- a/Documentation/laptops/lg-laptop.rst
+++ b/Documentation/admin-guide/laptops/lg-laptop.rst
@@ -1,5 +1,6 @@
 .. SPDX-License-Identifier: GPL-2.0+
 
+
 LG Gram laptop extra features
 =============================
 
diff --git a/Documentation/admin-guide/laptops/sony-laptop.rst b/Documentation/admin-guide/laptops/sony-laptop.rst
new file mode 100644
index 000000000000..9edcc7f6612f
--- /dev/null
+++ b/Documentation/admin-guide/laptops/sony-laptop.rst
@@ -0,0 +1,174 @@
+=========================================
+Sony Notebook Control Driver (SNC) Readme
+=========================================
+
+	- Copyright (C) 2004- 2005 Stelian Pop <stelian@popies.net>
+	- Copyright (C) 2007 Mattia Dongili <malattia@linux.it>
+
+This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the
+Sony Vaio laptops. This driver mixes both devices functions under the same
+(hopefully consistent) interface. This also means that the sonypi driver is
+obsoleted by sony-laptop now.
+
+Fn keys (hotkeys):
+------------------
+
+Some models report hotkeys through the SNC or SPIC devices, such events are
+reported both through the ACPI subsystem as acpi events and through the INPUT
+subsystem. See the logs of /proc/bus/input/devices to find out what those
+events are and which input devices are created by the driver.
+Additionally, loading the driver with the debug option will report all events
+in the kernel log.
+
+The "scancodes" passed to the input system (that can be remapped with udev)
+are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c
+module.  For example the "FN/E" key combination (EJECTCD on some models)
+generates the scancode 20 (0x14).
+
+Backlight control:
+------------------
+If your laptop model supports it, you will find sysfs files in the
+/sys/class/backlight/sony/
+directory. You will be able to query and set the current screen
+brightness:
+
+	======================	=========================================
+	brightness		get/set screen brightness (an integer
+				between 0 and 7)
+	actual_brightness	reading from this file will query the HW
+				to get real brightness value
+	max_brightness		the maximum brightness value
+	======================	=========================================
+
+
+Platform specific:
+------------------
+Loading the sony-laptop module will create a
+/sys/devices/platform/sony-laptop/
+directory populated with some files.
+
+You then read/write integer values from/to those files by using
+standard UNIX tools.
+
+The files are:
+
+	======================	==========================================
+	brightness_default	screen brightness which will be set
+				when the laptop will be rebooted
+	cdpower			power on/off the internal CD drive
+	audiopower		power on/off the internal sound card
+	lanpower		power on/off the internal ethernet card
+				(only in debug mode)
+	bluetoothpower		power on/off the internal bluetooth device
+	fanspeed		get/set the fan speed
+	======================	==========================================
+
+Note that some files may be missing if they are not supported
+by your particular laptop model.
+
+Example usage::
+
+	# echo "1" > /sys/devices/platform/sony-laptop/brightness_default
+
+sets the lowest screen brightness for the next and later reboots
+
+::
+
+	# echo "8" > /sys/devices/platform/sony-laptop/brightness_default
+
+sets the highest screen brightness for the next and later reboots
+
+::
+
+	# cat /sys/devices/platform/sony-laptop/brightness_default
+
+retrieves the value
+
+::
+
+	# echo "0" > /sys/devices/platform/sony-laptop/audiopower
+
+powers off the sound card
+
+::
+
+	# echo "1" > /sys/devices/platform/sony-laptop/audiopower
+
+powers on the sound card.
+
+
+RFkill control:
+---------------
+More recent Vaio models expose a consistent set of ACPI methods to
+control radio frequency emitting devices. If you are a lucky owner of
+such a laptop you will find the necessary rfkill devices under
+/sys/class/rfkill. Check those starting with sony-* in::
+
+	# grep . /sys/class/rfkill/*/{state,name}
+
+
+Development:
+------------
+
+If you want to help with the development of this driver (and
+you are not afraid of any side effects doing strange things with
+your ACPI BIOS could have on your laptop), load the driver and
+pass the option 'debug=1'.
+
+REPEAT:
+	**DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.**
+
+In your kernel logs you will find the list of all ACPI methods
+the SNC device has on your laptop.
+
+* For new models you will see a long list of meaningless method names,
+  reading the DSDT table source should reveal that:
+
+(1) the SNC device uses an internal capability lookup table
+(2) SN00 is used to find values in the lookup table
+(3) SN06 and SN07 are used to call into the real methods based on
+    offsets you can obtain iterating the table using SN00
+(4) SN02 used to enable events.
+
+Some values in the capability lookup table are more or less known, see
+the code for all sony_call_snc_handle calls, others are more obscure.
+
+* For old models you can see the GCDP/GCDP methods used to pwer on/off
+  the CD drive, but there are others and they are usually different from
+  model to model.
+
+**I HAVE NO IDEA WHAT THOSE METHODS DO.**
+
+The sony-laptop driver creates, for some of those methods (the most
+current ones found on several Vaio models), an entry under
+/sys/devices/platform/sony-laptop, just like the 'cdpower' one.
+You can create other entries corresponding to your own laptop methods by
+further editing the source (see the 'sony_nc_values' table, and add a new
+entry to this table with your get/set method names using the
+SNC_HANDLE_NAMES macro).
+
+Your mission, should you accept it, is to try finding out what
+those entries are for, by reading/writing random values from/to those
+files and find out what is the impact on your laptop.
+
+Should you find anything interesting, please report it back to me,
+I will not disavow all knowledge of your actions :)
+
+See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other
+useful info.
+
+Bugs/Limitations:
+-----------------
+
+* This driver is not based on official documentation from Sony
+  (because there is none), so there is no guarantee this driver
+  will work at all, or do the right thing. Although this hasn't
+  happened to me, this driver could do very bad things to your
+  laptop, including permanent damage.
+
+* The sony-laptop and sonypi drivers do not interact at all. In the
+  future, sonypi will be removed and replaced by sony-laptop.
+
+* spicctrl, which is the userspace tool used to communicate with the
+  sonypi driver (through /dev/sonypi) is deprecated as well since all
+  its features are now available under the sysfs tree via sony-laptop.
diff --git a/Documentation/admin-guide/laptops/sonypi.rst b/Documentation/admin-guide/laptops/sonypi.rst
new file mode 100644
index 000000000000..c6eaaf48f7c1
--- /dev/null
+++ b/Documentation/admin-guide/laptops/sonypi.rst
@@ -0,0 +1,158 @@
+==================================================
+Sony Programmable I/O Control Device Driver Readme
+==================================================
+
+	- Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
+	- Copyright (C) 2001-2002 Alcôve <www.alcove.com>
+	- Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
+	- Copyright (C) 2001 Junichi Morita <jun1m@mars.dti.ne.jp>
+	- Copyright (C) 2000 Takaya Kinjo <t-kinjo@tc4.so-net.ne.jp>
+	- Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
+
+This driver enables access to the Sony Programmable I/O Control Device which
+can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be
+limited to new FX series laptops, at least the FX501 and the FX702) lack a
+sonypi device and are not supported at all by this driver.
+
+It will give access (through a user space utility) to some events those laptops
+generate, like:
+
+	- jogdial events (the small wheel on the side of Vaios)
+	- capture button events (only on Vaio Picturebook series)
+	- Fn keys
+	- bluetooth button (only on C1VR model)
+	- programmable keys, back, help, zoom, thumbphrase buttons, etc.
+	  (when available)
+
+Those events (see linux/sonypi.h) can be polled using the character device node
+/dev/sonypi (major 10, minor auto allocated or specified as a option).
+A simple daemon which translates the jogdial movements into mouse wheel events
+can be downloaded at: <http://popies.net/sonypi/>
+
+Another option to intercept the events is to get them directly through the
+input layer.
+
+This driver supports also some ioctl commands for setting the LCD screen
+brightness and querying the batteries charge information (some more
+commands may be added in the future).
+
+This driver can also be used to set the camera controls on Picturebook series
+(brightness, contrast etc), and is used by the video4linux driver for the
+Motion Eye camera.
+
+Please note that this driver was created by reverse engineering the Windows
+driver and the ACPI BIOS, because Sony doesn't agree to release any programming
+specs for its laptops. If someone convinces them to do so, drop me a note.
+
+Driver options:
+---------------
+
+Several options can be passed to the sonypi driver using the standard
+module argument syntax (<param>=<value> when passing the option to the
+module or sonypi.<param>=<value> on the kernel boot line when sonypi is
+statically linked into the kernel). Those options are:
+
+	=============== =======================================================
+	minor:		minor number of the misc device /dev/sonypi,
+			default is -1 (automatic allocation, see /proc/misc
+			or kernel logs)
+
+	camera:		if you have a PictureBook series Vaio (with the
+			integrated MotionEye camera), set this parameter to 1
+			in order to let the driver access to the camera
+
+	fnkeyinit:	on some Vaios (C1VE, C1VR etc), the Fn key events don't
+			get enabled unless you set this parameter to 1.
+			Do not use this option unless it's actually necessary,
+			some Vaio models don't deal well with this option.
+			This option is available only if the kernel is
+			compiled without ACPI support (since it conflicts
+			with it and it shouldn't be required anyway if
+			ACPI is already enabled).
+
+	verbose:	set to 1 to print unknown events received from the
+			sonypi device.
+			set to 2 to print all events received from the
+			sonypi device.
+
+	compat:		uses some compatibility code for enabling the sonypi
+			events. If the driver worked for you in the past
+			(prior to version 1.5) and does not work anymore,
+			add this option and report to the author.
+
+	mask:		event mask telling the driver what events will be
+			reported to the user. This parameter is required for
+			some Vaio models where the hardware reuses values
+			used in other Vaio models (like the FX series who does
+			not have a jogdial but reuses the jogdial events for
+			programmable keys events). The default event mask is
+			set to 0xffffffff, meaning that all possible events
+			will be tried. You can use the following bits to
+			construct your own event mask (from
+			drivers/char/sonypi.h)::
+
+				SONYPI_JOGGER_MASK		0x0001
+				SONYPI_CAPTURE_MASK		0x0002
+				SONYPI_FNKEY_MASK		0x0004
+				SONYPI_BLUETOOTH_MASK		0x0008
+				SONYPI_PKEY_MASK		0x0010
+				SONYPI_BACK_MASK		0x0020
+				SONYPI_HELP_MASK		0x0040
+				SONYPI_LID_MASK			0x0080
+				SONYPI_ZOOM_MASK		0x0100
+				SONYPI_THUMBPHRASE_MASK		0x0200
+				SONYPI_MEYE_MASK		0x0400
+				SONYPI_MEMORYSTICK_MASK		0x0800
+				SONYPI_BATTERY_MASK		0x1000
+				SONYPI_WIRELESS_MASK		0x2000
+
+	useinput:	if set (which is the default) two input devices are
+			created, one which interprets the jogdial events as
+			mouse events, the other one which acts like a
+			keyboard reporting the pressing of the special keys.
+	=============== =======================================================
+
+Module use:
+-----------
+
+In order to automatically load the sonypi module on use, you can put those
+lines a configuration file in /etc/modprobe.d/::
+
+	alias char-major-10-250 sonypi
+	options sonypi minor=250
+
+This supposes the use of minor 250 for the sonypi device::
+
+	# mknod /dev/sonypi c 10 250
+
+Bugs:
+-----
+
+	- several users reported that this driver disables the BIOS-managed
+	  Fn-keys which put the laptop in sleeping state, or switch the
+	  external monitor on/off. There is no workaround yet, since this
+	  driver disables all APM management for those keys, by enabling the
+	  ACPI management (and the ACPI core stuff is not complete yet). If
+	  you have one of those laptops with working Fn keys and want to
+	  continue to use them, don't use this driver.
+
+	- some users reported that the laptop speed is lower (dhrystone
+	  tested) when using the driver with the fnkeyinit parameter. I cannot
+	  reproduce it on my laptop and not all users have this problem.
+	  This happens because the fnkeyinit parameter enables the ACPI
+	  mode (but without additional ACPI control, like processor
+	  speed handling etc). Use ACPI instead of APM if it works on your
+	  laptop.
+
+	- sonypi lacks the ability to distinguish between certain key
+	  events on some models.
+
+	- some models with the nvidia card (geforce go 6200 tc) uses a
+	  different way to adjust the backlighting of the screen. There
+	  is a userspace utility to adjust the brightness on those models,
+	  which can be downloaded from
+	  http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
+
+	- since all development was done by reverse engineering, there is
+	  *absolutely no guarantee* that this driver will not crash your
+	  laptop. Permanently.
diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
new file mode 100644
index 000000000000..adea0bf2acc5
--- /dev/null
+++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
@@ -0,0 +1,1562 @@
+===========================
+ThinkPad ACPI Extras Driver
+===========================
+
+Version 0.25
+
+October 16th,  2013
+
+- Borislav Deianov <borislav@users.sf.net>
+- Henrique de Moraes Holschuh <hmh@hmh.eng.br>
+
+http://ibm-acpi.sf.net/
+
+This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It
+supports various features of these laptops which are accessible
+through the ACPI and ACPI EC framework, but not otherwise fully
+supported by the generic Linux ACPI drivers.
+
+This driver used to be named ibm-acpi until kernel 2.6.21 and release
+0.13-20070314.  It used to be in the drivers/acpi tree, but it was
+moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
+2.6.22, and release 0.14.  It was moved to drivers/platform/x86 for
+kernel 2.6.29 and release 0.22.
+
+The driver is named "thinkpad-acpi".  In some places, like module
+names and log messages, "thinkpad_acpi" is used because of userspace
+issues.
+
+"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
+long due to length limitations on some Linux kernel versions.
+
+Status
+------
+
+The features currently supported are the following (see below for
+detailed description):
+
+	- Fn key combinations
+	- Bluetooth enable and disable
+	- video output switching, expansion control
+	- ThinkLight on and off
+	- CMOS/UCMS control
+	- LED control
+	- ACPI sounds
+	- temperature sensors
+	- Experimental: embedded controller register dump
+	- LCD brightness control
+	- Volume control
+	- Fan control and monitoring: fan speed, fan enable/disable
+	- WAN enable and disable
+	- UWB enable and disable
+
+A compatibility table by model and feature is maintained on the web
+site, http://ibm-acpi.sf.net/. I appreciate any success or failure
+reports, especially if they add to or correct the compatibility table.
+Please include the following information in your report:
+
+	- ThinkPad model name
+	- a copy of your ACPI tables, using the "acpidump" utility
+	- a copy of the output of dmidecode, with serial numbers
+	  and UUIDs masked off
+	- which driver features work and which don't
+	- the observed behavior of non-working features
+
+Any other comments or patches are also more than welcome.
+
+
+Installation
+------------
+
+If you are compiling this driver as included in the Linux kernel
+sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
+It is located on the menu path: "Device Drivers" -> "X86 Platform
+Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
+
+
+Features
+--------
+
+The driver exports two different interfaces to userspace, which can be
+used to access the features it provides.  One is a legacy procfs-based
+interface, which will be removed at some time in the future.  The other
+is a new sysfs-based interface which is not complete yet.
+
+The procfs interface creates the /proc/acpi/ibm directory.  There is a
+file under that directory for each feature it supports.  The procfs
+interface is mostly frozen, and will change very little if at all: it
+will not be extended to add any new functionality in the driver, instead
+all new functionality will be implemented on the sysfs interface.
+
+The sysfs interface tries to blend in the generic Linux sysfs subsystems
+and classes as much as possible.  Since some of these subsystems are not
+yet ready or stabilized, it is expected that this interface will change,
+and any and all userspace programs must deal with it.
+
+
+Notes about the sysfs interface
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unlike what was done with the procfs interface, correctness when talking
+to the sysfs interfaces will be enforced, as will correctness in the
+thinkpad-acpi's implementation of sysfs interfaces.
+
+Also, any bugs in the thinkpad-acpi sysfs driver code or in the
+thinkpad-acpi's implementation of the sysfs interfaces will be fixed for
+maximum correctness, even if that means changing an interface in
+non-compatible ways.  As these interfaces mature both in the kernel and
+in thinkpad-acpi, such changes should become quite rare.
+
+Applications interfacing to the thinkpad-acpi sysfs interfaces must
+follow all sysfs guidelines and correctly process all errors (the sysfs
+interface makes extensive use of errors).  File descriptors and open /
+close operations to the sysfs inodes must also be properly implemented.
+
+The version of thinkpad-acpi's sysfs interface is exported by the driver
+as a driver attribute (see below).
+
+Sysfs driver attributes are on the driver's sysfs attribute space,
+for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
+
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
+space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
+
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad", or
+better yet, through libsensors. For 4.14+ sysfs attributes were moved to the
+hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or
+/sys/class/hwmon/hwmon?).
+
+Driver version
+--------------
+
+procfs: /proc/acpi/ibm/driver
+
+sysfs driver attribute: version
+
+The driver name and version. No commands can be written to this file.
+
+
+Sysfs interface version
+-----------------------
+
+sysfs driver attribute: interface_version
+
+Version of the thinkpad-acpi sysfs interface, as an unsigned long
+(output in hex format: 0xAAAABBCC), where:
+
+	AAAA
+	  - major revision
+	BB
+	  - minor revision
+	CC
+	  - bugfix revision
+
+The sysfs interface version changelog for the driver can be found at the
+end of this document.  Changes to the sysfs interface done by the kernel
+subsystems are not documented here, nor are they tracked by this
+attribute.
+
+Changes to the thinkpad-acpi sysfs interface are only considered
+non-experimental when they are submitted to Linux mainline, at which
+point the changes in this interface are documented and interface_version
+may be updated.  If you are using any thinkpad-acpi features not yet
+sent to mainline for merging, you do so on your own risk: these features
+may disappear, or be implemented in a different and incompatible way by
+the time they are merged in Linux mainline.
+
+Changes that are backwards-compatible by nature (e.g. the addition of
+attributes that do not change the way the other attributes work) do not
+always warrant an update of interface_version.  Therefore, one must
+expect that an attribute might not be there, and deal with it properly
+(an attribute not being there *is* a valid way to make it clear that a
+feature is not available in sysfs).
+
+
+Hot keys
+--------
+
+procfs: /proc/acpi/ibm/hotkey
+
+sysfs device attribute: hotkey_*
+
+In a ThinkPad, the ACPI HKEY handler is responsible for communicating
+some important events and also keyboard hot key presses to the operating
+system.  Enabling the hotkey functionality of thinkpad-acpi signals the
+firmware that such a driver is present, and modifies how the ThinkPad
+firmware will behave in many situations.
+
+The driver enables the HKEY ("hot key") event reporting automatically
+when loaded, and disables it when it is removed.
+
+The driver will report HKEY events in the following format::
+
+	ibm/hotkey HKEY 00000080 0000xxxx
+
+Some of these events refer to hot key presses, but not all of them.
+
+The driver will generate events over the input layer for hot keys and
+radio switches, and over the ACPI netlink layer for other events.  The
+input layer support accepts the standard IOCTLs to remap the keycodes
+assigned to each hot key.
+
+The hot key bit mask allows some control over which hot keys generate
+events.  If a key is "masked" (bit set to 0 in the mask), the firmware
+will handle it.  If it is "unmasked", it signals the firmware that
+thinkpad-acpi would prefer to handle it, if the firmware would be so
+kind to allow it (and it often doesn't!).
+
+Not all bits in the mask can be modified.  Not all bits that can be
+modified do anything.  Not all hot keys can be individually controlled
+by the mask.  Some models do not support the mask at all.  The behaviour
+of the mask is, therefore, highly dependent on the ThinkPad model.
+
+The driver will filter out any unmasked hotkeys, so even if the firmware
+doesn't allow disabling an specific hotkey, the driver will not report
+events for unmasked hotkeys.
+
+Note that unmasking some keys prevents their default behavior.  For
+example, if Fn+F5 is unmasked, that key will no longer enable/disable
+Bluetooth by itself in firmware.
+
+Note also that not all Fn key combinations are supported through ACPI
+depending on the ThinkPad model and firmware version.  On those
+ThinkPads, it is still possible to support some extra hotkeys by
+polling the "CMOS NVRAM" at least 10 times per second.  The driver
+attempts to enables this functionality automatically when required.
+
+procfs notes
+^^^^^^^^^^^^
+
+The following commands can be written to the /proc/acpi/ibm/hotkey file::
+
+	echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
+	echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
+	... any other 8-hex-digit mask ...
+	echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask
+
+The following commands have been deprecated and will cause the kernel
+to log a warning::
+
+	echo enable > /proc/acpi/ibm/hotkey -- does nothing
+	echo disable > /proc/acpi/ibm/hotkey -- returns an error
+
+The procfs interface does not support NVRAM polling control.  So as to
+maintain maximum bug-to-bug compatibility, it does not report any masks,
+nor does it allow one to manipulate the hot key mask when the firmware
+does not support masks at all, even if NVRAM polling is in use.
+
+sysfs notes
+^^^^^^^^^^^
+
+	hotkey_bios_enabled:
+		DEPRECATED, WILL BE REMOVED SOON.
+
+		Returns 0.
+
+	hotkey_bios_mask:
+		DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE.
+
+		Returns the hot keys mask when thinkpad-acpi was loaded.
+		Upon module unload, the hot keys mask will be restored
+		to this value.   This is always 0x80c, because those are
+		the hotkeys that were supported by ancient firmware
+		without mask support.
+
+	hotkey_enable:
+		DEPRECATED, WILL BE REMOVED SOON.
+
+		0: returns -EPERM
+		1: does nothing
+
+	hotkey_mask:
+		bit mask to enable reporting (and depending on
+		the firmware, ACPI event generation) for each hot key
+		(see above).  Returns the current status of the hot keys
+		mask, and allows one to modify it.
+
+	hotkey_all_mask:
+		bit mask that should enable event reporting for all
+		supported hot keys, when echoed to hotkey_mask above.
+		Unless you know which events need to be handled
+		passively (because the firmware *will* handle them
+		anyway), do *not* use hotkey_all_mask.  Use
+		hotkey_recommended_mask, instead. You have been warned.
+
+	hotkey_recommended_mask:
+		bit mask that should enable event reporting for all
+		supported hot keys, except those which are always
+		handled by the firmware anyway.  Echo it to
+		hotkey_mask above, to use.  This is the default mask
+		used by the driver.
+
+	hotkey_source_mask:
+		bit mask that selects which hot keys will the driver
+		poll the NVRAM for.  This is auto-detected by the driver
+		based on the capabilities reported by the ACPI firmware,
+		but it can be overridden at runtime.
+
+		Hot keys whose bits are set in hotkey_source_mask are
+		polled for in NVRAM, and reported as hotkey events if
+		enabled in hotkey_mask.  Only a few hot keys are
+		available through CMOS NVRAM polling.
+
+		Warning: when in NVRAM mode, the volume up/down/mute
+		keys are synthesized according to changes in the mixer,
+		which uses a single volume up or volume down hotkey
+		press to unmute, as per the ThinkPad volume mixer user
+		interface.  When in ACPI event mode, volume up/down/mute
+		events are reported by the firmware and can behave
+		differently (and that behaviour changes with firmware
+		version -- not just with firmware models -- as well as
+		OSI(Linux) state).
+
+	hotkey_poll_freq:
+		frequency in Hz for hot key polling. It must be between
+		0 and 25 Hz.  Polling is only carried out when strictly
+		needed.
+
+		Setting hotkey_poll_freq to zero disables polling, and
+		will cause hot key presses that require NVRAM polling
+		to never be reported.
+
+		Setting hotkey_poll_freq too low may cause repeated
+		pressings of the same hot key to be misreported as a
+		single key press, or to not even be detected at all.
+		The recommended polling frequency is 10Hz.
+
+	hotkey_radio_sw:
+		If the ThinkPad has a hardware radio switch, this
+		attribute will read 0 if the switch is in the "radios
+		disabled" position, and 1 if the switch is in the
+		"radios enabled" position.
+
+		This attribute has poll()/select() support.
+
+	hotkey_tablet_mode:
+		If the ThinkPad has tablet capabilities, this attribute
+		will read 0 if the ThinkPad is in normal mode, and
+		1 if the ThinkPad is in tablet mode.
+
+		This attribute has poll()/select() support.
+
+	wakeup_reason:
+		Set to 1 if the system is waking up because the user
+		requested a bay ejection.  Set to 2 if the system is
+		waking up because the user requested the system to
+		undock.  Set to zero for normal wake-ups or wake-ups
+		due to unknown reasons.
+
+		This attribute has poll()/select() support.
+
+	wakeup_hotunplug_complete:
+		Set to 1 if the system was waken up because of an
+		undock or bay ejection request, and that request
+		was successfully completed.  At this point, it might
+		be useful to send the system back to sleep, at the
+		user's choice.  Refer to HKEY events 0x4003 and
+		0x3003, below.
+
+		This attribute has poll()/select() support.
+
+input layer notes
+^^^^^^^^^^^^^^^^^
+
+A Hot key is mapped to a single input layer EV_KEY event, possibly
+followed by an EV_MSC MSC_SCAN event that shall contain that key's scan
+code.  An EV_SYN event will always be generated to mark the end of the
+event block.
+
+Do not use the EV_MSC MSC_SCAN events to process keys.  They are to be
+used as a helper to remap keys, only.  They are particularly useful when
+remapping KEY_UNKNOWN keys.
+
+The events are available in an input device, with the following id:
+
+	==============  ==============================
+	Bus		BUS_HOST
+	vendor		0x1014 (PCI_VENDOR_ID_IBM)  or
+			0x17aa (PCI_VENDOR_ID_LENOVO)
+	product		0x5054 ("TP")
+	version		0x4101
+	==============  ==============================
+
+The version will have its LSB incremented if the keymap changes in a
+backwards-compatible way.  The MSB shall always be 0x41 for this input
+device.  If the MSB is not 0x41, do not use the device as described in
+this section, as it is either something else (e.g. another input device
+exported by a thinkpad driver, such as HDAPS) or its functionality has
+been changed in a non-backwards compatible way.
+
+Adding other event types for other functionalities shall be considered a
+backwards-compatible change for this input device.
+
+Thinkpad-acpi Hot Key event map (version 0x4101):
+
+=======	=======	==============	==============================================
+ACPI	Scan
+event	code	Key		Notes
+=======	=======	==============	==============================================
+0x1001	0x00	FN+F1		-
+
+0x1002	0x01	FN+F2		IBM: battery (rare)
+				Lenovo: Screen lock
+
+0x1003	0x02	FN+F3		Many IBM models always report
+				this hot key, even with hot keys
+				disabled or with Fn+F3 masked
+				off
+				IBM: screen lock, often turns
+				off the ThinkLight as side-effect
+				Lenovo: battery
+
+0x1004	0x03	FN+F4		Sleep button (ACPI sleep button
+				semantics, i.e. sleep-to-RAM).
+				It always generates some kind
+				of event, either the hot key
+				event or an ACPI sleep button
+				event. The firmware may
+				refuse to generate further FN+F4
+				key presses until a S3 or S4 ACPI
+				sleep cycle is performed or some
+				time passes.
+
+0x1005	0x04	FN+F5		Radio.  Enables/disables
+				the internal Bluetooth hardware
+				and W-WAN card if left in control
+				of the firmware.  Does not affect
+				the WLAN card.
+				Should be used to turn on/off all
+				radios (Bluetooth+W-WAN+WLAN),
+				really.
+
+0x1006	0x05	FN+F6		-
+
+0x1007	0x06	FN+F7		Video output cycle.
+				Do you feel lucky today?
+
+0x1008	0x07	FN+F8		IBM: toggle screen expand
+				Lenovo: configure UltraNav,
+				or toggle screen expand
+
+0x1009	0x08	FN+F9		-
+
+...	...	...		...
+
+0x100B	0x0A	FN+F11		-
+
+0x100C	0x0B	FN+F12		Sleep to disk.  You are always
+				supposed to handle it yourself,
+				either through the ACPI event,
+				or through a hotkey event.
+				The firmware may refuse to
+				generate further FN+F12 key
+				press events until a S3 or S4
+				ACPI sleep cycle is performed,
+				or some time passes.
+
+0x100D	0x0C	FN+BACKSPACE	-
+0x100E	0x0D	FN+INSERT	-
+0x100F	0x0E	FN+DELETE	-
+
+0x1010	0x0F	FN+HOME		Brightness up.  This key is
+				always handled by the firmware
+				in IBM ThinkPads, even when
+				unmasked.  Just leave it alone.
+				For Lenovo ThinkPads with a new
+				BIOS, it has to be handled either
+				by the ACPI OSI, or by userspace.
+				The driver does the right thing,
+				never mess with this.
+0x1011	0x10	FN+END		Brightness down.  See brightness
+				up for details.
+
+0x1012	0x11	FN+PGUP		ThinkLight toggle.  This key is
+				always handled by the firmware,
+				even when unmasked.
+
+0x1013	0x12	FN+PGDOWN	-
+
+0x1014	0x13	FN+SPACE	Zoom key
+
+0x1015	0x14	VOLUME UP	Internal mixer volume up. This
+				key is always handled by the
+				firmware, even when unmasked.
+				NOTE: Lenovo seems to be changing
+				this.
+0x1016	0x15	VOLUME DOWN	Internal mixer volume up. This
+				key is always handled by the
+				firmware, even when unmasked.
+				NOTE: Lenovo seems to be changing
+				this.
+0x1017	0x16	MUTE		Mute internal mixer. This
+				key is always handled by the
+				firmware, even when unmasked.
+
+0x1018	0x17	THINKPAD	ThinkPad/Access IBM/Lenovo key
+
+0x1019	0x18	unknown
+
+...	...	...
+
+0x1020	0x1F	unknown
+=======	=======	==============	==============================================
+
+The ThinkPad firmware does not allow one to differentiate when most hot
+keys are pressed or released (either that, or we don't know how to, yet).
+For these keys, the driver generates a set of events for a key press and
+immediately issues the same set of events for a key release.  It is
+unknown by the driver if the ThinkPad firmware triggered these events on
+hot key press or release, but the firmware will do it for either one, not
+both.
+
+If a key is mapped to KEY_RESERVED, it generates no input events at all.
+If a key is mapped to KEY_UNKNOWN, it generates an input event that
+includes an scan code.  If a key is mapped to anything else, it will
+generate input device EV_KEY events.
+
+In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
+events for switches:
+
+==============	==============================================
+SW_RFKILL_ALL	T60 and later hardware rfkill rocker switch
+SW_TABLET_MODE	Tablet ThinkPads HKEY events 0x5009 and 0x500A
+==============	==============================================
+
+Non hotkey ACPI HKEY event map
+------------------------------
+
+Events that are never propagated by the driver:
+
+======		==================================================
+0x2304		System is waking up from suspend to undock
+0x2305		System is waking up from suspend to eject bay
+0x2404		System is waking up from hibernation to undock
+0x2405		System is waking up from hibernation to eject bay
+0x5001		Lid closed
+0x5002		Lid opened
+0x5009		Tablet swivel: switched to tablet mode
+0x500A		Tablet swivel: switched to normal mode
+0x5010		Brightness level changed/control event
+0x6000		KEYBOARD: Numlock key pressed
+0x6005		KEYBOARD: Fn key pressed (TO BE VERIFIED)
+0x7000		Radio Switch may have changed state
+======		==================================================
+
+
+Events that are propagated by the driver to userspace:
+
+======		=====================================================
+0x2313		ALARM: System is waking up from suspend because
+		the battery is nearly empty
+0x2413		ALARM: System is waking up from hibernation because
+		the battery is nearly empty
+0x3003		Bay ejection (see 0x2x05) complete, can sleep again
+0x3006		Bay hotplug request (hint to power up SATA link when
+		the optical drive tray is ejected)
+0x4003		Undocked (see 0x2x04), can sleep again
+0x4010		Docked into hotplug port replicator (non-ACPI dock)
+0x4011		Undocked from hotplug port replicator (non-ACPI dock)
+0x500B		Tablet pen inserted into its storage bay
+0x500C		Tablet pen removed from its storage bay
+0x6011		ALARM: battery is too hot
+0x6012		ALARM: battery is extremely hot
+0x6021		ALARM: a sensor is too hot
+0x6022		ALARM: a sensor is extremely hot
+0x6030		System thermal table changed
+0x6032		Thermal Control command set completion  (DYTC, Windows)
+0x6040		Nvidia Optimus/AC adapter related (TO BE VERIFIED)
+0x60C0		X1 Yoga 2016, Tablet mode status changed
+0x60F0		Thermal Transformation changed (GMTS, Windows)
+======		=====================================================
+
+Battery nearly empty alarms are a last resort attempt to get the
+operating system to hibernate or shutdown cleanly (0x2313), or shutdown
+cleanly (0x2413) before power is lost.  They must be acted upon, as the
+wake up caused by the firmware will have negated most safety nets...
+
+When any of the "too hot" alarms happen, according to Lenovo the user
+should suspend or hibernate the laptop (and in the case of battery
+alarms, unplug the AC adapter) to let it cool down.  These alarms do
+signal that something is wrong, they should never happen on normal
+operating conditions.
+
+The "extremely hot" alarms are emergencies.  According to Lenovo, the
+operating system is to force either an immediate suspend or hibernate
+cycle, or a system shutdown.  Obviously, something is very wrong if this
+happens.
+
+
+Brightness hotkey notes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Don't mess with the brightness hotkeys in a Thinkpad.  If you want
+notifications for OSD, use the sysfs backlight class event support.
+
+The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events
+automatically for the cases were userspace has to do something to
+implement brightness changes.  When you override these events, you will
+either fail to handle properly the ThinkPads that require explicit
+action to change backlight brightness, or the ThinkPads that require
+that no action be taken to work properly.
+
+
+Bluetooth
+---------
+
+procfs: /proc/acpi/ibm/bluetooth
+
+sysfs device attribute: bluetooth_enable (deprecated)
+
+sysfs rfkill class: switch "tpacpi_bluetooth_sw"
+
+This feature shows the presence and current state of a ThinkPad
+Bluetooth device in the internal ThinkPad CDC slot.
+
+If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
+so it is kept across reboots and power-off.
+
+Procfs notes
+^^^^^^^^^^^^
+
+If Bluetooth is installed, the following commands can be used::
+
+	echo enable > /proc/acpi/ibm/bluetooth
+	echo disable > /proc/acpi/ibm/bluetooth
+
+Sysfs notes
+^^^^^^^^^^^
+
+	If the Bluetooth CDC card is installed, it can be enabled /
+	disabled through the "bluetooth_enable" thinkpad-acpi device
+	attribute, and its current status can also be queried.
+
+	enable:
+
+		- 0: disables Bluetooth / Bluetooth is disabled
+		- 1: enables Bluetooth / Bluetooth is enabled.
+
+	Note: this interface has been superseded by the	generic rfkill
+	class.  It has been deprecated, and it will be removed in year
+	2010.
+
+	rfkill controller switch "tpacpi_bluetooth_sw": refer to
+	Documentation/driver-api/rfkill.rst for details.
+
+
+Video output control -- /proc/acpi/ibm/video
+--------------------------------------------
+
+This feature allows control over the devices used for video output -
+LCD, CRT or DVI (if available). The following commands are available::
+
+	echo lcd_enable > /proc/acpi/ibm/video
+	echo lcd_disable > /proc/acpi/ibm/video
+	echo crt_enable > /proc/acpi/ibm/video
+	echo crt_disable > /proc/acpi/ibm/video
+	echo dvi_enable > /proc/acpi/ibm/video
+	echo dvi_disable > /proc/acpi/ibm/video
+	echo auto_enable > /proc/acpi/ibm/video
+	echo auto_disable > /proc/acpi/ibm/video
+	echo expand_toggle > /proc/acpi/ibm/video
+	echo video_switch > /proc/acpi/ibm/video
+
+NOTE:
+  Access to this feature is restricted to processes owning the
+  CAP_SYS_ADMIN capability for safety reasons, as it can interact badly
+  enough with some versions of X.org to crash it.
+
+Each video output device can be enabled or disabled individually.
+Reading /proc/acpi/ibm/video shows the status of each device.
+
+Automatic video switching can be enabled or disabled.  When automatic
+video switching is enabled, certain events (e.g. opening the lid,
+docking or undocking) cause the video output device to change
+automatically. While this can be useful, it also causes flickering
+and, on the X40, video corruption. By disabling automatic switching,
+the flickering or video corruption can be avoided.
+
+The video_switch command cycles through the available video outputs
+(it simulates the behavior of Fn-F7).
+
+Video expansion can be toggled through this feature. This controls
+whether the display is expanded to fill the entire LCD screen when a
+mode with less than full resolution is used. Note that the current
+video expansion status cannot be determined through this feature.
+
+Note that on many models (particularly those using Radeon graphics
+chips) the X driver configures the video card in a way which prevents
+Fn-F7 from working. This also disables the video output switching
+features of this driver, as it uses the same ACPI methods as
+Fn-F7. Video switching on the console should still work.
+
+UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
+
+
+ThinkLight control
+------------------
+
+procfs: /proc/acpi/ibm/light
+
+sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED
+
+procfs notes
+^^^^^^^^^^^^
+
+The ThinkLight status can be read and set through the procfs interface.  A
+few models which do not make the status available will show the ThinkLight
+status as "unknown". The available commands are::
+
+	echo on  > /proc/acpi/ibm/light
+	echo off > /proc/acpi/ibm/light
+
+sysfs notes
+^^^^^^^^^^^
+
+The ThinkLight sysfs interface is documented by the LED class
+documentation, in Documentation/leds/leds-class.rst.  The ThinkLight LED name
+is "tpacpi::thinklight".
+
+Due to limitations in the sysfs LED class, if the status of the ThinkLight
+cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
+It is impossible to know if the status returned through sysfs is valid.
+
+
+CMOS/UCMS control
+-----------------
+
+procfs: /proc/acpi/ibm/cmos
+
+sysfs device attribute: cmos_command
+
+This feature is mostly used internally by the ACPI firmware to keep the legacy
+CMOS NVRAM bits in sync with the current machine state, and to record this
+state so that the ThinkPad will retain such settings across reboots.
+
+Some of these commands actually perform actions in some ThinkPad models, but
+this is expected to disappear more and more in newer models.  As an example, in
+a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for
+real, but commands 0 to 2 don't control the mixer anymore (they have been
+phased out) and just update the NVRAM.
+
+The range of valid cmos command numbers is 0 to 21, but not all have an
+effect and the behavior varies from model to model.  Here is the behavior
+on the X40 (tpb is the ThinkPad Buttons utility):
+
+	- 0 - Related to "Volume down" key press
+	- 1 - Related to "Volume up" key press
+	- 2 - Related to "Mute on" key press
+	- 3 - Related to "Access IBM" key press
+	- 4 - Related to "LCD brightness up" key press
+	- 5 - Related to "LCD brightness down" key press
+	- 11 - Related to "toggle screen expansion" key press/function
+	- 12 - Related to "ThinkLight on"
+	- 13 - Related to "ThinkLight off"
+	- 14 - Related to "ThinkLight" key press (toggle ThinkLight)
+
+The cmos command interface is prone to firmware split-brain problems, as
+in newer ThinkPads it is just a compatibility layer.  Do not use it, it is
+exported just as a debug tool.
+
+
+LED control
+-----------
+
+procfs: /proc/acpi/ibm/led
+sysfs attributes: as per LED class, see below for names
+
+Some of the LED indicators can be controlled through this feature.  On
+some older ThinkPad models, it is possible to query the status of the
+LED indicators as well.  Newer ThinkPads cannot query the real status
+of the LED indicators.
+
+Because misuse of the LEDs could induce an unaware user to perform
+dangerous actions (like undocking or ejecting a bay device while the
+buses are still active), or mask an important alarm (such as a nearly
+empty battery, or a broken battery), access to most LEDs is
+restricted.
+
+Unrestricted access to all LEDs requires that thinkpad-acpi be
+compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
+Distributions must never enable this option.  Individual users that
+are aware of the consequences are welcome to enabling it.
+
+Audio mute and microphone mute LEDs are supported, but currently not
+visible to userspace. They are used by the snd-hda-intel audio driver.
+
+procfs notes
+^^^^^^^^^^^^
+
+The available commands are::
+
+	echo '<LED number> on' >/proc/acpi/ibm/led
+	echo '<LED number> off' >/proc/acpi/ibm/led
+	echo '<LED number> blink' >/proc/acpi/ibm/led
+
+The <LED number> range is 0 to 15. The set of LEDs that can be
+controlled varies from model to model. Here is the common ThinkPad
+mapping:
+
+	- 0 - power
+	- 1 - battery (orange)
+	- 2 - battery (green)
+	- 3 - UltraBase/dock
+	- 4 - UltraBay
+	- 5 - UltraBase battery slot
+	- 6 - (unknown)
+	- 7 - standby
+	- 8 - dock status 1
+	- 9 - dock status 2
+	- 10, 11 - (unknown)
+	- 12 - thinkvantage
+	- 13, 14, 15 - (unknown)
+
+All of the above can be turned on and off and can be made to blink.
+
+sysfs notes
+^^^^^^^^^^^
+
+The ThinkPad LED sysfs interface is described in detail by the LED class
+documentation, in Documentation/leds/leds-class.rst.
+
+The LEDs are named (in LED ID order, from 0 to 12):
+"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
+"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
+"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
+"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
+"tpacpi::thinkvantage".
+
+Due to limitations in the sysfs LED class, if the status of the LED
+indicators cannot be read due to an error, thinkpad-acpi will report it as
+a brightness of zero (same as LED off).
+
+If the thinkpad firmware doesn't support reading the current status,
+trying to read the current LED brightness will just return whatever
+brightness was last written to that attribute.
+
+These LEDs can blink using hardware acceleration.  To request that a
+ThinkPad indicator LED should blink in hardware accelerated mode, use the
+"timer" trigger, and leave the delay_on and delay_off parameters set to
+zero (to request hardware acceleration autodetection).
+
+LEDs that are known not to exist in a given ThinkPad model are not
+made available through the sysfs interface.  If you have a dock and you
+notice there are LEDs listed for your ThinkPad that do not exist (and
+are not in the dock), or if you notice that there are missing LEDs,
+a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
+
+
+ACPI sounds -- /proc/acpi/ibm/beep
+----------------------------------
+
+The BEEP method is used internally by the ACPI firmware to provide
+audible alerts in various situations. This feature allows the same
+sounds to be triggered manually.
+
+The commands are non-negative integer numbers::
+
+	echo <number> >/proc/acpi/ibm/beep
+
+The valid <number> range is 0 to 17. Not all numbers trigger sounds
+and the sounds vary from model to model. Here is the behavior on the
+X40:
+
+	- 0 - stop a sound in progress (but use 17 to stop 16)
+	- 2 - two beeps, pause, third beep ("low battery")
+	- 3 - single beep
+	- 4 - high, followed by low-pitched beep ("unable")
+	- 5 - single beep
+	- 6 - very high, followed by high-pitched beep ("AC/DC")
+	- 7 - high-pitched beep
+	- 9 - three short beeps
+	- 10 - very long beep
+	- 12 - low-pitched beep
+	- 15 - three high-pitched beeps repeating constantly, stop with 0
+	- 16 - one medium-pitched beep repeating constantly, stop with 17
+	- 17 - stop 16
+
+
+Temperature sensors
+-------------------
+
+procfs: /proc/acpi/ibm/thermal
+
+sysfs device attributes: (hwmon "thinkpad") temp*_input
+
+Most ThinkPads include six or more separate temperature sensors but only
+expose the CPU temperature through the standard ACPI methods.  This
+feature shows readings from up to eight different sensors on older
+ThinkPads, and up to sixteen different sensors on newer ThinkPads.
+
+For example, on the X40, a typical output may be:
+
+temperatures:
+	42 42 45 41 36 -128 33 -128
+
+On the T43/p, a typical output may be:
+
+temperatures:
+	48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128
+
+The mapping of thermal sensors to physical locations varies depending on
+system-board model (and thus, on ThinkPad model).
+
+http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that
+tries to track down these locations for various models.
+
+Most (newer?) models seem to follow this pattern:
+
+- 1:  CPU
+- 2:  (depends on model)
+- 3:  (depends on model)
+- 4:  GPU
+- 5:  Main battery: main sensor
+- 6:  Bay battery: main sensor
+- 7:  Main battery: secondary sensor
+- 8:  Bay battery: secondary sensor
+- 9-15: (depends on model)
+
+For the R51 (source: Thomas Gruber):
+
+- 2:  Mini-PCI
+- 3:  Internal HDD
+
+For the T43, T43/p (source: Shmidoax/Thinkwiki.org)
+http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p
+
+- 2:  System board, left side (near PCMCIA slot), reported as HDAPS temp
+- 3:  PCMCIA slot
+- 9:  MCH (northbridge) to DRAM Bus
+- 10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI
+      card, under touchpad
+- 11: Power regulator, underside of system board, below F2 key
+
+The A31 has a very atypical layout for the thermal sensors
+(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31)
+
+- 1:  CPU
+- 2:  Main Battery: main sensor
+- 3:  Power Converter
+- 4:  Bay Battery: main sensor
+- 5:  MCH (northbridge)
+- 6:  PCMCIA/ambient
+- 7:  Main Battery: secondary sensor
+- 8:  Bay Battery: secondary sensor
+
+
+Procfs notes
+^^^^^^^^^^^^
+
+	Readings from sensors that are not available return -128.
+	No commands can be written to this file.
+
+Sysfs notes
+^^^^^^^^^^^
+
+	Sensors that are not available return the ENXIO error.  This
+	status may change at runtime, as there are hotplug thermal
+	sensors, like those inside the batteries and docks.
+
+	thinkpad-acpi thermal sensors are reported through the hwmon
+	subsystem, and follow all of the hwmon guidelines at
+	Documentation/hwmon.
+
+EXPERIMENTAL: Embedded controller register dump
+-----------------------------------------------
+
+This feature is not included in the thinkpad driver anymore.
+Instead the EC can be accessed through /sys/kernel/debug/ec with
+a userspace tool which can be found here:
+ftp://ftp.suse.com/pub/people/trenn/sources/ec
+
+Use it to determine the register holding the fan
+speed on some models. To do that, do the following:
+
+	- make sure the battery is fully charged
+	- make sure the fan is running
+	- use above mentioned tool to read out the EC
+
+Often fan and temperature values vary between
+readings. Since temperatures don't change vary fast, you can take
+several quick dumps to eliminate them.
+
+You can use a similar method to figure out the meaning of other
+embedded controller registers - e.g. make sure nothing else changes
+except the charging or discharging battery to determine which
+registers contain the current battery capacity, etc. If you experiment
+with this, do send me your results (including some complete dumps with
+a description of the conditions when they were taken.)
+
+
+LCD brightness control
+----------------------
+
+procfs: /proc/acpi/ibm/brightness
+
+sysfs backlight device "thinkpad_screen"
+
+This feature allows software control of the LCD brightness on ThinkPad
+models which don't have a hardware brightness slider.
+
+It has some limitations: the LCD backlight cannot be actually turned
+on or off by this interface, it just controls the backlight brightness
+level.
+
+On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
+has eight brightness levels, ranging from 0 to 7.  Some of the levels
+may not be distinct.  Later Lenovo models that implement the ACPI
+display backlight brightness control methods have 16 levels, ranging
+from 0 to 15.
+
+For IBM ThinkPads, there are two interfaces to the firmware for direct
+brightness control, EC and UCMS (or CMOS).  To select which one should be
+used, use the brightness_mode module parameter: brightness_mode=1 selects
+EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
+mode with NVRAM backing (so that brightness changes are remembered across
+shutdown/reboot).
+
+The driver tries to select which interface to use from a table of
+defaults for each ThinkPad model.  If it makes a wrong choice, please
+report this as a bug, so that we can fix it.
+
+Lenovo ThinkPads only support brightness_mode=2 (UCMS).
+
+When display backlight brightness controls are available through the
+standard ACPI interface, it is best to use it instead of this direct
+ThinkPad-specific interface.  The driver will disable its native
+backlight brightness control interface if it detects that the standard
+ACPI interface is available in the ThinkPad.
+
+If you want to use the thinkpad-acpi backlight brightness control
+instead of the generic ACPI video backlight brightness control for some
+reason, you should use the acpi_backlight=vendor kernel parameter.
+
+The brightness_enable module parameter can be used to control whether
+the LCD brightness control feature will be enabled when available.
+brightness_enable=0 forces it to be disabled.  brightness_enable=1
+forces it to be enabled when available, even if the standard ACPI
+interface is also available.
+
+Procfs notes
+^^^^^^^^^^^^
+
+The available commands are::
+
+	echo up   >/proc/acpi/ibm/brightness
+	echo down >/proc/acpi/ibm/brightness
+	echo 'level <level>' >/proc/acpi/ibm/brightness
+
+Sysfs notes
+^^^^^^^^^^^
+
+The interface is implemented through the backlight sysfs class, which is
+poorly documented at this time.
+
+Locate the thinkpad_screen device under /sys/class/backlight, and inside
+it there will be the following attributes:
+
+	max_brightness:
+		Reads the maximum brightness the hardware can be set to.
+		The minimum is always zero.
+
+	actual_brightness:
+		Reads what brightness the screen is set to at this instant.
+
+	brightness:
+		Writes request the driver to change brightness to the
+		given value.  Reads will tell you what brightness the
+		driver is trying to set the display to when "power" is set
+		to zero and the display has not been dimmed by a kernel
+		power management event.
+
+	power:
+		power management mode, where 0 is "display on", and 1 to 3
+		will dim the display backlight to brightness level 0
+		because thinkpad-acpi cannot really turn the backlight
+		off.  Kernel power management events can temporarily
+		increase the current power management level, i.e. they can
+		dim the display.
+
+
+WARNING:
+
+    Whatever you do, do NOT ever call thinkpad-acpi backlight-level change
+    interface and the ACPI-based backlight level change interface
+    (available on newer BIOSes, and driven by the Linux ACPI video driver)
+    at the same time.  The two will interact in bad ways, do funny things,
+    and maybe reduce the life of the backlight lamps by needlessly kicking
+    its level up and down at every change.
+
+
+Volume control (Console Audio control)
+--------------------------------------
+
+procfs: /proc/acpi/ibm/volume
+
+ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC"
+
+NOTE: by default, the volume control interface operates in read-only
+mode, as it is supposed to be used for on-screen-display purposes.
+The read/write mode can be enabled through the use of the
+"volume_control=1" module parameter.
+
+NOTE: distros are urged to not enable volume_control by default, this
+should be done by the local admin only.  The ThinkPad UI is for the
+console audio control to be done through the volume keys only, and for
+the desktop environment to just provide on-screen-display feedback.
+Software volume control should be done only in the main AC97/HDA
+mixer.
+
+
+About the ThinkPad Console Audio control
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ThinkPads have a built-in amplifier and muting circuit that drives the
+console headphone and speakers.  This circuit is after the main AC97
+or HDA mixer in the audio path, and under exclusive control of the
+firmware.
+
+ThinkPads have three special hotkeys to interact with the console
+audio control: volume up, volume down and mute.
+
+It is worth noting that the normal way the mute function works (on
+ThinkPads that do not have a "mute LED") is:
+
+1. Press mute to mute.  It will *always* mute, you can press it as
+   many times as you want, and the sound will remain mute.
+
+2. Press either volume key to unmute the ThinkPad (it will _not_
+   change the volume, it will just unmute).
+
+This is a very superior design when compared to the cheap software-only
+mute-toggle solution found on normal consumer laptops:  you can be
+absolutely sure the ThinkPad will not make noise if you press the mute
+button, no matter the previous state.
+
+The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain
+amplifiers driving the speakers and headphone output, and the firmware
+also handles volume control for the headphone and speakers on these
+ThinkPads without any help from the operating system (this volume
+control stage exists after the main AC97 or HDA mixer in the audio
+path).
+
+The newer Lenovo models only have firmware mute control, and depend on
+the main HDA mixer to do volume control (which is done by the operating
+system).  In this case, the volume keys are filtered out for unmute
+key press (there are some firmware bugs in this area) and delivered as
+normal key presses to the operating system (thinkpad-acpi is not
+involved).
+
+
+The ThinkPad-ACPI volume control
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The preferred way to interact with the Console Audio control is the
+ALSA interface.
+
+The legacy procfs interface allows one to read the current state,
+and if volume control is enabled, accepts the following commands::
+
+	echo up   >/proc/acpi/ibm/volume
+	echo down >/proc/acpi/ibm/volume
+	echo mute >/proc/acpi/ibm/volume
+	echo unmute >/proc/acpi/ibm/volume
+	echo 'level <level>' >/proc/acpi/ibm/volume
+
+The <level> number range is 0 to 14 although not all of them may be
+distinct. To unmute the volume after the mute command, use either the
+up or down command (the level command will not unmute the volume), or
+the unmute command.
+
+You can use the volume_capabilities parameter to tell the driver
+whether your thinkpad has volume control or mute-only control:
+volume_capabilities=1 for mixers with mute and volume control,
+volume_capabilities=2 for mixers with only mute control.
+
+If the driver misdetects the capabilities for your ThinkPad model,
+please report this to ibm-acpi-devel@lists.sourceforge.net, so that we
+can update the driver.
+
+There are two strategies for volume control.  To select which one
+should be used, use the volume_mode module parameter: volume_mode=1
+selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing
+(so that volume/mute changes are remembered across shutdown/reboot).
+
+The driver will operate in volume_mode=3 by default. If that does not
+work well on your ThinkPad model, please report this to
+ibm-acpi-devel@lists.sourceforge.net.
+
+The driver supports the standard ALSA module parameters.  If the ALSA
+mixer is disabled, the driver will disable all volume functionality.
+
+
+Fan control and monitoring: fan speed, fan enable/disable
+---------------------------------------------------------
+
+procfs: /proc/acpi/ibm/fan
+
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, pwm1_enable, fan2_input
+
+sysfs hwmon driver attributes: fan_watchdog
+
+NOTE NOTE NOTE:
+   fan control operations are disabled by default for
+   safety reasons.  To enable them, the module parameter "fan_control=1"
+   must be given to thinkpad-acpi.
+
+This feature attempts to show the current fan speed, control mode and
+other fan data that might be available.  The speed is read directly
+from the hardware registers of the embedded controller.  This is known
+to work on later R, T, X and Z series ThinkPads but may show a bogus
+value on other models.
+
+Some Lenovo ThinkPads support a secondary fan.  This fan cannot be
+controlled separately, it shares the main fan control.
+
+Fan levels
+^^^^^^^^^^
+
+Most ThinkPad fans work in "levels" at the firmware interface.  Level 0
+stops the fan.  The higher the level, the higher the fan speed, although
+adjacent levels often map to the same fan speed.  7 is the highest
+level, where the fan reaches the maximum recommended speed.
+
+Level "auto" means the EC changes the fan level according to some
+internal algorithm, usually based on readings from the thermal sensors.
+
+There is also a "full-speed" level, also known as "disengaged" level.
+In this level, the EC disables the speed-locked closed-loop fan control,
+and drives the fan as fast as it can go, which might exceed hardware
+limits, so use this level with caution.
+
+The fan usually ramps up or down slowly from one speed to another, and
+it is normal for the EC to take several seconds to react to fan
+commands.  The full-speed level may take up to two minutes to ramp up to
+maximum speed, and in some ThinkPads, the tachometer readings go stale
+while the EC is transitioning to the full-speed level.
+
+WARNING WARNING WARNING: do not leave the fan disabled unless you are
+monitoring all of the temperature sensor readings and you are ready to
+enable it if necessary to avoid overheating.
+
+An enabled fan in level "auto" may stop spinning if the EC decides the
+ThinkPad is cool enough and doesn't need the extra airflow.  This is
+normal, and the EC will spin the fan up if the various thermal readings
+rise too much.
+
+On the X40, this seems to depend on the CPU and HDD temperatures.
+Specifically, the fan is turned on when either the CPU temperature
+climbs to 56 degrees or the HDD temperature climbs to 46 degrees.  The
+fan is turned off when the CPU temperature drops to 49 degrees and the
+HDD temperature drops to 41 degrees.  These thresholds cannot
+currently be controlled.
+
+The ThinkPad's ACPI DSDT code will reprogram the fan on its own when
+certain conditions are met.  It will override any fan programming done
+through thinkpad-acpi.
+
+The thinkpad-acpi kernel driver can be programmed to revert the fan
+level to a safe setting if userspace does not issue one of the procfs
+fan commands: "enable", "disable", "level" or "watchdog", or if there
+are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is
+set to 1, manual mode) within a configurable amount of time of up to
+120 seconds.  This functionality is called fan safety watchdog.
+
+Note that the watchdog timer stops after it enables the fan.  It will be
+rearmed again automatically (using the same interval) when one of the
+above mentioned fan commands is received.  The fan watchdog is,
+therefore, not suitable to protect against fan mode changes made through
+means other than the "enable", "disable", and "level" procfs fan
+commands, or the hwmon fan control sysfs interface.
+
+Procfs notes
+^^^^^^^^^^^^
+
+The fan may be enabled or disabled with the following commands::
+
+	echo enable  >/proc/acpi/ibm/fan
+	echo disable >/proc/acpi/ibm/fan
+
+Placing a fan on level 0 is the same as disabling it.  Enabling a fan
+will try to place it in a safe level if it is too slow or disabled.
+
+The fan level can be controlled with the command::
+
+	echo 'level <level>' > /proc/acpi/ibm/fan
+
+Where <level> is an integer from 0 to 7, or one of the words "auto" or
+"full-speed" (without the quotes).  Not all ThinkPads support the "auto"
+and "full-speed" levels.  The driver accepts "disengaged" as an alias for
+"full-speed", and reports it as "disengaged" for backwards
+compatibility.
+
+On the X31 and X40 (and ONLY on those models), the fan speed can be
+controlled to a certain degree.  Once the fan is running, it can be
+forced to run faster or slower with the following command::
+
+	echo 'speed <speed>' > /proc/acpi/ibm/fan
+
+The sustainable range of fan speeds on the X40 appears to be from about
+3700 to about 7350. Values outside this range either do not have any
+effect or the fan speed eventually settles somewhere in that range.  The
+fan cannot be stopped or started with this command.  This functionality
+is incomplete, and not available through the sysfs interface.
+
+To program the safety watchdog, use the "watchdog" command::
+
+	echo 'watchdog <interval in seconds>' > /proc/acpi/ibm/fan
+
+If you want to disable the watchdog, use 0 as the interval.
+
+Sysfs notes
+^^^^^^^^^^^
+
+The sysfs interface follows the hwmon subsystem guidelines for the most
+part, and the exception is the fan safety watchdog.
+
+Writes to any of the sysfs attributes may return the EINVAL error if
+that operation is not supported in a given ThinkPad or if the parameter
+is out-of-bounds, and EPERM if it is forbidden.  They may also return
+EINTR (interrupted system call), and EIO (I/O error while trying to talk
+to the firmware).
+
+Features not yet implemented by the driver return ENOSYS.
+
+hwmon device attribute pwm1_enable:
+	- 0: PWM offline (fan is set to full-speed mode)
+	- 1: Manual PWM control (use pwm1 to set fan level)
+	- 2: Hardware PWM control (EC "auto" mode)
+	- 3: reserved (Software PWM control, not implemented yet)
+
+	Modes 0 and 2 are not supported by all ThinkPads, and the
+	driver is not always able to detect this.  If it does know a
+	mode is unsupported, it will return -EINVAL.
+
+hwmon device attribute pwm1:
+	Fan level, scaled from the firmware values of 0-7 to the hwmon
+	scale of 0-255.  0 means fan stopped, 255 means highest normal
+	speed (level 7).
+
+	This attribute only commands the fan if pmw1_enable is set to 1
+	(manual PWM control).
+
+hwmon device attribute fan1_input:
+	Fan tachometer reading, in RPM.  May go stale on certain
+	ThinkPads while the EC transitions the PWM to offline mode,
+	which can take up to two minutes.  May return rubbish on older
+	ThinkPads.
+
+hwmon device attribute fan2_input:
+	Fan tachometer reading, in RPM, for the secondary fan.
+	Available only on some ThinkPads.  If the secondary fan is
+	not installed, will always read 0.
+
+hwmon driver attribute fan_watchdog:
+	Fan safety watchdog timer interval, in seconds.  Minimum is
+	1 second, maximum is 120 seconds.  0 disables the watchdog.
+
+To stop the fan: set pwm1 to zero, and pwm1_enable to 1.
+
+To start the fan in a safe mode: set pwm1_enable to 2.  If that fails
+with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255
+would be the safest choice, though).
+
+
+WAN
+---
+
+procfs: /proc/acpi/ibm/wan
+
+sysfs device attribute: wwan_enable (deprecated)
+
+sysfs rfkill class: switch "tpacpi_wwan_sw"
+
+This feature shows the presence and current state of the built-in
+Wireless WAN device.
+
+If the ThinkPad supports it, the WWAN state is stored in NVRAM,
+so it is kept across reboots and power-off.
+
+It was tested on a Lenovo ThinkPad X60. It should probably work on other
+ThinkPad models which come with this module installed.
+
+Procfs notes
+^^^^^^^^^^^^
+
+If the W-WAN card is installed, the following commands can be used::
+
+	echo enable > /proc/acpi/ibm/wan
+	echo disable > /proc/acpi/ibm/wan
+
+Sysfs notes
+^^^^^^^^^^^
+
+	If the W-WAN card is installed, it can be enabled /
+	disabled through the "wwan_enable" thinkpad-acpi device
+	attribute, and its current status can also be queried.
+
+	enable:
+		- 0: disables WWAN card / WWAN card is disabled
+		- 1: enables WWAN card / WWAN card is enabled.
+
+	Note: this interface has been superseded by the	generic rfkill
+	class.  It has been deprecated, and it will be removed in year
+	2010.
+
+	rfkill controller switch "tpacpi_wwan_sw": refer to
+	Documentation/driver-api/rfkill.rst for details.
+
+
+EXPERIMENTAL: UWB
+-----------------
+
+This feature is considered EXPERIMENTAL because it has not been extensively
+tested and validated in various ThinkPad models yet.  The feature may not
+work as expected. USE WITH CAUTION! To use this feature, you need to supply
+the experimental=1 parameter when loading the module.
+
+sysfs rfkill class: switch "tpacpi_uwb_sw"
+
+This feature exports an rfkill controller for the UWB device, if one is
+present and enabled in the BIOS.
+
+Sysfs notes
+^^^^^^^^^^^
+
+	rfkill controller switch "tpacpi_uwb_sw": refer to
+	Documentation/driver-api/rfkill.rst for details.
+
+Adaptive keyboard
+-----------------
+
+sysfs device attribute: adaptive_kbd_mode
+
+This sysfs attribute controls the keyboard "face" that will be shown on the
+Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read
+and set.
+
+- 1 = Home mode
+- 2 = Web-browser mode
+- 3 = Web-conference mode
+- 4 = Function mode
+- 5 = Layflat mode
+
+For more details about which buttons will appear depending on the mode, please
+review the laptop's user guide:
+http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf
+
+Multiple Commands, Module Parameters
+------------------------------------
+
+Multiple commands can be written to the proc files in one shot by
+separating them with commas, for example::
+
+	echo enable,0xffff > /proc/acpi/ibm/hotkey
+	echo lcd_disable,crt_enable > /proc/acpi/ibm/video
+
+Commands can also be specified when loading the thinkpad-acpi module,
+for example::
+
+	modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
+
+
+Enabling debugging output
+-------------------------
+
+The module takes a debug parameter which can be used to selectively
+enable various classes of debugging output, for example::
+
+	 modprobe thinkpad_acpi debug=0xffff
+
+will enable all debugging output classes.  It takes a bitmask, so
+to enable more than one output class, just add their values.
+
+	=============		======================================
+	Debug bitmask		Description
+	=============		======================================
+	0x8000			Disclose PID of userspace programs
+				accessing some functions of the driver
+	0x0001			Initialization and probing
+	0x0002			Removal
+	0x0004			RF Transmitter control (RFKILL)
+				(bluetooth, WWAN, UWB...)
+	0x0008			HKEY event interface, hotkeys
+	0x0010			Fan control
+	0x0020			Backlight brightness
+	0x0040			Audio mixer/volume control
+	=============		======================================
+
+There is also a kernel build option to enable more debugging
+information, which may be necessary to debug driver problems.
+
+The level of debugging information output by the driver can be changed
+at runtime through sysfs, using the driver attribute debug_level.  The
+attribute takes the same bitmask as the debug module parameter above.
+
+
+Force loading of module
+-----------------------
+
+If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify
+the module parameter force_load=1.  Regardless of whether this works or
+not, please contact ibm-acpi-devel@lists.sourceforge.net with a report.
+
+
+Sysfs interface changelog
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=========	===============================================================
+0x000100:	Initial sysfs support, as a single platform driver and
+		device.
+0x000200:	Hot key support for 32 hot keys, and radio slider switch
+		support.
+0x010000:	Hot keys are now handled by default over the input
+		layer, the radio switch generates input event EV_RADIO,
+		and the driver enables hot key handling by default in
+		the firmware.
+
+0x020000:	ABI fix: added a separate hwmon platform device and
+		driver, which must be located by name (thinkpad)
+		and the hwmon class for libsensors4 (lm-sensors 3)
+		compatibility.  Moved all hwmon attributes to this
+		new platform device.
+
+0x020100:	Marker for thinkpad-acpi with hot key NVRAM polling
+		support.  If you must, use it to know you should not
+		start a userspace NVRAM poller (allows to detect when
+		NVRAM is compiled out by the user because it is
+		unneeded/undesired in the first place).
+0x020101:	Marker for thinkpad-acpi with hot key NVRAM polling
+		and proper hotkey_mask semantics (version 8 of the
+		NVRAM polling patch).  Some development snapshots of
+		0.18 had an earlier version that did strange things
+		to hotkey_mask.
+
+0x020200:	Add poll()/select() support to the following attributes:
+		hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
+
+0x020300:	hotkey enable/disable support removed, attributes
+		hotkey_bios_enabled and hotkey_enable deprecated and
+		marked for removal.
+
+0x020400:	Marker for 16 LEDs support.  Also, LEDs that are known
+		to not exist in a given model are not registered with
+		the LED sysfs class anymore.
+
+0x020500:	Updated hotkey driver, hotkey_mask is always available
+		and it is always able to disable hot keys.  Very old
+		thinkpads are properly supported.  hotkey_bios_mask
+		is deprecated and marked for removal.
+
+0x020600:	Marker for backlight change event support.
+
+0x020700:	Support for mute-only mixers.
+		Volume control in read-only mode by default.
+		Marker for ALSA mixer support.
+
+0x030000:	Thermal and fan sysfs attributes were moved to the hwmon
+		device instead of being attached to the backing platform
+		device.
+=========	===============================================================
diff --git a/Documentation/admin-guide/laptops/toshiba_haps.rst b/Documentation/admin-guide/laptops/toshiba_haps.rst
new file mode 100644
index 000000000000..d28b6c3f2849
--- /dev/null
+++ b/Documentation/admin-guide/laptops/toshiba_haps.rst
@@ -0,0 +1,87 @@
+====================================
+Toshiba HDD Active Protection Sensor
+====================================
+
+Kernel driver: toshiba_haps
+
+Author: Azael Avalos <coproscefalo@gmail.com>
+
+
+.. 0. Contents
+
+   1. Description
+   2. Interface
+   3. Accelerometer axes
+   4. Supported devices
+   5. Usage
+
+
+1. Description
+--------------
+
+This driver provides support for the accelerometer found in various Toshiba
+laptops, being called "Toshiba HDD Protection - Shock Sensor" officially,
+and detects laptops automatically with this device.
+On Windows, Toshiba provided software monitors this device and provides
+automatic HDD protection (head unload) on sudden moves or harsh vibrations,
+however, this driver only provides a notification via a sysfs file to let
+userspace tools or daemons act accordingly, as well as providing a sysfs
+file to set the desired protection level or sensor sensibility.
+
+
+2. Interface
+------------
+
+This device comes with 3 methods:
+
+====	=====================================================================
+_STA    Checks existence of the device, returning Zero if the device does not
+	exists or is not supported.
+PTLV    Sets the desired protection level.
+RSSS    Shuts down the HDD protection interface for a few seconds,
+	then restores normal operation.
+====	=====================================================================
+
+Note:
+  The presence of Solid State Drives (SSD) can make this driver to fail loading,
+  given the fact that such drives have no movable parts, and thus, not requiring
+  any "protection" as well as failing during the evaluation of the _STA method
+  found under this device.
+
+
+3. Accelerometer axes
+---------------------
+
+This device does not report any axes, however, to query the sensor position
+a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are
+provided to query such information, handled by the kernel module toshiba_acpi
+since kernel version 3.15.
+
+
+4. Supported devices
+--------------------
+
+This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop
+with this device is supported, given the fact that they have the presence of
+conventional HDD and not only SSD, or a combination of both HDD and SSD.
+
+
+5. Usage
+--------
+
+The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are:
+
+================   ============================================================
+protection_level   The protection_level is readable and writeable, and
+		   provides a way to let userspace query the current protection
+		   level, as well as set the desired protection level, the
+		   available protection levels are::
+
+		     ============   =======   ==========   ========
+		     0 - Disabled   1 - Low   2 - Medium   3 - High
+		     ============   =======   ==========   ========
+
+reset_protection   The reset_protection entry is writeable only, being "1"
+		   the only parameter it accepts, it is used to trigger
+		   a reset of the protection interface.
+================   ============================================================
diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst
new file mode 100644
index 000000000000..a3eb00c62f53
--- /dev/null
+++ b/Documentation/admin-guide/lcd-panel-cgram.rst
@@ -0,0 +1,27 @@
+======================================
+Parallel port LCD/Keypad Panel support
+======================================
+
+Some LCDs allow you to define up to 8 characters, mapped to ASCII
+characters 0 to 7. The escape code to define a new character is
+'\e[LG' followed by one digit from 0 to 7, representing the character
+number, and up to 8 couples of hex digits terminated by a semi-colon
+(';'). Each couple of digits represents a line, with 1-bits for each
+illuminated pixel with LSB on the right. Lines are numbered from the
+top of the character to the bottom. On a 5x7 matrix, only the 5 lower
+bits of the 7 first bytes are used for each character. If the string
+is incomplete, only complete lines will be redefined. Here are some
+examples::
+
+  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
+  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
+  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
+  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
+  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
+  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
+  printf "\e[LG60016051516141400;"  => 6 = "IP"
+
+  printf "\e[LG00103071F1F070301;"  => big speaker
+  printf "\e[LG00002061E1E060200;"  => small speaker
+
+Willy
diff --git a/Documentation/ldm.txt b/Documentation/admin-guide/ldm.rst
index 12c571368e73..12c571368e73 100644
--- a/Documentation/ldm.txt
+++ b/Documentation/admin-guide/ldm.rst
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/admin-guide/lockup-watchdogs.rst
index 290840c160af..290840c160af 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
new file mode 100644
index 000000000000..4e06ffabd78a
--- /dev/null
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -0,0 +1,25 @@
+=====================
+CMA Debugfs Interface
+=====================
+
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+	<debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example::
+
+	echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index 8edb35f11317..11db46448354 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -11,7 +11,7 @@ processes address space and many other cool things.
 Linux memory management is a complex system with many configurable
 settings. Most of these settings are available via ``/proc``
 filesystem and can be quired and adjusted using ``sysctl``. These APIs
-are described in Documentation/sysctl/vm.txt and in `man 5 proc`_.
+are described in Documentation/admin-guide/sysctl/vm.rst and in `man 5 proc`_.
 
 .. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
 
@@ -26,11 +26,13 @@ the Linux memory management.
    :maxdepth: 1
 
    concepts
+   cma_debugfs
    hugetlbpage
    idle_page_tracking
    ksm
    memory-hotplug
    numa_memory_policy
+   numaperf
    pagemap
    soft-dirty
    transhuge
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 9303786632d1..874eb0c77d34 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -59,7 +59,7 @@ MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
 
 If a region of memory must be split into at least one new MADV_MERGEABLE
 or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
-will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.txt).
+will exceed ``vm.max_map_count`` (see Documentation/admin-guide/sysctl/vm.rst).
 
 Like other madvise calls, they are intended for use on mapped areas of
 the user address space: they will report ENOMEM if the specified range
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index d78c5b315f72..8463f5538fda 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
 Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.txt``)
+(``Documentation/admin-guide/cgroup-v1/cpusets.rst``)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst
index b79f70c04397..a80c3c37226e 100644
--- a/Documentation/admin-guide/mm/numaperf.rst
+++ b/Documentation/admin-guide/mm/numaperf.rst
@@ -15,7 +15,7 @@ characteristics.  Some memory may share the same node as a CPU, and others
 are provided as memory only nodes. While memory only nodes do not provide
 CPUs, they may still be local to one or more compute nodes relative to
 other nodes. The following diagram shows one such example of two compute
-nodes with local memory and a memory only node for each of compute node:
+nodes with local memory and a memory only node for each of compute node::
 
  +------------------+     +------------------+
  | Compute Node 0   +-----+ Compute Node 1   |
@@ -165,5 +165,6 @@ write-through caching.
 ========
 See Also
 ========
-.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
-       Section 5.2.27
+
+[1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
+- Section 5.2.27
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7ab93a8404b9..bd5714547cee 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -53,7 +53,7 @@ disabled, there is ``khugepaged`` daemon that scans memory and
 collapses sequences of basic pages into huge pages.
 
 The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
-interface and using madivse(2) and prctl(2) system calls.
+interface and using madvise(2) and prctl(2) system calls.
 
 Transparent Hugepage Support maximizes the usefulness of free memory
 if compared to the reservation approach of hugetlbfs by allowing all
diff --git a/Documentation/admin-guide/namespaces/compatibility-list.rst b/Documentation/admin-guide/namespaces/compatibility-list.rst
new file mode 100644
index 000000000000..318800b2a943
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/compatibility-list.rst
@@ -0,0 +1,43 @@
+=============================
+Namespaces compatibility list
+=============================
+
+This document contains the information about the problems user
+may have when creating tasks living in different namespaces.
+
+Here's the summary. This matrix shows the known problems, that
+occur when tasks share some namespace (the columns) while living
+in different other namespaces (the rows):
+
+====	===	===	===	===	====	===
+-	UTS	IPC	VFS	PID	User	Net
+====	===	===	===	===	====	===
+UTS	 X
+IPC		 X	 1
+VFS			 X
+PID		 1	 1	 X
+User		 2	 2		 X
+Net						 X
+====	===	===	===	===	====	===
+
+1. Both the IPC and the PID namespaces provide IDs to address
+   object inside the kernel. E.g. semaphore with IPCID or
+   process group with pid.
+
+   In both cases, tasks shouldn't try exposing this ID to some
+   other task living in a different namespace via a shared filesystem
+   or IPC shmem/message. The fact is that this ID is only valid
+   within the namespace it was obtained in and may refer to some
+   other object in another namespace.
+
+2. Intentionally, two equal user IDs in different user namespaces
+   should not be equal from the VFS point of view. In other
+   words, user 10 in one user namespace shouldn't have the same
+   access permissions to files, belonging to user 10 in another
+   namespace.
+
+   The same is true for the IPC namespaces being shared - two users
+   from different user namespaces should not access the same IPC objects
+   even having equal UIDs.
+
+   But currently this is not so.
diff --git a/Documentation/admin-guide/namespaces/index.rst b/Documentation/admin-guide/namespaces/index.rst
new file mode 100644
index 000000000000..384f2e0f33d2
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Namespaces
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   compatibility-list
+   resource-control
diff --git a/Documentation/admin-guide/namespaces/resource-control.rst b/Documentation/admin-guide/namespaces/resource-control.rst
new file mode 100644
index 000000000000..369556e00f0c
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/resource-control.rst
@@ -0,0 +1,18 @@
+===========================
+Namespaces research control
+===========================
+
+There are a lot of kinds of objects in the kernel that don't have
+individual limits or that have limits that are ineffective when a set
+of processes is allowed to switch user ids.  With user namespaces
+enabled in a kernel for people who don't trust their users or their
+users programs to play nice this problems becomes more acute.
+
+Therefore it is recommended that memory control groups be enabled in
+kernels that enable user namespaces, and it is further recommended
+that userspace configure memory control groups to limit how much
+memory user's they don't trust to play nice can use.
+
+Memory control groups can be configured by installing the libcgroup
+package present on most distros editing /etc/cgrules.conf,
+/etc/cgconfig.conf and setting up libpam-cgroup.
diff --git a/Documentation/numastat.txt b/Documentation/admin-guide/numastat.rst
index aaf1667489f8..aaf1667489f8 100644
--- a/Documentation/numastat.txt
+++ b/Documentation/admin-guide/numastat.rst
diff --git a/Documentation/admin-guide/perf/arm-ccn.rst b/Documentation/admin-guide/perf/arm-ccn.rst
new file mode 100644
index 000000000000..832b0c64023a
--- /dev/null
+++ b/Documentation/admin-guide/perf/arm-ccn.rst
@@ -0,0 +1,61 @@
+==========================
+ARM Cache Coherent Network
+==========================
+
+CCN-504 is a ring-bus interconnect consisting of 11 crosspoints
+(XPs), with each crosspoint supporting up to two device ports,
+so nodes (devices) 0 and 1 are connected to crosspoint 0,
+nodes 2 and 3 to crosspoint 1 etc.
+
+PMU (perf) driver
+-----------------
+
+The CCN driver registers a perf PMU driver, which provides
+description of available events and configuration options
+in sysfs, see /sys/bus/event_source/devices/ccn*.
+
+The "format" directory describes format of the config, config1
+and config2 fields of the perf_event_attr structure. The "events"
+directory provides configuration templates for all documented
+events, that can be used with perf tool. For example "xp_valid_flit"
+is an equivalent of "type=0x8,event=0x4". Other parameters must be
+explicitly specified.
+
+For events originating from device, "node" defines its index.
+
+Crosspoint PMU events require "xp" (index), "bus" (bus number)
+and "vc" (virtual channel ID).
+
+Crosspoint watchpoint-based events (special "event" value 0xfe)
+require "xp" and "vc" as as above plus "port" (device port index),
+"dir" (transmit/receive direction), comparator values ("cmp_l"
+and "cmp_h") and "mask", being index of the comparator mask.
+
+Masks are defined separately from the event description
+(due to limited number of the config values) in the "cmp_mask"
+directory, with first 8 configurable by user and additional
+4 hardcoded for the most frequent use cases.
+
+Cycle counter is described by a "type" value 0xff and does
+not require any other settings.
+
+The driver also provides a "cpumask" sysfs attribute, which contains
+a single CPU ID, of the processor which will be used to handle all
+the CCN PMU events. It is recommended that the user space tools
+request the events on this processor (if not, the perf_event->cpu value
+will be overwritten anyway). In case of this processor being offlined,
+the events are migrated to another one and the attribute is updated.
+
+Example of perf tool use::
+
+  / # perf list | grep ccn
+    ccn/cycles/                                        [Kernel PMU event]
+  <...>
+    ccn/xp_valid_flit,xp=?,port=?,vc=?,dir=?/          [Kernel PMU event]
+  <...>
+
+  / # perf stat -a -e ccn/cycles/,ccn/xp_valid_flit,xp=1,port=0,vc=1,dir=1/ \
+                                                                         sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/arm_dsu_pmu.rst b/Documentation/admin-guide/perf/arm_dsu_pmu.rst
new file mode 100644
index 000000000000..7fd34db75d13
--- /dev/null
+++ b/Documentation/admin-guide/perf/arm_dsu_pmu.rst
@@ -0,0 +1,29 @@
+==================================
+ARM DynamIQ Shared Unit (DSU) PMU
+==================================
+
+ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system,
+control logic and external interfaces to form a multicore cluster. The PMU
+allows counting the various events related to the L3 cache, Snoop Control Unit
+etc, using 32bit independent counters. It also provides a 64bit cycle counter.
+
+The PMU can only be accessed via CPU system registers and are common to the
+cores connected to the same DSU. Like most of the other uncore PMUs, DSU
+PMU doesn't support process specific events and cannot be used in sampling mode.
+
+The DSU provides a bitmap for a subset of implemented events via hardware
+registers. There is no way for the driver to determine if the other events
+are available or not. Hence the driver exposes only those events advertised
+by the DSU, in "events" directory under::
+
+  /sys/bus/event_sources/devices/arm_dsu_<N>/
+
+The user should refer to the TRM of the product to figure out the supported events
+and use the raw event code for the unlisted events.
+
+The driver also exposes the CPUs connected to the DSU instance in "associated_cpus".
+
+
+e.g usage::
+
+	perf stat -a -e arm_dsu_0/cycles/
diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst
new file mode 100644
index 000000000000..404a5c3d9d00
--- /dev/null
+++ b/Documentation/admin-guide/perf/hisi-pmu.rst
@@ -0,0 +1,60 @@
+======================================================
+HiSilicon SoC uncore Performance Monitoring Unit (PMU)
+======================================================
+
+The HiSilicon SoC chip includes various independent system device PMUs
+such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
+independent and have hardware logic to gather statistics and performance
+information.
+
+The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
+(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
+called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
+two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
+
+HiSilicon SoC uncore PMU driver
+-------------------------------
+
+Each device PMU has separate registers for event counting, control and
+interrupt, and the PMU driver shall register perf PMU drivers like L3C,
+HHA and DDRC etc. The available events and configuration options shall
+be described in the sysfs, see:
+
+/sys/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>/, or
+/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
+The "perf list" command shall list the available events from sysfs.
+
+Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
+name will appear in event listing as hisi_sccl<sccl-id>_module<index-id>.
+where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
+module.
+
+e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
+SCCL ID #3.
+
+e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
+SCCL ID #1.
+
+The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
+ID used to count the uncore PMU event.
+
+Example usage of perf::
+
+  $# perf list
+  hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
+  ------------------------------------------
+  hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
+  ------------------------------------------
+  hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
+  ------------------------------------------
+  hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
+  ------------------------------------------
+
+  $# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
+  $# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported as the events are all uncore.
+
+Note: Please contact the maintainer for a complete list of events supported for
+the PMU devices in the SoC and its information if needed.
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
new file mode 100644
index 000000000000..ee4bfd2a740f
--- /dev/null
+++ b/Documentation/admin-guide/perf/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+Performance monitor support
+===========================
+
+.. toctree::
+   :maxdepth: 1
+
+   hisi-pmu
+   qcom_l2_pmu
+   qcom_l3_pmu
+   arm-ccn
+   xgene-pmu
+   arm_dsu_pmu
+   thunderx2-pmu
diff --git a/Documentation/admin-guide/perf/qcom_l2_pmu.rst b/Documentation/admin-guide/perf/qcom_l2_pmu.rst
new file mode 100644
index 000000000000..c130178a4a55
--- /dev/null
+++ b/Documentation/admin-guide/perf/qcom_l2_pmu.rst
@@ -0,0 +1,39 @@
+=====================================================================
+Qualcomm Technologies Level-2 Cache Performance Monitoring Unit (PMU)
+=====================================================================
+
+This driver supports the L2 cache clusters found in Qualcomm Technologies
+Centriq SoCs. There are multiple physical L2 cache clusters, each with their
+own PMU. Each cluster has one or more CPUs associated with it.
+
+There is one logical L2 PMU exposed, which aggregates the results from
+the physical PMUs.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l2cache_0.
+
+The "format" directory describes the format of the events.
+
+Events can be envisioned as a 2-dimensional array. Each column represents
+a group of events. There are 8 groups. Only one entry from each
+group can be in use at a time. If multiple events from the same group
+are specified, the conflicting events cannot be counted at the same time.
+
+Events are specified as 0xCCG, where CC is 2 hex digits specifying
+the code (array row) and G specifies the group (column) 0-7.
+
+In addition there is a cycle counter event specified by the value 0xFE
+which is outside the above scheme.
+
+The driver provides a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per cluster which will be used to handle all the PMU
+events on that cluster.
+
+Examples for use with perf::
+
+  perf stat -e l2cache_0/config=0x001/,l2cache_0/config=0x042/ -a sleep 1
+
+  perf stat -e l2cache_0/config=0xfe/ -C 2 sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/qcom_l3_pmu.rst b/Documentation/admin-guide/perf/qcom_l3_pmu.rst
new file mode 100644
index 000000000000..a3d014a46bfd
--- /dev/null
+++ b/Documentation/admin-guide/perf/qcom_l3_pmu.rst
@@ -0,0 +1,26 @@
+===========================================================================
+Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU)
+===========================================================================
+
+This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies
+Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared
+by all cores within a socket. Each slice is exposed as a separate uncore perf
+PMU with device name l3cache_<socket>_<instance>. User space is responsible
+for aggregating across slices.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs
+the driver also exposes a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per socket which will be used to handle all the PMU
+events on that socket.
+
+The hardware implements 32bit event counters and has a flat 8bit event space
+exposed via the "event" format attribute. In addition to the 32bit physical
+counters the driver supports virtual 64bit hardware counters by using hardware
+counter chaining. This feature is exposed via the "lc" (long counter) format
+flag. E.g.::
+
+  perf stat -e l3cache_0_0/read-miss,lc/
+
+Given that these are uncore PMUs the driver does not support sampling, therefore
+"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/thunderx2-pmu.rst b/Documentation/admin-guide/perf/thunderx2-pmu.rst
new file mode 100644
index 000000000000..08e33675853a
--- /dev/null
+++ b/Documentation/admin-guide/perf/thunderx2-pmu.rst
@@ -0,0 +1,42 @@
+=============================================================
+Cavium ThunderX2 SoC Performance Monitoring Unit (PMU UNCORE)
+=============================================================
+
+The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
+PMUs such as the Level 3 Cache (L3C) and DDR4 Memory Controller (DMC).
+
+The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
+Events are counted for the default channel (i.e. channel 0) and prorated
+to the total number of channels/tiles.
+
+The DMC and L3C support up to 4 counters. Counters are independently
+programmable and can be started and stopped individually. Each counter
+can be set to a different event. Counters are 32-bit and do not support
+an overflow interrupt; they are read every 2 seconds.
+
+PMU UNCORE (perf) driver:
+
+The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
+L3C devices.  Each PMU can be used to count up to 4 events
+simultaneously. The PMUs provide a description of their available events
+and configuration options under sysfs, see
+/sys/devices/uncore_<l3c_S/dmc_S/>; S is the socket id.
+
+The driver does not support sampling, therefore "perf record" will not
+work. Per-task perf sessions are also not supported.
+
+Examples::
+
+  # perf stat -a -e uncore_dmc_0/cnt_cycles/ sleep 1
+
+  # perf stat -a -e \
+  uncore_dmc_0/cnt_cycles/,\
+  uncore_dmc_0/data_transfers/,\
+  uncore_dmc_0/read_txns/,\
+  uncore_dmc_0/write_txns/ sleep 1
+
+  # perf stat -a -e \
+  uncore_l3c_0/read_request/,\
+  uncore_l3c_0/read_hit/,\
+  uncore_l3c_0/inv_request/,\
+  uncore_l3c_0/inv_hit/ sleep 1
diff --git a/Documentation/admin-guide/perf/xgene-pmu.rst b/Documentation/admin-guide/perf/xgene-pmu.rst
new file mode 100644
index 000000000000..644f8ed89152
--- /dev/null
+++ b/Documentation/admin-guide/perf/xgene-pmu.rst
@@ -0,0 +1,49 @@
+================================================
+APM X-Gene SoC Performance Monitoring Unit (PMU)
+================================================
+
+X-Gene SoC PMU consists of various independent system device PMUs such as
+L3 cache(s), I/O bridge(s), memory controller bridge(s) and memory
+controller(s). These PMU devices are loosely architected to follow the
+same model as the PMU for ARM cores. The PMUs share the same top level
+interrupt and status CSR region.
+
+PMU (perf) driver
+-----------------
+
+The xgene-pmu driver registers several perf PMU drivers. Each of the perf
+driver provides description of its available events and configuration options
+in sysfs, see /sys/devices/<l3cX/iobX/mcbX/mcX>/.
+
+The "format" directory describes format of the config (event ID),
+config1 (agent ID) fields of the perf_event_attr structure. The "events"
+directory provides configuration templates for all supported event types that
+can be used with perf tool. For example, "l3c0/bank-fifo-full/" is an
+equivalent of "l3c0/config=0x0b/".
+
+Most of the SoC PMU has a specific list of agent ID used for monitoring
+performance of a specific datapath. For example, agents of a L3 cache can be
+a specific CPU or an I/O bridge. Each PMU has a set of 2 registers capable of
+masking the agents from which the request come from. If the bit with
+the bit number corresponding to the agent is set, the event is counted only if
+it is caused by a request from that agent. Each agent ID bit is inversely mapped
+to a corresponding bit in "config1" field. By default, the event will be
+counted for all agent requests (config1 = 0x0). For all the supported agents of
+each PMU, please refer to APM X-Gene User Manual.
+
+Each perf driver also provides a "cpumask" sysfs attribute, which contains a
+single CPU ID of the processor which will be used to handle all the PMU events.
+
+Example for perf tool use::
+
+ / # perf list | grep -e l3c -e iob -e mcb -e mc
+   l3c0/ackq-full/                                    [Kernel PMU event]
+ <...>
+   mcb1/mcb-csw-stall/                                [Kernel PMU event]
+
+ / # perf stat -a -e l3c0/read-miss/,mcb1/csw-write-request/ sleep 1
+
+ / # perf stat -a -e l3c0/read-miss,config1=0xfffffffffffffffe/ sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/pnp.txt b/Documentation/admin-guide/pnp.rst
index bab2d10631f0..bab2d10631f0 100644
--- a/Documentation/pnp.txt
+++ b/Documentation/admin-guide/pnp.rst
diff --git a/Documentation/driver-api/rapidio.rst b/Documentation/admin-guide/rapidio.rst
index 71ff658ab78e..71ff658ab78e 100644
--- a/Documentation/driver-api/rapidio.rst
+++ b/Documentation/admin-guide/rapidio.rst
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index c7495e42e6f4..2b20f5f7380d 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_.
   mode).
 
 .. [#f3] For more details about the Machine Check Architecture (MCA),
-  please read Documentation/x86/x86_64/machinecheck at the Kernel tree.
+  please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree.
 
 EDAC - Error Detection And Correction
 *************************************
diff --git a/Documentation/rtc.txt b/Documentation/admin-guide/rtc.rst
index 688c95b11919..688c95b11919 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/admin-guide/rtc.rst
diff --git a/Documentation/svga.txt b/Documentation/admin-guide/svga.rst
index b6c2f9acca92..b6c2f9acca92 100644
--- a/Documentation/svga.txt
+++ b/Documentation/admin-guide/svga.rst
diff --git a/Documentation/admin-guide/sysctl/abi.rst b/Documentation/admin-guide/sysctl/abi.rst
new file mode 100644
index 000000000000..599bcde7f0b7
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/abi.rst
@@ -0,0 +1,67 @@
+================================
+Documentation for /proc/sys/abi/
+================================
+
+kernel version 2.6.0.test2
+
+Copyright (c) 2003,  Fabian Frederick <ffrederick@users.sourceforge.net>
+
+For general info: index.rst.
+
+------------------------------------------------------------------------------
+
+This path is binary emulation relevant aka personality types aka abi.
+When a process is executed, it's linked to an exec_domain whose
+personality is defined using values available from /proc/sys/abi.
+You can find further details about abi in include/linux/personality.h.
+
+Here are the files featuring in 2.6 kernel:
+
+- defhandler_coff
+- defhandler_elf
+- defhandler_lcall7
+- defhandler_libcso
+- fake_utsname
+- trace
+
+defhandler_coff
+---------------
+
+defined value:
+	PER_SCOSVR3::
+
+		0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE
+
+defhandler_elf
+--------------
+
+defined value:
+	PER_LINUX::
+
+		0
+
+defhandler_lcall7
+-----------------
+
+defined value :
+	PER_SVR4::
+
+		0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+defhandler_libsco
+-----------------
+
+defined value:
+	PER_SVR4::
+
+		0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+fake_utsname
+------------
+
+Unused
+
+trace
+-----
+
+Unused
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
new file mode 100644
index 000000000000..2a45119e3331
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -0,0 +1,384 @@
+===============================
+Documentation for /proc/sys/fs/
+===============================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2009,        Shen Feng<shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in intro.rst.
+
+------------------------------------------------------------------------------
+
+This file contains documentation for the sysctl files in
+/proc/sys/fs/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+1. /proc/sys/fs
+===============
+
+Currently, these files are in /proc/sys/fs:
+
+- aio-max-nr
+- aio-nr
+- dentry-state
+- dquot-max
+- dquot-nr
+- file-max
+- file-nr
+- inode-max
+- inode-nr
+- inode-state
+- nr_open
+- overflowuid
+- overflowgid
+- pipe-user-pages-hard
+- pipe-user-pages-soft
+- protected_fifos
+- protected_hardlinks
+- protected_regular
+- protected_symlinks
+- suid_dumpable
+- super-max
+- super-nr
+
+
+aio-nr & aio-max-nr
+-------------------
+
+aio-nr is the running total of the number of events specified on the
+io_setup system call for all currently active aio contexts.  If aio-nr
+reaches aio-max-nr then io_setup will fail with EAGAIN.  Note that
+raising aio-max-nr does not result in the pre-allocation or re-sizing
+of any kernel data structures.
+
+
+dentry-state
+------------
+
+From linux/include/linux/dcache.h::
+
+  struct dentry_stat_t dentry_stat {
+        int nr_dentry;
+        int nr_unused;
+        int age_limit;         /* age in seconds */
+        int want_pages;        /* pages requested by system */
+        int nr_negative;       /* # of unused negative dentries */
+        int dummy;             /* Reserved for future use */
+  };
+
+Dentries are dynamically allocated and deallocated.
+
+nr_dentry shows the total number of dentries allocated (active
++ unused). nr_unused shows the number of dentries that are not
+actively used, but are saved in the LRU list for future reuse.
+
+Age_limit is the age in seconds after which dcache entries
+can be reclaimed when memory is short and want_pages is
+nonzero when shrink_dcache_pages() has been called and the
+dcache isn't pruned yet.
+
+nr_negative shows the number of unused dentries that are also
+negative dentries which do not map to any files. Instead,
+they help speeding up rejection of non-existing files provided
+by the users.
+
+
+dquot-max & dquot-nr
+--------------------
+
+The file dquot-max shows the maximum number of cached disk
+quota entries.
+
+The file dquot-nr shows the number of allocated disk quota
+entries and the number of free disk quota entries.
+
+If the number of free cached disk quotas is very low and
+you have some awesome number of simultaneous system users,
+you might want to raise the limit.
+
+
+file-max & file-nr
+------------------
+
+The value in file-max denotes the maximum number of file-
+handles that the Linux kernel will allocate. When you get lots
+of error messages about running out of file handles, you might
+want to increase this limit.
+
+Historically,the kernel was able to allocate file handles
+dynamically, but not to free them again. The three values in
+file-nr denote the number of allocated file handles, the number
+of allocated but unused file handles, and the maximum number of
+file handles. Linux 2.6 always reports 0 as the number of free
+file handles -- this is not an error, it just means that the
+number of allocated file handles exactly matches the number of
+used file handles.
+
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
+
+
+nr_open
+-------
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+
+inode-max, inode-nr & inode-state
+---------------------------------
+
+As with file handles, the kernel allocates the inode structures
+dynamically, but can't free them yet.
+
+The value in inode-max denotes the maximum number of inode
+handlers. This value should be 3-4 times larger than the value
+in file-max, since stdin, stdout and network sockets also
+need an inode struct to handle them. When you regularly run
+out of inodes, you need to increase this value.
+
+The file inode-nr contains the first two items from
+inode-state, so we'll skip to that file...
+
+Inode-state contains three actual numbers and four dummies.
+The actual numbers are, in order of appearance, nr_inodes,
+nr_free_inodes and preshrink.
+
+Nr_inodes stands for the number of inodes the system has
+allocated, this can be slightly more than inode-max because
+Linux allocates them one pageful at a time.
+
+Nr_free_inodes represents the number of free inodes (?) and
+preshrink is nonzero when the nr_inodes > inode-max and the
+system needs to prune the inode list instead of allocating
+more.
+
+
+overflowgid & overflowuid
+-------------------------
+
+Some filesystems only support 16-bit UIDs and GIDs, although in Linux
+UIDs and GIDs are 32 bits. When one of these filesystems is mounted
+with writes enabled, any UID or GID that would exceed 65535 is translated
+to a fixed value before being written to disk.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+
+pipe-user-pages-hard
+--------------------
+
+Maximum total number of pages a non-privileged user may allocate for pipes.
+Once this limit is reached, no new pipes may be allocated until usage goes
+below the limit again. When set to 0, no limit is applied, which is the default
+setting.
+
+
+pipe-user-pages-soft
+--------------------
+
+Maximum total number of pages a non-privileged user may allocate for pipes
+before the pipe size gets limited to a single page. Once this limit is reached,
+new pipes will be limited to a single page in size for this user in order to
+limit total memory usage, and trying to increase them using fcntl() will be
+denied until usage goes below the limit again. The default value allows to
+allocate up to 1024 pipes at their default size. When set to 0, no limit is
+applied.
+
+
+protected_fifos
+---------------
+
+The intent of this protection is to avoid unintentional writes to
+an attacker-controlled FIFO, where a program expected to create a regular
+file.
+
+When set to "0", writing to FIFOs is unrestricted.
+
+When set to "1" don't allow O_CREAT open on FIFOs that we don't own
+in world writable sticky directories, unless they are owned by the
+owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+This protection is based on the restrictions in Openwall.
+
+
+protected_hardlinks
+--------------------
+
+A long-standing class of security issues is the hardlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given hardlink (i.e. a
+root process follows a hardlink created by another user). Additionally,
+on systems without separated partitions, this stops unauthorized users
+from "pinning" vulnerable setuid/setgid files against being upgraded by
+the administrator, or linking to special files.
+
+When set to "0", hardlink creation behavior is unrestricted.
+
+When set to "1" hardlinks cannot be created by users if they do not
+already own the source file, or do not have read/write access to it.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+
+protected_regular
+-----------------
+
+This protection is similar to protected_fifos, but it
+avoids writes to an attacker-controlled regular file, where a program
+expected to create one.
+
+When set to "0", writing to regular files is unrestricted.
+
+When set to "1" don't allow O_CREAT open on regular files that we
+don't own in world writable sticky directories, unless they are
+owned by the owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+
+protected_symlinks
+------------------
+
+A long-standing class of security issues is the symlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given symlink (i.e. a
+root process follows a symlink belonging to another user). For a likely
+incomplete list of hundreds of examples across the years, please see:
+http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp
+
+When set to "0", symlink following behavior is unrestricted.
+
+When set to "1" symlinks are permitted to be followed only when outside
+a sticky world-writable directory, or when the uid of the symlink and
+follower match, or when the directory owner matches the symlink's owner.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+
+suid_dumpable:
+--------------
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+=   ==========  ===============================================================
+0   (default)	traditional behaviour. Any process which has changed
+		privilege levels or is execute only will not be dumped.
+1   (debug)	all processes dump core when possible. The core dump is
+		owned by the current user and no security is applied. This is
+		intended for system debugging situations only.
+		Ptrace is unchecked.
+		This is insecure as it allows regular users to examine the
+		memory contents of privileged processes.
+2   (suidsafe)	any binary which normally would not be dumped is dumped
+		anyway, but only if the "core_pattern" kernel sysctl is set to
+		either a pipe handler or a fully qualified path. (For more
+		details on this limitation, see CVE-2006-2451.) This mode is
+		appropriate when administrators are attempting to debug
+		problems in a normal environment, and either have a core dump
+		pipe handler that knows to treat privileged core dumps with
+		care, or specific directory defined for catching core dumps.
+		If a core dump happens without a pipe handler or fully
+		qualified path, a message will be emitted to syslog warning
+		about the lack of a correct setting.
+=   ==========  ===============================================================
+
+
+super-max & super-nr
+--------------------
+
+These numbers control the maximum number of superblocks, and
+thus the maximum number of mounted filesystems the kernel
+can have. You only need to increase super-max if you need to
+mount more filesystems than the current value in super-max
+allows you to.
+
+
+aio-nr & aio-max-nr
+-------------------
+
+aio-nr shows the current system-wide number of asynchronous io
+requests.  aio-max-nr allows you to change the maximum value
+aio-nr can grow to.
+
+
+mount-max
+---------
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+
+
+2. /proc/sys/fs/binfmt_misc
+===========================
+
+Documentation for the files in /proc/sys/fs/binfmt_misc is
+in Documentation/admin-guide/binfmt-misc.rst.
+
+
+3. /proc/sys/fs/mqueue - POSIX message queues filesystem
+========================================================
+
+
+The "mqueue"  filesystem provides  the necessary kernel features to enable the
+creation of a  user space  library that  implements  the  POSIX message queues
+API (as noted by the  MSG tag in the  POSIX 1003.1-2001 version  of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting  the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write  file for  setting/getting  the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max  is  a  read/write file  for  setting/getting  the
+maximum number of messages in a queue value.  In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is  a read/write  file for setting/getting the
+maximum  message size value (it is every  message queue's attribute set during
+its creation).
+
+/proc/sys/fs/mqueue/msg_default is  a read/write  file for setting/getting the
+default number of messages in a queue value if attr parameter of mq_open(2) is
+NULL. If it exceed msg_max, the default value is initialized msg_max.
+
+/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
+the default message size value if attr parameter of mq_open(2) is NULL. If it
+exceed msgsize_max, the default value is initialized msgsize_max.
+
+4. /proc/sys/fs/epoll - Configuration options for the epoll interface
+=====================================================================
+
+This directory contains configuration options for the epoll(7) interface.
+
+max_user_watches
+----------------
+
+Every epoll file descriptor can store a number of files to be monitored
+for event readiness. Each one of these monitored files constitutes a "watch".
+This configuration option sets the maximum number of "watches" that are
+allowed for each user.
+Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
+on a 64bit one.
+The current default value for  max_user_watches  is the 1/32 of the available
+low memory, divided for the "watch" cost in bytes.
diff --git a/Documentation/admin-guide/sysctl/index.rst b/Documentation/admin-guide/sysctl/index.rst
new file mode 100644
index 000000000000..03346f98c7b9
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/index.rst
@@ -0,0 +1,98 @@
+===========================
+Documentation for /proc/sys
+===========================
+
+Copyright (c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
+
+------------------------------------------------------------------------------
+
+'Why', I hear you ask, 'would anyone even _want_ documentation
+for them sysctl files? If anybody really needs it, it's all in
+the source...'
+
+Well, this documentation is written because some people either
+don't know they need to tweak something, or because they don't
+have the time or knowledge to read the source code.
+
+Furthermore, the programmers who built sysctl have built it to
+be actually used, not just for the fun of programming it :-)
+
+------------------------------------------------------------------------------
+
+Legal blurb:
+
+As usual, there are two main things to consider:
+
+1. you get what you pay for
+2. it's free
+
+The consequences are that I won't guarantee the correctness of
+this document, and if you come to me complaining about how you
+screwed up your system because of wrong documentation, I won't
+feel sorry for you. I might even laugh at you...
+
+But of course, if you _do_ manage to screw up your system using
+only the sysctl options used in this file, I'd like to hear of
+it. Not only to have a great laugh, but also to make sure that
+you're the last RTFMing person to screw up.
+
+In short, e-mail your suggestions, corrections and / or horror
+stories to: <riel@nl.linux.org>
+
+Rik van Riel.
+
+--------------------------------------------------------------
+
+Introduction
+============
+
+Sysctl is a means of configuring certain aspects of the kernel
+at run-time, and the /proc/sys/ directory is there so that you
+don't even need special tools to do it!
+In fact, there are only four things needed to use these config
+facilities:
+
+- a running Linux system
+- root access
+- common sense (this is especially hard to come by these days)
+- knowledge of what all those values mean
+
+As a quick 'ls /proc/sys' will show, the directory consists of
+several (arch-dependent?) subdirs. Each subdir is mainly about
+one part of the kernel, so you can do configuration on a piece
+by piece basis, or just some 'thematic frobbing'.
+
+This documentation is about:
+
+=============== ===============================================================
+abi/		execution domains & personalities
+debug/		<empty>
+dev/		device specific information (eg dev/cdrom/info)
+fs/		specific filesystems
+		filehandle, inode, dentry and quota tuning
+		binfmt_misc <Documentation/admin-guide/binfmt-misc.rst>
+kernel/		global kernel info / tuning
+		miscellaneous stuff
+net/		networking stuff, for documentation look in:
+		<Documentation/networking/>
+proc/		<empty>
+sunrpc/		SUN Remote Procedure Call (NFS)
+vm/		memory management tuning
+		buffer and cache management
+user/		Per user per user namespace limits
+=============== ===============================================================
+
+These are the subdirs I have on my system. There might be more
+or other subdirs in another setup. If you see another dir, I'd
+really like to hear about it :-)
+
+.. toctree::
+   :maxdepth: 1
+
+   abi
+   fs
+   kernel
+   net
+   sunrpc
+   user
+   vm
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
new file mode 100644
index 000000000000..032c7cd3cede
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -0,0 +1,1177 @@
+===================================
+Documentation for /proc/sys/kernel/
+===================================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2009,        Shen Feng<shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains documentation for the sysctl files in
+/proc/sys/kernel/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+Currently, these files might (depending on your configuration)
+show up in /proc/sys/kernel:
+
+- acct
+- acpi_video_flags
+- auto_msgmni
+- bootloader_type	     [ X86 only ]
+- bootloader_version	     [ X86 only ]
+- cap_last_cap
+- core_pattern
+- core_pipe_limit
+- core_uses_pid
+- ctrl-alt-del
+- dmesg_restrict
+- domainname
+- hostname
+- hotplug
+- hardlockup_all_cpu_backtrace
+- hardlockup_panic
+- hung_task_panic
+- hung_task_check_count
+- hung_task_timeout_secs
+- hung_task_check_interval_secs
+- hung_task_warnings
+- hyperv_record_panic_msg
+- kexec_load_disabled
+- kptr_restrict
+- l2cr                        [ PPC only ]
+- modprobe                    ==> Documentation/debugging-modules.txt
+- modules_disabled
+- msg_next_id		      [ sysv ipc ]
+- msgmax
+- msgmnb
+- msgmni
+- nmi_watchdog
+- osrelease
+- ostype
+- overflowgid
+- overflowuid
+- panic
+- panic_on_oops
+- panic_on_stackoverflow
+- panic_on_unrecovered_nmi
+- panic_on_warn
+- panic_print
+- panic_on_rcu_stall
+- perf_cpu_time_max_percent
+- perf_event_paranoid
+- perf_event_max_stack
+- perf_event_mlock_kb
+- perf_event_max_contexts_per_stack
+- pid_max
+- powersave-nap               [ PPC only ]
+- printk
+- printk_delay
+- printk_ratelimit
+- printk_ratelimit_burst
+- pty                         ==> Documentation/filesystems/devpts.txt
+- randomize_va_space
+- real-root-dev               ==> Documentation/admin-guide/initrd.rst
+- reboot-cmd                  [ SPARC only ]
+- rtsig-max
+- rtsig-nr
+- sched_energy_aware
+- seccomp/                    ==> Documentation/userspace-api/seccomp_filter.rst
+- sem
+- sem_next_id		      [ sysv ipc ]
+- sg-big-buff                 [ generic SCSI device (sg) ]
+- shm_next_id		      [ sysv ipc ]
+- shm_rmid_forced
+- shmall
+- shmmax                      [ sysv ipc ]
+- shmmni
+- softlockup_all_cpu_backtrace
+- soft_watchdog
+- stack_erasing
+- stop-a                      [ SPARC only ]
+- sysrq                       ==> Documentation/admin-guide/sysrq.rst
+- sysctl_writes_strict
+- tainted                     ==> Documentation/admin-guide/tainted-kernels.rst
+- threads-max
+- unknown_nmi_panic
+- watchdog
+- watchdog_thresh
+- version
+
+
+acct:
+=====
+
+highwater lowwater frequency
+
+If BSD-style process accounting is enabled these values control
+its behaviour. If free space on filesystem where the log lives
+goes below <lowwater>% accounting suspends. If free space gets
+above <highwater>% accounting resumes. <Frequency> determines
+how often do we check the amount of free space (value is in
+seconds). Default:
+4 2 30
+That is, suspend accounting if there left <= 2% free; resume it
+if we got >=4%; consider information about amount of free space
+valid for 30 seconds.
+
+
+acpi_video_flags:
+=================
+
+flags
+
+See Doc*/kernel/power/video.txt, it allows mode of video boot to be
+set during run time.
+
+
+auto_msgmni:
+============
+
+This variable has no effect and may be removed in future kernel
+releases. Reading it always returns 0.
+Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni
+upon memory add/remove or upon ipc namespace creation/removal.
+Echoing "1" into this file enabled msgmni automatic recomputing.
+Echoing "0" turned it off. auto_msgmni default value was 1.
+
+
+bootloader_type:
+================
+
+x86 bootloader identification
+
+This gives the bootloader type number as indicated by the bootloader,
+shifted left by 4, and OR'd with the low four bits of the bootloader
+version.  The reason for this encoding is that this used to match the
+type_of_loader field in the kernel header; the encoding is kept for
+backwards compatibility.  That is, if the full bootloader type number
+is 0x15 and the full version number is 0x234, this file will contain
+the value 340 = 0x154.
+
+See the type_of_loader and ext_loader_type fields in
+Documentation/x86/boot.rst for additional information.
+
+
+bootloader_version:
+===================
+
+x86 bootloader version
+
+The complete bootloader version number.  In the example above, this
+file will contain the value 564 = 0x234.
+
+See the type_of_loader and ext_loader_ver fields in
+Documentation/x86/boot.rst for additional information.
+
+
+cap_last_cap:
+=============
+
+Highest valid capability of the running kernel.  Exports
+CAP_LAST_CAP from the kernel.
+
+
+core_pattern:
+=============
+
+core_pattern is used to specify a core dumpfile pattern name.
+
+* max length 127 characters; default value is "core"
+* core_pattern is used as a pattern template for the output filename;
+  certain string patterns (beginning with '%') are substituted with
+  their actual values.
+* backward compatibility with core_uses_pid:
+
+	If core_pattern does not include "%p" (default does not)
+	and core_uses_pid is set, then .PID will be appended to
+	the filename.
+
+* corename format specifiers::
+
+	%<NUL>	'%' is dropped
+	%%	output one '%'
+	%p	pid
+	%P	global pid (init PID namespace)
+	%i	tid
+	%I	global tid (init PID namespace)
+	%u	uid (in initial user namespace)
+	%g	gid (in initial user namespace)
+	%d	dump mode, matches PR_SET_DUMPABLE and
+		/proc/sys/fs/suid_dumpable
+	%s	signal number
+	%t	UNIX time of dump
+	%h	hostname
+	%e	executable filename (may be shortened)
+	%E	executable path
+	%<OTHER> both are dropped
+
+* If the first character of the pattern is a '|', the kernel will treat
+  the rest of the pattern as a command to run.  The core dump will be
+  written to the standard input of that program instead of to a file.
+
+
+core_pipe_limit:
+================
+
+This sysctl is only applicable when core_pattern is configured to pipe
+core files to a user space helper (when the first character of
+core_pattern is a '|', see above).  When collecting cores via a pipe
+to an application, it is occasionally useful for the collecting
+application to gather data about the crashing process from its
+/proc/pid directory.  In order to do this safely, the kernel must wait
+for the collecting process to exit, so as not to remove the crashing
+processes proc files prematurely.  This in turn creates the
+possibility that a misbehaving userspace collecting process can block
+the reaping of a crashed process simply by never exiting.  This sysctl
+defends against that.  It defines how many concurrent crashing
+processes may be piped to user space applications in parallel.  If
+this value is exceeded, then those crashing processes above that value
+are noted via the kernel log and their cores are skipped.  0 is a
+special value, indicating that unlimited processes may be captured in
+parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crashing pid>/).  This
+value defaults to 0.
+
+
+core_uses_pid:
+==============
+
+The default coredump filename is "core".  By setting
+core_uses_pid to 1, the coredump filename becomes core.PID.
+If core_pattern does not include "%p" (default does not)
+and core_uses_pid is set, then .PID will be appended to
+the filename.
+
+
+ctrl-alt-del:
+=============
+
+When the value in this file is 0, ctrl-alt-del is trapped and
+sent to the init(1) program to handle a graceful restart.
+When, however, the value is > 0, Linux's reaction to a Vulcan
+Nerve Pinch (tm) will be an immediate reboot, without even
+syncing its dirty buffers.
+
+Note:
+  when a program (like dosemu) has the keyboard in 'raw'
+  mode, the ctrl-alt-del is intercepted by the program before it
+  ever reaches the kernel tty layer, and it's up to the program
+  to decide what to do with it.
+
+
+dmesg_restrict:
+===============
+
+This toggle indicates whether unprivileged users are prevented
+from using dmesg(8) to view messages from the kernel's log buffer.
+When dmesg_restrict is set to (0) there are no restrictions. When
+dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use
+dmesg(8).
+
+The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the
+default value of dmesg_restrict.
+
+
+domainname & hostname:
+======================
+
+These files can be used to set the NIS/YP domainname and the
+hostname of your box in exactly the same way as the commands
+domainname and hostname, i.e.::
+
+	# echo "darkstar" > /proc/sys/kernel/hostname
+	# echo "mydomain" > /proc/sys/kernel/domainname
+
+has the same effect as::
+
+	# hostname "darkstar"
+	# domainname "mydomain"
+
+Note, however, that the classic darkstar.frop.org has the
+hostname "darkstar" and DNS (Internet Domain Name Server)
+domainname "frop.org", not to be confused with the NIS (Network
+Information Service) or YP (Yellow Pages) domainname. These two
+domain names are in general different. For a detailed discussion
+see the hostname(1) man page.
+
+
+hardlockup_all_cpu_backtrace:
+=============================
+
+This value controls the hard lockup detector behavior when a hard
+lockup condition is detected as to whether or not to gather further
+debug information. If enabled, arch-specific all-CPU stack dumping
+will be initiated.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+
+hardlockup_panic:
+=================
+
+This parameter can be used to control whether the kernel panics
+when a hard lockup is detected.
+
+   0 - don't panic on hard lockup
+   1 - panic on hard lockup
+
+See Documentation/admin-guide/lockup-watchdogs.rst for more information.  This can
+also be set using the nmi_watchdog kernel parameter.
+
+
+hotplug:
+========
+
+Path for the hotplug policy agent.
+Default value is "/sbin/hotplug".
+
+
+hung_task_panic:
+================
+
+Controls the kernel's behavior when a hung task is detected.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0: continue operation. This is the default behavior.
+
+1: panic immediately.
+
+
+hung_task_check_count:
+======================
+
+The upper bound on the number of tasks that are checked.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+
+hung_task_timeout_secs:
+=======================
+
+When a task in D state did not get scheduled
+for more than this value report a warning.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0: means infinite timeout - no checking done.
+
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+
+hung_task_check_interval_secs:
+==============================
+
+Hung task check interval. If hung task checking is enabled
+(see hung_task_timeout_secs), the check is done every
+hung_task_check_interval_secs seconds.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0 (default): means use hung_task_timeout_secs as checking interval.
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+
+hung_task_warnings:
+===================
+
+The maximum number of warnings to report. During a check interval
+if a hung task is detected, this value is decreased by 1.
+When this value reaches 0, no more warnings will be reported.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+-1: report an infinite number of warnings.
+
+
+hyperv_record_panic_msg:
+========================
+
+Controls whether the panic kmsg data should be reported to Hyper-V.
+
+0: do not report panic kmsg data.
+
+1: report the panic kmsg data. This is the default behavior.
+
+
+kexec_load_disabled:
+====================
+
+A toggle indicating if the kexec_load syscall has been disabled. This
+value defaults to 0 (false: kexec_load enabled), but can be set to 1
+(true: kexec_load disabled). Once true, kexec can no longer be used, and
+the toggle cannot be set back to false. This allows a kexec image to be
+loaded before disabling the syscall, allowing a system to set up (and
+later use) an image without it being altered. Generally used together
+with the "modules_disabled" sysctl.
+
+
+kptr_restrict:
+==============
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.
+
+When kptr_restrict is set to 0 (the default) the address is hashed before
+printing. (This is the equivalent to %p.)
+
+When kptr_restrict is set to (1), kernel pointers printed using the %pK
+format specifier will be replaced with 0's unless the user has CAP_SYSLOG
+and effective user and group ids are equal to the real ids. This is
+because %pK checks are done at read() time rather than open() time, so
+if permissions are elevated between the open() and the read() (e.g via
+a setuid binary) then %pK will not leak kernel pointers to unprivileged
+users. Note, this is a temporary solution only. The correct long-term
+solution is to do the permission checks at open() time. Consider removing
+world read permissions from files that use %pK, and using dmesg_restrict
+to protect against uses of %pK in dmesg(8) if leaking kernel pointer
+values to unprivileged users is a concern.
+
+When kptr_restrict is set to (2), kernel pointers printed using
+%pK will be replaced with 0's regardless of privileges.
+
+
+l2cr: (PPC only)
+================
+
+This flag controls the L2 cache of G3 processor boards. If
+0, the cache is disabled. Enabled if nonzero.
+
+
+modules_disabled:
+=================
+
+A toggle value indicating if modules are allowed to be loaded
+in an otherwise modular kernel.  This toggle defaults to off
+(0), but can be set true (1).  Once true, modules can be
+neither loaded nor unloaded, and the toggle cannot be set back
+to false.  Generally used with the "kexec_load_disabled" toggle.
+
+
+msg_next_id, sem_next_id, and shm_next_id:
+==========================================
+
+These three toggles allows to specify desired id for next allocated IPC
+object: message, semaphore or shared memory respectively.
+
+By default they are equal to -1, which means generic allocation logic.
+Possible values to set are in range {0..INT_MAX}.
+
+Notes:
+  1) kernel doesn't guarantee, that new object will have desired id. So,
+     it's up to userspace, how to handle an object with "wrong" id.
+  2) Toggle with non-default value will be set back to -1 by kernel after
+     successful IPC object allocation. If an IPC object allocation syscall
+     fails, it is undefined if the value remains unmodified or is reset to -1.
+
+
+nmi_watchdog:
+=============
+
+This parameter can be used to control the NMI watchdog
+(i.e. the hard lockup detector) on x86 systems.
+
+0 - disable the hard lockup detector
+
+1 - enable the hard lockup detector
+
+The hard lockup detector monitors each CPU for its ability to respond to
+timer interrupts. The mechanism utilizes CPU performance counter registers
+that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
+while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
+
+The NMI watchdog is disabled by default if the kernel is running as a guest
+in a KVM virtual machine. This default can be overridden by adding::
+
+   nmi_watchdog=1
+
+to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst).
+
+
+numa_balancing:
+===============
+
+Enables/disables automatic page fault based NUMA memory
+balancing. Memory is moved automatically to nodes
+that access it often.
+
+Enables/disables automatic NUMA memory balancing. On NUMA machines, there
+is a performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing memory
+by periodically unmapping pages and later trapping a page fault. At the
+time of the page fault, it is determined if the data being accessed should
+be migrated to a local memory node.
+
+The unmapping of pages and trapping faults incur additional overhead that
+ideally is offset by improved memory locality but there is no universal
+guarantee. If the target workload is already bound to NUMA nodes then this
+feature should be disabled. Otherwise, if the system overhead from the
+feature is too high then the rate the kernel samples for NUMA hinting
+faults may be controlled by the numa_balancing_scan_period_min_ms,
+numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
+
+numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
+===============================================================================================================================
+
+
+Automatic NUMA balancing scans tasks address space and unmaps pages to
+detect if pages are properly placed or if the data should be migrated to a
+memory node local to where the task is running.  Every "scan delay" the task
+scans the next "scan size" number of pages in its address space. When the
+end of the address space is reached the scanner restarts from the beginning.
+
+In combination, the "scan delay" and "scan size" determine the scan rate.
+When "scan delay" decreases, the scan rate increases.  The scan delay and
+hence the scan rate of every task is adaptive and depends on historical
+behaviour. If pages are properly placed then the scan delay increases,
+otherwise the scan delay decreases.  The "scan size" is not adaptive but
+the higher the "scan size", the higher the scan rate.
+
+Higher scan rates incur higher system overhead as page faults must be
+trapped and potentially data must be migrated. However, the higher the scan
+rate, the more quickly a tasks memory is migrated to a local node if the
+workload pattern changes and minimises performance impact due to remote
+memory accesses. These sysctls control the thresholds for scan delays and
+the number of pages scanned.
+
+numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the maximum scanning
+rate for each task.
+
+numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
+when it initially forks.
+
+numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the minimum scanning
+rate for each task.
+
+numa_balancing_scan_size_mb is how many megabytes worth of pages are
+scanned for a given scan.
+
+
+osrelease, ostype & version:
+============================
+
+::
+
+  # cat osrelease
+  2.1.88
+  # cat ostype
+  Linux
+  # cat version
+  #5 Wed Feb 25 21:49:24 MET 1998
+
+The files osrelease and ostype should be clear enough. Version
+needs a little more clarification however. The '#5' means that
+this is the fifth kernel built from this source base and the
+date behind it indicates the time the kernel was built.
+The only way to tune these values is to rebuild the kernel :-)
+
+
+overflowgid & overflowuid:
+==========================
+
+if your architecture did not always support 32-bit UIDs (i.e. arm,
+i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to
+applications that use the old 16-bit UID/GID system calls, if the
+actual UID or GID would exceed 65535.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+
+panic:
+======
+
+The value in this file represents the number of seconds the kernel
+waits before rebooting on a panic. When you use the software watchdog,
+the recommended setting is 60.
+
+
+panic_on_io_nmi:
+================
+
+Controls the kernel's behavior when a CPU receives an NMI caused by
+an IO error.
+
+0: try to continue operation (default)
+
+1: panic immediately. The IO error triggered an NMI. This indicates a
+   serious system condition which could result in IO data corruption.
+   Rather than continuing, panicking might be a better choice. Some
+   servers issue this sort of NMI when the dump button is pushed,
+   and you can use this option to take a crash dump.
+
+
+panic_on_oops:
+==============
+
+Controls the kernel's behaviour when an oops or BUG is encountered.
+
+0: try to continue operation
+
+1: panic immediately.  If the `panic` sysctl is also non-zero then the
+   machine will be rebooted.
+
+
+panic_on_stackoverflow:
+=======================
+
+Controls the kernel's behavior when detecting the overflows of
+kernel, IRQ and exception stacks except a user stack.
+This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
+
+0: try to continue operation.
+
+1: panic immediately.
+
+
+panic_on_unrecovered_nmi:
+=========================
+
+The default Linux behaviour on an NMI of either memory or unknown is
+to continue operation. For many environments such as scientific
+computing it is preferable that the box is taken out and the error
+dealt with than an uncorrected parity/ECC error get propagated.
+
+A small number of systems do generate NMI's for bizarre random reasons
+such as power management so the default is off. That sysctl works like
+the existing panic controls already in that directory.
+
+
+panic_on_warn:
+==============
+
+Calls panic() in the WARN() path when set to 1.  This is useful to avoid
+a kernel rebuild when attempting to kdump at the location of a WARN().
+
+0: only WARN(), default behaviour.
+
+1: call panic() after printing out WARN() location.
+
+
+panic_print:
+============
+
+Bitmask for printing system info when panic happens. User can chose
+combination of the following bits:
+
+=====  ========================================
+bit 0  print all tasks info
+bit 1  print system memory info
+bit 2  print timer info
+bit 3  print locks info if CONFIG_LOCKDEP is on
+bit 4  print ftrace buffer
+=====  ========================================
+
+So for example to print tasks and memory info on panic, user can::
+
+  echo 3 > /proc/sys/kernel/panic_print
+
+
+panic_on_rcu_stall:
+===================
+
+When set to 1, calls panic() after RCU stall detection messages. This
+is useful to define the root cause of RCU stalls using a vmcore.
+
+0: do not panic() when RCU stall takes place, default behavior.
+
+1: panic() after printing RCU stall messages.
+
+
+perf_cpu_time_max_percent:
+==========================
+
+Hints to the kernel how much CPU time it should be allowed to
+use to handle perf sampling events.  If the perf subsystem
+is informed that its samples are exceeding this limit, it
+will drop its sampling frequency to attempt to reduce its CPU
+usage.
+
+Some perf sampling happens in NMIs.  If these samples
+unexpectedly take too long to execute, the NMIs can become
+stacked up next to each other so much that nothing else is
+allowed to execute.
+
+0:
+   disable the mechanism.  Do not monitor or correct perf's
+   sampling rate no matter how CPU time it takes.
+
+1-100:
+   attempt to throttle perf's sample rate to this
+   percentage of CPU.  Note: the kernel calculates an
+   "expected" length of each sample event.  100 here means
+   100% of that expected length.  Even if this is set to
+   100, you may still see sample throttling if this
+   length is exceeded.  Set to 0 if you truly do not care
+   how much CPU is consumed.
+
+
+perf_event_paranoid:
+====================
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 2.
+
+===  ==================================================================
+ -1  Allow use of (almost) all events by all users
+
+     Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+
+>=0  Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+
+     Disallow raw tracepoint access by users without CAP_SYS_ADMIN
+
+>=1  Disallow CPU event access by users without CAP_SYS_ADMIN
+
+>=2  Disallow kernel profiling by users without CAP_SYS_ADMIN
+===  ==================================================================
+
+
+perf_event_max_stack:
+=====================
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+
+perf_event_mlock_kb:
+====================
+
+Control size of per-cpu ring buffer not counted agains mlock limit.
+
+The default value is 512 + 1 page
+
+
+perf_event_max_contexts_per_stack:
+==================================
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+
+pid_max:
+========
+
+PID allocation wrap value.  When the kernel's next PID value
+reaches this value, it wraps back to a minimum PID value.
+PIDs of value pid_max or larger are not allocated.
+
+
+ns_last_pid:
+============
+
+The last pid allocated in the current (the one task using this sysctl
+lives in) pid namespace. When selecting a pid for a next task on fork
+kernel tries to allocate a number starting from this one.
+
+
+powersave-nap: (PPC only)
+=========================
+
+If set, Linux-PPC will use the 'nap' mode of powersaving,
+otherwise the 'doze' mode will be used.
+
+==============================================================
+
+printk:
+=======
+
+The four values in printk denote: console_loglevel,
+default_message_loglevel, minimum_console_loglevel and
+default_console_loglevel respectively.
+
+These values influence printk() behavior when printing or
+logging error messages. See 'man 2 syslog' for more info on
+the different loglevels.
+
+- console_loglevel:
+	messages with a higher priority than
+	this will be printed to the console
+- default_message_loglevel:
+	messages without an explicit priority
+	will be printed with this priority
+- minimum_console_loglevel:
+	minimum (highest) value to which
+	console_loglevel can be set
+- default_console_loglevel:
+	default value for console_loglevel
+
+
+printk_delay:
+=============
+
+Delay each printk message in printk_delay milliseconds
+
+Value from 0 - 10000 is allowed.
+
+
+printk_ratelimit:
+=================
+
+Some warning messages are rate limited. printk_ratelimit specifies
+the minimum length of time between these messages (in jiffies), by
+default we allow one every 5 seconds.
+
+A value of 0 will disable rate limiting.
+
+
+printk_ratelimit_burst:
+=======================
+
+While long term we enforce one message per printk_ratelimit
+seconds, we do allow a burst of messages to pass through.
+printk_ratelimit_burst specifies the number of messages we can
+send before ratelimiting kicks in.
+
+
+printk_devkmsg:
+===============
+
+Control the logging to /dev/kmsg from userspace:
+
+ratelimit:
+	default, ratelimited
+
+on: unlimited logging to /dev/kmsg from userspace
+
+off: logging to /dev/kmsg disabled
+
+The kernel command line parameter printk.devkmsg= overrides this and is
+a one-time setting until next reboot: once set, it cannot be changed by
+this sysctl interface anymore.
+
+
+randomize_va_space:
+===================
+
+This option can be used to select the type of process address
+space randomization that is used in the system, for architectures
+that support this feature.
+
+==  ===========================================================================
+0   Turn the process address space randomization off.  This is the
+    default for architectures that do not support this feature anyways,
+    and kernels that are booted with the "norandmaps" parameter.
+
+1   Make the addresses of mmap base, stack and VDSO page randomized.
+    This, among other things, implies that shared libraries will be
+    loaded to random addresses.  Also for PIE-linked binaries, the
+    location of code start is randomized.  This is the default if the
+    CONFIG_COMPAT_BRK option is enabled.
+
+2   Additionally enable heap randomization.  This is the default if
+    CONFIG_COMPAT_BRK is disabled.
+
+    There are a few legacy applications out there (such as some ancient
+    versions of libc.so.5 from 1996) that assume that brk area starts
+    just after the end of the code+bss.  These applications break when
+    start of the brk area is randomized.  There are however no known
+    non-legacy applications that would be broken this way, so for most
+    systems it is safe to choose full randomization.
+
+    Systems with ancient and/or broken binaries should be configured
+    with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
+    address space randomization.
+==  ===========================================================================
+
+
+reboot-cmd: (Sparc only)
+========================
+
+??? This seems to be a way to give an argument to the Sparc
+ROM/Flash boot loader. Maybe to tell it what to do after
+rebooting. ???
+
+
+rtsig-max & rtsig-nr:
+=====================
+
+The file rtsig-max can be used to tune the maximum number
+of POSIX realtime (queued) signals that can be outstanding
+in the system.
+
+rtsig-nr shows the number of RT signals currently queued.
+
+
+sched_energy_aware:
+===================
+
+Enables/disables Energy Aware Scheduling (EAS). EAS starts
+automatically on platforms where it can run (that is,
+platforms with asymmetric CPU topologies and having an Energy
+Model available). If your platform happens to meet the
+requirements for EAS but you do not want to use it, change
+this value to 0.
+
+
+sched_schedstats:
+=================
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+
+sg-big-buff:
+============
+
+This file shows the size of the generic SCSI (sg) buffer.
+You can't tune it just yet, but you could change it on
+compile time by editing include/scsi/sg.h and changing
+the value of SG_BIG_BUFF.
+
+There shouldn't be any reason to change this value. If
+you can come up with one, you probably know what you
+are doing anyway :)
+
+
+shmall:
+=======
+
+This parameter sets the total amount of shared memory pages that
+can be used system wide. Hence, SHMALL should always be at least
+ceil(shmmax/PAGE_SIZE).
+
+If you are not sure what the default PAGE_SIZE is on your Linux
+system, you can run the following command:
+
+	# getconf PAGE_SIZE
+
+
+shmmax:
+=======
+
+This value can be used to query and set the run time limit
+on the maximum shared memory segment size that can be created.
+Shared memory segments up to 1Gb are now supported in the
+kernel.  This value defaults to SHMMAX.
+
+
+shm_rmid_forced:
+================
+
+Linux lets you set resource limits, including how much memory one
+process can consume, via setrlimit(2).  Unfortunately, shared memory
+segments are allowed to exist without association with any process, and
+thus might not be counted against any resource limits.  If enabled,
+shared memory segments are automatically destroyed when their attach
+count becomes zero after a detach or a process termination.  It will
+also destroy segments that were created, but never attached to, on exit
+from the process.  The only use left for IPC_RMID is to immediately
+destroy an unattached segment.  Of course, this breaks the way things are
+defined, so some applications might stop working.  Note that this
+feature will do you no good unless you also configure your resource
+limits (in particular, RLIMIT_AS and RLIMIT_NPROC).  Most systems don't
+need this.
+
+Note that if you change this from 0 to 1, already created segments
+without users and with a dead originative process will be destroyed.
+
+
+sysctl_writes_strict:
+=====================
+
+Control how file position affects the behavior of updating sysctl values
+via the /proc/sys interface:
+
+  ==   ======================================================================
+  -1   Legacy per-write sysctl value handling, with no printk warnings.
+       Each write syscall must fully contain the sysctl value to be
+       written, and multiple writes on the same sysctl file descriptor
+       will rewrite the sysctl value, regardless of file position.
+   0   Same behavior as above, but warn about processes that perform writes
+       to a sysctl file descriptor when the file position is not 0.
+   1   (default) Respect file position when writing sysctl strings. Multiple
+       writes will append to the sysctl value buffer. Anything past the max
+       length of the sysctl value buffer will be ignored. Writes to numeric
+       sysctl entries must always be at file position 0 and the value must
+       be fully contained in the buffer sent in the write syscall.
+  ==   ======================================================================
+
+
+softlockup_all_cpu_backtrace:
+=============================
+
+This value controls the soft lockup detector thread's behavior
+when a soft lockup condition is detected as to whether or not
+to gather further debug information. If enabled, each cpu will
+be issued an NMI and instructed to capture stack trace.
+
+This feature is only applicable for architectures which support
+NMI.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+
+soft_watchdog:
+==============
+
+This parameter can be used to control the soft lockup detector.
+
+   0 - disable the soft lockup detector
+
+   1 - enable the soft lockup detector
+
+The soft lockup detector monitors CPUs for threads that are hogging the CPUs
+without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
+from running. The mechanism depends on the CPUs ability to respond to timer
+interrupts which are needed for the 'watchdog/N' threads to be woken up by
+the watchdog timer function, otherwise the NMI watchdog - if enabled - can
+detect a hard lockup condition.
+
+
+stack_erasing:
+==============
+
+This parameter can be used to control kernel stack erasing at the end
+of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK.
+
+That erasing reduces the information which kernel stack leak bugs
+can reveal and blocks some uninitialized stack variable attacks.
+The tradeoff is the performance impact: on a single CPU system kernel
+compilation sees a 1% slowdown, other systems and workloads may vary.
+
+  0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated.
+
+  1: kernel stack erasing is enabled (default), it is performed before
+     returning to the userspace at the end of syscalls.
+
+
+tainted
+=======
+
+Non-zero if the kernel has been tainted. Numeric values, which can be
+ORed together. The letters are seen in "Tainted" line of Oops reports.
+
+======  =====  ==============================================================
+     1  `(P)`  proprietary module was loaded
+     2  `(F)`  module was force loaded
+     4  `(S)`  SMP kernel oops on an officially SMP incapable processor
+     8  `(R)`  module was force unloaded
+    16  `(M)`  processor reported a Machine Check Exception (MCE)
+    32  `(B)`  bad page referenced or some unexpected page flags
+    64  `(U)`  taint requested by userspace application
+   128  `(D)`  kernel died recently, i.e. there was an OOPS or BUG
+   256  `(A)`  an ACPI table was overridden by user
+   512  `(W)`  kernel issued warning
+  1024  `(C)`  staging driver was loaded
+  2048  `(I)`  workaround for bug in platform firmware applied
+  4096  `(O)`  externally-built ("out-of-tree") module was loaded
+  8192  `(E)`  unsigned module was loaded
+ 16384  `(L)`  soft lockup occurred
+ 32768  `(K)`  kernel has been live patched
+ 65536  `(X)`  Auxiliary taint, defined and used by for distros
+131072  `(T)`  The kernel was built with the struct randomization plugin
+======  =====  ==============================================================
+
+See Documentation/admin-guide/tainted-kernels.rst for more information.
+
+
+threads-max:
+============
+
+This value controls the maximum number of threads that can be created
+using fork().
+
+During initialization the kernel sets this value such that even if the
+maximum number of threads is created, the thread structures occupy only
+a part (1/8th) of the available RAM pages.
+
+The minimum value that can be written to threads-max is 20.
+
+The maximum value that can be written to threads-max is given by the
+constant FUTEX_TID_MASK (0x3fffffff).
+
+If a value outside of this range is written to threads-max an error
+EINVAL occurs.
+
+The value written is checked against the available RAM pages. If the
+thread structures would occupy too much (more than 1/8th) of the
+available RAM pages threads-max is reduced accordingly.
+
+
+unknown_nmi_panic:
+==================
+
+The value in this file affects behavior of handling NMI. When the
+value is non-zero, unknown NMI is trapped and then panic occurs. At
+that time, kernel debugging information is displayed on console.
+
+NMI switch that most IA32 servers have fires unknown NMI up, for
+example.  If a system hangs up, try pressing the NMI switch.
+
+
+watchdog:
+=========
+
+This parameter can be used to disable or enable the soft lockup detector
+_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
+
+   0 - disable both lockup detectors
+
+   1 - enable both lockup detectors
+
+The soft lockup detector and the NMI watchdog can also be disabled or
+enabled individually, using the soft_watchdog and nmi_watchdog parameters.
+If the watchdog parameter is read, for example by executing::
+
+   cat /proc/sys/kernel/watchdog
+
+the output of this command (0 or 1) shows the logical OR of soft_watchdog
+and nmi_watchdog.
+
+
+watchdog_cpumask:
+=================
+
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say::
+
+  echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+
+
+watchdog_thresh:
+================
+
+This value can be used to control the frequency of hrtimer and NMI
+events and the soft and hard lockup thresholds. The default threshold
+is 10 seconds.
+
+The softlockup threshold is (2 * watchdog_thresh). Setting this
+tunable to zero will disable lockup detection altogether.
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
new file mode 100644
index 000000000000..a7d44e71019d
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -0,0 +1,461 @@
+================================
+Documentation for /proc/sys/net/
+================================
+
+Copyright
+
+Copyright (c) 1999
+
+	- Terrehon Bowden <terrehon@pacbell.net>
+	- Bodo Bauer <bb@ricochet.net>
+
+Copyright (c) 2000
+
+	- Jorge Nerin <comandante@zaralinux.com>
+
+Copyright (c) 2009
+
+	- Shen Feng <shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/net
+
+The interface  to  the  networking  parts  of  the  kernel  is  located  in
+/proc/sys/net. The following table shows all possible subdirectories.  You may
+see only some of them, depending on your kernel's configuration.
+
+
+Table : Subdirectories in /proc/sys/net
+
+ ========= =================== = ========== ==================
+ Directory Content               Directory  Content
+ ========= =================== = ========== ==================
+ core      General parameter     appletalk  Appletalk protocol
+ unix      Unix domain sockets   netrom     NET/ROM
+ 802       E802 protocol         ax25       AX25
+ ethernet  Ethernet protocol     rose       X.25 PLP layer
+ ipv4      IP version 4          x25        X.25 protocol
+ ipx       IPX                   token-ring IBM token ring
+ bridge    Bridging              decnet     DEC net
+ ipv6      IP version 6          tipc       TIPC
+ ========= =================== = ========== ==================
+
+1. /proc/sys/net/core - Network core options
+============================================
+
+bpf_jit_enable
+--------------
+
+This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
+and efficient infrastructure allowing to execute bytecode at various
+hook points. It is used in a number of Linux kernel subsystems such
+as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
+and security (e.g. seccomp). LLVM has a BPF back end that can compile
+restricted C into a sequence of BPF instructions. After program load
+through bpf(2) and passing a verifier in the kernel, a JIT will then
+translate these BPF proglets into native CPU instructions. There are
+two flavors of JITs, the newer eBPF JIT currently supported on:
+
+  - x86_64
+  - x86_32
+  - arm64
+  - arm32
+  - ppc64
+  - sparc64
+  - mips64
+  - s390x
+  - riscv
+
+And the older cBPF JIT supported on the following archs:
+
+  - mips
+  - ppc
+  - sparc
+
+eBPF JITs are a superset of cBPF JITs, meaning the kernel will
+migrate cBPF instructions into eBPF instructions and then JIT
+compile them transparently. Older cBPF JITs can only translate
+tcpdump filters, seccomp rules, etc, but not mentioned eBPF
+programs loaded through bpf(2).
+
+Values:
+
+	- 0 - disable the JIT (default value)
+	- 1 - enable the JIT
+	- 2 - enable the JIT and ask the compiler to emit traces on kernel log.
+
+bpf_jit_harden
+--------------
+
+This enables hardening for the BPF JIT compiler. Supported are eBPF
+JIT backends. Enabling hardening trades off performance, but can
+mitigate JIT spraying.
+
+Values:
+
+	- 0 - disable JIT hardening (default value)
+	- 1 - enable JIT hardening for unprivileged users only
+	- 2 - enable JIT hardening for all users
+
+bpf_jit_kallsyms
+----------------
+
+When BPF JIT compiler is enabled, then compiled images are unknown
+addresses to the kernel, meaning they neither show up in traces nor
+in /proc/kallsyms. This enables export of these addresses, which can
+be used for debugging/tracing. If bpf_jit_harden is enabled, this
+feature is disabled.
+
+Values :
+
+	- 0 - disable JIT kallsyms export (default value)
+	- 1 - enable JIT kallsyms export for privileged users only
+
+bpf_jit_limit
+-------------
+
+This enforces a global limit for memory allocations to the BPF JIT
+compiler in order to reject unprivileged JIT requests once it has
+been surpassed. bpf_jit_limit contains the value of the global limit
+in bytes.
+
+dev_weight
+----------
+
+The maximum number of packets that kernel can handle on a NAPI interrupt,
+it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware
+aggregated packet is counted as one packet in this context.
+
+Default: 64
+
+dev_weight_rx_bias
+------------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+
+Default: 1
+
+dev_weight_tx_bias
+------------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+
+Default: 1
+
+default_qdisc
+-------------
+
+The default queuing discipline to use for network devices. This allows
+overriding the default of pfifo_fast with an alternative. Since the default
+queuing discipline is created without additional parameters so is best suited
+to queuing disciplines that work well without configuration like stochastic
+fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use
+queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin
+which require setting up classes and bandwidths. Note that physical multiqueue
+interfaces still use mq as root qdisc, which in turn uses this default for its
+leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead
+default to noqueue.
+
+Default: pfifo_fast
+
+busy_read
+---------
+
+Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL)
+Approximate time in us to busy loop waiting for packets on the device queue.
+This sets the default value of the SO_BUSY_POLL socket option.
+Can be set or overridden per socket by setting socket option SO_BUSY_POLL,
+which is the preferred method of enabling. If you need to enable the feature
+globally via sysctl, a value of 50 is recommended.
+
+Will increase power usage.
+
+Default: 0 (off)
+
+busy_poll
+----------------
+Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL)
+Approximate time in us to busy loop waiting for events.
+Recommended value depends on the number of sockets you poll on.
+For several sockets 50, for several hundreds 100.
+For more than that you probably want to use epoll.
+Note that only sockets with SO_BUSY_POLL set will be busy polled,
+so you want to either selectively set SO_BUSY_POLL on those sockets or set
+sysctl.net.busy_read globally.
+
+Will increase power usage.
+
+Default: 0 (off)
+
+rmem_default
+------------
+
+The default setting of the socket receive buffer in bytes.
+
+rmem_max
+--------
+
+The maximum receive socket buffer size in bytes.
+
+tstamp_allow_data
+-----------------
+Allow processes to receive tx timestamps looped together with the original
+packet contents. If disabled, transmit timestamp requests from unprivileged
+processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set.
+
+Default: 1 (on)
+
+
+wmem_default
+------------
+
+The default setting (in bytes) of the socket send buffer.
+
+wmem_max
+--------
+
+The maximum send socket buffer size in bytes.
+
+message_burst and message_cost
+------------------------------
+
+These parameters  are used to limit the warning messages written to the kernel
+log from  the  networking  code.  They  enforce  a  rate  limit  to  make  a
+denial-of-service attack  impossible. A higher message_cost factor, results in
+fewer messages that will be written. Message_burst controls when messages will
+be dropped.  The  default  settings  limit  warning messages to one every five
+seconds.
+
+warnings
+--------
+
+This sysctl is now unused.
+
+This was used to control console messages from the networking stack that
+occur because of problems on the network like duplicate address or bad
+checksums.
+
+These messages are now emitted at KERN_DEBUG and can generally be enabled
+and controlled by the dynamic_debug facility.
+
+netdev_budget
+-------------
+
+Maximum number of packets taken from all interfaces in one polling cycle (NAPI
+poll). In one polling cycle interfaces which are registered to polling are
+probed in a round-robin manner. Also, a polling cycle may not exceed
+netdev_budget_usecs microseconds, even if netdev_budget has not been
+exhausted.
+
+netdev_budget_usecs
+---------------------
+
+Maximum number of microseconds in one NAPI polling cycle. Polling
+will exit when either netdev_budget_usecs have elapsed during the
+poll cycle or the number of packets processed reaches netdev_budget.
+
+netdev_max_backlog
+------------------
+
+Maximum number  of  packets,  queued  on  the  INPUT  side, when the interface
+receives packets faster than kernel can process them.
+
+netdev_rss_key
+--------------
+
+RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is
+randomly generated.
+Some user space might need to gather its content even if drivers do not
+provide ethtool -x support yet.
+
+::
+
+  myhost:~# cat /proc/sys/net/core/netdev_rss_key
+  84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total)
+
+File contains nul bytes if no driver ever called netdev_rss_key_fill() function.
+
+Note:
+  /proc/sys/net/core/netdev_rss_key contains 52 bytes of key,
+  but most drivers only use 40 bytes of it.
+
+::
+
+  myhost:~# ethtool -x eth0
+  RX flow hash indirection table for eth0 with 8 RX ring(s):
+      0:    0     1     2     3     4     5     6     7
+  RSS hash key:
+  84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89
+
+netdev_tstamp_prequeue
+----------------------
+
+If set to 0, RX packet timestamps can be sampled after RPS processing, when
+the target CPU processes packets. It might give some delay on timestamps, but
+permit to distribute the load on several cpus.
+
+If set to 1 (default), timestamps are sampled as soon as possible, before
+queueing.
+
+optmem_max
+----------
+
+Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
+of struct cmsghdr structures with appended data.
+
+fb_tunnels_only_for_init_net
+----------------------------
+
+Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
+sit0, ip6tnl0, ip6gre0) are automatically created when a new
+network namespace is created, if corresponding tunnel is present
+in initial network namespace.
+If set to 1, these devices are not automatically created, and
+user space is responsible for creating them if needed.
+
+Default : 0  (for compatibility reasons)
+
+devconf_inherit_init_net
+------------------------
+
+Controls if a new network namespace should inherit all current
+settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By
+default, we keep the current behavior: for IPv4 we inherit all current
+settings from init_net and for IPv6 we reset all settings to default.
+
+If set to 1, both IPv4 and IPv6 settings are forced to inherit from
+current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
+forced to reset to their default values.
+
+Default : 0  (for compatibility reasons)
+
+2. /proc/sys/net/unix - Parameters for Unix domain sockets
+----------------------------------------------------------
+
+There is only one file in this directory.
+unix_dgram_qlen limits the max number of datagrams queued in Unix domain
+socket's buffer. It will not take effect unless PF_UNIX flag is specified.
+
+
+3. /proc/sys/net/ipv4 - IPV4 settings
+-------------------------------------
+Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
+descriptions of these entries.
+
+
+4. Appletalk
+------------
+
+The /proc/sys/net/appletalk  directory  holds the Appletalk configuration data
+when Appletalk is loaded. The configurable parameters are:
+
+aarp-expiry-time
+----------------
+
+The amount  of  time  we keep an ARP entry before expiring it. Used to age out
+old hosts.
+
+aarp-resolve-time
+-----------------
+
+The amount of time we will spend trying to resolve an Appletalk address.
+
+aarp-retransmit-limit
+---------------------
+
+The number of times we will retransmit a query before giving up.
+
+aarp-tick-time
+--------------
+
+Controls the rate at which expires are checked.
+
+The directory  /proc/net/appletalk  holds the list of active Appletalk sockets
+on a machine.
+
+The fields  indicate  the DDP type, the local address (in network:node format)
+the remote  address,  the  size of the transmit pending queue, the size of the
+received queue  (bytes waiting for applications to read) the state and the uid
+owning the socket.
+
+/proc/net/atalk_iface lists  all  the  interfaces  configured for appletalk.It
+shows the  name  of the interface, its Appletalk address, the network range on
+that address  (or  network number for phase 1 networks), and the status of the
+interface.
+
+/proc/net/atalk_route lists  each  known  network  route.  It lists the target
+(network) that the route leads to, the router (may be directly connected), the
+route flags, and the device the route is using.
+
+
+5. IPX
+------
+
+The IPX protocol has no tunable values in proc/sys/net.
+
+The IPX  protocol  does,  however,  provide  proc/net/ipx. This lists each IPX
+socket giving  the  local  and  remote  addresses  in  Novell  format (that is
+network:node:port). In  accordance  with  the  strange  Novell  tradition,
+everything but the port is in hex. Not_Connected is displayed for sockets that
+are not  tied to a specific remote address. The Tx and Rx queue sizes indicate
+the number  of  bytes  pending  for  transmission  and  reception.  The  state
+indicates the  state  the  socket  is  in and the uid is the owning uid of the
+socket.
+
+The /proc/net/ipx_interface  file lists all IPX interfaces. For each interface
+it gives  the network number, the node number, and indicates if the network is
+the primary  network.  It  also  indicates  which  device  it  is bound to (or
+Internal for  internal  networks)  and  the  Frame  Type if appropriate. Linux
+supports 802.3,  802.2,  802.2  SNAP  and DIX (Blue Book) ethernet framing for
+IPX.
+
+The /proc/net/ipx_route  table  holds  a list of IPX routes. For each route it
+gives the  destination  network, the router node (or Directly) and the network
+address of the router (or Connected) for internal networks.
+
+6. TIPC
+-------
+
+tipc_rmem
+---------
+
+The TIPC protocol now has a tunable for the receive memory, similar to the
+tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max)
+
+::
+
+    # cat /proc/sys/net/tipc/tipc_rmem
+    4252725 34021800        68043600
+    #
+
+The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values
+are scaled (shifted) versions of that same value.  Note that the min value
+is not at this point in time used in any meaningful way, but the triplet is
+preserved in order to be consistent with things like tcp_rmem.
+
+named_timeout
+-------------
+
+TIPC name table updates are distributed asynchronously in a cluster, without
+any form of transaction handling. This means that different race scenarios are
+possible. One such is that a name withdrawal sent out by one node and received
+by another node may arrive after a second, overlapping name publication already
+has been accepted from a third node, although the conflicting updates
+originally may have been issued in the correct sequential order.
+If named_timeout is nonzero, failed topology updates will be placed on a defer
+queue until another event arrives that clears the error, or until the timeout
+expires. Value is in milliseconds.
diff --git a/Documentation/admin-guide/sysctl/sunrpc.rst b/Documentation/admin-guide/sysctl/sunrpc.rst
new file mode 100644
index 000000000000..09780a682afd
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/sunrpc.rst
@@ -0,0 +1,25 @@
+===================================
+Documentation for /proc/sys/sunrpc/
+===================================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/sunrpc and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to (re)set the debug
+flags of the SUN Remote Procedure Call (RPC) subsystem in
+the Linux kernel. This stuff is used for NFS, KNFSD and
+maybe a few other things as well.
+
+The files in there are used to control the debugging flags:
+rpc_debug, nfs_debug, nfsd_debug and nlm_debug.
+
+These flags are for kernel hackers only. You should read the
+source code in net/sunrpc/ for more information.
diff --git a/Documentation/admin-guide/sysctl/user.rst b/Documentation/admin-guide/sysctl/user.rst
new file mode 100644
index 000000000000..650eaa03f15e
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/user.rst
@@ -0,0 +1,78 @@
+=================================
+Documentation for /proc/sys/user/
+=================================
+
+kernel version 4.9.0
+
+Copyright (c) 2016		Eric Biederman <ebiederm@xmission.com>
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem.  It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+max_cgroup_namespaces
+=====================
+
+  The maximum number of cgroup namespaces that any user in the current
+  user namespace may create.
+
+max_ipc_namespaces
+==================
+
+  The maximum number of ipc namespaces that any user in the current
+  user namespace may create.
+
+max_mnt_namespaces
+==================
+
+  The maximum number of mount namespaces that any user in the current
+  user namespace may create.
+
+max_net_namespaces
+==================
+
+  The maximum number of network namespaces that any user in the
+  current user namespace may create.
+
+max_pid_namespaces
+==================
+
+  The maximum number of pid namespaces that any user in the current
+  user namespace may create.
+
+max_user_namespaces
+===================
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
+
+max_uts_namespaces
+==================
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
new file mode 100644
index 000000000000..64aeee1009ca
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -0,0 +1,964 @@
+===============================
+Documentation for /proc/sys/vm/
+===============================
+
+kernel version 2.6.29
+
+Copyright (c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2008         Peter W. Morreale <pmorreale@novell.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/vm and is valid for Linux kernel version 2.6.29.
+
+The files in this directory can be used to tune the operation
+of the virtual memory (VM) subsystem of the Linux kernel and
+the writeout of dirty data to disk.
+
+Default values and initialization routines for most of these
+files can be found in mm/swap.c.
+
+Currently, these files are in /proc/sys/vm:
+
+- admin_reserve_kbytes
+- block_dump
+- compact_memory
+- compact_unevictable_allowed
+- dirty_background_bytes
+- dirty_background_ratio
+- dirty_bytes
+- dirty_expire_centisecs
+- dirty_ratio
+- dirtytime_expire_seconds
+- dirty_writeback_centisecs
+- drop_caches
+- extfrag_threshold
+- hugetlb_shm_group
+- laptop_mode
+- legacy_va_layout
+- lowmem_reserve_ratio
+- max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
+- min_free_kbytes
+- min_slab_ratio
+- min_unmapped_ratio
+- mmap_min_addr
+- mmap_rnd_bits
+- mmap_rnd_compat_bits
+- nr_hugepages
+- nr_hugepages_mempolicy
+- nr_overcommit_hugepages
+- nr_trim_pages         (only if CONFIG_MMU=n)
+- numa_zonelist_order
+- oom_dump_tasks
+- oom_kill_allocating_task
+- overcommit_kbytes
+- overcommit_memory
+- overcommit_ratio
+- page-cluster
+- panic_on_oom
+- percpu_pagelist_fraction
+- stat_interval
+- stat_refresh
+- numa_stat
+- swappiness
+- unprivileged_userfaultfd
+- user_reserve_kbytes
+- vfs_cache_pressure
+- watermark_boost_factor
+- watermark_scale_factor
+- zone_reclaim_mode
+
+
+admin_reserve_kbytes
+====================
+
+The amount of free memory in the system that should be reserved for users
+with the capability cap_sys_admin.
+
+admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
+
+That should provide enough for the admin to log in and kill a process,
+if necessary, under the default overcommit 'guess' mode.
+
+Systems running under overcommit 'never' should increase this to account
+for the full Virtual Memory Size of programs used to recover. Otherwise,
+root may not be able to log in to recover the system.
+
+How do you calculate a minimum useful reserve?
+
+sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
+
+For overcommit 'guess', we can sum resident set sizes (RSS).
+On x86_64 this is about 8MB.
+
+For overcommit 'never', we can take the max of their virtual sizes (VSZ)
+and add the sum of their RSS.
+On x86_64 this is about 128MB.
+
+Changing this takes effect whenever an application requests memory.
+
+
+block_dump
+==========
+
+block_dump enables block I/O debugging when set to a nonzero value. More
+information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst.
+
+
+compact_memory
+==============
+
+Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
+all zones are compacted such that free memory is available in contiguous
+blocks where possible. This can be important for example in the allocation of
+huge pages although processes will also directly compact memory as required.
+
+
+compact_unevictable_allowed
+===========================
+
+Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
+allowed to examine the unevictable lru (mlocked pages) for pages to compact.
+This should be used on systems where stalls for minor page faults are an
+acceptable trade for large contiguous free memory.  Set to 0 to prevent
+compaction from moving pages that are unevictable.  Default value is 1.
+
+
+dirty_background_bytes
+======================
+
+Contains the amount of dirty memory at which the background kernel
+flusher threads will start writeback.
+
+Note:
+  dirty_background_bytes is the counterpart of dirty_background_ratio. Only
+  one of them may be specified at a time. When one sysctl is written it is
+  immediately taken into account to evaluate the dirty memory limits and the
+  other appears as 0 when read.
+
+
+dirty_background_ratio
+======================
+
+Contains, as a percentage of total available memory that contains free pages
+and reclaimable pages, the number of pages at which the background kernel
+flusher threads will start writing out dirty data.
+
+The total available memory is not equal to total system memory.
+
+
+dirty_bytes
+===========
+
+Contains the amount of dirty memory at which a process generating disk writes
+will itself start writeback.
+
+Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be
+specified at a time. When one sysctl is written it is immediately taken into
+account to evaluate the dirty memory limits and the other appears as 0 when
+read.
+
+Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
+value lower than this limit will be ignored and the old configuration will be
+retained.
+
+
+dirty_expire_centisecs
+======================
+
+This tunable is used to define when dirty data is old enough to be eligible
+for writeout by the kernel flusher threads.  It is expressed in 100'ths
+of a second.  Data which has been dirty in-memory for longer than this
+interval will be written out next time a flusher thread wakes up.
+
+
+dirty_ratio
+===========
+
+Contains, as a percentage of total available memory that contains free pages
+and reclaimable pages, the number of pages at which a process which is
+generating disk writes will itself start writing out dirty data.
+
+The total available memory is not equal to total system memory.
+
+
+dirtytime_expire_seconds
+========================
+
+When a lazytime inode is constantly having its pages dirtied, the inode with
+an updated timestamp will never get chance to be written out.  And, if the
+only thing that has happened on the file system is a dirtytime inode caused
+by an atime update, a worker will be scheduled to make sure that inode
+eventually gets pushed out to disk.  This tunable is used to define when dirty
+inode is old enough to be eligible for writeback by the kernel flusher threads.
+And, it is also used as the interval to wakeup dirtytime_writeback thread.
+
+
+dirty_writeback_centisecs
+=========================
+
+The kernel flusher threads will periodically wake up and write `old` data
+out to disk.  This tunable expresses the interval between those wakeups, in
+100'ths of a second.
+
+Setting this to zero disables periodic writeback altogether.
+
+
+drop_caches
+===========
+
+Writing to this will cause the kernel to drop clean caches, as well as
+reclaimable slab objects like dentries and inodes.  Once dropped, their
+memory becomes free.
+
+To free pagecache::
+
+	echo 1 > /proc/sys/vm/drop_caches
+
+To free reclaimable slab objects (includes dentries and inodes)::
+
+	echo 2 > /proc/sys/vm/drop_caches
+
+To free slab objects and pagecache::
+
+	echo 3 > /proc/sys/vm/drop_caches
+
+This is a non-destructive operation and will not free any dirty objects.
+To increase the number of objects freed by this operation, the user may run
+`sync` prior to writing to /proc/sys/vm/drop_caches.  This will minimize the
+number of dirty objects on the system and create more candidates to be
+dropped.
+
+This file is not a means to control the growth of the various kernel caches
+(inodes, dentries, pagecache, etc...)  These objects are automatically
+reclaimed by the kernel when memory is needed elsewhere on the system.
+
+Use of this file can cause performance problems.  Since it discards cached
+objects, it may cost a significant amount of I/O and CPU to recreate the
+dropped objects, especially if they were under heavy use.  Because of this,
+use outside of a testing or debugging environment is not recommended.
+
+You may see informational messages in your kernel log when this file is
+used::
+
+	cat (1234): drop_caches: 3
+
+These are informational only.  They do not mean that anything is wrong
+with your system.  To disable them, echo 4 (bit 2) into drop_caches.
+
+
+extfrag_threshold
+=================
+
+This parameter affects whether the kernel will compact memory or direct
+reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in
+debugfs shows what the fragmentation index for each order is in each zone in
+the system. Values tending towards 0 imply allocations would fail due to lack
+of memory, values towards 1000 imply failures are due to fragmentation and -1
+implies that the allocation will succeed as long as watermarks are met.
+
+The kernel will not compact memory in a zone if the
+fragmentation index is <= extfrag_threshold. The default value is 500.
+
+
+highmem_is_dirtyable
+====================
+
+Available only for systems with CONFIG_HIGHMEM enabled (32b systems).
+
+This parameter controls whether the high memory is considered for dirty
+writers throttling.  This is not the case by default which means that
+only the amount of memory directly visible/usable by the kernel can
+be dirtied. As a result, on systems with a large amount of memory and
+lowmem basically depleted writers might be throttled too early and
+streaming writes can get very slow.
+
+Changing the value to non zero would allow more memory to be dirtied
+and thus allow writers to write more data which can be flushed to the
+storage more effectively. Note this also comes with a risk of pre-mature
+OOM killer because some writers (e.g. direct block device writes) can
+only use the low memory and they can fill it up with dirty data without
+any throttling.
+
+
+hugetlb_shm_group
+=================
+
+hugetlb_shm_group contains group id that is allowed to create SysV
+shared memory segment using hugetlb page.
+
+
+laptop_mode
+===========
+
+laptop_mode is a knob that controls "laptop mode". All the things that are
+controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst.
+
+
+legacy_va_layout
+================
+
+If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel
+will use the legacy (2.4) layout for all processes.
+
+
+lowmem_reserve_ratio
+====================
+
+For some specialised workloads on highmem machines it is dangerous for
+the kernel to allow process memory to be allocated from the "lowmem"
+zone.  This is because that memory could then be pinned via the mlock()
+system call, or by unavailability of swapspace.
+
+And on large highmem machines this lack of reclaimable lowmem memory
+can be fatal.
+
+So the Linux page allocator has a mechanism which prevents allocations
+which *could* use highmem from using too much lowmem.  This means that
+a certain amount of lowmem is defended from the possibility of being
+captured into pinned user memory.
+
+(The same argument applies to the old 16 megabyte ISA DMA region.  This
+mechanism will also defend that region from allocations which could use
+highmem or lowmem).
+
+The `lowmem_reserve_ratio` tunable determines how aggressive the kernel is
+in defending these lower zones.
+
+If you have a machine which uses highmem or ISA DMA and your
+applications are using mlock(), or if you are running with no swap then
+you probably should change the lowmem_reserve_ratio setting.
+
+The lowmem_reserve_ratio is an array. You can see them by reading this file::
+
+	% cat /proc/sys/vm/lowmem_reserve_ratio
+	256     256     32
+
+But, these values are not used directly. The kernel calculates # of protection
+pages for each zones from them. These are shown as array of protection pages
+in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+Each zone has an array of protection pages like this::
+
+  Node 0, zone      DMA
+    pages free     1355
+          min      3
+          low      3
+          high     4
+	:
+	:
+      numa_other   0
+          protection: (0, 2004, 2004, 2004)
+	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    pagesets
+      cpu: 0 pcp: 0
+          :
+
+These protections are added to score to judge whether this zone should be used
+for page allocation or should be reclaimed.
+
+In this example, if normal pages (index=2) are required to this DMA zone and
+watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
+not be used because pages_free(1355) is smaller than watermark + protection[2]
+(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
+normal page requirement. If requirement is DMA zone(index=0), protection[0]
+(=0) is used.
+
+zone[i]'s protection[j] is calculated by following expression::
+
+  (i < j):
+    zone[i]->protection[j]
+    = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
+      / lowmem_reserve_ratio[i];
+  (i = j):
+     (should not be protected. = 0;
+  (i > j):
+     (not necessary, but looks 0)
+
+The default values of lowmem_reserve_ratio[i] are
+
+    === ====================================
+    256 (if zone[i] means DMA or DMA32 zone)
+    32  (others)
+    === ====================================
+
+As above expression, they are reciprocal number of ratio.
+256 means 1/256. # of protection pages becomes about "0.39%" of total managed
+pages of higher zones on the node.
+
+If you would like to protect more pages, smaller values are effective.
+The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
+disables protection of the pages.
+
+
+max_map_count:
+==============
+
+This file contains the maximum number of memory map areas a process
+may have. Memory map areas are used as a side-effect of calling
+malloc, directly by mmap, mprotect, and madvise, and also when loading
+shared libraries.
+
+While most applications need less than a thousand maps, certain
+programs, particularly malloc debuggers, may consume lots of them,
+e.g., up to one or two maps per allocation.
+
+The default value is 65536.
+
+
+memory_failure_early_kill:
+==========================
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+
+memory_failure_recovery
+=======================
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
+
+min_free_kbytes
+===============
+
+This is used to force the Linux VM to keep a minimum number
+of kilobytes free.  The VM uses this number to compute a
+watermark[WMARK_MIN] value for each lowmem zone in the system.
+Each lowmem zone gets a number of reserved free pages based
+proportionally on its size.
+
+Some minimal amount of memory is needed to satisfy PF_MEMALLOC
+allocations; if you set this to lower than 1024KB, your system will
+become subtly broken, and prone to deadlock under high loads.
+
+Setting this too high will OOM your machine instantly.
+
+
+min_slab_ratio
+==============
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone.  On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+
+min_unmapped_ratio
+==================
+
+This is available only on NUMA kernels.
+
+This is a percentage of the total pages in each zone. Zone reclaim will
+only occur if more than this percentage of pages are in a state that
+zone_reclaim_mode allows to be reclaimed.
+
+If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
+against all file-backed unmapped pages including swapcache pages and tmpfs
+files. Otherwise, only unmapped pages backed by normal files but not tmpfs
+files and similar are considered.
+
+The default is 1 percent.
+
+
+mmap_min_addr
+=============
+
+This file indicates the amount of address space  which a user process will
+be restricted from mmapping.  Since kernel null dereference bugs could
+accidentally operate based on the information in the first couple of pages
+of memory userspace processes should not be allowed to write to them.  By
+default this value is set to 0 and no protections will be enforced by the
+security module.  Setting this value to something like 64k will allow the
+vast majority of applications to work correctly and provide defense in depth
+against future potential kernel bugs.
+
+
+mmap_rnd_bits
+=============
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations on architectures which support
+tuning address space randomization.  This value will be bounded
+by the architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_bits tunable
+
+
+mmap_rnd_compat_bits
+====================
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations for applications run in
+compatibility mode on architectures which support tuning address
+space randomization.  This value will be bounded by the
+architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_compat_bits tunable
+
+
+nr_hugepages
+============
+
+Change the minimum size of the hugepage pool.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_hugepages_mempolicy
+======================
+
+Change the size of the hugepage pool at run-time on a specific
+set of NUMA nodes.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_overcommit_hugepages
+=======================
+
+Change the maximum size of the hugepage pool. The maximum is
+nr_hugepages + nr_overcommit_hugepages.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_trim_pages
+=============
+
+This is available only on NOMMU kernels.
+
+This value adjusts the excess page trimming behaviour of power-of-2 aligned
+NOMMU mmap allocations.
+
+A value of 0 disables trimming of allocations entirely, while a value of 1
+trims excess pages aggressively. Any value >= 1 acts as the watermark where
+trimming of allocations is initiated.
+
+The default value is 1.
+
+See Documentation/nommu-mmap.txt for more information.
+
+
+numa_zonelist_order
+===================
+
+This sysctl is only for NUMA and it is deprecated. Anything but
+Node order will fail!
+
+'where the memory is allocated from' is controlled by zonelists.
+
+(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
+you may be able to read ZONE_DMA as ZONE_DMA32...)
+
+In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
+ZONE_NORMAL -> ZONE_DMA
+This means that a memory allocation request for GFP_KERNEL will
+get memory from ZONE_DMA only when ZONE_NORMAL is not available.
+
+In NUMA case, you can think of following 2 types of order.
+Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL::
+
+  (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
+  (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
+
+Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
+will be used before ZONE_NORMAL exhaustion. This increases possibility of
+out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
+
+Type(B) cannot offer the best locality but is more robust against OOM of
+the DMA zone.
+
+Type(A) is called as "Node" order. Type (B) is "Zone" order.
+
+"Node order" orders the zonelists by node, then by zone within each node.
+Specify "[Nn]ode" for node order
+
+"Zone Order" orders the zonelists by zone type, then by node within each
+zone.  Specify "[Zz]one" for zone order.
+
+Specify "[Dd]efault" to request automatic configuration.
+
+On 32-bit, the Normal zone needs to be preserved for allocations accessible
+by the kernel, so "zone" order will be selected.
+
+On 64-bit, devices that require DMA32/DMA are relatively rare, so "node"
+order will be selected.
+
+Default order is recommended unless this is causing problems for your
+system/application.
+
+
+oom_dump_tasks
+==============
+
+Enables a system-wide task dump (excluding kernel threads) to be produced
+when the kernel performs an OOM-killing and includes such information as
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
+
+If this is set to zero, this information is suppressed.  On very
+large systems with thousands of tasks it may not be feasible to dump
+the memory state information for each one.  Such systems should not
+be forced to incur a performance penalty in OOM conditions when the
+information may not be desired.
+
+If this is set to non-zero, this information is shown whenever the
+OOM killer actually kills a memory-hogging task.
+
+The default value is 1 (enabled).
+
+
+oom_kill_allocating_task
+========================
+
+This enables or disables killing the OOM-triggering task in
+out-of-memory situations.
+
+If this is set to zero, the OOM killer will scan through the entire
+tasklist and select a task based on heuristics to kill.  This normally
+selects a rogue memory-hogging task that frees up a large amount of
+memory when killed.
+
+If this is set to non-zero, the OOM killer simply kills the task that
+triggered the out-of-memory condition.  This avoids the expensive
+tasklist scan.
+
+If panic_on_oom is selected, it takes precedence over whatever value
+is used in oom_kill_allocating_task.
+
+The default value is 0.
+
+
+overcommit_kbytes
+=================
+
+When overcommit_memory is set to 2, the committed address space is not
+permitted to exceed swap plus this amount of physical RAM. See below.
+
+Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
+of them may be specified at a time. Setting one disables the other (which
+then appears as 0 when read).
+
+
+overcommit_memory
+=================
+
+This value contains a flag that enables memory overcommitment.
+
+When this flag is 0, the kernel attempts to estimate the amount
+of free memory left when userspace requests more memory.
+
+When this flag is 1, the kernel pretends there is always enough
+memory until it actually runs out.
+
+When this flag is 2, the kernel uses a "never overcommit"
+policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
+
+This feature can be very useful because there are a lot of
+programs that malloc() huge amounts of memory "just-in-case"
+and don't use much of it.
+
+The default value is 0.
+
+See Documentation/vm/overcommit-accounting.rst and
+mm/util.c::__vm_enough_memory() for more information.
+
+
+overcommit_ratio
+================
+
+When overcommit_memory is set to 2, the committed address
+space is not permitted to exceed swap plus this percentage
+of physical RAM.  See above.
+
+
+page-cluster
+============
+
+page-cluster controls the number of pages up to which consecutive pages
+are read in from swap in a single attempt. This is the swap counterpart
+to page cache readahead.
+The mentioned consecutivity is not in terms of virtual/physical addresses,
+but consecutive on swap space - that means they were swapped out together.
+
+It is a logarithmic value - setting it to zero means "1 page", setting
+it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
+Zero disables swap readahead completely.
+
+The default value is three (eight pages at a time).  There may be some
+small benefits in tuning this to a different value if your workload is
+swap-intensive.
+
+Lower values mean lower latencies for initial faults, but at the same time
+extra faults and I/O delays for following faults if they would have been part of
+that consecutive pages readahead would have brought in.
+
+
+panic_on_oom
+============
+
+This enables or disables panic on out-of-memory feature.
+
+If this is set to 0, the kernel will kill some rogue process,
+called oom_killer.  Usually, oom_killer can kill rogue processes and
+system will survive.
+
+If this is set to 1, the kernel panics when out-of-memory happens.
+However, if a process limits using nodes by mempolicy/cpusets,
+and those nodes become memory exhaustion status, one process
+may be killed by oom-killer. No panic occurs in this case.
+Because other nodes' memory may be free. This means system total status
+may be not fatal yet.
+
+If this is set to 2, the kernel panics compulsorily even on the
+above-mentioned. Even oom happens under memory cgroup, the whole
+system panics.
+
+The default value is 0.
+
+1 and 2 are for failover of clustering. Please select either
+according to your policy of failover.
+
+panic_on_oom=2+kdump gives you very strong tool to investigate
+why oom happens. You can get snapshot.
+
+
+percpu_pagelist_fraction
+========================
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list.  The min value for this is 8.  It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist.  This entry only changes the value
+of hot per cpu pagelists.  User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result.  It is
+set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero.  Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list.  If the user writes '0' to this
+sysctl, it will revert to this default behavior.
+
+
+stat_interval
+=============
+
+The time interval between which vm statistics are updated.  The default
+is 1 second.
+
+
+stat_refresh
+============
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+
+numa_stat
+=========
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do::
+
+	echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do::
+
+	echo 1 > /proc/sys/vm/numa_stat
+
+
+swappiness
+==========
+
+This control is used to define how aggressive the kernel will swap
+memory pages.  Higher values will increase aggressiveness, lower values
+decrease the amount of swap.  A value of 0 instructs the kernel not to
+initiate swap until the amount of free and file-backed pages is less
+than the high water mark in a zone.
+
+The default value is 60.
+
+
+unprivileged_userfaultfd
+========================
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls.  Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+
+user_reserve_kbytes
+===================
+
+When overcommit_memory is set to 2, "never overcommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+
+vfs_cache_pressure
+==================
+
+This percentage value controls the tendency of the kernel to reclaim
+the memory which is used for caching of directory and inode objects.
+
+At the default value of vfs_cache_pressure=100 the kernel will attempt to
+reclaim dentries and inodes at a "fair" rate with respect to pagecache and
+swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
+to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
+never reclaim dentries and inodes due to memory pressure and this can easily
+lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
+causes the kernel to prefer to reclaim dentries and inodes.
+
+Increasing vfs_cache_pressure significantly beyond 100 may have negative
+performance impact. Reclaim code needs to take various locks to find freeable
+directory and inode objects. With vfs_cache_pressure=1000, it will look for
+ten times more freeable objects than there are.
+
+
+watermark_boost_factor
+======================
+
+This factor controls the level of reclaim when memory is being fragmented.
+It defines the percentage of the high watermark of a zone that will be
+reclaimed if pages of different mobility are being mixed within pageblocks.
+The intent is that compaction has less work to do in the future and to
+increase the success rate of future high-order allocations such as SLUB
+allocations, THP and hugetlbfs pages.
+
+To make it sensible with respect to the watermark_scale_factor
+parameter, the unit is in fractions of 10,000. The default value of
+15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
+watermark will be reclaimed in the event of a pageblock being mixed due
+to fragmentation. The level of reclaim is determined by the number of
+fragmentation events that occurred in the recent past. If this value is
+smaller than a pageblock then a pageblocks worth of pages will be reclaimed
+(e.g.  2MB on 64-bit x86). A boost factor of 0 will disable the feature.
+
+
+watermark_scale_factor
+======================
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
+
+zone_reclaim_mode
+=================
+
+Zone_reclaim_mode allows someone to set more or less aggressive approaches to
+reclaim memory when a zone runs out of memory. If it is set to zero then no
+zone reclaim occurs. Allocations will be satisfied from other zones / nodes
+in the system.
+
+This is value OR'ed together of
+
+=	===================================
+1	Zone reclaim on
+2	Zone reclaim writes dirty pages out
+4	Zone reclaim swaps pages
+=	===================================
+
+zone_reclaim_mode is disabled by default.  For file servers or workloads
+that benefit from having their data cached, zone_reclaim_mode should be
+left disabled as the caching effect is likely to be more important than
+data locality.
+
+zone_reclaim may be enabled if it's known that the workload is partitioned
+such that each partition fits within a NUMA node and that accessing remote
+memory would cause a measurable performance reduction.  The page allocator
+will then reclaim easily reusable pages (those page cache pages that are
+currently not used) before allocating off node pages.
+
+Allowing zone reclaim to write out pages stops processes that are
+writing large amounts of data from dirtying pages on other nodes. Zone
+reclaim will write out dirty pages if a zone fills up and so effectively
+throttle the process. This may decrease the performance of a single process
+since it cannot use all of system memory to buffer the outgoing writes
+anymore but it preserve the memory on other nodes so that the performance
+of other processes running on other nodes will not be affected.
+
+Allowing regular swap effectively restricts allocations to the local
+node unless explicitly overridden by memory policies or cpuset
+configurations.
diff --git a/Documentation/video-output.txt b/Documentation/admin-guide/video-output.rst
index 56d6fa2e2368..56d6fa2e2368 100644
--- a/Documentation/video-output.txt
+++ b/Documentation/admin-guide/video-output.rst
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
new file mode 100644
index 000000000000..e76665a8f2f2
--- /dev/null
+++ b/Documentation/admin-guide/xfs.rst
@@ -0,0 +1,466 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+The SGI XFS Filesystem
+======================
+
+XFS is a high performance journaling filesystem which originated
+on the SGI IRIX platform.  It is completely multi-threaded, can
+support large files and large filesystems, extended attributes,
+variable block sizes, is extent based, and makes extensive use of
+Btrees (directories, extents, free space) to aid both performance
+and scalability.
+
+Refer to the documentation at https://xfs.wiki.kernel.org/
+for further details.  This implementation is on-disk compatible
+with the IRIX version of XFS.
+
+
+Mount Options
+=============
+
+When mounting an XFS filesystem, the following options are accepted.
+
+  allocsize=size
+	Sets the buffered I/O end-of-file preallocation size when
+	doing delayed allocation writeout (default size is 64KiB).
+	Valid values for this option are page size (typically 4KiB)
+	through to 1GiB, inclusive, in power-of-2 increments.
+
+	The default behaviour is for dynamic end-of-file
+	preallocation size, which uses a set of heuristics to
+	optimise the preallocation size based on the current
+	allocation patterns within the file and the access patterns
+	to the file. Specifying a fixed ``allocsize`` value turns off
+	the dynamic behaviour.
+
+  attr2 or noattr2
+	The options enable/disable an "opportunistic" improvement to
+	be made in the way inline extended attributes are stored
+	on-disk.  When the new form is used for the first time when
+	``attr2`` is selected (either when setting or removing extended
+	attributes) the on-disk superblock feature bit field will be
+	updated to reflect this format being in use.
+
+	The default behaviour is determined by the on-disk feature
+	bit indicating that ``attr2`` behaviour is active. If either
+	mount option is set, then that becomes the new default used
+	by the filesystem.
+
+	CRC enabled filesystems always use the ``attr2`` format, and so
+	will reject the ``noattr2`` mount option if it is set.
+
+  discard or nodiscard (default)
+	Enable/disable the issuing of commands to let the block
+	device reclaim space freed by the filesystem.  This is
+	useful for SSD devices, thinly provisioned LUNs and virtual
+	machine images, but may have a performance impact.
+
+	Note: It is currently recommended that you use the ``fstrim``
+	application to ``discard`` unused blocks rather than the ``discard``
+	mount option because the performance impact of this option
+	is quite severe.
+
+  grpid/bsdgroups or nogrpid/sysvgroups (default)
+	These options define what group ID a newly created file
+	gets.  When ``grpid`` is set, it takes the group ID of the
+	directory in which it is created; otherwise it takes the
+	``fsgid`` of the current process, unless the directory has the
+	``setgid`` bit set, in which case it takes the ``gid`` from the
+	parent directory, and also gets the ``setgid`` bit set if it is
+	a directory itself.
+
+  filestreams
+	Make the data allocator use the filestreams allocation mode
+	across the entire filesystem rather than just on directories
+	configured to use it.
+
+  ikeep or noikeep (default)
+	When ``ikeep`` is specified, XFS does not delete empty inode
+	clusters and keeps them around on disk.  When ``noikeep`` is
+	specified, empty inode clusters are returned to the free
+	space pool.
+
+  inode32 or inode64 (default)
+	When ``inode32`` is specified, it indicates that XFS limits
+	inode creation to locations which will not result in inode
+	numbers with more than 32 bits of significance.
+
+	When ``inode64`` is specified, it indicates that XFS is allowed
+	to create inodes at any location in the filesystem,
+	including those which will result in inode numbers occupying
+	more than 32 bits of significance.
+
+	``inode32`` is provided for backwards compatibility with older
+	systems and applications, since 64 bits inode numbers might
+	cause problems for some applications that cannot handle
+	large inode numbers.  If applications are in use which do
+	not handle inode numbers bigger than 32 bits, the ``inode32``
+	option should be specified.
+
+  largeio or nolargeio (default)
+	If ``nolargeio`` is specified, the optimal I/O reported in
+	``st_blksize`` by **stat(2)** will be as small as possible to allow
+	user applications to avoid inefficient read/modify/write
+	I/O.  This is typically the page size of the machine, as
+	this is the granularity of the page cache.
+
+	If ``largeio`` is specified, a filesystem that was created with a
+	``swidth`` specified will return the ``swidth`` value (in bytes)
+	in ``st_blksize``. If the filesystem does not have a ``swidth``
+	specified but does specify an ``allocsize`` then ``allocsize``
+	(in bytes) will be returned instead. Otherwise the behaviour
+	is the same as if ``nolargeio`` was specified.
+
+  logbufs=value
+	Set the number of in-memory log buffers.  Valid numbers
+	range from 2-8 inclusive.
+
+	The default value is 8 buffers.
+
+	If the memory cost of 8 log buffers is too high on small
+	systems, then it may be reduced at some cost to performance
+	on metadata intensive workloads. The ``logbsize`` option below
+	controls the size of each buffer and so is also relevant to
+	this case.
+
+  logbsize=value
+	Set the size of each in-memory log buffer.  The size may be
+	specified in bytes, or in kilobytes with a "k" suffix.
+	Valid sizes for version 1 and version 2 logs are 16384 (16k)
+	and 32768 (32k).  Valid sizes for version 2 logs also
+	include 65536 (64k), 131072 (128k) and 262144 (256k). The
+	logbsize must be an integer multiple of the log
+	stripe unit configured at **mkfs(8)** time.
+
+	The default value for for version 1 logs is 32768, while the
+	default value for version 2 logs is MAX(32768, log_sunit).
+
+  logdev=device and rtdev=device
+	Use an external log (metadata journal) and/or real-time device.
+	An XFS filesystem has up to three parts: a data section, a log
+	section, and a real-time section.  The real-time section is
+	optional, and the log section can be separate from the data
+	section or contained within it.
+
+  noalign
+	Data allocations will not be aligned at stripe unit
+	boundaries. This is only relevant to filesystems created
+	with non-zero data alignment parameters (``sunit``, ``swidth``) by
+	**mkfs(8)**.
+
+  norecovery
+	The filesystem will be mounted without running log recovery.
+	If the filesystem was not cleanly unmounted, it is likely to
+	be inconsistent when mounted in ``norecovery`` mode.
+	Some files or directories may not be accessible because of this.
+	Filesystems mounted ``norecovery`` must be mounted read-only or
+	the mount will fail.
+
+  nouuid
+	Don't check for double mounted file systems using the file
+	system ``uuid``.  This is useful to mount LVM snapshot volumes,
+	and often used in combination with ``norecovery`` for mounting
+	read-only snapshots.
+
+  noquota
+	Forcibly turns off all quota accounting and enforcement
+	within the filesystem.
+
+  uquota/usrquota/uqnoenforce/quota
+	User disk quota accounting enabled, and limits (optionally)
+	enforced.  Refer to **xfs_quota(8)** for further details.
+
+  gquota/grpquota/gqnoenforce
+	Group disk quota accounting enabled and limits (optionally)
+	enforced.  Refer to **xfs_quota(8)** for further details.
+
+  pquota/prjquota/pqnoenforce
+	Project disk quota accounting enabled and limits (optionally)
+	enforced.  Refer to **xfs_quota(8)** for further details.
+
+  sunit=value and swidth=value
+	Used to specify the stripe unit and width for a RAID device
+	or a stripe volume.  "value" must be specified in 512-byte
+	block units. These options are only relevant to filesystems
+	that were created with non-zero data alignment parameters.
+
+	The ``sunit`` and ``swidth`` parameters specified must be compatible
+	with the existing filesystem alignment characteristics.  In
+	general, that means the only valid changes to ``sunit`` are
+	increasing it by a power-of-2 multiple. Valid ``swidth`` values
+	are any integer multiple of a valid ``sunit`` value.
+
+	Typically the only time these mount options are necessary if
+	after an underlying RAID device has had it's geometry
+	modified, such as adding a new disk to a RAID5 lun and
+	reshaping it.
+
+  swalloc
+	Data allocations will be rounded up to stripe width boundaries
+	when the current end of file is being extended and the file
+	size is larger than the stripe width size.
+
+  wsync
+	When specified, all filesystem namespace operations are
+	executed synchronously. This ensures that when the namespace
+	operation (create, unlink, etc) completes, the change to the
+	namespace is on stable storage. This is useful in HA setups
+	where failover must not result in clients seeing
+	inconsistent namespace presentation during or after a
+	failover event.
+
+
+Deprecated Mount Options
+========================
+
+===========================     ================
+  Name				Removal Schedule
+===========================     ================
+===========================     ================
+
+
+Removed Mount Options
+=====================
+
+===========================     =======
+  Name				Removed
+===========================	=======
+  delaylog/nodelaylog		v4.0
+  ihashsize			v4.0
+  irixsgid			v4.0
+  osyncisdsync/osyncisosync	v4.0
+  barrier			v4.19
+  nobarrier			v4.19
+===========================     =======
+
+sysctls
+=======
+
+The following sysctls are available for the XFS filesystem:
+
+  fs.xfs.stats_clear		(Min: 0  Default: 0  Max: 1)
+	Setting this to "1" clears accumulated XFS statistics
+	in /proc/fs/xfs/stat.  It then immediately resets to "0".
+
+  fs.xfs.xfssyncd_centisecs	(Min: 100  Default: 3000  Max: 720000)
+	The interval at which the filesystem flushes metadata
+	out to disk and runs internal cache cleanup routines.
+
+  fs.xfs.filestream_centisecs	(Min: 1  Default: 3000  Max: 360000)
+	The interval at which the filesystem ages filestreams cache
+	references and returns timed-out AGs back to the free stream
+	pool.
+
+  fs.xfs.speculative_prealloc_lifetime
+		(Units: seconds   Min: 1  Default: 300  Max: 86400)
+	The interval at which the background scanning for inodes
+	with unused speculative preallocation runs. The scan
+	removes unused preallocation from clean inodes and releases
+	the unused space back to the free pool.
+
+  fs.xfs.error_level		(Min: 0  Default: 3  Max: 11)
+	A volume knob for error reporting when internal errors occur.
+	This will generate detailed messages & backtraces for filesystem
+	shutdowns, for example.  Current threshold values are:
+
+		XFS_ERRLEVEL_OFF:       0
+		XFS_ERRLEVEL_LOW:       1
+		XFS_ERRLEVEL_HIGH:      5
+
+  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 256)
+	Causes certain error conditions to call BUG(). Value is a bitmask;
+	OR together the tags which represent errors which should cause panics:
+
+		XFS_NO_PTAG                     0
+		XFS_PTAG_IFLUSH                 0x00000001
+		XFS_PTAG_LOGRES                 0x00000002
+		XFS_PTAG_AILDELETE              0x00000004
+		XFS_PTAG_ERROR_REPORT           0x00000008
+		XFS_PTAG_SHUTDOWN_CORRUPT       0x00000010
+		XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
+		XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
+		XFS_PTAG_FSBLOCK_ZERO           0x00000080
+		XFS_PTAG_VERIFIER_ERROR         0x00000100
+
+	This option is intended for debugging only.
+
+  fs.xfs.irix_symlink_mode	(Min: 0  Default: 0  Max: 1)
+	Controls whether symlinks are created with mode 0777 (default)
+	or whether their mode is affected by the umask (irix mode).
+
+  fs.xfs.irix_sgid_inherit	(Min: 0  Default: 0  Max: 1)
+	Controls files created in SGID directories.
+	If the group ID of the new file does not match the effective group
+	ID or one of the supplementary group IDs of the parent dir, the
+	ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
+	is set.
+
+  fs.xfs.inherit_sync		(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "sync" flag set
+	by the **xfs_io(8)** chattr command on a directory to be
+	inherited by files in that directory.
+
+  fs.xfs.inherit_nodump		(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "nodump" flag set
+	by the **xfs_io(8)** chattr command on a directory to be
+	inherited by files in that directory.
+
+  fs.xfs.inherit_noatime	(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "noatime" flag set
+	by the **xfs_io(8)** chattr command on a directory to be
+	inherited by files in that directory.
+
+  fs.xfs.inherit_nosymlinks	(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "nosymlinks" flag set
+	by the **xfs_io(8)** chattr command on a directory to be
+	inherited by files in that directory.
+
+  fs.xfs.inherit_nodefrag	(Min: 0  Default: 1  Max: 1)
+	Setting this to "1" will cause the "nodefrag" flag set
+	by the **xfs_io(8)** chattr command on a directory to be
+	inherited by files in that directory.
+
+  fs.xfs.rotorstep		(Min: 1  Default: 1  Max: 256)
+	In "inode32" allocation mode, this option determines how many
+	files the allocator attempts to allocate in the same allocation
+	group before moving to the next allocation group.  The intent
+	is to control the rate at which the allocator moves between
+	allocation groups when allocating extents for new files.
+
+Deprecated Sysctls
+==================
+
+None at present.
+
+
+Removed Sysctls
+===============
+
+  Name				Removed
+  ----				-------
+  fs.xfs.xfsbufd_centisec	v4.0
+  fs.xfs.age_buffer_centisecs	v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+	Defines how fast XFS should propagate an error upwards when a specific
+	error is found during the filesystem operation. It can propagate
+	immediately, after a defined number of retries, after a set time period,
+	or simply retry forever.
+
+ -error classes:
+	Specifies the subsystem the error configuration will apply to, such as
+	metadata IO or memory allocation. Different subsystems will have
+	different error handlers for which behaviour can be configured.
+
+ -error handlers:
+	Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via ``sysfs`` files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+  <dev>
+	The short device name of the mounted filesystem. This is the same device
+	name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+  <class>
+	The subsystem the error configuration belongs to. As of 4.9, the defined
+	classes are:
+
+		- "metadata": applies metadata buffer write IO
+
+  <error>
+	The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+  /sys/fs/xfs/<dev>/error/
+
+  fail_at_unmount		(Min:  0  Default:  1  Max: 1)
+	Defines the filesystem error behavior at unmount time.
+
+	If set to a value of 1, XFS will override all other error configurations
+	during unmount and replace them with "immediate fail" characteristics.
+	i.e. no retries, no retry timeout. This will always allow unmount to
+	succeed when there are persistent errors present.
+
+	If set to 0, the configured retry behaviour will continue until all
+	retries and/or timeouts have been exhausted. This will delay unmount
+	completion when there are persistent errors, and it may prevent the
+	filesystem from ever unmounting fully in the case of "retry forever"
+	handler configurations.
+
+	Note: there is no guarantee that fail_at_unmount can be set while an
+	unmount is in progress. It is possible that the ``sysfs`` entries are
+	removed by the unmounting filesystem before a "retry forever" error
+	handler configuration causes unmount to hang, and hence the filesystem
+	must be configured appropriately before unmount begins to prevent
+	unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configured for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+  max_retries			(Min: -1  Default: Varies  Max: INTMAX)
+	Defines the allowed number of retries of a specific error before
+	the filesystem will propagate the error. The retry count for a given
+	error context (e.g. a specific metadata buffer) is reset every time
+	there is a successful completion of the operation.
+
+	Setting the value to "-1" will cause XFS to retry forever for this
+	specific error.
+
+	Setting the value to "0" will cause XFS to fail immediately when the
+	specific error is reported.
+
+	Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+	operation "N" times before propagating the error.
+
+  retry_timeout_seconds		(Min:  -1  Default:  Varies  Max: 1 day)
+	Define the amount of time (in seconds) that the filesystem is
+	allowed to retry its operations when the specific error is
+	found.
+
+	Setting the value to "-1" will allow XFS to retry forever for this
+	specific error.
+
+	Setting the value to "0" will cause XFS to fail immediately when the
+	specific error is reported.
+
+	Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+	operation for up to "N" seconds before propagating the error.
+
+**Note:** The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/Documentation/aoe/aoe.txt b/Documentation/aoe/aoe.txt
deleted file mode 100644
index c71487d399d1..000000000000
--- a/Documentation/aoe/aoe.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-ATA over Ethernet is a network protocol that provides simple access to
-block storage on the LAN.
-
-  http://support.coraid.com/documents/AoEr11.txt
-
-The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ...
-
-  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
-
-It has many tips and hints!  Please see, especially, recommended
-tunings for virtual memory:
-
-  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19
-
-The aoetools are userland programs that are designed to work with this
-driver.  The aoetools are on sourceforge.
-
-  http://aoetools.sourceforge.net/
-
-The scripts in this Documentation/aoe directory are intended to
-document the use of the driver and are not necessary if you install
-the aoetools.
-
-
-CREATING DEVICE NODES
-
-  Users of udev should find the block device nodes created
-  automatically, but to create all the necessary device nodes, use the
-  udev configuration rules provided in udev.txt (in this directory).
-
-  There is a udev-install.sh script that shows how to install these
-  rules on your system.
-
-  There is also an autoload script that shows how to edit
-  /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when
-  necessary.  Preloading the aoe module is preferable to autoloading,
-  however, because AoE discovery takes a few seconds.  It can be
-  confusing when an AoE device is not present the first time the a
-  command is run but appears a second later.
-
-USING DEVICE NODES
-
-  "cat /dev/etherd/err" blocks, waiting for error diagnostic output,
-  like any retransmitted packets.
-
-  "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to
-  limit ATA over Ethernet traffic to eth2 and eth4.  AoE traffic from
-  untrusted networks should be ignored as a matter of security.  See
-  also the aoe_iflist driver option described below.
-
-  "echo > /dev/etherd/discover" tells the driver to find out what AoE
-  devices are available.
-
-  In the future these character devices may disappear and be replaced
-  by sysfs counterparts.  Using the commands in aoetools insulates
-  users from these implementation details.
-
-  The block devices are named like this:
-
-	e{shelf}.{slot}
-	e{shelf}.{slot}p{part}
-
-  ... so that "e0.2" is the third blade from the left (slot 2) in the
-  first shelf (shelf address zero).  That's the whole disk.  The first
-  partition on that disk would be "e0.2p1".
-
-USING SYSFS
-
-  Each aoe block device in /sys/block has the extra attributes of
-  state, mac, and netif.  The state attribute is "up" when the device
-  is ready for I/O and "down" if detected but unusable.  The
-  "down,closewait" state shows that the device is still open and
-  cannot come up again until it has been closed.
-
-  The mac attribute is the ethernet address of the remote AoE device.
-  The netif attribute is the network interface on the localhost
-  through which we are communicating with the remote AoE device.
-
-  There is a script in this directory that formats this information in
-  a convenient way.  Users with aoetools should use the aoe-stat
-  command.
-
-  root@makki root# sh Documentation/aoe/status.sh 
-     e10.0            eth3              up
-     e10.1            eth3              up
-     e10.2            eth3              up
-     e10.3            eth3              up
-     e10.4            eth3              up
-     e10.5            eth3              up
-     e10.6            eth3              up
-     e10.7            eth3              up
-     e10.8            eth3              up
-     e10.9            eth3              up
-      e4.0            eth1              up
-      e4.1            eth1              up
-      e4.2            eth1              up
-      e4.3            eth1              up
-      e4.4            eth1              up
-      e4.5            eth1              up
-      e4.6            eth1              up
-      e4.7            eth1              up
-      e4.8            eth1              up
-      e4.9            eth1              up
-
-  Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver
-  option discussed below) instead of /dev/etherd/interfaces to limit
-  AoE traffic to the network interfaces in the given
-  whitespace-separated list.  Unlike the old character device, the
-  sysfs entry can be read from as well as written to.
-
-  It's helpful to trigger discovery after setting the list of allowed
-  interfaces.  The aoetools package provides an aoe-discover script
-  for this purpose.  You can also directly use the
-  /dev/etherd/discover special file described above.
-
-DRIVER OPTIONS
-
-  There is a boot option for the built-in aoe driver and a
-  corresponding module parameter, aoe_iflist.  Without this option,
-  all network interfaces may be used for ATA over Ethernet.  Here is a
-  usage example for the module parameter.
-
-    modprobe aoe_iflist="eth1 eth3"
-
-  The aoe_deadsecs module parameter determines the maximum number of
-  seconds that the driver will wait for an AoE device to provide a
-  response to an AoE command.  After aoe_deadsecs seconds have
-  elapsed, the AoE device will be marked as "down".  A value of zero
-  is supported for testing purposes and makes the aoe driver keep
-  trying AoE commands forever.
-
-  The aoe_maxout module parameter has a default of 128.  This is the
-  maximum number of unresponded packets that will be sent to an AoE
-  target at one time.
-
-  The aoe_dyndevs module parameter defaults to 1, meaning that the
-  driver will assign a block device minor number to a discovered AoE
-  target based on the order of its discovery.  With dynamic minor
-  device numbers in use, a greater range of AoE shelf and slot
-  addresses can be supported.  Users with udev will never have to
-  think about minor numbers.  Using aoe_dyndevs=0 allows device nodes
-  to be pre-created using a static minor-number scheme with the
-  aoe-mkshelf script in the aoetools.
diff --git a/Documentation/aoe/todo.txt b/Documentation/aoe/todo.txt
deleted file mode 100644
index c09dfad4aed8..000000000000
--- a/Documentation/aoe/todo.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-There is a potential for deadlock when allocating a struct sk_buff for
-data that needs to be written out to aoe storage.  If the data is
-being written from a dirty page in order to free that page, and if
-there are no other pages available, then deadlock may occur when a
-free page is needed for the sk_buff allocation.  This situation has
-not been observed, but it would be nice to eliminate any potential for
-deadlock under memory pressure.
-
-Because ATA over Ethernet is not fragmented by the kernel's IP code,
-the destructor member of the struct sk_buff is available to the aoe
-driver.  By using a mempool for allocating all but the first few
-sk_buffs, and by registering a destructor, we should be able to
-efficiently allocate sk_buffs without introducing any potential for
-deadlock.
diff --git a/Documentation/arm/Booting b/Documentation/arm/Booting
deleted file mode 100644
index f1f965ce93d6..000000000000
--- a/Documentation/arm/Booting
+++ /dev/null
@@ -1,218 +0,0 @@
-			Booting ARM Linux
-			=================
-
-Author:	Russell King
-Date  : 18 May 2002
-
-The following documentation is relevant to 2.4.18-rmk6 and beyond.
-
-In order to boot ARM Linux, you require a boot loader, which is a small
-program that runs before the main kernel.  The boot loader is expected
-to initialise various devices, and eventually call the Linux kernel,
-passing information to the kernel.
-
-Essentially, the boot loader should provide (as a minimum) the
-following:
-
-1. Setup and initialise the RAM.
-2. Initialise one serial port.
-3. Detect the machine type.
-4. Setup the kernel tagged list.
-5. Load initramfs.
-6. Call the kernel image.
-
-
-1. Setup and initialise RAM
----------------------------
-
-Existing boot loaders:		MANDATORY
-New boot loaders:		MANDATORY
-
-The boot loader is expected to find and initialise all RAM that the
-kernel will use for volatile data storage in the system.  It performs
-this in a machine dependent manner.  (It may use internal algorithms
-to automatically locate and size all RAM, or it may use knowledge of
-the RAM in the machine, or any other method the boot loader designer
-sees fit.)
-
-
-2. Initialise one serial port
------------------------------
-
-Existing boot loaders:		OPTIONAL, RECOMMENDED
-New boot loaders:		OPTIONAL, RECOMMENDED
-
-The boot loader should initialise and enable one serial port on the
-target.  This allows the kernel serial driver to automatically detect
-which serial port it should use for the kernel console (generally
-used for debugging purposes, or communication with the target.)
-
-As an alternative, the boot loader can pass the relevant 'console='
-option to the kernel via the tagged lists specifying the port, and
-serial format options as described in
-
-       Documentation/admin-guide/kernel-parameters.rst.
-
-
-3. Detect the machine type
---------------------------
-
-Existing boot loaders:		OPTIONAL
-New boot loaders:		MANDATORY except for DT-only platforms
-
-The boot loader should detect the machine type its running on by some
-method.  Whether this is a hard coded value or some algorithm that
-looks at the connected hardware is beyond the scope of this document.
-The boot loader must ultimately be able to provide a MACH_TYPE_xxx
-value to the kernel. (see linux/arch/arm/tools/mach-types).  This
-should be passed to the kernel in register r1.
-
-For DT-only platforms, the machine type will be determined by device
-tree.  set the machine type to all ones (~0).  This is not strictly
-necessary, but assures that it will not match any existing types.
-
-4. Setup boot data
-------------------
-
-Existing boot loaders:		OPTIONAL, HIGHLY RECOMMENDED
-New boot loaders:		MANDATORY
-
-The boot loader must provide either a tagged list or a dtb image for
-passing configuration data to the kernel.  The physical address of the
-boot data is passed to the kernel in register r2.
-
-4a. Setup the kernel tagged list
---------------------------------
-
-The boot loader must create and initialise the kernel tagged list.
-A valid tagged list starts with ATAG_CORE and ends with ATAG_NONE.
-The ATAG_CORE tag may or may not be empty.  An empty ATAG_CORE tag
-has the size field set to '2' (0x00000002).  The ATAG_NONE must set
-the size field to zero.
-
-Any number of tags can be placed in the list.  It is undefined
-whether a repeated tag appends to the information carried by the
-previous tag, or whether it replaces the information in its
-entirety; some tags behave as the former, others the latter.
-
-The boot loader must pass at a minimum the size and location of
-the system memory, and root filesystem location.  Therefore, the
-minimum tagged list should look:
-
-	+-----------+
-base ->	| ATAG_CORE |  |
-	+-----------+  |
-	| ATAG_MEM  |  | increasing address
-	+-----------+  |
-	| ATAG_NONE |  |
-	+-----------+  v
-
-The tagged list should be stored in system RAM.
-
-The tagged list must be placed in a region of memory where neither
-the kernel decompressor nor initrd 'bootp' program will overwrite
-it.  The recommended placement is in the first 16KiB of RAM.
-
-4b. Setup the device tree
--------------------------
-
-The boot loader must load a device tree image (dtb) into system ram
-at a 64bit aligned address and initialize it with the boot data.  The
-dtb format is documented in Documentation/devicetree/booting-without-of.txt.
-The kernel will look for the dtb magic value of 0xd00dfeed at the dtb
-physical address to determine if a dtb has been passed instead of a
-tagged list.
-
-The boot loader must pass at a minimum the size and location of the
-system memory, and the root filesystem location.  The dtb must be
-placed in a region of memory where the kernel decompressor will not
-overwrite it, while remaining within the region which will be covered
-by the kernel's low-memory mapping.
-
-A safe location is just above the 128MiB boundary from start of RAM.
-
-5. Load initramfs.
-------------------
-
-Existing boot loaders:		OPTIONAL
-New boot loaders:		OPTIONAL
-
-If an initramfs is in use then, as with the dtb, it must be placed in
-a region of memory where the kernel decompressor will not overwrite it
-while also with the region which will be covered by the kernel's
-low-memory mapping.
-
-A safe location is just above the device tree blob which itself will
-be loaded just above the 128MiB boundary from the start of RAM as
-recommended above.
-
-6. Calling the kernel image
----------------------------
-
-Existing boot loaders:		MANDATORY
-New boot loaders:		MANDATORY
-
-There are two options for calling the kernel zImage.  If the zImage
-is stored in flash, and is linked correctly to be run from flash,
-then it is legal for the boot loader to call the zImage in flash
-directly.
-
-The zImage may also be placed in system RAM and called there.  The
-kernel should be placed in the first 128MiB of RAM.  It is recommended
-that it is loaded above 32MiB in order to avoid the need to relocate
-prior to decompression, which will make the boot process slightly
-faster.
-
-When booting a raw (non-zImage) kernel the constraints are tighter.
-In this case the kernel must be loaded at an offset into system equal
-to TEXT_OFFSET - PAGE_OFFSET.
-
-In any case, the following conditions must be met:
-
-- Quiesce all DMA capable devices so that memory does not get
-  corrupted by bogus network packets or disk data. This will save
-  you many hours of debug.
-
-- CPU register settings
-  r0 = 0,
-  r1 = machine type number discovered in (3) above.
-  r2 = physical address of tagged list in system RAM, or
-       physical address of device tree block (dtb) in system RAM
-
-- CPU mode
-  All forms of interrupts must be disabled (IRQs and FIQs)
-
-  For CPUs which do not include the ARM virtualization extensions, the
-  CPU must be in SVC mode.  (A special exception exists for Angel)
-
-  CPUs which include support for the virtualization extensions can be
-  entered in HYP mode in order to enable the kernel to make full use of
-  these extensions.  This is the recommended boot method for such CPUs,
-  unless the virtualisations are already in use by a pre-installed
-  hypervisor.
-
-  If the kernel is not entered in HYP mode for any reason, it must be
-  entered in SVC mode.
-
-- Caches, MMUs
-  The MMU must be off.
-  Instruction cache may be on or off.
-  Data cache must be off.
-
-  If the kernel is entered in HYP mode, the above requirements apply to
-  the HYP mode configuration in addition to the ordinary PL1 (privileged
-  kernel modes) configuration.  In addition, all traps into the
-  hypervisor must be disabled, and PL1 access must be granted for all
-  peripherals and CPU resources for which this is architecturally
-  possible.  Except for entering in HYP mode, the system configuration
-  should be such that a kernel which does not include support for the
-  virtualization extensions can boot correctly without extra help.
-
-- The boot loader is expected to call the kernel image by jumping
-  directly to the first instruction of the kernel image.
-
-  On CPUs supporting the ARM instruction set, the entry must be
-  made in ARM state, even for a Thumb-2 kernel.
-
-  On CPUs supporting only the Thumb instruction set such as
-  Cortex-M class CPUs, the entry must be made in Thumb state.
diff --git a/Documentation/arm/IXP4xx b/Documentation/arm/IXP4xx
deleted file mode 100644
index e48b74de6ac0..000000000000
--- a/Documentation/arm/IXP4xx
+++ /dev/null
@@ -1,172 +0,0 @@
-
--------------------------------------------------------------------------
-Release Notes for Linux on Intel's IXP4xx Network Processor
-
-Maintained by Deepak Saxena <dsaxena@plexity.net>
--------------------------------------------------------------------------
-
-1. Overview
-
-Intel's IXP4xx network processor is a highly integrated SOC that
-is targeted for network applications, though it has become popular 
-in industrial control and other areas due to low cost and power
-consumption. The IXP4xx family currently consists of several processors
-that support different network offload functions such as encryption,
-routing, firewalling, etc. The IXP46x family is an updated version which
-supports faster speeds, new memory and flash configurations, and more
-integration such as an on-chip I2C controller.
-
-For more information on the various versions of the CPU, see:
-
-   http://developer.intel.com/design/network/products/npfamily/ixp4xx.htm
-
-Intel also made the IXCP1100 CPU for sometime which is an IXP4xx 
-stripped of much of the network intelligence.
-
-2. Linux Support
-
-Linux currently supports the following features on the IXP4xx chips:
-
-- Dual serial ports
-- PCI interface
-- Flash access (MTD/JFFS)
-- I2C through GPIO on IXP42x
-- GPIO for input/output/interrupts 
-  See arch/arm/mach-ixp4xx/include/mach/platform.h for access functions.
-- Timers (watchdog, OS)
-
-The following components of the chips are not supported by Linux and
-require the use of Intel's proprietary CSR software:
-
-- USB device interface
-- Network interfaces (HSS, Utopia, NPEs, etc)
-- Network offload functionality
-
-If you need to use any of the above, you need to download Intel's
-software from:
-
-   http://developer.intel.com/design/network/products/npfamily/ixp425.htm    
-
-DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPRIETARY
-SOFTWARE.
-
-There are several websites that provide directions/pointers on using
-Intel's software:
-
-   http://sourceforge.net/projects/ixp4xx-osdg/
-   Open Source Developer's Guide for using uClinux and the Intel libraries 
-
-http://gatewaymaker.sourceforge.net/ 
-   Simple one page summary of building a gateway using an IXP425 and Linux
-
-http://ixp425.sourceforge.net/
-   ATM device driver for IXP425 that relies on Intel's libraries
-
-3. Known Issues/Limitations
-
-3a. Limited inbound PCI window
-
-The IXP4xx family allows for up to 256MB of memory but the PCI interface
-can only expose 64MB of that memory to the PCI bus. This means that if
-you are running with > 64MB, all PCI buffers outside of the accessible
-range will be bounced using the routines in arch/arm/common/dmabounce.c.
-   
-3b. Limited outbound PCI window
-
-IXP4xx provides two methods of accessing PCI memory space:
-
-1) A direct mapped window from 0x48000000 to 0x4bffffff (64MB).
-   To access PCI via this space, we simply ioremap() the BAR
-   into the kernel and we can use the standard read[bwl]/write[bwl]
-   macros. This is the preffered method due to speed but it
-   limits the system to just 64MB of PCI memory. This can be 
-   problamatic if using video cards and other memory-heavy devices.
-          
-2) If > 64MB of memory space is required, the IXP4xx can be 
-   configured to use indirect registers to access PCI This allows 
-   for up to 128MB (0x48000000 to 0x4fffffff) of memory on the bus. 
-   The disadvantage of this is that every PCI access requires 
-   three local register accesses plus a spinlock, but in some 
-   cases the performance hit is acceptable. In addition, you cannot 
-   mmap() PCI devices in this case due to the indirect nature
-   of the PCI window.
-
-By default, the direct method is used for performance reasons. If
-you need more PCI memory, enable the IXP4XX_INDIRECT_PCI config option.
-
-3c. GPIO as Interrupts
-
-Currently the code only handles level-sensitive GPIO interrupts 
-
-4. Supported platforms
-
-ADI Engineering Coyote Gateway Reference Platform
-http://www.adiengineering.com/productsCoyote.html
-
-   The ADI Coyote platform is reference design for those building 
-   small residential/office gateways. One NPE is connected to a 10/100
-   interface, one to 4-port 10/100 switch, and the third to and ADSL
-   interface. In addition, it also supports to POTs interfaces connected
-   via SLICs. Note that those are not supported by Linux ATM. Finally,
-   the platform has two mini-PCI slots used for 802.11[bga] cards.
-   Finally, there is an IDE port hanging off the expansion bus.
-
-Gateworks Avila Network Platform
-http://www.gateworks.com/support/overview.php
-
-   The Avila platform is basically and IXDP425 with the 4 PCI slots
-   replaced with mini-PCI slots and a CF IDE interface hanging off
-   the expansion bus.
-
-Intel IXDP425 Development Platform
-http://www.intel.com/design/network/products/npfamily/ixdpg425.htm  
-
-   This is Intel's standard reference platform for the IXDP425 and is 
-   also known as the Richfield board. It contains 4 PCI slots, 16MB
-   of flash, two 10/100 ports and one ADSL port.
-
-Intel IXDP465 Development Platform
-http://www.intel.com/design/network/products/npfamily/ixdp465.htm
-
-   This is basically an IXDP425 with an IXP465 and 32M of flash instead
-   of just 16.
-
-Intel IXDPG425 Development Platform
-
-   This is basically and ADI Coyote board with a NEC EHCI controller
-   added. One issue with this board is that the mini-PCI slots only
-   have the 3.3v line connected, so you can't use a PCI to mini-PCI
-   adapter with an E100 card. So to NFS root you need to use either
-   the CSR or a WiFi card and a ramdisk that BOOTPs and then does
-   a pivot_root to NFS.
-
-Motorola PrPMC1100 Processor Mezanine Card
-http://www.fountainsys.com
-
-   The PrPMC1100 is based on the IXCP1100 and is meant to plug into
-   and IXP2400/2800 system to act as the system controller. It simply
-   contains a CPU and 16MB of flash on the board and needs to be
-   plugged into a carrier board to function. Currently Linux only
-   supports the Motorola PrPMC carrier board for this platform.
-
-5. TODO LIST
-
-- Add support for Coyote IDE
-- Add support for edge-based GPIO interrupts
-- Add support for CF IDE on expansion bus
-
-6. Thanks
-
-The IXP4xx work has been funded by Intel Corp. and MontaVista Software, Inc.
-
-The following people have contributed patches/comments/etc:
-
-Lennerty Buytenhek
-Lutz Jaenicke
-Justin Mayfield
-Robert E. Ranslam
-[I know I've forgotten others, please email me to be added] 
-
--------------------------------------------------------------------------
-
-Last Update: 01/04/2005
diff --git a/Documentation/arm/Interrupts b/Documentation/arm/Interrupts
deleted file mode 100644
index f09ab1b90ef1..000000000000
--- a/Documentation/arm/Interrupts
+++ /dev/null
@@ -1,167 +0,0 @@
-2.5.2-rmk5
-----------
-
-This is the first kernel that contains a major shake up of some of the
-major architecture-specific subsystems.
-
-Firstly, it contains some pretty major changes to the way we handle the
-MMU TLB.  Each MMU TLB variant is now handled completely separately -
-we have TLB v3, TLB v4 (without write buffer), TLB v4 (with write buffer),
-and finally TLB v4 (with write buffer, with I TLB invalidate entry).
-There is more assembly code inside each of these functions, mainly to
-allow more flexible TLB handling for the future.
-
-Secondly, the IRQ subsystem.
-
-The 2.5 kernels will be having major changes to the way IRQs are handled.
-Unfortunately, this means that machine types that touch the irq_desc[]
-array (basically all machine types) will break, and this means every
-machine type that we currently have.
-
-Lets take an example.  On the Assabet with Neponset, we have:
-
-                  GPIO25                 IRR:2
-        SA1100 ------------> Neponset -----------> SA1111
-                                         IIR:1
-                                      -----------> USAR
-                                         IIR:0
-                                      -----------> SMC9196
-
-The way stuff currently works, all SA1111 interrupts are mutually
-exclusive of each other - if you're processing one interrupt from the
-SA1111 and another comes in, you have to wait for that interrupt to
-finish processing before you can service the new interrupt.  Eg, an
-IDE PIO-based interrupt on the SA1111 excludes all other SA1111 and
-SMC9196 interrupts until it has finished transferring its multi-sector
-data, which can be a long time.  Note also that since we loop in the
-SA1111 IRQ handler, SA1111 IRQs can hold off SMC9196 IRQs indefinitely.
-
-
-The new approach brings several new ideas...
-
-We introduce the concept of a "parent" and a "child".  For example,
-to the Neponset handler, the "parent" is GPIO25, and the "children"d
-are SA1111, SMC9196 and USAR.
-
-We also bring the idea of an IRQ "chip" (mainly to reduce the size of
-the irqdesc array).  This doesn't have to be a real "IC"; indeed the
-SA11x0 IRQs are handled by two separate "chip" structures, one for
-GPIO0-10, and another for all the rest.  It is just a container for
-the various operations (maybe this'll change to a better name).
-This structure has the following operations:
-
-struct irqchip {
-        /*
-         * Acknowledge the IRQ.
-         * If this is a level-based IRQ, then it is expected to mask the IRQ
-         * as well.
-         */
-        void (*ack)(unsigned int irq);
-        /*
-         * Mask the IRQ in hardware.
-         */
-        void (*mask)(unsigned int irq);
-        /*
-         * Unmask the IRQ in hardware.
-         */
-        void (*unmask)(unsigned int irq);
-        /*
-         * Re-run the IRQ
-         */
-        void (*rerun)(unsigned int irq);
-        /*
-         * Set the type of the IRQ.
-         */
-        int (*type)(unsigned int irq, unsigned int, type);
-};
-
-ack    - required.  May be the same function as mask for IRQs
-         handled by do_level_IRQ.
-mask   - required.
-unmask - required.
-rerun  - optional.  Not required if you're using do_level_IRQ for all
-         IRQs that use this 'irqchip'.  Generally expected to re-trigger
-         the hardware IRQ if possible.  If not, may call the handler
-	 directly.
-type   - optional.  If you don't support changing the type of an IRQ,
-         it should be null so people can detect if they are unable to
-         set the IRQ type.
-
-For each IRQ, we keep the following information:
-
-        - "disable" depth (number of disable_irq()s without enable_irq()s)
-        - flags indicating what we can do with this IRQ (valid, probe,
-          noautounmask) as before
-        - status of the IRQ (probing, enable, etc)
-        - chip
-        - per-IRQ handler
-        - irqaction structure list
-
-The handler can be one of the 3 standard handlers - "level", "edge" and
-"simple", or your own specific handler if you need to do something special.
-
-The "level" handler is what we currently have - its pretty simple.
-"edge" knows about the brokenness of such IRQ implementations - that you
-need to leave the hardware IRQ enabled while processing it, and queueing
-further IRQ events should the IRQ happen again while processing.  The
-"simple" handler is very basic, and does not perform any hardware
-manipulation, nor state tracking.  This is useful for things like the
-SMC9196 and USAR above.
-
-So, what's changed?
-
-1. Machine implementations must not write to the irqdesc array.
-
-2. New functions to manipulate the irqdesc array.  The first 4 are expected
-   to be useful only to machine specific code.  The last is recommended to
-   only be used by machine specific code, but may be used in drivers if
-   absolutely necessary.
-
-        set_irq_chip(irq,chip)
-
-                Set the mask/unmask methods for handling this IRQ
-
-        set_irq_handler(irq,handler)
-
-                Set the handler for this IRQ (level, edge, simple)
-
-        set_irq_chained_handler(irq,handler)
-
-                Set a "chained" handler for this IRQ - automatically
-                enables this IRQ (eg, Neponset and SA1111 handlers).
-
-        set_irq_flags(irq,flags)
-
-                Set the valid/probe/noautoenable flags.
-
-        set_irq_type(irq,type)
-
-                Set active the IRQ edge(s)/level.  This replaces the
-                SA1111 INTPOL manipulation, and the set_GPIO_IRQ_edge()
-                function.  Type should be one of IRQ_TYPE_xxx defined in
-		<linux/irq.h>
-
-3. set_GPIO_IRQ_edge() is obsolete, and should be replaced by set_irq_type.
-
-4. Direct access to SA1111 INTPOL is deprecated.  Use set_irq_type instead.
-
-5. A handler is expected to perform any necessary acknowledgement of the
-   parent IRQ via the correct chip specific function.  For instance, if
-   the SA1111 is directly connected to a SA1110 GPIO, then you should
-   acknowledge the SA1110 IRQ each time you re-read the SA1111 IRQ status.
-
-6. For any child which doesn't have its own IRQ enable/disable controls
-   (eg, SMC9196), the handler must mask or acknowledge the parent IRQ
-   while the child handler is called, and the child handler should be the
-   "simple" handler (not "edge" nor "level").  After the handler completes,
-   the parent IRQ should be unmasked, and the status of all children must
-   be re-checked for pending events.  (see the Neponset IRQ handler for
-   details).
-
-7. fixup_irq() is gone, as is arch/arm/mach-*/include/mach/irq.h
-
-Please note that this will not solve all problems - some of them are
-hardware based.  Mixing level-based and edge-based IRQs on the same
-parent signal (eg neponset) is one such area where a software based
-solution can't provide the full answer to low IRQ latency.
-
diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README
deleted file mode 100644
index 56ada27c53be..000000000000
--- a/Documentation/arm/Marvell/README
+++ /dev/null
@@ -1,395 +0,0 @@
-ARM Marvell SoCs
-================
-
-This document lists all the ARM Marvell SoCs that are currently
-supported in mainline by the Linux kernel. As the Marvell families of
-SoCs are large and complex, it is hard to understand where the support
-for a particular SoC is available in the Linux kernel. This document
-tries to help in understanding where those SoCs are supported, and to
-match them with their corresponding public datasheet, when available.
-
-Orion family
-------------
-
-  Flavors:
-        88F5082
-        88F5181
-        88F5181L
-        88F5182
-               Datasheet               : http://www.embeddedarm.com/documentation/third-party/MV88F5182-datasheet.pdf
-               Programmer's User Guide : http://www.embeddedarm.com/documentation/third-party/MV88F5182-opensource-manual.pdf
-               User Manual             : http://www.embeddedarm.com/documentation/third-party/MV88F5182-usermanual.pdf
-        88F5281
-               Datasheet               : http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
-        88F6183
-  Core: Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
-  Linux kernel mach directory: arch/arm/mach-orion5x
-  Linux kernel plat directory: arch/arm/plat-orion
-
-Kirkwood family
----------------
-
-  Flavors:
-        88F6282 a.k.a Armada 300
-                Product Brief  : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
-        88F6283 a.k.a Armada 310
-                Product Brief  : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
-        88F6190
-                Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6190-003_WEB.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
-        88F6192
-                Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6192-003_ver1.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
-        88F6182
-        88F6180
-                Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6180-003_ver1.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6180_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
-        88F6281
-                Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6281-004_ver1.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6281_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
-  Homepage: http://www.marvell.com/embedded-processors/kirkwood/
-  Core: Feroceon 88fr131 ARMv5 compatible
-  Linux kernel mach directory: arch/arm/mach-mvebu
-  Linux kernel plat directory: none
-
-Discovery family
-----------------
-
-  Flavors:
-        MV78100
-                Product Brief  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78100-003_WEB.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78100_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
-        MV78200
-                Product Brief  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78200-002_WEB.pdf
-                Hardware Spec  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78200_OpenSource.pdf
-                Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
-        MV76100
-                Not supported by the Linux kernel.
-
-  Core: Feroceon 88fr571-vd ARMv5 compatible
-
-  Linux kernel mach directory: arch/arm/mach-mv78xx0
-  Linux kernel plat directory: arch/arm/plat-orion
-
-EBU Armada family
------------------
-
-  Armada 370 Flavors:
-        88F6710
-        88F6707
-        88F6W11
-    Product Brief:   http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
-    Hardware Spec:   http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
-    Functional Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
-    Core: Sheeva ARMv7 compatible PJ4B
-
-  Armada 375 Flavors:
-	88F6720
-    Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
-    Core: ARM Cortex-A9
-
-  Armada 38x Flavors:
-	88F6810	Armada 380
-	88F6820 Armada 385
-	88F6828 Armada 388
-    Product infos:   http://www.marvell.com/embedded-processors/armada-38x/
-    Functional Spec: https://marvellcorp.wufoo.com/forms/marvell-armada-38x-functional-specifications/
-    Core: ARM Cortex-A9
-
-  Armada 39x Flavors:
-	88F6920 Armada 390
-	88F6928 Armada 398
-    Product infos: http://www.marvell.com/embedded-processors/armada-39x/
-    Core: ARM Cortex-A9
-
-  Armada XP Flavors:
-        MV78230
-        MV78260
-        MV78460
-    NOTE: not to be confused with the non-SMP 78xx0 SoCs
-    Product Brief: http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
-    Functional Spec: http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
-    Hardware Specs:
-      http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
-      http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
-      http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
-    Core: Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
-
-  Linux kernel mach directory: arch/arm/mach-mvebu
-  Linux kernel plat directory: none
-
-EBU Armada family ARMv8
------------------------
-
-  Armada 3710/3720 Flavors:
-	88F3710
-	88F3720
-	Core: ARM Cortex A53 (ARMv8)
-
-	Homepage: http://www.marvell.com/embedded-processors/armada-3700/
-	Product Brief: http://www.marvell.com/embedded-processors/assets/PB-88F3700-FNL.pdf
-	Device tree files: arch/arm64/boot/dts/marvell/armada-37*
-
-  Armada 7K Flavors:
-	88F7020 (AP806 Dual + one CP110)
-	88F7040 (AP806 Quad + one CP110)
-	Core: ARM Cortex A72
-
-	Homepage: http://www.marvell.com/embedded-processors/armada-70xx/
-	Product Brief: http://www.marvell.com/embedded-processors/assets/Armada7020PB-Jan2016.pdf
-		       http://www.marvell.com/embedded-processors/assets/Armada7040PB-Jan2016.pdf
-	Device tree files: arch/arm64/boot/dts/marvell/armada-70*
-
-  Armada 8K Flavors:
-	88F8020 (AP806 Dual + two CP110)
-	88F8040 (AP806 Quad + two CP110)
-	Core: ARM Cortex A72
-
-	Homepage: http://www.marvell.com/embedded-processors/armada-80xx/
-	Product Brief: http://www.marvell.com/embedded-processors/assets/Armada8020PB-Jan2016.pdf
-		       http://www.marvell.com/embedded-processors/assets/Armada8040PB-Jan2016.pdf
-	Device tree files: arch/arm64/boot/dts/marvell/armada-80*
-
-Avanta family
--------------
-
-  Flavors:
-       88F6510
-       88F6530P
-       88F6550
-       88F6560
-  Homepage     : http://www.marvell.com/broadband/
-  Product Brief: http://www.marvell.com/broadband/assets/Marvell_Avanta_88F6510_305_060-001_product_brief.pdf
-  No public datasheet available.
-
-  Core: ARMv5 compatible
-
-  Linux kernel mach directory: no code in mainline yet, planned for the future
-  Linux kernel plat directory: no code in mainline yet, planned for the future
-
-Storage family
---------------
-
-  Armada SP:
-	88RC1580
-    Product infos: http://www.marvell.com/storage/armada-sp/
-    Core: Sheeva ARMv7 comatible Quad-core PJ4C
-    (not supported in upstream Linux kernel)
-
-Dove family (application processor)
------------------------------------
-
-  Flavors:
-        88AP510 a.k.a Armada 510
-                Product Brief   : http://www.marvell.com/application-processors/armada-500/assets/Marvell_Armada510_SoC.pdf
-                Hardware Spec   : http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Hardware-Spec.pdf
-                Functional Spec : http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Functional-Spec.pdf
-  Homepage: http://www.marvell.com/application-processors/armada-500/
-  Core: ARMv7 compatible
-
-  Directory: arch/arm/mach-mvebu (DT enabled platforms)
-             arch/arm/mach-dove (non-DT enabled platforms)
-
-PXA 2xx/3xx/93x/95x family
---------------------------
-
-  Flavors:
-        PXA21x, PXA25x, PXA26x
-             Application processor only
-             Core: ARMv5 XScale1 core
-        PXA270, PXA271, PXA272
-             Product Brief         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_pb.pdf
-             Design guide          : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_design_guide.pdf
-             Developers manual     : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_dev_man.pdf
-             Specification         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_emts.pdf
-             Specification update  : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_spec_update.pdf
-             Application processor only
-             Core: ARMv5 XScale2 core
-        PXA300, PXA310, PXA320
-             PXA 300 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA300_PB_R4.pdf
-             PXA 310 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA310_PB_R4.pdf
-             PXA 320 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA320_PB_R4.pdf
-             Design guide          : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Design_Guide.pdf
-             Developers manual     : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Developers_Manual.zip
-             Specifications        : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_EMTS.pdf
-             Specification Update  : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Spec_Update.zip
-             Reference Manual      : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_TavorP_BootROM_Ref_Manual.pdf
-             Application processor only
-             Core: ARMv5 XScale3 core
-        PXA930, PXA935
-             Application processor with Communication processor
-             Core: ARMv5 XScale3 core
-        PXA955
-             Application processor with Communication processor
-             Core: ARMv7 compatible Sheeva PJ4 core
-
-   Comments:
-
-    * This line of SoCs originates from the XScale family developed by
-      Intel and acquired by Marvell in ~2006. The PXA21x, PXA25x,
-      PXA26x, PXA27x, PXA3xx and PXA93x were developed by Intel, while
-      the later PXA95x were developed by Marvell.
-
-    * Due to their XScale origin, these SoCs have virtually nothing in
-      common with the other (Kirkwood, Dove, etc.) families of Marvell
-      SoCs, except with the MMP/MMP2 family of SoCs.
-
-   Linux kernel mach directory: arch/arm/mach-pxa
-   Linux kernel plat directory: arch/arm/plat-pxa
-
-MMP/MMP2/MMP3 family (communication processor)
------------------------------------------
-
-   Flavors:
-        PXA168, a.k.a Armada 168
-             Homepage             : http://www.marvell.com/application-processors/armada-100/armada-168.jsp
-             Product brief        : http://www.marvell.com/application-processors/armada-100/assets/pxa_168_pb.pdf
-             Hardware manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_datasheet.pdf
-             Software manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_software_manual.pdf
-             Specification update : http://www.marvell.com/application-processors/armada-100/assets/ARMADA16x_Spec_update.pdf
-             Boot ROM manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_ref_manual.pdf
-             App node package     : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_app_note_package.pdf
-             Application processor only
-             Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
-        PXA910/PXA920
-             Homepage             : http://www.marvell.com/communication-processors/pxa910/
-             Product Brief        : http://www.marvell.com/communication-processors/pxa910/assets/Marvell_PXA910_Platform-001_PB_final.pdf
-             Application processor with Communication processor
-             Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
-        PXA688, a.k.a. MMP2, a.k.a Armada 610
-             Product Brief        : http://www.marvell.com/application-processors/armada-600/assets/armada610_pb.pdf
-             Application processor only
-             Core: ARMv7 compatible Sheeva PJ4 88sv581x core
-	PXA2128, a.k.a. MMP3 (OLPC XO4, Linux support not upstream)
-	     Product Brief	  : http://www.marvell.com/application-processors/armada/pxa2128/assets/Marvell-ARMADA-PXA2128-SoC-PB.pdf
-	     Application processor only
-	     Core: Dual-core ARMv7 compatible Sheeva PJ4C core
-	PXA960/PXA968/PXA978 (Linux support not upstream)
-	     Application processor with Communication Processor
-	     Core: ARMv7 compatible Sheeva PJ4 core
-	PXA986/PXA988 (Linux support not upstream)
-	     Application processor with Communication Processor
-	     Core: Dual-core ARMv7 compatible Sheeva PJ4B-MP core
-	PXA1088/PXA1920 (Linux support not upstream)
-	     Application processor with Communication Processor
-	     Core: quad-core ARMv7 Cortex-A7
-	PXA1908/PXA1928/PXA1936
-	     Application processor with Communication Processor
-	     Core: multi-core ARMv8 Cortex-A53
-
-   Comments:
-
-    * This line of SoCs originates from the XScale family developed by
-      Intel and acquired by Marvell in ~2006. All the processors of
-      this MMP/MMP2 family were developed by Marvell.
-
-    * Due to their XScale origin, these SoCs have virtually nothing in
-      common with the other (Kirkwood, Dove, etc.) families of Marvell
-      SoCs, except with the PXA family of SoCs listed above.
-
-   Linux kernel mach directory: arch/arm/mach-mmp
-   Linux kernel plat directory: arch/arm/plat-pxa
-
-Berlin family (Multimedia Solutions)
--------------------------------------
-
-  Flavors:
-	88DE3010, Armada 1000 (no Linux support)
-		Core:		Marvell PJ1 (ARMv5TE), Dual-core
-		Product Brief:	http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
-	88DE3005, Armada 1500 Mini
-		Design name:	BG2CD
-		Core:		ARM Cortex-A9, PL310 L2CC
-	88DE3006, Armada 1500 Mini Plus
-		Design name:	BG2CDP
-		Core:		Dual Core ARM Cortex-A7
-	88DE3100, Armada 1500
-		Design name:	BG2
-		Core:		Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
-	88DE3114, Armada 1500 Pro
-		Design name:	BG2Q
-		Core:		Quad Core ARM Cortex-A9, PL310 L2CC
-	88DE3214, Armada 1500 Pro 4K
-		Design name:	BG3
-		Core:		ARM Cortex-A15, CA15 integrated L2CC
-	88DE3218, ARMADA 1500 Ultra
-		Core:		ARM Cortex-A53
-
-  Homepage: https://www.synaptics.com/products/multimedia-solutions
-  Directory: arch/arm/mach-berlin
-
-  Comments:
-
-   * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
-     with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
-
-   * The Berlin family was acquired by Synaptics from Marvell in 2017.
-
-CPU Cores
----------
-
-The XScale cores were designed by Intel, and shipped by Marvell in the older
-PXA processors. Feroceon is a Marvell designed core that developed in-house,
-and that evolved into Sheeva. The XScale and Feroceon cores were phased out
-over time and replaced with Sheeva cores in later products, which subsequently
-got replaced with licensed ARM Cortex-A cores.
-
-  XScale 1
-	CPUID 0x69052xxx
-	ARMv5, iWMMXt
-  XScale 2
-	CPUID 0x69054xxx
-	ARMv5, iWMMXt
-  XScale 3
-	CPUID 0x69056xxx or 0x69056xxx
-	ARMv5, iWMMXt
-  Feroceon-1850 88fr331 "Mohawk"
-	CPUID 0x5615331x or 0x41xx926x
-	ARMv5TE, single issue
-  Feroceon-2850 88fr531-vd "Jolteon"
-	CPUID 0x5605531x or 0x41xx926x
-	ARMv5TE, VFP, dual-issue
-  Feroceon 88fr571-vd "Jolteon"
-	CPUID 0x5615571x
-	ARMv5TE, VFP, dual-issue
-  Feroceon 88fr131 "Mohawk-D"
-	CPUID 0x5625131x
-	ARMv5TE, single-issue in-order
-  Sheeva PJ1 88sv331 "Mohawk"
-	CPUID 0x561584xx
-	ARMv5, single-issue iWMMXt v2
-  Sheeva PJ4 88sv581x "Flareon"
-	CPUID 0x560f581x
-	ARMv7, idivt, optional iWMMXt v2
-  Sheeva PJ4B 88sv581x
-	CPUID 0x561f581x
-	ARMv7, idivt, optional iWMMXt v2
-  Sheeva PJ4B-MP / PJ4C
-	CPUID 0x562f584x
-	ARMv7, idivt/idiva, LPAE, optional iWMMXt v2 and/or NEON
-
-Long-term plans
----------------
-
- * Unify the mach-dove/, mach-mv78xx0/, mach-orion5x/ into the
-   mach-mvebu/ to support all SoCs from the Marvell EBU (Engineering
-   Business Unit) in a single mach-<foo> directory. The plat-orion/
-   would therefore disappear.
-
- * Unify the mach-mmp/ and mach-pxa/ into the same mach-pxa
-   directory. The plat-pxa/ would therefore disappear.
-
-Credits
--------
-
- Maen Suleiman <maen@marvell.com>
- Lior Amsalem <alior@marvell.com>
- Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
- Andrew Lunn <andrew@lunn.ch>
- Nicolas Pitre <nico@fluxnic.net>
- Eric Miao <eric.y.miao@gmail.com>
diff --git a/Documentation/arm/Microchip/README b/Documentation/arm/Microchip/README
deleted file mode 100644
index a366f37d38f1..000000000000
--- a/Documentation/arm/Microchip/README
+++ /dev/null
@@ -1,169 +0,0 @@
-ARM Microchip SoCs (aka AT91)
-=============================
-
-
-Introduction
-------------
-This document gives useful information about the ARM Microchip SoCs that are
-currently supported in Linux Mainline (you know, the one on kernel.org).
-
-It is important to note that the Microchip (previously Atmel) ARM-based MPU
-product line is historically named "AT91" or "at91" throughout the Linux kernel
-development process even if this product prefix has completely disappeared from
-the official Microchip product name. Anyway, files, directories, git trees,
-git branches/tags and email subject always contain this "at91" sub-string.
-
-
-AT91 SoCs
----------
-Documentation and detailed datasheet for each product are available on
-the Microchip website: http://www.microchip.com.
-
-  Flavors:
-    * ARM 920 based SoC
-      - at91rm9200
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-1768-32-bit-ARM920T-Embedded-Microprocessor-AT91RM9200_Datasheet.pdf
-
-    * ARM 926 based SoCs
-      - at91sam9260
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6221-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9260_Datasheet.pdf
-
-      - at91sam9xe
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6254-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9XE_Datasheet.pdf
-
-      - at91sam9261
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6062-ARM926EJ-S-Microprocessor-SAM9261_Datasheet.pdf
-
-      - at91sam9263
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6249-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9263_Datasheet.pdf
-
-      - at91sam9rl
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/doc6289.pdf
-
-      - at91sam9g20
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001516A.pdf
-
-      - at91sam9g45 family
-        - at91sam9g45
-        - at91sam9g46
-        - at91sam9m10
-        - at91sam9m11 (device superset)
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6437-32-bit-ARM926-Embedded-Microprocessor-SAM9M11_Datasheet.pdf
-
-      - at91sam9x5 family (aka "The 5 series")
-        - at91sam9g15
-        - at91sam9g25
-        - at91sam9g35
-        - at91sam9x25
-        - at91sam9x35
-        + Datasheet (can be considered as covering the whole family)
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11055-32-bit-ARM926EJ-S-Microcontroller-SAM9X35_Datasheet.pdf
-
-      - at91sam9n12
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
-
-    * ARM Cortex-A5 based SoCs
-      - sama5d3 family
-        - sama5d31
-        - sama5d33
-        - sama5d34
-        - sama5d35
-        - sama5d36 (device superset)
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet.pdf
-
-    * ARM Cortex-A5 + NEON based SoCs
-      - sama5d4 family
-        - sama5d41
-        - sama5d42
-        - sama5d43
-        - sama5d44 (device superset)
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/60001525A.pdf
-
-      - sama5d2 family
-        - sama5d21
-        - sama5d22
-        - sama5d23
-        - sama5d24
-        - sama5d26
-        - sama5d27 (device superset)
-        - sama5d28 (device superset + environmental monitors)
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001476B.pdf
-
-    * ARM Cortex-M7 MCUs
-      - sams70 family
-        - sams70j19
-        - sams70j20
-        - sams70j21
-        - sams70n19
-        - sams70n20
-        - sams70n21
-        - sams70q19
-        - sams70q20
-        - sams70q21
-
-      - samv70 family
-        - samv70j19
-        - samv70j20
-        - samv70n19
-        - samv70n20
-        - samv70q19
-        - samv70q20
-
-      - samv71 family
-        - samv71j19
-        - samv71j20
-        - samv71j21
-        - samv71n19
-        - samv71n20
-        - samv71n21
-        - samv71q19
-        - samv71q20
-        - samv71q21
-
-        + Datasheet
-          http://ww1.microchip.com/downloads/en/DeviceDoc/60001527A.pdf
-
-
-Linux kernel information
-------------------------
-Linux kernel mach directory: arch/arm/mach-at91
-MAINTAINERS entry is: "ARM/Microchip (AT91) SoC support"
-
-
-Device Tree for AT91 SoCs and boards
-------------------------------------
-All AT91 SoCs are converted to Device Tree. Since Linux 3.19, these products
-must use this method to boot the Linux kernel.
-
-Work In Progress statement:
-Device Tree files and Device Tree bindings that apply to AT91 SoCs and boards are
-considered as "Unstable". To be completely clear, any at91 binding can change at
-any time. So, be sure to use a Device Tree Binary and a Kernel Image generated from
-the same source tree.
-Please refer to the Documentation/devicetree/bindings/ABI.txt file for a
-definition of a "Stable" binding/ABI.
-This statement will be removed by AT91 MAINTAINERS when appropriate.
-
-Naming conventions and best practice:
-- SoCs Device Tree Source Include files are named after the official name of
-  the product (at91sam9g20.dtsi or sama5d33.dtsi for instance).
-- Device Tree Source Include files (.dtsi) are used to collect common nodes that can be
-  shared across SoCs or boards (sama5d3.dtsi or at91sam9x5cm.dtsi for instance).
-  When collecting nodes for a particular peripheral or topic, the identifier have to
-  be placed at the end of the file name, separated with a "_" (at91sam9x5_can.dtsi
-  or sama5d3_gmac.dtsi for example).
-- board Device Tree Source files (.dts) are prefixed by the string "at91-" so
-  that they can be identified easily. Note that some files are historical exceptions
-  to this rule (sama5d3[13456]ek.dts, usb_a9g20.dts or animeo_ip.dts for example).
diff --git a/Documentation/arm/Netwinder b/Documentation/arm/Netwinder
deleted file mode 100644
index f1b457fbd3de..000000000000
--- a/Documentation/arm/Netwinder
+++ /dev/null
@@ -1,78 +0,0 @@
-NetWinder specific documentation
-================================
-
-The NetWinder is a small low-power computer, primarily designed
-to run Linux.  It is based around the StrongARM RISC processor,
-DC21285 PCI bridge, with PC-type hardware glued around it.
-
-Port usage
-==========
-
-Min    - Max	Description
----------------------------
-0x0000 - 0x000f	DMA1
-0x0020 - 0x0021	PIC1
-0x0060 - 0x006f	Keyboard
-0x0070 - 0x007f	RTC
-0x0080 - 0x0087	DMA1
-0x0088 - 0x008f	DMA2
-0x00a0 - 0x00a3	PIC2
-0x00c0 - 0x00df	DMA2
-0x0180 - 0x0187	IRDA
-0x01f0 - 0x01f6	ide0
-0x0201		Game port
-0x0203		RWA010 configuration read
-0x0220 - ?	SoundBlaster
-0x0250 - ?	WaveArtist
-0x0279		RWA010 configuration index
-0x02f8 - 0x02ff	Serial ttyS1
-0x0300 - 0x031f	Ether10
-0x0338		GPIO1
-0x033a		GPIO2
-0x0370 - 0x0371	W83977F configuration registers
-0x0388 - ?	AdLib
-0x03c0 - 0x03df	VGA
-0x03f6		ide0
-0x03f8 - 0x03ff	Serial ttyS0
-0x0400 - 0x0408	DC21143
-0x0480 - 0x0487	DMA1
-0x0488 - 0x048f	DMA2
-0x0a79		RWA010 configuration write
-0xe800 - 0xe80f	ide0/ide1 BM DMA
-
-
-Interrupt usage
-===============
-
-IRQ	type	Description
----------------------------
- 0	ISA	100Hz timer
- 1	ISA	Keyboard
- 2	ISA	cascade
- 3	ISA	Serial ttyS1
- 4	ISA	Serial ttyS0
- 5	ISA	PS/2 mouse
- 6	ISA	IRDA
- 7	ISA	Printer
- 8	ISA	RTC alarm
- 9	ISA
-10	ISA	GP10 (Orange reset button)
-11	ISA
-12	ISA	WaveArtist
-13	ISA
-14	ISA	hda1
-15	ISA
-
-DMA usage
-=========
-
-DMA	type	Description
----------------------------
- 0	ISA	IRDA
- 1	ISA
- 2	ISA	cascade
- 3	ISA	WaveArtist
- 4	ISA
- 5	ISA
- 6	ISA
- 7	ISA	WaveArtist
diff --git a/Documentation/arm/OMAP/DSS b/Documentation/arm/OMAP/DSS
deleted file mode 100644
index 4484e021290e..000000000000
--- a/Documentation/arm/OMAP/DSS
+++ /dev/null
@@ -1,362 +0,0 @@
-OMAP2/3 Display Subsystem
--------------------------
-
-This is an almost total rewrite of the OMAP FB driver in drivers/video/omap
-(let's call it DSS1). The main differences between DSS1 and DSS2 are DSI,
-TV-out and multiple display support, but there are lots of small improvements
-also.
-
-The DSS2 driver (omapdss module) is in arch/arm/plat-omap/dss/, and the FB,
-panel and controller drivers are in drivers/video/omap2/. DSS1 and DSS2 live
-currently side by side, you can choose which one to use.
-
-Features
---------
-
-Working and tested features include:
-
-- MIPI DPI (parallel) output
-- MIPI DSI output in command mode
-- MIPI DBI (RFBI) output
-- SDI output
-- TV output
-- All pieces can be compiled as a module or inside kernel
-- Use DISPC to update any of the outputs
-- Use CPU to update RFBI or DSI output
-- OMAP DISPC planes
-- RGB16, RGB24 packed, RGB24 unpacked
-- YUV2, UYVY
-- Scaling
-- Adjusting DSS FCK to find a good pixel clock
-- Use DSI DPLL to create DSS FCK
-
-Tested boards include:
-- OMAP3 SDP board
-- Beagle board
-- N810
-
-omapdss driver
---------------
-
-The DSS driver does not itself have any support for Linux framebuffer, V4L or
-such like the current ones, but it has an internal kernel API that upper level
-drivers can use.
-
-The DSS driver models OMAP's overlays, overlay managers and displays in a
-flexible way to enable non-common multi-display configuration. In addition to
-modelling the hardware overlays, omapdss supports virtual overlays and overlay
-managers. These can be used when updating a display with CPU or system DMA.
-
-omapdss driver support for audio
---------------------------------
-There exist several display technologies and standards that support audio as
-well. Hence, it is relevant to update the DSS device driver to provide an audio
-interface that may be used by an audio driver or any other driver interested in
-the functionality.
-
-The audio_enable function is intended to prepare the relevant
-IP for playback (e.g., enabling an audio FIFO, taking in/out of reset
-some IP, enabling companion chips, etc). It is intended to be called before
-audio_start. The audio_disable function performs the reverse operation and is
-intended to be called after audio_stop.
-
-While a given DSS device driver may support audio, it is possible that for
-certain configurations audio is not supported (e.g., an HDMI display using a
-VESA video timing). The audio_supported function is intended to query whether
-the current configuration of the display supports audio.
-
-The audio_config function is intended to configure all the relevant audio
-parameters of the display. In order to make the function independent of any
-specific DSS device driver, a struct omap_dss_audio is defined. Its purpose
-is to contain all the required parameters for audio configuration. At the
-moment, such structure contains pointers to IEC-60958 channel status word
-and CEA-861 audio infoframe structures. This should be enough to support
-HDMI and DisplayPort, as both are based on CEA-861 and IEC-60958.
-
-The audio_enable/disable, audio_config and audio_supported functions could be
-implemented as functions that may sleep. Hence, they should not be called
-while holding a spinlock or a readlock.
-
-The audio_start/audio_stop function is intended to effectively start/stop audio
-playback after the configuration has taken place. These functions are designed
-to be used in an atomic context. Hence, audio_start should return quickly and be
-called only after all the needed resources for audio playback (audio FIFOs,
-DMA channels, companion chips, etc) have been enabled to begin data transfers.
-audio_stop is designed to only stop the audio transfers. The resources used
-for playback are released using audio_disable.
-
-The enum omap_dss_audio_state may be used to help the implementations of
-the interface to keep track of the audio state. The initial state is _DISABLED;
-then, the state transitions to _CONFIGURED, and then, when it is ready to
-play audio, to _ENABLED. The state _PLAYING is used when the audio is being
-rendered.
-
-
-Panel and controller drivers
-----------------------------
-
-The drivers implement panel or controller specific functionality and are not
-usually visible to users except through omapfb driver.  They register
-themselves to the DSS driver.
-
-omapfb driver
--------------
-
-The omapfb driver implements arbitrary number of standard linux framebuffers.
-These framebuffers can be routed flexibly to any overlays, thus allowing very
-dynamic display architecture.
-
-The driver exports some omapfb specific ioctls, which are compatible with the
-ioctls in the old driver.
-
-The rest of the non standard features are exported via sysfs. Whether the final
-implementation will use sysfs, or ioctls, is still open.
-
-V4L2 drivers
-------------
-
-V4L2 is being implemented in TI.
-
-From omapdss point of view the V4L2 drivers should be similar to framebuffer
-driver.
-
-Architecture
---------------------
-
-Some clarification what the different components do:
-
-    - Framebuffer is a memory area inside OMAP's SRAM/SDRAM that contains the
-      pixel data for the image. Framebuffer has width and height and color
-      depth.
-    - Overlay defines where the pixels are read from and where they go on the
-      screen. The overlay may be smaller than framebuffer, thus displaying only
-      part of the framebuffer. The position of the overlay may be changed if
-      the overlay is smaller than the display.
-    - Overlay manager combines the overlays in to one image and feeds them to
-      display.
-    - Display is the actual physical display device.
-
-A framebuffer can be connected to multiple overlays to show the same pixel data
-on all of the overlays. Note that in this case the overlay input sizes must be
-the same, but, in case of video overlays, the output size can be different. Any
-framebuffer can be connected to any overlay.
-
-An overlay can be connected to one overlay manager. Also DISPC overlays can be
-connected only to DISPC overlay managers, and virtual overlays can be only
-connected to virtual overlays.
-
-An overlay manager can be connected to one display. There are certain
-restrictions which kinds of displays an overlay manager can be connected:
-
-    - DISPC TV overlay manager can be only connected to TV display.
-    - Virtual overlay managers can only be connected to DBI or DSI displays.
-    - DISPC LCD overlay manager can be connected to all displays, except TV
-      display.
-
-Sysfs
------
-The sysfs interface is mainly used for testing. I don't think sysfs
-interface is the best for this in the final version, but I don't quite know
-what would be the best interfaces for these things.
-
-The sysfs interface is divided to two parts: DSS and FB.
-
-/sys/class/graphics/fb? directory:
-mirror		0=off, 1=on
-rotate		Rotation 0-3 for 0, 90, 180, 270 degrees
-rotate_type	0 = DMA rotation, 1 = VRFB rotation
-overlays	List of overlay numbers to which framebuffer pixels go
-phys_addr	Physical address of the framebuffer
-virt_addr	Virtual address of the framebuffer
-size		Size of the framebuffer
-
-/sys/devices/platform/omapdss/overlay? directory:
-enabled		0=off, 1=on
-input_size	width,height (ie. the framebuffer size)
-manager		Destination overlay manager name
-name
-output_size	width,height
-position	x,y
-screen_width	width
-global_alpha   	global alpha 0-255 0=transparent 255=opaque
-
-/sys/devices/platform/omapdss/manager? directory:
-display				Destination display
-name
-alpha_blending_enabled		0=off, 1=on
-trans_key_enabled		0=off, 1=on
-trans_key_type			gfx-destination, video-source
-trans_key_value			transparency color key (RGB24)
-default_color			default background color (RGB24)
-
-/sys/devices/platform/omapdss/display? directory:
-ctrl_name	Controller name
-mirror		0=off, 1=on
-update_mode	0=off, 1=auto, 2=manual
-enabled		0=off, 1=on
-name
-rotate		Rotation 0-3 for 0, 90, 180, 270 degrees
-timings		Display timings (pixclock,xres/hfp/hbp/hsw,yres/vfp/vbp/vsw)
-		When writing, two special timings are accepted for tv-out:
-		"pal" and "ntsc"
-panel_name
-tear_elim	Tearing elimination 0=off, 1=on
-output_type	Output type (video encoder only): "composite" or "svideo"
-
-There are also some debugfs files at <debugfs>/omapdss/ which show information
-about clocks and registers.
-
-Examples
---------
-
-The following definitions have been made for the examples below:
-
-ovl0=/sys/devices/platform/omapdss/overlay0
-ovl1=/sys/devices/platform/omapdss/overlay1
-ovl2=/sys/devices/platform/omapdss/overlay2
-
-mgr0=/sys/devices/platform/omapdss/manager0
-mgr1=/sys/devices/platform/omapdss/manager1
-
-lcd=/sys/devices/platform/omapdss/display0
-dvi=/sys/devices/platform/omapdss/display1
-tv=/sys/devices/platform/omapdss/display2
-
-fb0=/sys/class/graphics/fb0
-fb1=/sys/class/graphics/fb1
-fb2=/sys/class/graphics/fb2
-
-Default setup on OMAP3 SDP
---------------------------
-
-Here's the default setup on OMAP3 SDP board. All planes go to LCD. DVI
-and TV-out are not in use. The columns from left to right are:
-framebuffers, overlays, overlay managers, displays. Framebuffers are
-handled by omapfb, and the rest by the DSS.
-
-FB0 --- GFX  -\            DVI
-FB1 --- VID1 --+- LCD ---- LCD
-FB2 --- VID2 -/   TV ----- TV
-
-Example: Switch from LCD to DVI
-----------------------
-
-w=`cat $dvi/timings | cut -d "," -f 2 | cut -d "/" -f 1`
-h=`cat $dvi/timings | cut -d "," -f 3 | cut -d "/" -f 1`
-
-echo "0" > $lcd/enabled
-echo "" > $mgr0/display
-fbset -fb /dev/fb0 -xres $w -yres $h -vxres $w -vyres $h
-# at this point you have to switch the dvi/lcd dip-switch from the omap board
-echo "dvi" > $mgr0/display
-echo "1" > $dvi/enabled
-
-After this the configuration looks like:
-
-FB0 --- GFX  -\         -- DVI
-FB1 --- VID1 --+- LCD -/   LCD
-FB2 --- VID2 -/   TV ----- TV
-
-Example: Clone GFX overlay to LCD and TV
--------------------------------
-
-w=`cat $tv/timings | cut -d "," -f 2 | cut -d "/" -f 1`
-h=`cat $tv/timings | cut -d "," -f 3 | cut -d "/" -f 1`
-
-echo "0" > $ovl0/enabled
-echo "0" > $ovl1/enabled
-
-echo "" > $fb1/overlays
-echo "0,1" > $fb0/overlays
-
-echo "$w,$h" > $ovl1/output_size
-echo "tv" > $ovl1/manager
-
-echo "1" > $ovl0/enabled
-echo "1" > $ovl1/enabled
-
-echo "1" > $tv/enabled
-
-After this the configuration looks like (only relevant parts shown):
-
-FB0 +-- GFX  ---- LCD ---- LCD
-     \- VID1 ---- TV  ---- TV
-
-Misc notes
-----------
-
-OMAP FB allocates the framebuffer memory using the standard dma allocator. You
-can enable Contiguous Memory Allocator (CONFIG_CMA) to improve the dma
-allocator, and if CMA is enabled, you use "cma=" kernel parameter to increase
-the global memory area for CMA.
-
-Using DSI DPLL to generate pixel clock it is possible produce the pixel clock
-of 86.5MHz (max possible), and with that you get 1280x1024@57 output from DVI.
-
-Rotation and mirroring currently only supports RGB565 and RGB8888 modes. VRFB
-does not support mirroring.
-
-VRFB rotation requires much more memory than non-rotated framebuffer, so you
-probably need to increase your vram setting before using VRFB rotation. Also,
-many applications may not work with VRFB if they do not pay attention to all
-framebuffer parameters.
-
-Kernel boot arguments
----------------------
-
-omapfb.mode=<display>:<mode>[,...]
-	- Default video mode for specified displays. For example,
-	  "dvi:800x400MR-24@60".  See drivers/video/modedb.c.
-	  There are also two special modes: "pal" and "ntsc" that
-	  can be used to tv out.
-
-omapfb.vram=<fbnum>:<size>[@<physaddr>][,...]
-	- VRAM allocated for a framebuffer. Normally omapfb allocates vram
-	  depending on the display size. With this you can manually allocate
-	  more or define the physical address of each framebuffer. For example,
-	  "1:4M" to allocate 4M for fb1.
-
-omapfb.debug=<y|n>
-	- Enable debug printing. You have to have OMAPFB debug support enabled
-	  in kernel config.
-
-omapfb.test=<y|n>
-	- Draw test pattern to framebuffer whenever framebuffer settings change.
-	  You need to have OMAPFB debug support enabled in kernel config.
-
-omapfb.vrfb=<y|n>
-	- Use VRFB rotation for all framebuffers.
-
-omapfb.rotate=<angle>
-	- Default rotation applied to all framebuffers.
-	  0 - 0 degree rotation
-	  1 - 90 degree rotation
-	  2 - 180 degree rotation
-	  3 - 270 degree rotation
-
-omapfb.mirror=<y|n>
-	- Default mirror for all framebuffers. Only works with DMA rotation.
-
-omapdss.def_disp=<display>
-	- Name of default display, to which all overlays will be connected.
-	  Common examples are "lcd" or "tv".
-
-omapdss.debug=<y|n>
-	- Enable debug printing. You have to have DSS debug support enabled in
-	  kernel config.
-
-TODO
-----
-
-DSS locking
-
-Error checking
-- Lots of checks are missing or implemented just as BUG()
-
-System DMA update for DSI
-- Can be used for RGB16 and RGB24P modes. Probably not for RGB24U (how
-  to skip the empty byte?)
-
-OMAP1 support
-- Not sure if needed
-
diff --git a/Documentation/arm/OMAP/README b/Documentation/arm/OMAP/README
deleted file mode 100644
index 90c6c57d61e8..000000000000
--- a/Documentation/arm/OMAP/README
+++ /dev/null
@@ -1,11 +0,0 @@
-This file contains documentation for running mainline
-kernel on omaps.
-
-KERNEL		NEW DEPENDENCIES
-v4.3+		Update is needed for custom .config files to make sure
-		CONFIG_REGULATOR_PBIAS is enabled for MMC1 to work
-		properly.
-
-v4.18+		Update is needed for custom .config files to make sure
-		CONFIG_MMC_SDHCI_OMAP is enabled for all MMC instances
-		to work in DRA7 and K2G based boards.
diff --git a/Documentation/arm/OMAP/omap_pm b/Documentation/arm/OMAP/omap_pm
deleted file mode 100644
index 4ae915a9f899..000000000000
--- a/Documentation/arm/OMAP/omap_pm
+++ /dev/null
@@ -1,154 +0,0 @@
-
-The OMAP PM interface
-=====================
-
-This document describes the temporary OMAP PM interface.  Driver
-authors use these functions to communicate minimum latency or
-throughput constraints to the kernel power management code.
-Over time, the intention is to merge features from the OMAP PM
-interface into the Linux PM QoS code.
-
-Drivers need to express PM parameters which:
-
-- support the range of power management parameters present in the TI SRF;
-
-- separate the drivers from the underlying PM parameter
-  implementation, whether it is the TI SRF or Linux PM QoS or Linux
-  latency framework or something else;
-
-- specify PM parameters in terms of fundamental units, such as
-  latency and throughput, rather than units which are specific to OMAP
-  or to particular OMAP variants;
-
-- allow drivers which are shared with other architectures (e.g.,
-  DaVinci) to add these constraints in a way which won't affect non-OMAP
-  systems,
-
-- can be implemented immediately with minimal disruption of other
-  architectures.
-
-
-This document proposes the OMAP PM interface, including the following
-five power management functions for driver code:
-
-1. Set the maximum MPU wakeup latency:
-   (*pdata->set_max_mpu_wakeup_lat)(struct device *dev, unsigned long t)
-
-2. Set the maximum device wakeup latency:
-   (*pdata->set_max_dev_wakeup_lat)(struct device *dev, unsigned long t)
-
-3. Set the maximum system DMA transfer start latency (CORE pwrdm):
-   (*pdata->set_max_sdma_lat)(struct device *dev, long t)
-
-4. Set the minimum bus throughput needed by a device:
-   (*pdata->set_min_bus_tput)(struct device *dev, u8 agent_id, unsigned long r)
-
-5. Return the number of times the device has lost context
-   (*pdata->get_dev_context_loss_count)(struct device *dev)
-
-
-Further documentation for all OMAP PM interface functions can be
-found in arch/arm/plat-omap/include/mach/omap-pm.h.
-
-
-The OMAP PM layer is intended to be temporary
----------------------------------------------
-
-The intention is that eventually the Linux PM QoS layer should support
-the range of power management features present in OMAP3.  As this
-happens, existing drivers using the OMAP PM interface can be modified
-to use the Linux PM QoS code; and the OMAP PM interface can disappear.
-
-
-Driver usage of the OMAP PM functions
--------------------------------------
-
-As the 'pdata' in the above examples indicates, these functions are
-exposed to drivers through function pointers in driver .platform_data
-structures.  The function pointers are initialized by the board-*.c
-files to point to the corresponding OMAP PM functions:
-.set_max_dev_wakeup_lat will point to
-omap_pm_set_max_dev_wakeup_lat(), etc.  Other architectures which do
-not support these functions should leave these function pointers set
-to NULL.  Drivers should use the following idiom:
-
-        if (pdata->set_max_dev_wakeup_lat)
-            (*pdata->set_max_dev_wakeup_lat)(dev, t);
-
-The most common usage of these functions will probably be to specify
-the maximum time from when an interrupt occurs, to when the device
-becomes accessible.  To accomplish this, driver writers should use the
-set_max_mpu_wakeup_lat() function to constrain the MPU wakeup
-latency, and the set_max_dev_wakeup_lat() function to constrain the
-device wakeup latency (from clk_enable() to accessibility).  For
-example,
-
-        /* Limit MPU wakeup latency */
-        if (pdata->set_max_mpu_wakeup_lat)
-            (*pdata->set_max_mpu_wakeup_lat)(dev, tc);
-
-        /* Limit device powerdomain wakeup latency */
-        if (pdata->set_max_dev_wakeup_lat)
-            (*pdata->set_max_dev_wakeup_lat)(dev, td);
-
-        /* total wakeup latency in this example: (tc + td) */
-
-The PM parameters can be overwritten by calling the function again
-with the new value.  The settings can be removed by calling the
-function with a t argument of -1 (except in the case of
-set_max_bus_tput(), which should be called with an r argument of 0).
-
-The fifth function above, omap_pm_get_dev_context_loss_count(),
-is intended as an optimization to allow drivers to determine whether the
-device has lost its internal context.  If context has been lost, the
-driver must restore its internal context before proceeding.
-
-
-Other specialized interface functions
--------------------------------------
-
-The five functions listed above are intended to be usable by any
-device driver.  DSPBridge and CPUFreq have a few special requirements.
-DSPBridge expresses target DSP performance levels in terms of OPP IDs.
-CPUFreq expresses target MPU performance levels in terms of MPU
-frequency.  The OMAP PM interface contains functions for these
-specialized cases to convert that input information (OPPs/MPU
-frequency) into the form that the underlying power management
-implementation needs:
-
-6. (*pdata->dsp_get_opp_table)(void)
-
-7. (*pdata->dsp_set_min_opp)(u8 opp_id)
-
-8. (*pdata->dsp_get_opp)(void)
-
-9. (*pdata->cpu_get_freq_table)(void)
-
-10. (*pdata->cpu_set_freq)(unsigned long f)
-
-11. (*pdata->cpu_get_freq)(void)
-
-Customizing OPP for platform
-============================
-Defining CONFIG_PM should enable OPP layer for the silicon
-and the registration of OPP table should take place automatically.
-However, in special cases, the default OPP table may need to be
-tweaked, for e.g.:
- * enable default OPPs which are disabled by default, but which
-   could be enabled on a platform
- * Disable an unsupported OPP on the platform
- * Define and add a custom opp table entry
-in these cases, the board file needs to do additional steps as follows:
-arch/arm/mach-omapx/board-xyz.c
-	#include "pm.h"
-	....
-	static void __init omap_xyz_init_irq(void)
-	{
-		....
-		/* Initialize the default table */
-		omapx_opp_init();
-		/* Do customization to the defaults */
-		....
-	}
-NOTE: omapx_opp_init will be omap3_opp_init or as required
-based on the omap family.
diff --git a/Documentation/arm/Porting b/Documentation/arm/Porting
deleted file mode 100644
index a492233931b9..000000000000
--- a/Documentation/arm/Porting
+++ /dev/null
@@ -1,135 +0,0 @@
-Taken from list archive at http://lists.arm.linux.org.uk/pipermail/linux-arm-kernel/2001-July/004064.html
-
-Initial definitions
--------------------
-
-The following symbol definitions rely on you knowing the translation that
-__virt_to_phys() does for your machine.  This macro converts the passed
-virtual address to a physical address.  Normally, it is simply:
-
-		phys = virt - PAGE_OFFSET + PHYS_OFFSET
-
-
-Decompressor Symbols
---------------------
-
-ZTEXTADDR
-	Start address of decompressor.  There's no point in talking about
-	virtual or physical addresses here, since the MMU will be off at
-	the time when you call the decompressor code.  You normally call
-	the kernel at this address to start it booting.  This doesn't have
-	to be located in RAM, it can be in flash or other read-only or
-	read-write addressable medium.
-
-ZBSSADDR
-	Start address of zero-initialised work area for the decompressor.
-	This must be pointing at RAM.  The decompressor will zero initialise
-	this for you.  Again, the MMU will be off.
-
-ZRELADDR
-	This is the address where the decompressed kernel will be written,
-	and eventually executed.  The following constraint must be valid:
-
-		__virt_to_phys(TEXTADDR) == ZRELADDR
-
-	The initial part of the kernel is carefully coded to be position
-	independent.
-
-INITRD_PHYS
-	Physical address to place the initial RAM disk.  Only relevant if
-	you are using the bootpImage stuff (which only works on the old
-	struct param_struct).
-
-INITRD_VIRT
-	Virtual address of the initial RAM disk.  The following  constraint
-	must be valid:
-
-		__virt_to_phys(INITRD_VIRT) == INITRD_PHYS
-
-PARAMS_PHYS
-	Physical address of the struct param_struct or tag list, giving the
-	kernel various parameters about its execution environment.
-
-
-Kernel Symbols
---------------
-
-PHYS_OFFSET
-	Physical start address of the first bank of RAM.
-
-PAGE_OFFSET
-	Virtual start address of the first bank of RAM.  During the kernel
-	boot phase, virtual address PAGE_OFFSET will be mapped to physical
-	address PHYS_OFFSET, along with any other mappings you supply.
-	This should be the same value as TASK_SIZE.
-
-TASK_SIZE
-	The maximum size of a user process in bytes.  Since user space
-	always starts at zero, this is the maximum address that a user
-	process can access+1.  The user space stack grows down from this
-	address.
-
-	Any virtual address below TASK_SIZE is deemed to be user process
-	area, and therefore managed dynamically on a process by process
-	basis by the kernel.  I'll call this the user segment.
-
-	Anything above TASK_SIZE is common to all processes.  I'll call
-	this the kernel segment.
-
-	(In other words, you can't put IO mappings below TASK_SIZE, and
-	hence PAGE_OFFSET).
-
-TEXTADDR
-	Virtual start address of kernel, normally PAGE_OFFSET + 0x8000.
-	This is where the kernel image ends up.  With the latest kernels,
-	it must be located at 32768 bytes into a 128MB region.  Previous
-	kernels placed a restriction of 256MB here.
-
-DATAADDR
-	Virtual address for the kernel data segment.  Must not be defined
-	when using the decompressor.
-
-VMALLOC_START
-VMALLOC_END
-	Virtual addresses bounding the vmalloc() area.  There must not be
-	any static mappings in this area; vmalloc will overwrite them.
-	The addresses must also be in the kernel segment (see above).
-	Normally, the vmalloc() area starts VMALLOC_OFFSET bytes above the
-	last virtual RAM address (found using variable high_memory).
-
-VMALLOC_OFFSET
-	Offset normally incorporated into VMALLOC_START to provide a hole
-	between virtual RAM and the vmalloc area.  We do this to allow
-	out of bounds memory accesses (eg, something writing off the end
-	of the mapped memory map) to be caught.  Normally set to 8MB.
-
-Architecture Specific Macros
-----------------------------
-
-BOOT_MEM(pram,pio,vio)
-	`pram' specifies the physical start address of RAM.  Must always
-	be present, and should be the same as PHYS_OFFSET.
-
-	`pio' is the physical address of an 8MB region containing IO for
-	use with the debugging macros in arch/arm/kernel/debug-armv.S.
-
-	`vio' is the virtual address of the 8MB debugging region.
-
-	It is expected that the debugging region will be re-initialised
-	by the architecture specific code later in the code (via the
-	MAPIO function).
-
-BOOT_PARAMS
-	Same as, and see PARAMS_PHYS.
-
-FIXUP(func)
-	Machine specific fixups, run before memory subsystems have been
-	initialised.
-
-MAPIO(func)
-	Machine specific function to map IO areas (including the debug
-	region above).
-
-INITIRQ(func)
-	Machine specific function to initialise interrupts.
-
diff --git a/Documentation/arm/README b/Documentation/arm/README
deleted file mode 100644
index 9d1e5b2c92e6..000000000000
--- a/Documentation/arm/README
+++ /dev/null
@@ -1,204 +0,0 @@
-			   ARM Linux 2.6
-			   =============
-
-    Please check <ftp://ftp.arm.linux.org.uk/pub/armlinux> for
-    updates.
-
-Compilation of kernel
----------------------
-
-  In order to compile ARM Linux, you will need a compiler capable of
-  generating ARM ELF code with GNU extensions.  GCC 3.3 is known to be
-  a good compiler.  Fortunately, you needn't guess.  The kernel will report
-  an error if your compiler is a recognized offender.
-
-  To build ARM Linux natively, you shouldn't have to alter the ARCH = line
-  in the top level Makefile.  However, if you don't have the ARM Linux ELF
-  tools installed as default, then you should change the CROSS_COMPILE
-  line as detailed below.
-
-  If you wish to cross-compile, then alter the following lines in the top
-  level make file:
-
-    ARCH = <whatever>
-	with
-    ARCH = arm
-
-	and
-
-    CROSS_COMPILE=
-	to
-    CROSS_COMPILE=<your-path-to-your-compiler-without-gcc>
-	eg.
-    CROSS_COMPILE=arm-linux-
-
-  Do a 'make config', followed by 'make Image' to build the kernel 
-  (arch/arm/boot/Image).  A compressed image can be built by doing a 
-  'make zImage' instead of 'make Image'.
-
-
-Bug reports etc
----------------
-
-  Please send patches to the patch system.  For more information, see
-  http://www.arm.linux.org.uk/developer/patches/info.php Always include some
-  explanation as to what the patch does and why it is needed.
-
-  Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
-  or submitted through the web form at
-  http://www.arm.linux.org.uk/developer/ 
-
-  When sending bug reports, please ensure that they contain all relevant
-  information, eg. the kernel messages that were printed before/during
-  the problem, what you were doing, etc.
-
-
-Include files
--------------
-
-  Several new include directories have been created under include/asm-arm,
-  which are there to reduce the clutter in the top-level directory.  These
-  directories, and their purpose is listed below:
-
-   arch-*	machine/platform specific header files
-   hardware	driver-internal ARM specific data structures/definitions
-   mach		descriptions of generic ARM to specific machine interfaces
-   proc-*	processor dependent header files (currently only two
-		categories)
-
-
-Machine/Platform support
-------------------------
-
-  The ARM tree contains support for a lot of different machine types.  To
-  continue supporting these differences, it has become necessary to split
-  machine-specific parts by directory.  For this, the machine category is
-  used to select which directories and files get included (we will use
-  $(MACHINE) to refer to the category)
-
-  To this end, we now have arch/arm/mach-$(MACHINE) directories which are
-  designed to house the non-driver files for a particular machine (eg, PCI,
-  memory management, architecture definitions etc).  For all future
-  machines, there should be a corresponding arch/arm/mach-$(MACHINE)/include/mach
-  directory.
-
-
-Modules
--------
-
-  Although modularisation is supported (and required for the FP emulator),
-  each module on an ARM2/ARM250/ARM3 machine when is loaded will take
-  memory up to the next 32k boundary due to the size of the pages.
-  Therefore, is modularisation on these machines really worth it?
-
-  However, ARM6 and up machines allow modules to take multiples of 4k, and
-  as such Acorn RiscPCs and other architectures using these processors can
-  make good use of modularisation.
-
-
-ADFS Image files
-----------------
-
-  You can access image files on your ADFS partitions by mounting the ADFS
-  partition, and then using the loopback device driver.  You must have
-  losetup installed.
-
-  Please note that the PCEmulator DOS partitions have a partition table at
-  the start, and as such, you will have to give '-o offset' to losetup.
-
-
-Request to developers
----------------------
-
-  When writing device drivers which include a separate assembler file, please
-  include it in with the C file, and not the arch/arm/lib directory.  This
-  allows the driver to be compiled as a loadable module without requiring
-  half the code to be compiled into the kernel image.
-
-  In general, try to avoid using assembler unless it is really necessary.  It
-  makes drivers far less easy to port to other hardware.
-
-
-ST506 hard drives
------------------
-
-  The ST506 hard drive controllers seem to be working fine (if a little
-  slowly).  At the moment they will only work off the controllers on an
-  A4x0's motherboard, but for it to work off a Podule just requires
-  someone with a podule to add the addresses for the IRQ mask and the
-  HDC base to the source.
-
-  As of 31/3/96 it works with two drives (you should get the ADFS
-  *configure harddrive set to 2). I've got an internal 20MB and a great
-  big external 5.25" FH 64MB drive (who could ever want more :-) ).
-
-  I've just got 240K/s off it (a dd with bs=128k); thats about half of what
-  RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting
-  last week :-)
-
-  Known bug: Drive data errors can cause a hang; including cases where
-  the controller has fixed the error using ECC. (Possibly ONLY
-  in that case...hmm).
-
-
-1772 Floppy
------------
-  This also seems to work OK, but hasn't been stressed much lately.  It
-  hasn't got any code for disc change detection in there at the moment which
-  could be a bit of a problem!  Suggestions on the correct way to do this
-  are welcome.
-
-
-CONFIG_MACH_ and CONFIG_ARCH_
------------------------------
-  A change was made in 2003 to the macro names for new machines.
-  Historically, CONFIG_ARCH_ was used for the bonafide architecture,
-  e.g. SA1100, as well as implementations of the architecture,
-  e.g. Assabet.  It was decided to change the implementation macros
-  to read CONFIG_MACH_ for clarity.  Moreover, a retroactive fixup has
-  not been made because it would complicate patching.
-
-  Previous registrations may be found online.
-
-    <http://www.arm.linux.org.uk/developer/machines/>
-
-Kernel entry (head.S)
---------------------------
-  The initial entry into the kernel is via head.S, which uses machine
-  independent code.  The machine is selected by the value of 'r1' on
-  entry, which must be kept unique.
-
-  Due to the large number of machines which the ARM port of Linux provides
-  for, we have a method to manage this which ensures that we don't end up
-  duplicating large amounts of code.
-
-  We group machine (or platform) support code into machine classes.  A
-  class typically based around one or more system on a chip devices, and
-  acts as a natural container around the actual implementations.  These
-  classes are given directories - arch/arm/mach-<class> and
-  arch/arm/mach-<class> - which contain the source files to/include/mach
-  support the machine class.  This directories also contain any machine
-  specific supporting code.
-
-  For example, the SA1100 class is based upon the SA1100 and SA1110 SoC
-  devices, and contains the code to support the way the on-board and off-
-  board devices are used, or the device is setup, and provides that
-  machine specific "personality."
-
-  For platforms that support device tree (DT), the machine selection is
-  controlled at runtime by passing the device tree blob to the kernel.  At
-  compile-time, support for the machine type must be selected.  This allows for
-  a single multiplatform kernel build to be used for several machine types.
-
-  For platforms that do not use device tree, this machine selection is
-  controlled by the machine type ID, which acts both as a run-time and a
-  compile-time code selection method.  You can register a new machine via the
-  web site at:
-
-    <http://www.arm.linux.org.uk/developer/machines/>
-
-  Note: Please do not register a machine type for DT-only platforms.  If your
-  platform is DT-only, you do not need a registered machine type.
-
----
-Russell King (15/03/2004)
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
deleted file mode 100644
index f9f62e8c0719..000000000000
--- a/Documentation/arm/SA1100/ADSBitsy
+++ /dev/null
@@ -1,43 +0,0 @@
-ADS Bitsy Single Board Computer
-(It is different from Bitsy(iPAQ) of Compaq)
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The Linux support for this product has been provided by
-Woojung Huh <whuh@applieddata.net>
-
-Use 'make adsbitsy_config' before any 'make config'.
-This will set up defaults for ADS Bitsy support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0400000.
-
-Linux can  be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- SA1111 USB Master
-- SA1100 serial port
-- pcmcia, compact flash
-- touchscreen(ucb1200)
-- console on LCD screen
-- serial ports (ttyS[0-2])
-  - ttyS0 is default for serial console
-
-To do:
-- everything else!  :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions.
-  You should be careful to use flash on board.
-  Its partition is different from GraphicsClient Plus and GraphicsMaster
-
-- 16bpp mode requires a different cable than what ships with the board.
-  Contact ADS or look through the manual to wire your own. Currently,
-  if you compile with 16bit mode support and switch into a lower bpp
-  mode, the timing is off so the image is corrupted.  This will be
-  fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
deleted file mode 100644
index e08a6739e72c..000000000000
--- a/Documentation/arm/SA1100/Assabet
+++ /dev/null
@@ -1,300 +0,0 @@
-The Intel Assabet (SA-1110 evaluation) board
-============================================
-
-Please see:
-http://developer.intel.com
-
-Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
-http://www.cs.cmu.edu/~wearable/software/assabet.html
-
-
-Building the kernel
--------------------
-
-To build the kernel with current defaults:
-
-	make assabet_config
-	make oldconfig
-	make zImage
-
-The resulting kernel image should be available in linux/arch/arm/boot/zImage.
-
-
-Installing a bootloader
------------------------
-
-A couple of bootloaders able to boot Linux on Assabet are available:
-
-BLOB (http://www.lartmaker.nl/lartware/blob/)
-
-   BLOB is a bootloader used within the LART project.  Some contributed
-   patches were merged into BLOB to add support for Assabet.
-
-Compaq's Bootldr + John Dorsey's patch for Assabet support
-(http://www.handhelds.org/Compaq/bootldr.html)
-(http://www.wearablegroup.org/software/bootldr/)
-
-   Bootldr is the bootloader developed by Compaq for the iPAQ Pocket PC.
-   John Dorsey has produced add-on patches to add support for Assabet and
-   the JFFS filesystem.
-
-RedBoot (http://sources.redhat.com/redboot/)
-
-   RedBoot is a bootloader developed by Red Hat based on the eCos RTOS
-   hardware abstraction layer.  It supports Assabet amongst many other
-   hardware platforms.
-
-RedBoot is currently the recommended choice since it's the only one to have
-networking support, and is the most actively maintained.
-
-Brief examples on how to boot Linux with RedBoot are shown below.  But first
-you need to have RedBoot installed in your flash memory.  A known to work
-precompiled RedBoot binary is available from the following location:
-
-ftp://ftp.netwinder.org/users/n/nico/
-ftp://ftp.arm.linux.org.uk/pub/linux/arm/people/nico/
-ftp://ftp.handhelds.org/pub/linux/arm/sa-1100-patches/
-
-Look for redboot-assabet*.tgz.  Some installation infos are provided in
-redboot-assabet*.txt.
-
-
-Initial RedBoot configuration
------------------------------
-
-The commands used here are explained in The RedBoot User's Guide available
-on-line at http://sources.redhat.com/ecos/docs.html.
-Please refer to it for explanations.
-
-If you have a CF network card (my Assabet kit contained a CF+ LP-E from
-Socket Communications Inc.), you should strongly consider using it for TFTP
-file transfers.  You must insert it before RedBoot runs since it can't detect
-it dynamically.
-
-To initialize the flash directory:
-
-	fis init -f
-
-To initialize the non-volatile settings, like whether you want to use BOOTP or
-a static IP address, etc, use this command:
-
-	fconfig -i
-
-
-Writing a kernel image into flash
----------------------------------
-
-First, the kernel image must be loaded into RAM.  If you have the zImage file
-available on a TFTP server:
-
-	load zImage -r -b 0x100000
-
-If you rather want to use Y-Modem upload over the serial port:
-
-	load -m ymodem -r -b 0x100000
-
-To write it to flash:
-
-	fis create "Linux kernel" -b 0x100000 -l 0xc0000
-
-
-Booting the kernel
-------------------
-
-The kernel still requires a filesystem to boot.  A ramdisk image can be loaded
-as follows:
-
-	load ramdisk_image.gz -r -b 0x800000
-
-Again, Y-Modem upload can be used instead of TFTP by replacing the file name
-by '-y ymodem'.
-
-Now the kernel can be retrieved from flash like this:
-
-	fis load "Linux kernel"
-
-or loaded as described previously.  To boot the kernel:
-
-	exec -b 0x100000 -l 0xc0000
-
-The ramdisk image could be stored into flash as well, but there are better
-solutions for on-flash filesystems as mentioned below.
-
-
-Using JFFS2
------------
-
-Using JFFS2 (the Second Journalling Flash File System) is probably the most
-convenient way to store a writable filesystem into flash.  JFFS2 is used in
-conjunction with the MTD layer which is responsible for low-level flash
-management.  More information on the Linux MTD can be found on-line at:
-http://www.linux-mtd.infradead.org/.  A JFFS howto with some infos about
-creating JFFS/JFFS2 images is available from the same site.
-
-For instance, a sample JFFS2 image can be retrieved from the same FTP sites
-mentioned below for the precompiled RedBoot image.
-
-To load this file:
-
-	load sample_img.jffs2 -r -b 0x100000
-
-The result should look like:
-
-RedBoot> load sample_img.jffs2 -r -b 0x100000
-Raw file loaded 0x00100000-0x00377424
-
-Now we must know the size of the unallocated flash:
-
-	fis free
-
-Result:
-
-RedBoot> fis free
-  0x500E0000 .. 0x503C0000
-
-The values above may be different depending on the size of the filesystem and
-the type of flash.  See their usage below as an example and take care of
-substituting yours appropriately.
-
-We must determine some values:
-
-size of unallocated flash:	0x503c0000 - 0x500e0000 = 0x2e0000
-size of the filesystem image:	0x00377424 - 0x00100000 = 0x277424
-
-We want to fit the filesystem image of course, but we also want to give it all
-the remaining flash space as well.  To write it:
-
-	fis unlock -f 0x500E0000 -l 0x2e0000
-	fis erase -f 0x500E0000 -l 0x2e0000
-	fis write -b 0x100000 -l 0x277424 -f 0x500E0000
-	fis create "JFFS2" -n -f 0x500E0000 -l 0x2e0000
-
-Now the filesystem is associated to a MTD "partition" once Linux has discovered
-what they are in the boot process.  From Redboot, the 'fis list' command
-displays them:
-
-RedBoot> fis list
-Name              FLASH addr  Mem addr    Length      Entry point
-RedBoot           0x50000000  0x50000000  0x00020000  0x00000000
-RedBoot config    0x503C0000  0x503C0000  0x00020000  0x00000000
-FIS directory     0x503E0000  0x503E0000  0x00020000  0x00000000
-Linux kernel      0x50020000  0x00100000  0x000C0000  0x00000000
-JFFS2             0x500E0000  0x500E0000  0x002E0000  0x00000000
-
-However Linux should display something like:
-
-SA1100 flash: probing 32-bit flash bus
-SA1100 flash: Found 2 x16 devices at 0x0 in 32-bit mode
-Using RedBoot partition definition
-Creating 5 MTD partitions on "SA1100 flash":
-0x00000000-0x00020000 : "RedBoot"
-0x00020000-0x000e0000 : "Linux kernel"
-0x000e0000-0x003c0000 : "JFFS2"
-0x003c0000-0x003e0000 : "RedBoot config"
-0x003e0000-0x00400000 : "FIS directory"
-
-What's important here is the position of the partition we are interested in,
-which is the third one.  Within Linux, this correspond to /dev/mtdblock2.
-Therefore to boot Linux with the kernel and its root filesystem in flash, we
-need this RedBoot command:
-
-	fis load "Linux kernel"
-	exec -b 0x100000 -l 0xc0000 -c "root=/dev/mtdblock2"
-
-Of course other filesystems than JFFS might be used, like cramfs for example.
-You might want to boot with a root filesystem over NFS, etc.  It is also
-possible, and sometimes more convenient, to flash a filesystem directly from
-within Linux while booted from a ramdisk or NFS.  The Linux MTD repository has
-many tools to deal with flash memory as well, to erase it for example.  JFFS2
-can then be mounted directly on a freshly erased partition and files can be
-copied over directly.  Etc...
-
-
-RedBoot scripting
------------------
-
-All the commands above aren't so useful if they have to be typed in every
-time the Assabet is rebooted.  Therefore it's possible to automate the boot
-process using RedBoot's scripting capability.
-
-For example, I use this to boot Linux with both the kernel and the ramdisk
-images retrieved from a TFTP server on the network:
-
-RedBoot> fconfig
-Run script at boot: false true
-Boot script:
-Enter script, terminate with empty line
->> load zImage -r -b 0x100000
->> load ramdisk_ks.gz -r -b 0x800000
->> exec -b 0x100000 -l 0xc0000
->>
-Boot script timeout (1000ms resolution): 3
-Use BOOTP for network configuration: true
-GDB connection port: 9000
-Network debug at boot time: false
-Update RedBoot non-volatile configuration - are you sure (y/n)? y
-
-Then, rebooting the Assabet is just a matter of waiting for the login prompt.
-
-
-
-Nicolas Pitre
-nico@fluxnic.net
-June 12, 2001
-
-
-Status of peripherals in -rmk tree (updated 14/10/2001)
--------------------------------------------------------
-
-Assabet:
- Serial ports:
-  Radio:		TX, RX, CTS, DSR, DCD, RI
-   PM:			Not tested.
-  COM:			TX, RX, CTS, DSR, DCD, RTS, DTR, PM
-   PM:			Not tested.
-  I2C:			Implemented, not fully tested.
-  L3:			Fully tested, pass.
-   PM:			Not tested.
-
- Video:
-  LCD:			Fully tested.  PM
-			(LCD doesn't like being blanked with
-			 neponset connected)
-  Video out:		Not fully
-
- Audio:
-  UDA1341:
-   Playback:		Fully tested, pass.
-   Record:		Implemented, not tested.
-   PM:			Not tested.
-
-  UCB1200:
-   Audio play:		Implemented, not heavily tested.
-   Audio rec:		Implemented, not heavily tested.
-   Telco audio play:	Implemented, not heavily tested.
-   Telco audio rec:	Implemented, not heavily tested.
-   POTS control:	No
-   Touchscreen:		Yes
-   PM:			Not tested.
-
- Other:
-  PCMCIA:
-   LPE:			Fully tested, pass.
-  USB:			No
-  IRDA:
-   SIR:			Fully tested, pass.
-   FIR:			Fully tested, pass.
-   PM:			Not tested.
-
-Neponset:
- Serial ports:
-  COM1,2:	TX, RX, CTS, DSR, DCD, RTS, DTR
-   PM:			Not tested.
-  USB:			Implemented, not heavily tested.
-  PCMCIA:		Implemented, not heavily tested.
-   PM:			Not tested.
-  CF:			Implemented, not heavily tested.
-   PM:			Not tested.
-
-More stuff can be found in the -np (Nicolas Pitre's) tree.
-
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
deleted file mode 100644
index 6a3aa95e9bfd..000000000000
--- a/Documentation/arm/SA1100/Brutus
+++ /dev/null
@@ -1,66 +0,0 @@
-Brutus is an evaluation platform for the SA1100 manufactured by Intel.  
-For more details, see:
-
-http://developer.intel.com
-
-To compile for Brutus, you must issue the following commands:
-
-	make brutus_config
-	make config
-	[accept all the defaults]
-	make zImage
-
-The resulting kernel will end up in linux/arch/arm/boot/zImage.  This file
-must be loaded at 0xc0008000 in Brutus's memory and execution started at
-0xc0008000 as well with the value of registers r0 = 0 and r1 = 16 upon
-entry.
-
-But prior to execute the kernel, a ramdisk image must also be loaded in
-memory.  Use memory address 0xd8000000 for this.  Note that the file 
-containing the (compressed) ramdisk image must not exceed 4 MB.
-
-Typically, you'll need angelboot to load the kernel.
-The following angelboot.opt file should be used:
-
------ begin angelboot.opt -----
-base 0xc0008000
-entry 0xc0008000
-r0 0x00000000
-r1 0x00000010
-device /dev/ttyS0
-options "9600 8N1"
-baud 115200
-otherfile ramdisk_img.gz
-otherbase 0xd8000000
------ end angelboot.opt -----
-
-Then load the kernel and ramdisk with:
-
-	angelboot -f angelboot.opt zImage
-
-The first Brutus serial port (assumed to be linked to /dev/ttyS0 on your
-host PC) is used by angel to load the kernel and ramdisk image. The serial
-console is provided through the second Brutus serial port. To access it,
-you may use minicom configured with /dev/ttyS1, 9600 baud, 8N1, no flow
-control.
-
-Currently supported:
-	- RS232 serial ports
-	- audio output
-	- LCD screen
-	- keyboard
-	
-The actual Brutus support may not be complete without extra patches. 
-If such patches exist, they should be found from 
-ftp.netwinder.org/users/n/nico.
-
-A full PCMCIA support is still missing, although it's possible to hack
-some drivers in order to drive already inserted cards at boot time with
-little modifications.
-
-Any contribution is welcome.
-
-Please send patches to nico@fluxnic.net
-
-Have Fun !
-
diff --git a/Documentation/arm/SA1100/CERF b/Documentation/arm/SA1100/CERF
deleted file mode 100644
index b3d845301ef1..000000000000
--- a/Documentation/arm/SA1100/CERF
+++ /dev/null
@@ -1,29 +0,0 @@
-*** The StrongARM version of the CerfBoard/Cube has been discontinued ***
-
-The Intrinsyc CerfBoard is a StrongARM 1110-based computer on a board
-that measures approximately 2" square. It includes an Ethernet
-controller, an RS232-compatible serial port, a USB function port, and
-one CompactFlash+ slot on the back. Pictures can be found at the
-Intrinsyc website, http://www.intrinsyc.com.
-
-This document describes the support in the Linux kernel for the
-Intrinsyc CerfBoard.
-
-Supported in this version:
-   - CompactFlash+ slot (select PCMCIA in General Setup and any options
-     that may be required)
-   - Onboard Crystal CS8900 Ethernet controller (Cerf CS8900A support in
-     Network Devices)
-   - Serial ports with a serial console (hardcoded to 38400 8N1)
-
-In order to get this kernel onto your Cerf, you need a server that runs
-both BOOTP and TFTP. Detailed instructions should have come with your
-evaluation kit on how to use the bootloader. This series of commands
-will suffice:
-
-   make ARCH=arm CROSS_COMPILE=arm-linux- cerfcube_defconfig
-   make ARCH=arm CROSS_COMPILE=arm-linux- zImage
-   make ARCH=arm CROSS_COMPILE=arm-linux- modules
-   cp arch/arm/boot/zImage <TFTP directory>
-
-support@intrinsyc.com
diff --git a/Documentation/arm/SA1100/FreeBird b/Documentation/arm/SA1100/FreeBird
deleted file mode 100644
index ab9193663b2b..000000000000
--- a/Documentation/arm/SA1100/FreeBird
+++ /dev/null
@@ -1,21 +0,0 @@
-Freebird-1.1 is produced by Legend(C), Inc.
-http://web.archive.org/web/*/http://www.legend.com.cn
-and software/linux maintained by Coventive(C), Inc.
-(http://www.coventive.com)
-
-Based on the Nicolas's strongarm kernel tree.
-
-===============================================================
-Maintainer:
-
-Chester Kuo <chester@coventive.com>
-	    <chester@linux.org.tw>
-
-Author :
-Tim wu <timwu@coventive.com>
-CIH <cih@coventive.com>
-Eric Peng <ericpeng@coventive.com>
-Jeff Lee <jeff_lee@coventive.com>
-Allen Cheng
-Tony Liu <tonyliu@coventive.com>
-
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
deleted file mode 100644
index 867bb35943af..000000000000
--- a/Documentation/arm/SA1100/GraphicsClient
+++ /dev/null
@@ -1,98 +0,0 @@
-ADS GraphicsClient Plus Single Board Computer
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The original Linux support for this product has been provided by 
-Nicolas Pitre <nico@fluxnic.net>. Continued development work by
-Woojung Huh <whuh@applieddata.net>
-
-It's currently possible to mount a root filesystem via NFS providing a
-complete Linux environment.  Otherwise a ramdisk image may be used.  The
-board supports MTD/JFFS, so you could also mount something on there.
-
-Use 'make graphicsclient_config' before any 'make config'.  This will set up
-defaults for GraphicsClient Plus support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0200000.  
-Also the following registers should have the specified values upon entry:
-
-	r0 = 0
-	r1 = 29	(this is the GraphicsClient architecture number)
-
-Linux can  be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-Angel is not available for the GraphicsClient Plus AFAIK.
-
-There is a  board known as just the GraphicsClient that ADS used to
-produce but has end of lifed. This code will not work on the older
-board with the ADS bootloader, but should still work with Angel,
-as outlined below.  In any case, if you're planning on deploying
-something en masse, you should probably get the newer board.
-
-If using Angel on the older boards, here is a typical angel.opt option file
-if the kernel is loaded through the Angel Debug Monitor:
-
------ begin angelboot.opt -----
-base 0xc0200000
-entry 0xc0200000
-r0 0x00000000
-r1 0x0000001d
-device /dev/ttyS1
-options "38400 8N1"
-baud 115200
-#otherfile ramdisk.gz
-#otherbase 0xc0800000
-exec minicom
------ end angelboot.opt -----
-
-Then the kernel (and ramdisk if otherfile/otherbase lines above are
-uncommented) would be loaded with:
-
-	angelboot -f angelboot.opt zImage
-
-Here it is assumed that the board is connected to ttyS1 on your PC
-and that minicom is preconfigured with /dev/ttyS1, 38400 baud, 8N1, no flow
-control by default.
-
-If any other bootloader is used, ensure it accomplish the same, especially
-for r0/r1 register values before jumping into the kernel.
-
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- on-board SMC 92C96 ethernet NIC
-- SA1100 serial port
-- flash memory access (MTD/JFFS)
-- pcmcia
-- touchscreen(ucb1200)
-- ps/2 keyboard
-- console on LCD screen
-- serial ports (ttyS[0-2])
-  - ttyS0 is default for serial console
-- Smart I/O (ADC, keypad, digital inputs, etc)
-  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
-  and example user space code. ps/2 keybd is multiplexed through this driver
-
-To do:
-- UCB1200 audio with new ucb_generic layer
-- everything else!  :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions.  mtd0 is where
-  the ADS boot ROM and zImage is stored.  It's been marked as
-  read-only to keep you from blasting over the bootloader. :)  mtd1 is
-  for the ramdisk.gz image.  mtd2 is user flash space and can be
-  utilized for either JFFS or if you're feeling crazy, running ext2
-  on top of it. If you're not using the ADS bootloader, you're
-  welcome to blast over the mtd1 partition also.
-
-- 16bpp mode requires a different cable than what ships with the board.
-  Contact ADS or look through the manual to wire your own. Currently,
-  if you compile with 16bit mode support and switch into a lower bpp
-  mode, the timing is off so the image is corrupted.  This will be
-  fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
-
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
deleted file mode 100644
index 9145088a0ba2..000000000000
--- a/Documentation/arm/SA1100/GraphicsMaster
+++ /dev/null
@@ -1,53 +0,0 @@
-ADS GraphicsMaster Single Board Computer
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The original Linux support for this product has been provided by
-Nicolas Pitre <nico@fluxnic.net>. Continued development work by
-Woojung Huh <whuh@applieddata.net>
-
-Use 'make graphicsmaster_config' before any 'make config'.
-This will set up defaults for GraphicsMaster support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0400000.
-
-Linux can  be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- SA1111 USB Master
-- on-board SMC 92C96 ethernet NIC
-- SA1100 serial port
-- flash memory access (MTD/JFFS)
-- pcmcia, compact flash
-- touchscreen(ucb1200)
-- ps/2 keyboard
-- console on LCD screen
-- serial ports (ttyS[0-2])
-  - ttyS0 is default for serial console
-- Smart I/O (ADC, keypad, digital inputs, etc)
-  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
-  and example user space code. ps/2 keybd is multiplexed through this driver
-
-To do:
-- everything else!  :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions.  mtd0 is where
-  the zImage is stored.  It's been marked as read-only to keep you
-  from blasting over the bootloader. :)  mtd1 is
-  for the ramdisk.gz image.  mtd2 is user flash space and can be
-  utilized for either JFFS or if you're feeling crazy, running ext2
-  on top of it. If you're not using the ADS bootloader, you're
-  welcome to blast over the mtd1 partition also.
-
-- 16bpp mode requires a different cable than what ships with the board.
-  Contact ADS or look through the manual to wire your own. Currently,
-  if you compile with 16bit mode support and switch into a lower bpp
-  mode, the timing is off so the image is corrupted.  This will be
-  fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/HUW_WEBPANEL b/Documentation/arm/SA1100/HUW_WEBPANEL
deleted file mode 100644
index fd56b48d4833..000000000000
--- a/Documentation/arm/SA1100/HUW_WEBPANEL
+++ /dev/null
@@ -1,17 +0,0 @@
-The HUW_WEBPANEL is a product of the german company Hoeft & Wessel AG
-
-If you want more information, please visit
-http://www.hoeft-wessel.de
-
-To build the kernel:
-	make huw_webpanel_config
-	make oldconfig
-	[accept all defaults]
-	make zImage
-
-Mostly of the work is done by:
-Roman Jordan         jor@hoeft-wessel.de
-Christoph Schulz    schu@hoeft-wessel.de
-
-2000/12/18/
-
diff --git a/Documentation/arm/SA1100/Itsy b/Documentation/arm/SA1100/Itsy
deleted file mode 100644
index 44b94997fa0d..000000000000
--- a/Documentation/arm/SA1100/Itsy
+++ /dev/null
@@ -1,39 +0,0 @@
-Itsy is a research project done by the Western Research Lab, and Systems
-Research Center in Palo Alto, CA. The Itsy project is one of several
-research projects at Compaq that are related to pocket computing.
-
-For more information, see:
-
-	http://www.hpl.hp.com/downloads/crl/itsy/
-
-Notes on initial 2.4 Itsy support (8/27/2000) :
-The port was done on an Itsy version 1.5 machine with a daughtercard with
-64 Meg of DRAM and 32 Meg of Flash. The initial work includes support for
-serial console (to see what you're doing).  No other devices have been
-enabled.
-
-To build, do a "make menuconfig" (or xmenuconfig) and select Itsy support.
-Disable Flash and LCD support. and then do a make zImage.
-Finally, you will need to cd to arch/arm/boot/tools and execute a make there
-to build the params-itsy program used to boot the kernel.
-
-In order to install the port of 2.4 to the itsy, You will need to set the
-configuration parameters in the monitor as follows:
-Arg 1:0x08340000, Arg2: 0xC0000000, Arg3:18 (0x12), Arg4:0
-Make sure the start-routine address is set to 0x00060000.
-
-Next, flash the params-itsy program to 0x00060000 ("p 1 0x00060000" in the
-flash menu)  Flash the kernel in arch/arm/boot/zImage into 0x08340000
-("p 1 0x00340000").  Finally flash an initial ramdisk into 0xC8000000
-("p 2 0x0")  We used ramdisk-2-30.gz from the 0.11 version directory on
-handhelds.org.
-
-The serial connection we established was at:
- 8-bit data, no parity, 1 stop bit(s), 115200.00 b/s. in the monitor, in the
-params-itsy program, and in the kernel itself.  This can be changed, but
-not easily. The monitor parameters are easily changed, the params program
-setup is assembly outl's, and the kernel is a configuration item specific to
-the itsy. (i.e. grep for CONFIG_SA1100_ITSY and you'll find where it is.)
-
-
-This should get you a properly booting 2.4 kernel on the itsy.
diff --git a/Documentation/arm/SA1100/LART b/Documentation/arm/SA1100/LART
deleted file mode 100644
index 6d412b685598..000000000000
--- a/Documentation/arm/SA1100/LART
+++ /dev/null
@@ -1,14 +0,0 @@
-Linux Advanced Radio Terminal (LART)
-------------------------------------
-
-The LART is a small (7.5 x 10cm) SA-1100 board, designed for embedded
-applications. It has 32 MB DRAM, 4MB Flash ROM, double RS232 and all
-other StrongARM-gadgets. Almost all SA signals are directly accessible
-through a number of connectors. The powersupply accepts voltages
-between 3.5V and 16V and is overdimensioned to support a range of
-daughterboards. A quad Ethernet / IDE / PS2 / sound daughterboard
-is under development, with plenty of others in different stages of
-planning.
-
-The hardware designs for this board have been released under an open license;
-see the LART page at http://www.lartmaker.nl/ for more information.
diff --git a/Documentation/arm/SA1100/PLEB b/Documentation/arm/SA1100/PLEB
deleted file mode 100644
index b9c8a631a351..000000000000
--- a/Documentation/arm/SA1100/PLEB
+++ /dev/null
@@ -1,11 +0,0 @@
-The PLEB project was started as a student initiative at the School of
-Computer Science and Engineering, University of New South Wales to make a
-pocket computer capable of running the Linux Kernel.
-
-PLEB support has yet to be fully integrated.
-
-For more information, see:
-
-	http://www.cse.unsw.edu.au
-
-
diff --git a/Documentation/arm/SA1100/Pangolin b/Documentation/arm/SA1100/Pangolin
deleted file mode 100644
index 077a6120e129..000000000000
--- a/Documentation/arm/SA1100/Pangolin
+++ /dev/null
@@ -1,23 +0,0 @@
-Pangolin is a StrongARM 1110-based evaluation platform produced
-by Dialogue Technology (http://www.dialogue.com.tw/).
-It has EISA slots for ease of configuration with SDRAM/Flash
-memory card, USB/Serial/Audio card, Compact Flash card,
-PCMCIA/IDE card and TFT-LCD card.
-
-To compile for Pangolin, you must issue the following commands:
-
-	make pangolin_config
-	make oldconfig
-	make zImage
-
-Supported peripherals:
-- SA1110 serial port (UART1/UART2/UART3)
-- flash memory access
-- compact flash driver
-- UDA1341 sound driver
-- SA1100 LCD controller for 800x600 16bpp TFT-LCD
-- MQ-200 driver for 800x600 16bpp TFT-LCD
-- Penmount(touch panel) driver
-- PCMCIA driver
-- SMC91C94 LAN driver
-- IDE driver (experimental)
diff --git a/Documentation/arm/SA1100/Tifon b/Documentation/arm/SA1100/Tifon
deleted file mode 100644
index dd1934d9c851..000000000000
--- a/Documentation/arm/SA1100/Tifon
+++ /dev/null
@@ -1,7 +0,0 @@
-Tifon
------
-
-More info has to come...
-
-Contact: Peter Danielsson <peter.danielsson@era-t.ericsson.se>
-
diff --git a/Documentation/arm/SA1100/Yopy b/Documentation/arm/SA1100/Yopy
deleted file mode 100644
index e14f16d836ac..000000000000
--- a/Documentation/arm/SA1100/Yopy
+++ /dev/null
@@ -1,2 +0,0 @@
-See http://www.yopydeveloper.org for more.
-
diff --git a/Documentation/arm/SA1100/empeg b/Documentation/arm/SA1100/empeg
deleted file mode 100644
index 4ece4849a42c..000000000000
--- a/Documentation/arm/SA1100/empeg
+++ /dev/null
@@ -1,2 +0,0 @@
-See ../empeg/README
-
diff --git a/Documentation/arm/SA1100/nanoEngine b/Documentation/arm/SA1100/nanoEngine
deleted file mode 100644
index 48a7934f95f6..000000000000
--- a/Documentation/arm/SA1100/nanoEngine
+++ /dev/null
@@ -1,11 +0,0 @@
-nanoEngine
-----------
-
-"nanoEngine" is a SA1110 based single board computer from 
-Bright Star Engineering Inc.  See www.brightstareng.com/arm
-for more info.
-(Ref: Stuart Adams <sja@brightstareng.com>)
-
-Also visit Larry Doolittle's "Linux for the nanoEngine" site:
-http://www.brightstareng.com/arm/nanoeng.htm
-
diff --git a/Documentation/arm/SA1100/serial_UART b/Documentation/arm/SA1100/serial_UART
deleted file mode 100644
index a63966f1d083..000000000000
--- a/Documentation/arm/SA1100/serial_UART
+++ /dev/null
@@ -1,47 +0,0 @@
-The SA1100 serial port had its major/minor numbers officially assigned:
-
-> Date: Sun, 24 Sep 2000 21:40:27 -0700
-> From: H. Peter Anvin <hpa@transmeta.com>
-> To: Nicolas Pitre <nico@CAM.ORG>
-> Cc: Device List Maintainer <device@lanana.org>
-> Subject: Re: device
-> 
-> Okay.  Note that device numbers 204 and 205 are used for "low density
-> serial devices", so you will have a range of minors on those majors (the
-> tty device layer handles this just fine, so you don't have to worry about
-> doing anything special.)
-> 
-> So your assignments are:
-> 
-> 204 char        Low-density serial ports
->                   5 = /dev/ttySA0               SA1100 builtin serial port 0
->                   6 = /dev/ttySA1               SA1100 builtin serial port 1
->                   7 = /dev/ttySA2               SA1100 builtin serial port 2
-> 
-> 205 char        Low-density serial ports (alternate device)
->                   5 = /dev/cusa0                Callout device for ttySA0
->                   6 = /dev/cusa1                Callout device for ttySA1
->                   7 = /dev/cusa2                Callout device for ttySA2
->
-
-You must create those inodes in /dev on the root filesystem used
-by your SA1100-based device:
-
-	mknod ttySA0 c 204 5
-	mknod ttySA1 c 204 6
-	mknod ttySA2 c 204 7
-	mknod cusa0 c 205 5
-	mknod cusa1 c 205 6
-	mknod cusa2 c 205 7
-
-In addition to the creation of the appropriate device nodes above, you
-must ensure your user space applications make use of the correct device
-name. The classic example is the content of the /etc/inittab file where
-you might have a getty process started on ttyS0.  In this case:
-
-- replace occurrences of ttyS0 with ttySA0, ttyS1 with ttySA1, etc.
-
-- don't forget to add 'ttySA0', 'console', or the appropriate tty name
-  in /etc/securetty for root to be allowed to login as well.
-
-
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
deleted file mode 100644
index 1b049be6c84f..000000000000
--- a/Documentation/arm/SPEAr/overview.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-			SPEAr ARM Linux Overview
-			==========================
-
-Introduction
-------------
-
-  SPEAr (Structured Processor Enhanced Architecture).
-  weblink : http://www.st.com/spear
-
-  The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
-  supported by the 'spear' platform of ARM Linux. Currently SPEAr1310,
-  SPEAr1340, SPEAr300, SPEAr310, SPEAr320 and SPEAr600 SOCs are supported.
-
-  Hierarchy in SPEAr is as follows:
-
-  SPEAr (Platform)
-	- SPEAr3XX (3XX SOC series, based on ARM9)
-		- SPEAr300 (SOC)
-			- SPEAr300 Evaluation Board
-		- SPEAr310 (SOC)
-			- SPEAr310 Evaluation Board
-		- SPEAr320 (SOC)
-			- SPEAr320 Evaluation Board
-	- SPEAr6XX (6XX SOC series, based on ARM9)
-		- SPEAr600 (SOC)
-			- SPEAr600 Evaluation Board
-	- SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
-		- SPEAr1310 (SOC)
-			- SPEAr1310 Evaluation Board
-		- SPEAr1340 (SOC)
-			- SPEAr1340 Evaluation Board
-
-  Configuration
-  -------------
-
-  A generic configuration is provided for each machine, and can be used as the
-  default by
-	make spear13xx_defconfig
-	make spear3xx_defconfig
-	make spear6xx_defconfig
-
-  Layout
-  ------
-
-  The common files for multiple machine families (SPEAr3xx, SPEAr6xx and
-  SPEAr13xx) are located in the platform code contained in arch/arm/plat-spear
-  with headers in plat/.
-
-  Each machine series have a directory with name arch/arm/mach-spear followed by
-  series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
-
-  Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c, for
-  spear6xx is mach-spear6xx/spear6xx.c and for spear13xx family is
-  mach-spear13xx/spear13xx.c. mach-spear* also contain soc/machine specific
-  files, like spear1310.c, spear1340.c spear300.c, spear310.c, spear320.c and
-  spear600.c.  mach-spear* doesn't contains board specific files as they fully
-  support Flattened Device Tree.
-
-
-  Document Author
-  ---------------
-
-  Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
deleted file mode 100644
index fa968aa99d67..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-		S3C24XX CPUfreq support
-		=======================
-
-Introduction
-------------
-
- The S3C24XX series support a number of power saving systems, such as
- the ability to change the core, memory and peripheral operating
- frequencies. The core control is exported via the CPUFreq driver
- which has a number of different manual or automatic controls over the
- rate the core is running at.
-
- There are two forms of the driver depending on the specific CPU and
- how the clocks are arranged. The first implementation used as single
- PLL to feed the ARM, memory and peripherals via a series of dividers
- and muxes and this is the implementation that is documented here. A
- newer version where there is a separate PLL and clock divider for the
- ARM core is available as a separate driver.
-
-
-Layout
-------
-
- The code core manages the CPU specific drivers, any data that they
- need to register and the interface to the generic drivers/cpufreq
- system. Each CPU registers a driver to control the PLL, clock dividers
- and anything else associated with it. Any board that wants to use this
- framework needs to supply at least basic details of what is required.
-
- The core registers with drivers/cpufreq at init time if all the data
- necessary has been supplied.
-
-
-CPU support
------------
-
- The support for each CPU depends on the facilities provided by the
- SoC and the driver as each device has different PLL and clock chains
- associated with it.
-
-
-Slow Mode
----------
-
- The SLOW mode where the PLL is turned off altogether and the
- system is fed by the external crystal input is currently not
- supported.
-
-
-sysfs
------
-
- The core code exports extra information via sysfs in the directory
- devices/system/cpu/cpu0/arch-freq.
-
-
-Board Support
--------------
-
- Each board that wants to use the cpufreq code must register some basic
- information with the core driver to provide information about what the
- board requires and any restrictions being placed on it.
-
- The board needs to supply information about whether it needs the IO bank
- timings changing, any maximum frequency limits and information about the
- SDRAM refresh rate.
-
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2009 Simtec Electronics
-Licensed under GPLv2
diff --git a/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt b/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
deleted file mode 100644
index b87292e05f2f..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-		Simtec Electronics EB2410ITX (BAST)
-		===================================
-
-	http://www.simtec.co.uk/products/EB2410ITX/
-
-Introduction
-------------
-
-  The EB2410ITX is a S3C2410 based development board with a variety of
-  peripherals and expansion connectors. This board is also known by
-  the shortened name of Bast.
-
-
-Configuration
--------------
-
-  To set the default configuration, use `make bast_defconfig` which
-  supports the commonly used features of this board.
-
-
-Support
--------
-
-  Official support information can be found on the Simtec Electronics
-  website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
-
-  Useful links:
-
-    - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
-
-    - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
-
-    - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
-      and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
-
-
-MTD
----
-
-  The NAND and NOR support has been merged from the linux-mtd project.
-  Any problems, see http://www.linux-mtd.infradead.org/ for more
-  information or up-to-date versions of linux-mtd.
-
-
-IDE
----
-
-  Both onboard IDE ports are supported, however there is no support for
-  changing speed of devices, PIO Mode 4 capable drives should be used.
-
-
-Maintainers
------------
-
-  This board is maintained by Simtec Electronics.
-
-
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
deleted file mode 100644
index e8f918b96123..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-			S3C24XX GPIO Control
-			====================
-
-Introduction
-------------
-
-  The s3c2410 kernel provides an interface to configure and
-  manipulate the state of the GPIO pins, and find out other
-  information about them.
-
-  There are a number of conditions attached to the configuration
-  of the s3c2410 GPIO system, please read the Samsung provided
-  data-sheet/users manual to find out the complete list.
-
-  See Documentation/arm/Samsung/GPIO.txt for the core implementation.
-
-
-GPIOLIB
--------
-
-  With the event of the GPIOLIB in drivers/gpio, support for some
-  of the GPIO functions such as reading and writing a pin will
-  be removed in favour of this common access method.
-
-  Once all the extant drivers have been converted, the functions
-  listed below will be removed (they may be marked as __deprecated
-  in the near future).
-
-  The following functions now either have a s3c_ specific variant
-  or are merged into gpiolib. See the definitions in
-  arch/arm/plat-samsung/include/plat/gpio-cfg.h:
-
-  s3c2410_gpio_setpin()		gpio_set_value() or gpio_direction_output()
-  s3c2410_gpio_getpin()		gpio_get_value() or gpio_direction_input()
-  s3c2410_gpio_getirq()		gpio_to_irq()
-  s3c2410_gpio_cfgpin()		s3c_gpio_cfgpin()
-  s3c2410_gpio_getcfg()		s3c_gpio_getcfg()
-  s3c2410_gpio_pullup()		s3c_gpio_setpull()
-
-
-GPIOLIB conversion
-------------------
-
-If you need to convert your board or driver to use gpiolib from the phased
-out s3c2410 API, then here are some notes on the process.
-
-1) If your board is exclusively using an GPIO, say to control peripheral
-   power, then it will require to claim the gpio with gpio_request() before
-   it can use it.
-
-   It is recommended to check the return value, with at least WARN_ON()
-   during initialisation.
-
-2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
-   as they have the same arguments, and can either take the pin specific
-   values, or the more generic special-function-number arguments.
-
-3) s3c2410_gpio_pullup() changes have the problem that while the
-   s3c2410_gpio_pullup(x, 1) can be easily translated to the
-   s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
-   are not so easy.
-
-   The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
-   of some of the devices, a pull-down) and as such the new API distinguishes
-   between the UP and DOWN case. There is currently no 'just turn on' setting
-   which may be required if this becomes a problem.
-
-4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
-   does not implicitly configure the relevant gpio to output. The gpio
-   direction should be changed before using gpio_set_value().
-
-5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
-   has been set to input. It is currently unknown what the behaviour is
-   when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
-   would return the value the pin is supposed to be outputting).
-
-6) s3c2410_gpio_getirq() should be directly replaceable with the
-   gpio_to_irq() call.
-
-The s3c2410_gpio and gpio_ calls have always operated on the same gpio
-numberspace, so there is no problem with converting the gpio numbering
-between the calls.
-
-
-Headers
--------
-
-  See arch/arm/mach-s3c24xx/include/mach/regs-gpio.h for the list
-  of GPIO pins, and the configuration values for them. This
-  is included by using #include <mach/regs-gpio.h>
-
-
-PIN Numbers
------------
-
-  Each pin has an unique number associated with it in regs-gpio.h,
-  e.g. S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
-  the GPIO functions which pin is to be used.
-
-  With the conversion to gpiolib, there is no longer a direct conversion
-  from gpio pin number to register base address as in earlier kernels. This
-  is due to the number space required for newer SoCs where the later
-  GPIOs are not contiguous.
-
-
-Configuring a pin
------------------
-
-  The following function allows the configuration of a given pin to
-  be changed.
-
-    void s3c_gpio_cfgpin(unsigned int pin, unsigned int function);
-
-  e.g.:
-
-     s3c_gpio_cfgpin(S3C2410_GPA(0), S3C_GPIO_SFN(1));
-     s3c_gpio_cfgpin(S3C2410_GPE(8), S3C_GPIO_SFN(2));
-
-   which would turn GPA(0) into the lowest Address line A0, and set
-   GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
-
-
-Reading the current configuration
----------------------------------
-
-  The current configuration of a pin can be read by using standard
-  gpiolib function:
-
-  s3c_gpio_getcfg(unsigned int pin);
-
-  The return value will be from the same set of values which can be
-  passed to s3c_gpio_cfgpin().
-
-
-Configuring a pull-up resistor
-------------------------------
-
-  A large proportion of the GPIO pins on the S3C2410 can have weak
-  pull-up resistors enabled. This can be configured by the following
-  function:
-
-    void s3c_gpio_setpull(unsigned int pin, unsigned int to);
-
-  Where the to value is S3C_GPIO_PULL_NONE to set the pull-up off,
-  and S3C_GPIO_PULL_UP to enable the specified pull-up. Any other
-  values are currently undefined.
-
-
-Getting and setting the state of a PIN
---------------------------------------
-
-  These calls are now implemented by the relevant gpiolib calls, convert
-  your board or driver to use gpiolib.
-
-
-Getting the IRQ number associated with a PIN
---------------------------------------------
-
-  A standard gpiolib function can map the given pin number to an IRQ
-  number to pass to the IRQ system.
-
-   int gpio_to_irq(unsigned int pin);
-
-  Note, not all pins have an IRQ.
-
-
-Author
--------
-
-Ben Dooks, 03 October 2004
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/H1940.txt b/Documentation/arm/Samsung-S3C24XX/H1940.txt
deleted file mode 100644
index b738859b1fc0..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/H1940.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-		HP IPAQ H1940
-		=============
-
-http://www.handhelds.org/projects/h1940.html
-
-Introduction
-------------
-
-  The HP H1940 is a S3C2410 based handheld device, with
-  bluetooth connectivity.
-
-
-Support
--------
-
-  A variety of information is available
-
-  handhelds.org project page:
-
-    http://www.handhelds.org/projects/h1940.html
-
-  handhelds.org wiki page:
-
-    http://handhelds.org/moin/moin.cgi/HpIpaqH1940
-
-  Herbert Pötzl pages:
-
-    http://vserver.13thfloor.at/H1940/
-
-
-Maintainers
------------
-
-  This project is being maintained and developed by a variety
-  of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
-
-  Thanks to the many others who have also provided support.
-
-
-(c) 2005 Ben Dooks
diff --git a/Documentation/arm/Samsung-S3C24XX/NAND.txt b/Documentation/arm/Samsung-S3C24XX/NAND.txt
deleted file mode 100644
index bc478a3409b8..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/NAND.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-			S3C24XX NAND Support
-			====================
-
-Introduction
-------------
-
-Small Page NAND
----------------
-
-The driver uses a 512 byte (1 page) ECC code for this setup. The
-ECC code is not directly compatible with the default kernel ECC
-code, so the driver enforces its own OOB layout and ECC parameters
-
-Large Page NAND
----------------
-
-The driver is capable of handling NAND flash with a 2KiB page
-size, with support for hardware ECC generation and correction.
-
-Unlike the 512byte page mode, the driver generates ECC data for
-each 256 byte block in an 2KiB page. This means that more than
-one error in a page can be rectified. It also means that the
-OOB layout remains the default kernel layout for these flashes.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2007 Simtec Electronics
-
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt
deleted file mode 100644
index 00d3c3141e21..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/Overview.txt
+++ /dev/null
@@ -1,318 +0,0 @@
-			S3C24XX ARM Linux Overview
-			==========================
-
-
-
-Introduction
-------------
-
-  The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
-  by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
-  S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443 and S3C2450 devices
-  are supported.
-
-  Support for the S3C2400 and S3C24A0 series was never completed and the
-  corresponding code has been removed after a while.  If someone wishes to
-  revive this effort, partial support can be retrieved from earlier Linux
-  versions.
-
-  The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
-  included under the arch/arm/mach-s3c2416 directory. Note, while core
-  support for these SoCs is in, work on some of the extra peripherals
-  and extra interrupts is still ongoing.
-
-
-Configuration
--------------
-
-  A generic S3C2410 configuration is provided, and can be used as the
-  default by `make s3c2410_defconfig`. This configuration has support
-  for all the machines, and the commonly used features on them.
-
-  Certain machines may have their own default configurations as well,
-  please check the machine specific documentation.
-
-
-Layout
-------
-
-  The core support files are located in the platform code contained in
-  arch/arm/plat-s3c24xx with headers in include/asm-arm/plat-s3c24xx.
-  This directory should be kept to items shared between the platform
-  code (arch/arm/plat-s3c24xx) and the arch/arm/mach-s3c24* code.
-
-  Each cpu has a directory with the support files for it, and the
-  machines that carry the device. For example S3C2410 is contained
-  in arch/arm/mach-s3c2410 and S3C2440 in arch/arm/mach-s3c2440
-
-  Register, kernel and platform data definitions are held in the
-  arch/arm/mach-s3c2410 directory./include/mach
-
-arch/arm/plat-s3c24xx:
-
-  Files in here are either common to all the s3c24xx family,
-  or are common to only some of them with names to indicate this
-  status. The files that are not common to all are generally named
-  with the initial cpu they support in the series to ensure a short
-  name without any possibility of confusion with newer devices.
-
-  As an example, initially s3c244x would cover s3c2440 and s3c2442, but
-  with the s3c2443 which does not share many of the same drivers in
-  this directory, the name becomes invalid. We stick to s3c2440-<x>
-  to indicate a driver that is s3c2440 and s3c2442 compatible.
-
-  This does mean that to find the status of any given SoC, a number
-  of directories may need to be searched.
-
-
-Machines
---------
-
-  The currently supported machines are as follows:
-
-  Simtec Electronics EB2410ITX (BAST)
-
-    A general purpose development board, see EB2410ITX.txt for further
-    details
-
-  Simtec Electronics IM2440D20 (Osiris)
-
-    CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
-    and a PCMCIA controller.
-
-  Samsung SMDK2410
-
-    Samsung's own development board, geared for PDA work.
-
-  Samsung/Aiji SMDK2412
-
-    The S3C2412 version of the SMDK2440.
-
-  Samsung/Aiji SMDK2413
-
-    The S3C2412 version of the SMDK2440.
-
-  Samsung/Meritech SMDK2440
-
-    The S3C2440 compatible version of the SMDK2440, which has the
-    option of an S3C2440 or S3C2442 CPU module.
-
-  Thorcom VR1000
-
-    Custom embedded board
-
-  HP IPAQ 1940
-
-    Handheld (IPAQ), available in several varieties
-
-  HP iPAQ rx3715
-
-    S3C2440 based IPAQ, with a number of variations depending on
-    features shipped.
-
-  Acer N30
-
-    A S3C2410 based PDA from Acer.  There is a Wiki page at
-    http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
-
-  AML M5900
-
-    American Microsystems' M5900
-
-  Nex Vision Nexcoder
-  Nex Vision Otom
-
-    Two machines by Nex Vision
-
-
-Adding New Machines
--------------------
-
-  The architecture has been designed to support as many machines as can
-  be configured for it in one kernel build, and any future additions
-  should keep this in mind before altering items outside of their own
-  machine files.
-
-  Machine definitions should be kept in linux/arch/arm/mach-s3c2410,
-  and there are a number of examples that can be looked at.
-
-  Read the kernel patch submission policies as well as the
-  Documentation/arm directory before submitting patches. The
-  ARM kernel series is managed by Russell King, and has a patch system
-  located at http://www.arm.linux.org.uk/developer/patches/
-  as well as mailing lists that can be found from the same site.
-
-  As a courtesy, please notify <ben-linux@fluff.org> of any new
-  machines or other modifications.
-
-  Any large scale modifications, or new drivers should be discussed
-  on the ARM kernel mailing list (linux-arm-kernel) before being
-  attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
-  mailing list information.
-
-
-I2C
----
-
-  The hardware I2C core in the CPU is supported in single master
-  mode, and can be configured via platform data.
-
-
-RTC
----
-
-  Support for the onboard RTC unit, including alarm function.
-
-  This has recently been upgraded to use the new RTC core,
-  and the module has been renamed to rtc-s3c to fit in with
-  the new rtc naming scheme.
-
-
-Watchdog
---------
-
-  The onchip watchdog is available via the standard watchdog
-  interface.
-
-
-NAND
-----
-
-  The current kernels now have support for the s3c2410 NAND
-  controller. If there are any problems the latest linux-mtd
-  code can be found from http://www.linux-mtd.infradead.org/
-
-  For more information see Documentation/arm/Samsung-S3C24XX/NAND.txt
-
-
-SD/MMC
-------
-
-  The SD/MMC hardware pre S3C2443 is supported in the current
-  kernel, the driver is drivers/mmc/host/s3cmci.c and supports
-  1 and 4 bit SD or MMC cards.
-
-  The SDIO behaviour of this driver has not been fully tested. There is no
-  current support for hardware SDIO interrupts.
-
-
-Serial
-------
-
-  The s3c2410 serial driver provides support for the internal
-  serial ports. These devices appear as /dev/ttySAC0 through 3.
-
-  To create device nodes for these, use the following commands
-
-    mknod ttySAC0 c 204 64
-    mknod ttySAC1 c 204 65
-    mknod ttySAC2 c 204 66
-
-
-GPIO
-----
-
-  The core contains support for manipulating the GPIO, see the
-  documentation in GPIO.txt in the same directory as this file.
-
-  Newer kernels carry GPIOLIB, and support is being moved towards
-  this with some of the older support in line to be removed.
-
-  As of v2.6.34, the move towards using gpiolib support is almost
-  complete, and very little of the old calls are left.
-
-  See Documentation/arm/Samsung-S3C24XX/GPIO.txt for the S3C24XX specific
-  support and Documentation/arm/Samsung/GPIO.txt for the core Samsung
-  implementation.
-
-
-Clock Management
-----------------
-
-  The core provides the interface defined in the header file
-  include/asm-arm/hardware/clock.h, to allow control over the
-  various clock units
-
-
-Suspend to RAM
---------------
-
-  For boards that provide support for suspend to RAM, the
-  system can be placed into low power suspend.
-
-  See Suspend.txt for more information.
-
-
-SPI
----
-
-  SPI drivers are available for both the in-built hardware
-  (although there is no DMA support yet) and a generic
-  GPIO based solution.
-
-
-LEDs
-----
-
-  There is support for GPIO based LEDs via a platform driver
-  in the LED subsystem.
-
-
-Platform Data
--------------
-
-  Whenever a device has platform specific data that is specified
-  on a per-machine basis, care should be taken to ensure the
-  following:
-
-    1) that default data is not left in the device to confuse the
-       driver if a machine does not set it at startup
-
-    2) the data should (if possible) be marked as __initdata,
-       to ensure that the data is thrown away if the machine is
-       not the one currently in use.
-
-       The best way of doing this is to make a function that
-       kmalloc()s an area of memory, and copies the __initdata
-       and then sets the relevant device's platform data. Making
-       the function `__init` takes care of ensuring it is discarded
-       with the rest of the initialisation code
-
-       static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
-       {
-           struct s3c2410_xxx_mach_info *npd;
-
-	   npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
-	   if (npd) {
-	      memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
-	      s3c_device_xxx.dev.platform_data = npd;
-	   } else {
-              printk(KERN_ERR "no memory for xxx platform data\n");
-	   }
-	}
-
-	Note, since the code is marked as __init, it should not be
-	exported outside arch/arm/mach-s3c2410/, or exported to
-	modules via EXPORT_SYMBOL() and related functions.
-
-
-Port Contributors
------------------
-
-  Ben Dooks (BJD)
-  Vincent Sanders
-  Herbert Potzl
-  Arnaud Patard (RTP)
-  Roc Wu
-  Klaus Fetscher
-  Dimitry Andric
-  Shannon Holland
-  Guillaume Gourat (NexVision)
-  Christer Weinigel (wingel) (Acer N30)
-  Lucas Correia Villa Real (S3C2400 port)
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004-2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2412.txt b/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
deleted file mode 100644
index dc1fd362d3c1..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-		S3C2412 ARM Linux Overview
-		==========================
-
-Introduction
-------------
-
-  The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
-  from Samsung. This part has an ARM926-EJS core, capable of running up
-  to 266MHz (see data-sheet for more information)
-
-
-Clock
------
-
-  The core clock code provides a set of clocks to the drivers, and allows
-  for source selection and a number of other features.
-
-
-Power
------
-
-  No support for suspend/resume to RAM in the current system.
-
-
-DMA
----
-
-  No current support for DMA.
-
-
-GPIO
-----
-
-  There is support for setting the GPIO to input/output/special function
-  and reading or writing to them.
-
-
-UART
-----
-
-  The UART hardware is similar to the S3C2440, and is supported by the
-  s3c2410 driver in the drivers/serial directory.
-
-
-NAND
-----
-
-  The NAND hardware is similar to the S3C2440, and is supported by the
-  s3c2410 driver in the drivers/mtd/nand/raw directory.
-
-
-USB Host
---------
-
-  The USB hardware is similar to the S3C2410, with extended clock source
-  control. The OHCI portion is supported by the ohci-s3c2410 driver, and
-  the clock control selection is supported by the core clock code.
-
-
-USB Device
-----------
-
-  No current support in the kernel
-
-
-IRQs
-----
-
-  All the standard, and external interrupt sources are supported. The
-  extra sub-sources are not yet supported.
-
-
-RTC
----
-
-  The RTC hardware is similar to the S3C2410, and is supported by the
-  s3c2410-rtc driver.
-
-
-Watchdog
---------
-
-  The watchdog hardware is the same as the S3C2410, and is supported by
-  the s3c2410_wdt driver.
-
-
-MMC/SD/SDIO
------------
-
-  No current support for the MMC/SD/SDIO block.
-
-IIC
----
-
-  The IIC hardware is the same as the S3C2410, and is supported by the
-  i2c-s3c24xx driver.
-
-
-IIS
----
-
-  No current support for the IIS interface.
-
-
-SPI
----
-
-  No current support for the SPI interfaces.
-
-
-ATA
----
-
-  No current support for the on-board ATA block.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2413.txt b/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
deleted file mode 100644
index 909bdc7dd7b5..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-		S3C2413 ARM Linux Overview
-		==========================
-
-Introduction
-------------
-
-  The S3C2413 is an extended version of the S3C2412, with an camera
-  interface and mobile DDR memory support. See the S3C2412 support
-  documentation for more information.
-
-
-Camera Interface
----------------
-
-  This block is currently not supported.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt b/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
deleted file mode 100644
index 429390bd4684..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-		Samsung/Meritech SMDK2440
-		=========================
-
-Introduction
-------------
-
-  The SMDK2440 is a two part evaluation board for the Samsung S3C2440
-  processor. It includes support for LCD, SmartMedia, Audio, SD and
-  10MBit Ethernet, and expansion headers for various signals, including
-  the camera and unused GPIO.
-
-
-Configuration
--------------
-
-  To set the default configuration, use `make smdk2440_defconfig` which
-  will configure the common features of this board, or use
-  `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
-
-
-Support
--------
-
-  Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
-  includes linux based USB download tools.
-
-  Some of the h1940 patches that can be found from the H1940 project
-  site at http://www.handhelds.org/projects/h1940.html can also be
-  applied to this board.
-
-
-Peripherals
------------
-
-  There is no current support for any of the extra peripherals on the
-  base-board itself.
-
-
-MTD
----
-
-  The NAND flash should be supported by the in kernel MTD NAND support,
-  NOR flash will be added later.
-
-
-Maintainers
------------
-
-  This board is being maintained by Ben Dooks, for more info, see
-  http://www.fluff.org/ben/smdk2440/
-
-  Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
-  and to Simtec Electronics for allowing me time to work on this.
-
-
-(c) 2004 Ben Dooks
diff --git a/Documentation/arm/Samsung-S3C24XX/Suspend.txt b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
deleted file mode 100644
index cb4f0c0cdf9d..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/Suspend.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-			S3C24XX Suspend Support
-			=======================
-
-
-Introduction
-------------
-
-  The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
-  in Self-Refresh mode, and all but the essential peripheral blocks are
-  powered down. For more information on how this works, please look
-  at the relevant CPU datasheet from Samsung.
-
-
-Requirements
-------------
-
-  1) A bootloader that can support the necessary resume operation
-
-  2) Support for at least 1 source for resume
-
-  3) CONFIG_PM enabled in the kernel
-
-  4) Any peripherals that are going to be powered down at the same
-     time require suspend/resume support.
-
-
-Resuming
---------
-
-  The S3C2410 user manual defines the process of sending the CPU to
-  sleep and how it resumes. The default behaviour of the Linux code
-  is to set the GSTATUS3 register to the physical address of the
-  code to resume Linux operation.
-
-  GSTATUS4 is currently left alone by the sleep code, and is free to
-  use for any other purposes (for example, the EB2410ITX uses this to
-  save memory configuration in).
-
-
-Machine Support
----------------
-
-  The machine specific functions must call the s3c_pm_init() function
-  to say that its bootloader is capable of resuming. This can be as
-  simple as adding the following to the machine's definition:
-
-  INITMACHINE(s3c_pm_init)
-
-  A board can do its own setup before calling s3c_pm_init, if it
-  needs to setup anything else for power management support.
-
-  There is currently no support for over-riding the default method of
-  saving the resume address, if your board requires it, then contact
-  the maintainer and discuss what is required.
-
-  Note, the original method of adding an late_initcall() is wrong,
-  and will end up initialising all compiled machines' pm init!
-
-  The following is an example of code used for testing wakeup from
-  an falling edge on IRQ_EINT0:
-
-
-static irqreturn_t button_irq(int irq, void *pw)
-{
-	return IRQ_HANDLED;
-}
-
-statuc void __init machine_init(void)
-{
-	...
-
-	request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
-		   "button-irq-eint0", NULL);
-
-	enable_irq_wake(IRQ_EINT0);
-
-	s3c_pm_init();
-}
-
-
-Debugging
----------
-
-  There are several important things to remember when using PM suspend:
-
-  1) The uart drivers will disable the clocks to the UART blocks when
-     suspending, which means that use of printascii() or similar direct
-     access to the UARTs will cause the debug to stop.
-
-  2) While the pm code itself will attempt to re-enable the UART clocks,
-     care should be taken that any external clock sources that the UARTs
-     rely on are still enabled at that point.
-
-  3) If any debugging is placed in the resume path, then it must have the
-     relevant clocks and peripherals setup before use (ie, bootloader).
-
-     For example, if you transmit a character from the UART, the baud
-     rate and uart controls must be setup beforehand.
-
-
-Configuration
--------------
-
-  The S3C2410 specific configuration in `System Type` defines various
-  aspects of how the S3C2410 suspend and resume support is configured
-
-  `S3C2410 PM Suspend debug`
-
-    This option prints messages to the serial console before and after
-    the actual suspend, giving detailed information on what is
-    happening
-
-
-  `S3C2410 PM Suspend Memory CRC`
-
-    Allows the entire memory to be checksummed before and after the
-    suspend to see if there has been any corruption of the contents.
-
-    Note, the time to calculate the CRC is dependent on the CPU speed
-    and the size of memory. For an 64Mbyte RAM area on an 200MHz
-    S3C2410, this can take approximately 4 seconds to complete.
-
-    This support requires the CRC32 function to be enabled.
-
-
-  `S3C2410 PM Suspend CRC Chunksize (KiB)`
-
-    Defines the size of memory each CRC chunk covers. A smaller value
-    will mean that the CRC data block will take more memory, but will
-    identify any faults with better precision
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004 Simtec Electronics
-
diff --git a/Documentation/arm/Samsung-S3C24XX/USB-Host.txt b/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
deleted file mode 100644
index f82b1faefad5..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-			S3C24XX USB Host support
-			========================
-
-
-
-Introduction
-------------
-
-  This document details the S3C2410/S3C2440 in-built OHCI USB host support.
-
-Configuration
--------------
-
-  Enable at least the following kernel options:
-
-  menuconfig:
-
-   Device Drivers  --->
-     USB support  --->
-       <*> Support for Host-side USB
-       <*>   OHCI HCD support
-
-
-  .config:
-    CONFIG_USB
-    CONFIG_USB_OHCI_HCD
-
-
-  Once these options are configured, the standard set of USB device
-  drivers can be configured and used.
-
-
-Board Support
--------------
-
-  The driver attaches to a platform device, which will need to be
-  added by the board specific support file in linux/arch/arm/mach-s3c2410,
-  such as mach-bast.c or mach-smdk2410.c
-
-  The platform device's platform_data field is only needed if the
-  board implements extra power control or over-current monitoring.
-
-  The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
-  register, so if both ports are to be used for the host, then it is
-  the board support file's responsibility to ensure that the second
-  port is configured to be connected to the OHCI core.
-
-
-Platform Data
--------------
-
-  See arch/arm/mach-s3c2410/include/mach/usb-control.h for the
-  descriptions of the platform device data. An implementation
-  can be found in linux/arch/arm/mach-s3c2410/usb-simtec.c .
-
-  The `struct s3c2410_hcd_info` contains a pair of functions
-  that get called to enable over-current detection, and to
-  control the port power status.
-
-  The ports are numbered 0 and 1.
-
-  power_control:
-
-    Called to enable or disable the power on the port.
-
-  enable_oc:
-
-    Called to enable or disable the over-current monitoring.
-    This should claim or release the resources being used to
-    check the power condition on the port, such as an IRQ.
-
-  report_oc:
-
-    The OHCI driver fills this field in for the over-current code
-    to call when there is a change to the over-current state on
-    an port. The ports argument is a bitmask of 1 bit per port,
-    with bit X being 1 for an over-current on port X.
-
-    The function s3c2410_usb_report_oc() has been provided to
-    ensure this is called correctly.
-
-  port[x]:
-
-    This is struct describes each port, 0 or 1. The platform driver
-    should set the flags field of each port to S3C_HCDFLG_USED if
-    the port is enabled.
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2005 Simtec Electronics
diff --git a/Documentation/arm/Samsung/Bootloader-interface.txt b/Documentation/arm/Samsung/Bootloader-interface.txt
deleted file mode 100644
index d17ed518a7ea..000000000000
--- a/Documentation/arm/Samsung/Bootloader-interface.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-      Interface between kernel and boot loaders on Exynos boards
-      ==========================================================
-
-Author: Krzysztof Kozlowski
-Date  : 6 June 2015
-
-The document tries to describe currently used interface between Linux kernel
-and boot loaders on Samsung Exynos based boards. This is not a definition
-of interface but rather a description of existing state, a reference
-for information purpose only.
-
-In the document "boot loader" means any of following: U-boot, proprietary
-SBOOT or any other firmware for ARMv7 and ARMv8 initializing the board before
-executing kernel.
-
-
-1. Non-Secure mode
-
-Address:      sysram_ns_base_addr
-Offset        Value                                        Purpose
-=============================================================================
-0x08          exynos_cpu_resume_ns, mcpm_entry_point       System suspend
-0x0c          0x00000bad (Magic cookie)                    System suspend
-0x1c          exynos4_secondary_startup                    Secondary CPU boot
-0x1c + 4*cpu  exynos4_secondary_startup (Exynos4412)       Secondary CPU boot
-0x20          0xfcba0d10 (Magic cookie)                    AFTR
-0x24          exynos_cpu_resume_ns                         AFTR
-0x28 + 4*cpu  0x8 (Magic cookie, Exynos3250)               AFTR
-0x28          0x0 or last value during resume (Exynos542x) System suspend
-
-
-2. Secure mode
-
-Address:      sysram_base_addr
-Offset        Value                                        Purpose
-=============================================================================
-0x00          exynos4_secondary_startup                    Secondary CPU boot
-0x04          exynos4_secondary_startup (Exynos542x)       Secondary CPU boot
-4*cpu         exynos4_secondary_startup (Exynos4412)       Secondary CPU boot
-0x20          exynos_cpu_resume (Exynos4210 r1.0)          AFTR
-0x24          0xfcba0d10 (Magic cookie, Exynos4210 r1.0)   AFTR
-
-Address:      pmu_base_addr
-Offset        Value                                        Purpose
-=============================================================================
-0x0800        exynos_cpu_resume                            AFTR, suspend
-0x0800        mcpm_entry_point (Exynos542x with MCPM)      AFTR, suspend
-0x0804        0xfcba0d10 (Magic cookie)                    AFTR
-0x0804        0x00000bad (Magic cookie)                    System suspend
-0x0814        exynos4_secondary_startup (Exynos4210 r1.1)  Secondary CPU boot
-0x0818        0xfcba0d10 (Magic cookie, Exynos4210 r1.1)   AFTR
-0x081C        exynos_cpu_resume (Exynos4210 r1.1)          AFTR
-
-
-3. Other (regardless of secure/non-secure mode)
-
-Address:      pmu_base_addr
-Offset        Value                           Purpose
-=============================================================================
-0x0908        Non-zero                        Secondary CPU boot up indicator
-                                              on Exynos3250 and Exynos542x
-
-
-4. Glossary
-
-AFTR - ARM Off Top Running, a low power mode, Cortex cores and many other
-modules are power gated, except the TOP modules
-MCPM - Multi-Cluster Power Management
diff --git a/Documentation/arm/Samsung/GPIO.txt b/Documentation/arm/Samsung/GPIO.txt
deleted file mode 100644
index 795adfd88081..000000000000
--- a/Documentation/arm/Samsung/GPIO.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-		Samsung GPIO implementation
-		===========================
-
-Introduction
-------------
-
-This outlines the Samsung GPIO implementation and the architecture
-specific calls provided alongside the drivers/gpio core.
-
-
-S3C24XX (Legacy)
-----------------
-
-See Documentation/arm/Samsung-S3C24XX/GPIO.txt for more information
-about these devices. Their implementation has been brought into line
-with the core samsung implementation described in this document.
-
-
-GPIOLIB integration
--------------------
-
-The gpio implementation uses gpiolib as much as possible, only providing
-specific calls for the items that require Samsung specific handling, such
-as pin special-function or pull resistor control.
-
-GPIO numbering is synchronised between the Samsung and gpiolib system.
-
-
-PIN configuration
------------------
-
-Pin configuration is specific to the Samsung architecture, with each SoC
-registering the necessary information for the core gpio configuration
-implementation to configure pins as necessary.
-
-The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a
-driver or machine to change gpio configuration.
-
-See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information
-on these functions.
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt
deleted file mode 100644
index 8f7309bad460..000000000000
--- a/Documentation/arm/Samsung/Overview.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-		Samsung ARM Linux Overview
-		==========================
-
-Introduction
-------------
-
-  The Samsung range of ARM SoCs spans many similar devices, from the initial
-  ARM9 through to the newest ARM cores. This document shows an overview of
-  the current kernel support, how to use it and where to find the code
-  that supports this.
-
-  The currently supported SoCs are:
-
-  - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
-  - S3C64XX: S3C6400 and S3C6410
-  - S5PC110 / S5PV210
-
-
-S3C24XX Systems
----------------
-
-  There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
-  deals with the architecture and drivers specific to these devices.
-
-  See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
-  on the implementation details and specific support.
-
-
-Configuration
--------------
-
-  A number of configurations are supplied, as there is no current way of
-  unifying all the SoCs into one kernel.
-
-  s5pc110_defconfig - S5PC110 specific default configuration
-  s5pv210_defconfig - S5PV210 specific default configuration
-
-
-Layout
-------
-
-  The directory layout is currently being restructured, and consists of
-  several platform directories and then the machine specific directories
-  of the CPUs being built for.
-
-  plat-samsung provides the base for all the implementations, and is the
-  last in the line of include directories that are processed for the build
-  specific information. It contains the base clock, GPIO and device definitions
-  to get the system running.
-
-  plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
-
-  plat-s5p is for s5p specific builds, and contains common support for the
-  S5P specific systems. Not all S5Ps use all the features in this directory
-  due to differences in the hardware.
-
-
-Layout changes
---------------
-
-  The old plat-s3c and plat-s5pc1xx directories have been removed, with
-  support moved to either plat-samsung or plat-s5p as necessary. These moves
-  where to simplify the include and dependency issues involved with having
-  so many different platform directories.
-
-
-Port Contributors
------------------
-
-  Ben Dooks (BJD)
-  Vincent Sanders
-  Herbert Potzl
-  Arnaud Patard (RTP)
-  Roc Wu
-  Klaus Fetscher
-  Dimitry Andric
-  Shannon Holland
-  Guillaume Gourat (NexVision)
-  Christer Weinigel (wingel) (Acer N30)
-  Lucas Correia Villa Real (S3C2400 port)
-
-
-Document Author
----------------
-
-Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
diff --git a/Documentation/arm/Setup b/Documentation/arm/Setup
deleted file mode 100644
index 0cb1e64bde80..000000000000
--- a/Documentation/arm/Setup
+++ /dev/null
@@ -1,129 +0,0 @@
-Kernel initialisation parameters on ARM Linux
----------------------------------------------
-
-The following document describes the kernel initialisation parameter
-structure, otherwise known as 'struct param_struct' which is used
-for most ARM Linux architectures.
-
-This structure is used to pass initialisation parameters from the
-kernel loader to the Linux kernel proper, and may be short lived
-through the kernel initialisation process.  As a general rule, it
-should not be referenced outside of arch/arm/kernel/setup.c:setup_arch().
-
-There are a lot of parameters listed in there, and they are described
-below:
-
- page_size
-
-   This parameter must be set to the page size of the machine, and
-   will be checked by the kernel.
-
- nr_pages
-
-   This is the total number of pages of memory in the system.  If
-   the memory is banked, then this should contain the total number
-   of pages in the system.
-
-   If the system contains separate VRAM, this value should not
-   include this information.
-
- ramdisk_size
-
-   This is now obsolete, and should not be used.
-
- flags
-
-   Various kernel flags, including:
-    bit 0 - 1 = mount root read only
-    bit 1 - unused
-    bit 2 - 0 = load ramdisk
-    bit 3 - 0 = prompt for ramdisk
-
- rootdev
-
-   major/minor number pair of device to mount as the root filesystem.
-
- video_num_cols
- video_num_rows
-
-   These two together describe the character size of the dummy console,
-   or VGA console character size.  They should not be used for any other
-   purpose.
-
-   It's generally a good idea to set these to be either standard VGA, or
-   the equivalent character size of your fbcon display.  This then allows
-   all the bootup messages to be displayed correctly.
-
- video_x
- video_y
-
-   This describes the character position of cursor on VGA console, and
-   is otherwise unused. (should not be used for other console types, and
-   should not be used for other purposes).
-
- memc_control_reg
-
-   MEMC chip control register for Acorn Archimedes and Acorn A5000
-   based machines.  May be used differently by different architectures.
-
- sounddefault
-
-   Default sound setting on Acorn machines.  May be used differently by
-   different architectures.
-
- adfsdrives
-
-   Number of ADFS/MFM disks.  May be used differently by different
-   architectures.
-
- bytes_per_char_h
- bytes_per_char_v
-
-   These are now obsolete, and should not be used.
-
- pages_in_bank[4]
-
-   Number of pages in each bank of the systems memory (used for RiscPC).
-   This is intended to be used on systems where the physical memory
-   is non-contiguous from the processors point of view.
-
- pages_in_vram
-
-   Number of pages in VRAM (used on Acorn RiscPC).  This value may also
-   be used by loaders if the size of the video RAM can't be obtained
-   from the hardware.
-
- initrd_start
- initrd_size
-
-   This describes the kernel virtual start address and size of the
-   initial ramdisk.
-
- rd_start
-
-   Start address in sectors of the ramdisk image on a floppy disk.
-
- system_rev
-
-   system revision number.
-
- system_serial_low
- system_serial_high
-
-   system 64-bit serial number
-
- mem_fclk_21285
-
-   The speed of the external oscillator to the 21285 (footbridge),
-   which control's the speed of the memory bus, timer & serial port.
-   Depending upon the speed of the cpu its value can be between
-   0-66 MHz. If no params are passed or a value of zero is passed,
-   then a value of 50 Mhz is the default on 21285 architectures.
-
- paths[8][128]
-
-   These are now obsolete, and should not be used.
-
- commandline
-
-   Kernel command line parameters.  Details can be found elsewhere.
diff --git a/Documentation/arm/VFP/release-notes.txt b/Documentation/arm/VFP/release-notes.txt
deleted file mode 100644
index 28a2795705ca..000000000000
--- a/Documentation/arm/VFP/release-notes.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Release notes for Linux Kernel VFP support code
------------------------------------------------
-
-Date: 	20 May 2004
-Author:	Russell King
-
-This is the first release of the Linux Kernel VFP support code.  It
-provides support for the exceptions bounced from VFP hardware found
-on ARM926EJ-S.
-
-This release has been validated against the SoftFloat-2b library by
-John R. Hauser using the TestFloat-2a test suite.  Details of this
-library and test suite can be found at:
-
-   http://www.jhauser.us/arithmetic/SoftFloat.html
-
-The operations which have been tested with this package are:
-
- - fdiv
- - fsub
- - fadd
- - fmul
- - fcmp
- - fcmpe
- - fcvtd
- - fcvts
- - fsito
- - ftosi
- - fsqrt
-
-All the above pass softfloat tests with the following exceptions:
-
-- fadd/fsub shows some differences in the handling of +0 / -0 results
-  when input operands differ in signs.
-- the handling of underflow exceptions is slightly different.  If a
-  result underflows before rounding, but becomes a normalised number
-  after rounding, we do not signal an underflow exception.
-
-Other operations which have been tested by basic assembly-only tests
-are:
-
- - fcpy
- - fabs
- - fneg
- - ftoui
- - ftosiz
- - ftouiz
-
-The combination operations have not been tested:
-
- - fmac
- - fnmac
- - fmsc
- - fnmsc
- - fnmul
diff --git a/Documentation/arm/arm.rst b/Documentation/arm/arm.rst
new file mode 100644
index 000000000000..2edc509df92a
--- /dev/null
+++ b/Documentation/arm/arm.rst
@@ -0,0 +1,214 @@
+=======================
+ARM Linux 2.6 and upper
+=======================
+
+    Please check <ftp://ftp.arm.linux.org.uk/pub/armlinux> for
+    updates.
+
+Compilation of kernel
+---------------------
+
+  In order to compile ARM Linux, you will need a compiler capable of
+  generating ARM ELF code with GNU extensions.  GCC 3.3 is known to be
+  a good compiler.  Fortunately, you needn't guess.  The kernel will report
+  an error if your compiler is a recognized offender.
+
+  To build ARM Linux natively, you shouldn't have to alter the ARCH = line
+  in the top level Makefile.  However, if you don't have the ARM Linux ELF
+  tools installed as default, then you should change the CROSS_COMPILE
+  line as detailed below.
+
+  If you wish to cross-compile, then alter the following lines in the top
+  level make file::
+
+    ARCH = <whatever>
+
+  with::
+
+    ARCH = arm
+
+  and::
+
+    CROSS_COMPILE=
+
+  to::
+
+    CROSS_COMPILE=<your-path-to-your-compiler-without-gcc>
+
+  eg.::
+
+    CROSS_COMPILE=arm-linux-
+
+  Do a 'make config', followed by 'make Image' to build the kernel
+  (arch/arm/boot/Image).  A compressed image can be built by doing a
+  'make zImage' instead of 'make Image'.
+
+
+Bug reports etc
+---------------
+
+  Please send patches to the patch system.  For more information, see
+  http://www.arm.linux.org.uk/developer/patches/info.php Always include some
+  explanation as to what the patch does and why it is needed.
+
+  Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
+  or submitted through the web form at
+  http://www.arm.linux.org.uk/developer/
+
+  When sending bug reports, please ensure that they contain all relevant
+  information, eg. the kernel messages that were printed before/during
+  the problem, what you were doing, etc.
+
+
+Include files
+-------------
+
+  Several new include directories have been created under include/asm-arm,
+  which are there to reduce the clutter in the top-level directory.  These
+  directories, and their purpose is listed below:
+
+  ============= ==========================================================
+   `arch-*`	machine/platform specific header files
+   `hardware`	driver-internal ARM specific data structures/definitions
+   `mach`	descriptions of generic ARM to specific machine interfaces
+   `proc-*`	processor dependent header files (currently only two
+		categories)
+  ============= ==========================================================
+
+
+Machine/Platform support
+------------------------
+
+  The ARM tree contains support for a lot of different machine types.  To
+  continue supporting these differences, it has become necessary to split
+  machine-specific parts by directory.  For this, the machine category is
+  used to select which directories and files get included (we will use
+  $(MACHINE) to refer to the category)
+
+  To this end, we now have arch/arm/mach-$(MACHINE) directories which are
+  designed to house the non-driver files for a particular machine (eg, PCI,
+  memory management, architecture definitions etc).  For all future
+  machines, there should be a corresponding arch/arm/mach-$(MACHINE)/include/mach
+  directory.
+
+
+Modules
+-------
+
+  Although modularisation is supported (and required for the FP emulator),
+  each module on an ARM2/ARM250/ARM3 machine when is loaded will take
+  memory up to the next 32k boundary due to the size of the pages.
+  Therefore, is modularisation on these machines really worth it?
+
+  However, ARM6 and up machines allow modules to take multiples of 4k, and
+  as such Acorn RiscPCs and other architectures using these processors can
+  make good use of modularisation.
+
+
+ADFS Image files
+----------------
+
+  You can access image files on your ADFS partitions by mounting the ADFS
+  partition, and then using the loopback device driver.  You must have
+  losetup installed.
+
+  Please note that the PCEmulator DOS partitions have a partition table at
+  the start, and as such, you will have to give '-o offset' to losetup.
+
+
+Request to developers
+---------------------
+
+  When writing device drivers which include a separate assembler file, please
+  include it in with the C file, and not the arch/arm/lib directory.  This
+  allows the driver to be compiled as a loadable module without requiring
+  half the code to be compiled into the kernel image.
+
+  In general, try to avoid using assembler unless it is really necessary.  It
+  makes drivers far less easy to port to other hardware.
+
+
+ST506 hard drives
+-----------------
+
+  The ST506 hard drive controllers seem to be working fine (if a little
+  slowly).  At the moment they will only work off the controllers on an
+  A4x0's motherboard, but for it to work off a Podule just requires
+  someone with a podule to add the addresses for the IRQ mask and the
+  HDC base to the source.
+
+  As of 31/3/96 it works with two drives (you should get the ADFS
+  `*configure` harddrive set to 2). I've got an internal 20MB and a great
+  big external 5.25" FH 64MB drive (who could ever want more :-) ).
+
+  I've just got 240K/s off it (a dd with bs=128k); thats about half of what
+  RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting
+  last week :-)
+
+  Known bug: Drive data errors can cause a hang; including cases where
+  the controller has fixed the error using ECC. (Possibly ONLY
+  in that case...hmm).
+
+
+1772 Floppy
+-----------
+  This also seems to work OK, but hasn't been stressed much lately.  It
+  hasn't got any code for disc change detection in there at the moment which
+  could be a bit of a problem!  Suggestions on the correct way to do this
+  are welcome.
+
+
+`CONFIG_MACH_` and `CONFIG_ARCH_`
+---------------------------------
+  A change was made in 2003 to the macro names for new machines.
+  Historically, `CONFIG_ARCH_` was used for the bonafide architecture,
+  e.g. SA1100, as well as implementations of the architecture,
+  e.g. Assabet.  It was decided to change the implementation macros
+  to read `CONFIG_MACH_` for clarity.  Moreover, a retroactive fixup has
+  not been made because it would complicate patching.
+
+  Previous registrations may be found online.
+
+    <http://www.arm.linux.org.uk/developer/machines/>
+
+Kernel entry (head.S)
+---------------------
+  The initial entry into the kernel is via head.S, which uses machine
+  independent code.  The machine is selected by the value of 'r1' on
+  entry, which must be kept unique.
+
+  Due to the large number of machines which the ARM port of Linux provides
+  for, we have a method to manage this which ensures that we don't end up
+  duplicating large amounts of code.
+
+  We group machine (or platform) support code into machine classes.  A
+  class typically based around one or more system on a chip devices, and
+  acts as a natural container around the actual implementations.  These
+  classes are given directories - arch/arm/mach-<class> and
+  arch/arm/mach-<class> - which contain the source files to/include/mach
+  support the machine class.  This directories also contain any machine
+  specific supporting code.
+
+  For example, the SA1100 class is based upon the SA1100 and SA1110 SoC
+  devices, and contains the code to support the way the on-board and off-
+  board devices are used, or the device is setup, and provides that
+  machine specific "personality."
+
+  For platforms that support device tree (DT), the machine selection is
+  controlled at runtime by passing the device tree blob to the kernel.  At
+  compile-time, support for the machine type must be selected.  This allows for
+  a single multiplatform kernel build to be used for several machine types.
+
+  For platforms that do not use device tree, this machine selection is
+  controlled by the machine type ID, which acts both as a run-time and a
+  compile-time code selection method.  You can register a new machine via the
+  web site at:
+
+    <http://www.arm.linux.org.uk/developer/machines/>
+
+  Note: Please do not register a machine type for DT-only platforms.  If your
+  platform is DT-only, you do not need a registered machine type.
+
+---
+
+Russell King (15/03/2004)
diff --git a/Documentation/arm/booting.rst b/Documentation/arm/booting.rst
new file mode 100644
index 000000000000..4babb6c6ae1e
--- /dev/null
+++ b/Documentation/arm/booting.rst
@@ -0,0 +1,237 @@
+=================
+Booting ARM Linux
+=================
+
+Author:	Russell King
+
+Date  : 18 May 2002
+
+The following documentation is relevant to 2.4.18-rmk6 and beyond.
+
+In order to boot ARM Linux, you require a boot loader, which is a small
+program that runs before the main kernel.  The boot loader is expected
+to initialise various devices, and eventually call the Linux kernel,
+passing information to the kernel.
+
+Essentially, the boot loader should provide (as a minimum) the
+following:
+
+1. Setup and initialise the RAM.
+2. Initialise one serial port.
+3. Detect the machine type.
+4. Setup the kernel tagged list.
+5. Load initramfs.
+6. Call the kernel image.
+
+
+1. Setup and initialise RAM
+---------------------------
+
+Existing boot loaders:
+	MANDATORY
+New boot loaders:
+	MANDATORY
+
+The boot loader is expected to find and initialise all RAM that the
+kernel will use for volatile data storage in the system.  It performs
+this in a machine dependent manner.  (It may use internal algorithms
+to automatically locate and size all RAM, or it may use knowledge of
+the RAM in the machine, or any other method the boot loader designer
+sees fit.)
+
+
+2. Initialise one serial port
+-----------------------------
+
+Existing boot loaders:
+	OPTIONAL, RECOMMENDED
+New boot loaders:
+	OPTIONAL, RECOMMENDED
+
+The boot loader should initialise and enable one serial port on the
+target.  This allows the kernel serial driver to automatically detect
+which serial port it should use for the kernel console (generally
+used for debugging purposes, or communication with the target.)
+
+As an alternative, the boot loader can pass the relevant 'console='
+option to the kernel via the tagged lists specifying the port, and
+serial format options as described in
+
+       Documentation/admin-guide/kernel-parameters.rst.
+
+
+3. Detect the machine type
+--------------------------
+
+Existing boot loaders:
+	OPTIONAL
+New boot loaders:
+	MANDATORY except for DT-only platforms
+
+The boot loader should detect the machine type its running on by some
+method.  Whether this is a hard coded value or some algorithm that
+looks at the connected hardware is beyond the scope of this document.
+The boot loader must ultimately be able to provide a MACH_TYPE_xxx
+value to the kernel. (see linux/arch/arm/tools/mach-types).  This
+should be passed to the kernel in register r1.
+
+For DT-only platforms, the machine type will be determined by device
+tree.  set the machine type to all ones (~0).  This is not strictly
+necessary, but assures that it will not match any existing types.
+
+4. Setup boot data
+------------------
+
+Existing boot loaders:
+	OPTIONAL, HIGHLY RECOMMENDED
+New boot loaders:
+	MANDATORY
+
+The boot loader must provide either a tagged list or a dtb image for
+passing configuration data to the kernel.  The physical address of the
+boot data is passed to the kernel in register r2.
+
+4a. Setup the kernel tagged list
+--------------------------------
+
+The boot loader must create and initialise the kernel tagged list.
+A valid tagged list starts with ATAG_CORE and ends with ATAG_NONE.
+The ATAG_CORE tag may or may not be empty.  An empty ATAG_CORE tag
+has the size field set to '2' (0x00000002).  The ATAG_NONE must set
+the size field to zero.
+
+Any number of tags can be placed in the list.  It is undefined
+whether a repeated tag appends to the information carried by the
+previous tag, or whether it replaces the information in its
+entirety; some tags behave as the former, others the latter.
+
+The boot loader must pass at a minimum the size and location of
+the system memory, and root filesystem location.  Therefore, the
+minimum tagged list should look::
+
+		+-----------+
+  base ->	| ATAG_CORE |  |
+		+-----------+  |
+		| ATAG_MEM  |  | increasing address
+		+-----------+  |
+		| ATAG_NONE |  |
+		+-----------+  v
+
+The tagged list should be stored in system RAM.
+
+The tagged list must be placed in a region of memory where neither
+the kernel decompressor nor initrd 'bootp' program will overwrite
+it.  The recommended placement is in the first 16KiB of RAM.
+
+4b. Setup the device tree
+-------------------------
+
+The boot loader must load a device tree image (dtb) into system ram
+at a 64bit aligned address and initialize it with the boot data.  The
+dtb format is documented in Documentation/devicetree/booting-without-of.txt.
+The kernel will look for the dtb magic value of 0xd00dfeed at the dtb
+physical address to determine if a dtb has been passed instead of a
+tagged list.
+
+The boot loader must pass at a minimum the size and location of the
+system memory, and the root filesystem location.  The dtb must be
+placed in a region of memory where the kernel decompressor will not
+overwrite it, while remaining within the region which will be covered
+by the kernel's low-memory mapping.
+
+A safe location is just above the 128MiB boundary from start of RAM.
+
+5. Load initramfs.
+------------------
+
+Existing boot loaders:
+	OPTIONAL
+New boot loaders:
+	OPTIONAL
+
+If an initramfs is in use then, as with the dtb, it must be placed in
+a region of memory where the kernel decompressor will not overwrite it
+while also with the region which will be covered by the kernel's
+low-memory mapping.
+
+A safe location is just above the device tree blob which itself will
+be loaded just above the 128MiB boundary from the start of RAM as
+recommended above.
+
+6. Calling the kernel image
+---------------------------
+
+Existing boot loaders:
+	MANDATORY
+New boot loaders:
+	MANDATORY
+
+There are two options for calling the kernel zImage.  If the zImage
+is stored in flash, and is linked correctly to be run from flash,
+then it is legal for the boot loader to call the zImage in flash
+directly.
+
+The zImage may also be placed in system RAM and called there.  The
+kernel should be placed in the first 128MiB of RAM.  It is recommended
+that it is loaded above 32MiB in order to avoid the need to relocate
+prior to decompression, which will make the boot process slightly
+faster.
+
+When booting a raw (non-zImage) kernel the constraints are tighter.
+In this case the kernel must be loaded at an offset into system equal
+to TEXT_OFFSET - PAGE_OFFSET.
+
+In any case, the following conditions must be met:
+
+- Quiesce all DMA capable devices so that memory does not get
+  corrupted by bogus network packets or disk data. This will save
+  you many hours of debug.
+
+- CPU register settings
+
+  - r0 = 0,
+  - r1 = machine type number discovered in (3) above.
+  - r2 = physical address of tagged list in system RAM, or
+    physical address of device tree block (dtb) in system RAM
+
+- CPU mode
+
+  All forms of interrupts must be disabled (IRQs and FIQs)
+
+  For CPUs which do not include the ARM virtualization extensions, the
+  CPU must be in SVC mode.  (A special exception exists for Angel)
+
+  CPUs which include support for the virtualization extensions can be
+  entered in HYP mode in order to enable the kernel to make full use of
+  these extensions.  This is the recommended boot method for such CPUs,
+  unless the virtualisations are already in use by a pre-installed
+  hypervisor.
+
+  If the kernel is not entered in HYP mode for any reason, it must be
+  entered in SVC mode.
+
+- Caches, MMUs
+
+  The MMU must be off.
+
+  Instruction cache may be on or off.
+
+  Data cache must be off.
+
+  If the kernel is entered in HYP mode, the above requirements apply to
+  the HYP mode configuration in addition to the ordinary PL1 (privileged
+  kernel modes) configuration.  In addition, all traps into the
+  hypervisor must be disabled, and PL1 access must be granted for all
+  peripherals and CPU resources for which this is architecturally
+  possible.  Except for entering in HYP mode, the system configuration
+  should be such that a kernel which does not include support for the
+  virtualization extensions can boot correctly without extra help.
+
+- The boot loader is expected to call the kernel image by jumping
+  directly to the first instruction of the kernel image.
+
+  On CPUs supporting the ARM instruction set, the entry must be
+  made in ARM state, even for a Thumb-2 kernel.
+
+  On CPUs supporting only the Thumb instruction set such as
+  Cortex-M class CPUs, the entry must be made in Thumb state.
diff --git a/Documentation/arm/cluster-pm-race-avoidance.rst b/Documentation/arm/cluster-pm-race-avoidance.rst
new file mode 100644
index 000000000000..aa58603d3f28
--- /dev/null
+++ b/Documentation/arm/cluster-pm-race-avoidance.rst
@@ -0,0 +1,533 @@
+=========================================================
+Cluster-wide Power-up/power-down race avoidance algorithm
+=========================================================
+
+This file documents the algorithm which is used to coordinate CPU and
+cluster setup and teardown operations and to manage hardware coherency
+controls safely.
+
+The section "Rationale" explains what the algorithm is for and why it is
+needed.  "Basic model" explains general concepts using a simplified view
+of the system.  The other sections explain the actual details of the
+algorithm in use.
+
+
+Rationale
+---------
+
+In a system containing multiple CPUs, it is desirable to have the
+ability to turn off individual CPUs when the system is idle, reducing
+power consumption and thermal dissipation.
+
+In a system containing multiple clusters of CPUs, it is also desirable
+to have the ability to turn off entire clusters.
+
+Turning entire clusters off and on is a risky business, because it
+involves performing potentially destructive operations affecting a group
+of independently running CPUs, while the OS continues to run.  This
+means that we need some coordination in order to ensure that critical
+cluster-level operations are only performed when it is truly safe to do
+so.
+
+Simple locking may not be sufficient to solve this problem, because
+mechanisms like Linux spinlocks may rely on coherency mechanisms which
+are not immediately enabled when a cluster powers up.  Since enabling or
+disabling those mechanisms may itself be a non-atomic operation (such as
+writing some hardware registers and invalidating large caches), other
+methods of coordination are required in order to guarantee safe
+power-down and power-up at the cluster level.
+
+The mechanism presented in this document describes a coherent memory
+based protocol for performing the needed coordination.  It aims to be as
+lightweight as possible, while providing the required safety properties.
+
+
+Basic model
+-----------
+
+Each cluster and CPU is assigned a state, as follows:
+
+	- DOWN
+	- COMING_UP
+	- UP
+	- GOING_DOWN
+
+::
+
+	    +---------> UP ----------+
+	    |                        v
+
+	COMING_UP                GOING_DOWN
+
+	    ^                        |
+	    +--------- DOWN <--------+
+
+
+DOWN:
+	The CPU or cluster is not coherent, and is either powered off or
+	suspended, or is ready to be powered off or suspended.
+
+COMING_UP:
+	The CPU or cluster has committed to moving to the UP state.
+	It may be part way through the process of initialisation and
+	enabling coherency.
+
+UP:
+	The CPU or cluster is active and coherent at the hardware
+	level.  A CPU in this state is not necessarily being used
+	actively by the kernel.
+
+GOING_DOWN:
+	The CPU or cluster has committed to moving to the DOWN
+	state.  It may be part way through the process of teardown and
+	coherency exit.
+
+
+Each CPU has one of these states assigned to it at any point in time.
+The CPU states are described in the "CPU state" section, below.
+
+Each cluster is also assigned a state, but it is necessary to split the
+state value into two parts (the "cluster" state and "inbound" state) and
+to introduce additional states in order to avoid races between different
+CPUs in the cluster simultaneously modifying the state.  The cluster-
+level states are described in the "Cluster state" section.
+
+To help distinguish the CPU states from cluster states in this
+discussion, the state names are given a `CPU_` prefix for the CPU states,
+and a `CLUSTER_` or `INBOUND_` prefix for the cluster states.
+
+
+CPU state
+---------
+
+In this algorithm, each individual core in a multi-core processor is
+referred to as a "CPU".  CPUs are assumed to be single-threaded:
+therefore, a CPU can only be doing one thing at a single point in time.
+
+This means that CPUs fit the basic model closely.
+
+The algorithm defines the following states for each CPU in the system:
+
+	- CPU_DOWN
+	- CPU_COMING_UP
+	- CPU_UP
+	- CPU_GOING_DOWN
+
+::
+
+	 cluster setup and
+	CPU setup complete          policy decision
+	      +-----------> CPU_UP ------------+
+	      |                                v
+
+	CPU_COMING_UP                   CPU_GOING_DOWN
+
+	      ^                                |
+	      +----------- CPU_DOWN <----------+
+	 policy decision           CPU teardown complete
+	or hardware event
+
+
+The definitions of the four states correspond closely to the states of
+the basic model.
+
+Transitions between states occur as follows.
+
+A trigger event (spontaneous) means that the CPU can transition to the
+next state as a result of making local progress only, with no
+requirement for any external event to happen.
+
+
+CPU_DOWN:
+	A CPU reaches the CPU_DOWN state when it is ready for
+	power-down.  On reaching this state, the CPU will typically
+	power itself down or suspend itself, via a WFI instruction or a
+	firmware call.
+
+	Next state:
+		CPU_COMING_UP
+	Conditions:
+		none
+
+	Trigger events:
+		a) an explicit hardware power-up operation, resulting
+		   from a policy decision on another CPU;
+
+		b) a hardware event, such as an interrupt.
+
+
+CPU_COMING_UP:
+	A CPU cannot start participating in hardware coherency until the
+	cluster is set up and coherent.  If the cluster is not ready,
+	then the CPU will wait in the CPU_COMING_UP state until the
+	cluster has been set up.
+
+	Next state:
+		CPU_UP
+	Conditions:
+		The CPU's parent cluster must be in CLUSTER_UP.
+	Trigger events:
+		Transition of the parent cluster to CLUSTER_UP.
+
+	Refer to the "Cluster state" section for a description of the
+	CLUSTER_UP state.
+
+
+CPU_UP:
+	When a CPU reaches the CPU_UP state, it is safe for the CPU to
+	start participating in local coherency.
+
+	This is done by jumping to the kernel's CPU resume code.
+
+	Note that the definition of this state is slightly different
+	from the basic model definition: CPU_UP does not mean that the
+	CPU is coherent yet, but it does mean that it is safe to resume
+	the kernel.  The kernel handles the rest of the resume
+	procedure, so the remaining steps are not visible as part of the
+	race avoidance algorithm.
+
+	The CPU remains in this state until an explicit policy decision
+	is made to shut down or suspend the CPU.
+
+	Next state:
+		CPU_GOING_DOWN
+	Conditions:
+		none
+	Trigger events:
+		explicit policy decision
+
+
+CPU_GOING_DOWN:
+	While in this state, the CPU exits coherency, including any
+	operations required to achieve this (such as cleaning data
+	caches).
+
+	Next state:
+		CPU_DOWN
+	Conditions:
+		local CPU teardown complete
+	Trigger events:
+		(spontaneous)
+
+
+Cluster state
+-------------
+
+A cluster is a group of connected CPUs with some common resources.
+Because a cluster contains multiple CPUs, it can be doing multiple
+things at the same time.  This has some implications.  In particular, a
+CPU can start up while another CPU is tearing the cluster down.
+
+In this discussion, the "outbound side" is the view of the cluster state
+as seen by a CPU tearing the cluster down.  The "inbound side" is the
+view of the cluster state as seen by a CPU setting the CPU up.
+
+In order to enable safe coordination in such situations, it is important
+that a CPU which is setting up the cluster can advertise its state
+independently of the CPU which is tearing down the cluster.  For this
+reason, the cluster state is split into two parts:
+
+	"cluster" state: The global state of the cluster; or the state
+	on the outbound side:
+
+		- CLUSTER_DOWN
+		- CLUSTER_UP
+		- CLUSTER_GOING_DOWN
+
+	"inbound" state: The state of the cluster on the inbound side.
+
+		- INBOUND_NOT_COMING_UP
+		- INBOUND_COMING_UP
+
+
+	The different pairings of these states results in six possible
+	states for the cluster as a whole::
+
+	                            CLUSTER_UP
+	          +==========> INBOUND_NOT_COMING_UP -------------+
+	          #                                               |
+	                                                          |
+	     CLUSTER_UP     <----+                                |
+	  INBOUND_COMING_UP      |                                v
+
+	          ^             CLUSTER_GOING_DOWN       CLUSTER_GOING_DOWN
+	          #              INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
+
+	    CLUSTER_DOWN         |                                |
+	  INBOUND_COMING_UP <----+                                |
+	                                                          |
+	          ^                                               |
+	          +===========     CLUSTER_DOWN      <------------+
+	                       INBOUND_NOT_COMING_UP
+
+	Transitions -----> can only be made by the outbound CPU, and
+	only involve changes to the "cluster" state.
+
+	Transitions ===##> can only be made by the inbound CPU, and only
+	involve changes to the "inbound" state, except where there is no
+	further transition possible on the outbound side (i.e., the
+	outbound CPU has put the cluster into the CLUSTER_DOWN state).
+
+	The race avoidance algorithm does not provide a way to determine
+	which exact CPUs within the cluster play these roles.  This must
+	be decided in advance by some other means.  Refer to the section
+	"Last man and first man selection" for more explanation.
+
+
+	CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
+	cluster can actually be powered down.
+
+	The parallelism of the inbound and outbound CPUs is observed by
+	the existence of two different paths from CLUSTER_GOING_DOWN/
+	INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
+	model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
+	COMING_UP in the basic model).  The second path avoids cluster
+	teardown completely.
+
+	CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
+	model.  The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
+	is trivial and merely resets the state machine ready for the
+	next cycle.
+
+	Details of the allowable transitions follow.
+
+	The next state in each case is notated
+
+		<cluster state>/<inbound state> (<transitioner>)
+
+	where the <transitioner> is the side on which the transition
+	can occur; either the inbound or the outbound side.
+
+
+CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
+	Next state:
+		CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
+	Conditions:
+		none
+
+	Trigger events:
+		a) an explicit hardware power-up operation, resulting
+		   from a policy decision on another CPU;
+
+		b) a hardware event, such as an interrupt.
+
+
+CLUSTER_DOWN/INBOUND_COMING_UP:
+
+	In this state, an inbound CPU sets up the cluster, including
+	enabling of hardware coherency at the cluster level and any
+	other operations (such as cache invalidation) which are required
+	in order to achieve this.
+
+	The purpose of this state is to do sufficient cluster-level
+	setup to enable other CPUs in the cluster to enter coherency
+	safely.
+
+	Next state:
+		CLUSTER_UP/INBOUND_COMING_UP (inbound)
+	Conditions:
+		cluster-level setup and hardware coherency complete
+	Trigger events:
+		(spontaneous)
+
+
+CLUSTER_UP/INBOUND_COMING_UP:
+
+	Cluster-level setup is complete and hardware coherency is
+	enabled for the cluster.  Other CPUs in the cluster can safely
+	enter coherency.
+
+	This is a transient state, leading immediately to
+	CLUSTER_UP/INBOUND_NOT_COMING_UP.  All other CPUs on the cluster
+	should consider treat these two states as equivalent.
+
+	Next state:
+		CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
+	Conditions:
+		none
+	Trigger events:
+		(spontaneous)
+
+
+CLUSTER_UP/INBOUND_NOT_COMING_UP:
+
+	Cluster-level setup is complete and hardware coherency is
+	enabled for the cluster.  Other CPUs in the cluster can safely
+	enter coherency.
+
+	The cluster will remain in this state until a policy decision is
+	made to power the cluster down.
+
+	Next state:
+		CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
+	Conditions:
+		none
+	Trigger events:
+		policy decision to power down the cluster
+
+
+CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
+
+	An outbound CPU is tearing the cluster down.  The selected CPU
+	must wait in this state until all CPUs in the cluster are in the
+	CPU_DOWN state.
+
+	When all CPUs are in the CPU_DOWN state, the cluster can be torn
+	down, for example by cleaning data caches and exiting
+	cluster-level coherency.
+
+	To avoid wasteful unnecessary teardown operations, the outbound
+	should check the inbound cluster state for asynchronous
+	transitions to INBOUND_COMING_UP.  Alternatively, individual
+	CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
+
+
+	Next states:
+
+	CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
+		Conditions:
+			cluster torn down and ready to power off
+		Trigger events:
+			(spontaneous)
+
+	CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
+		Conditions:
+			none
+
+		Trigger events:
+			a) an explicit hardware power-up operation,
+			   resulting from a policy decision on another
+			   CPU;
+
+			b) a hardware event, such as an interrupt.
+
+
+CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
+
+	The cluster is (or was) being torn down, but another CPU has
+	come online in the meantime and is trying to set up the cluster
+	again.
+
+	If the outbound CPU observes this state, it has two choices:
+
+		a) back out of teardown, restoring the cluster to the
+		   CLUSTER_UP state;
+
+		b) finish tearing the cluster down and put the cluster
+		   in the CLUSTER_DOWN state; the inbound CPU will
+		   set up the cluster again from there.
+
+	Choice (a) permits the removal of some latency by avoiding
+	unnecessary teardown and setup operations in situations where
+	the cluster is not really going to be powered down.
+
+
+	Next states:
+
+	CLUSTER_UP/INBOUND_COMING_UP (outbound)
+		Conditions:
+				cluster-level setup and hardware
+				coherency complete
+
+		Trigger events:
+				(spontaneous)
+
+	CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
+		Conditions:
+			cluster torn down and ready to power off
+
+		Trigger events:
+			(spontaneous)
+
+
+Last man and First man selection
+--------------------------------
+
+The CPU which performs cluster tear-down operations on the outbound side
+is commonly referred to as the "last man".
+
+The CPU which performs cluster setup on the inbound side is commonly
+referred to as the "first man".
+
+The race avoidance algorithm documented above does not provide a
+mechanism to choose which CPUs should play these roles.
+
+
+Last man:
+
+When shutting down the cluster, all the CPUs involved are initially
+executing Linux and hence coherent.  Therefore, ordinary spinlocks can
+be used to select a last man safely, before the CPUs become
+non-coherent.
+
+
+First man:
+
+Because CPUs may power up asynchronously in response to external wake-up
+events, a dynamic mechanism is needed to make sure that only one CPU
+attempts to play the first man role and do the cluster-level
+initialisation: any other CPUs must wait for this to complete before
+proceeding.
+
+Cluster-level initialisation may involve actions such as configuring
+coherency controls in the bus fabric.
+
+The current implementation in mcpm_head.S uses a separate mutual exclusion
+mechanism to do this arbitration.  This mechanism is documented in
+detail in vlocks.txt.
+
+
+Features and Limitations
+------------------------
+
+Implementation:
+
+	The current ARM-based implementation is split between
+	arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
+	arch/arm/common/mcpm_entry.c (everything else):
+
+	__mcpm_cpu_going_down() signals the transition of a CPU to the
+	CPU_GOING_DOWN state.
+
+	__mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
+	state.
+
+	A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
+	low-level power-up code in mcpm_head.S.  This could
+	involve CPU-specific setup code, but in the current
+	implementation it does not.
+
+	__mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
+	handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
+	and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
+	the case of an aborted cluster power-down).
+
+	These functions are more complex than the __mcpm_cpu_*()
+	functions due to the extra inter-CPU coordination which
+	is needed for safe transitions at the cluster level.
+
+	A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
+	the low-level power-up code in mcpm_head.S.  This
+	typically involves platform-specific setup code,
+	provided by the platform-specific power_up_setup
+	function registered via mcpm_sync_init.
+
+Deep topologies:
+
+	As currently described and implemented, the algorithm does not
+	support CPU topologies involving more than two levels (i.e.,
+	clusters of clusters are not supported).  The algorithm could be
+	extended by replicating the cluster-level states for the
+	additional topological levels, and modifying the transition
+	rules for the intermediate (non-outermost) cluster levels.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, in
+collaboration with Nicolas Pitre and Achin Gupta.
+
+Copyright (C) 2012-2013  Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt
deleted file mode 100644
index 750b6fc24af9..000000000000
--- a/Documentation/arm/cluster-pm-race-avoidance.txt
+++ /dev/null
@@ -1,498 +0,0 @@
-Cluster-wide Power-up/power-down race avoidance algorithm
-=========================================================
-
-This file documents the algorithm which is used to coordinate CPU and
-cluster setup and teardown operations and to manage hardware coherency
-controls safely.
-
-The section "Rationale" explains what the algorithm is for and why it is
-needed.  "Basic model" explains general concepts using a simplified view
-of the system.  The other sections explain the actual details of the
-algorithm in use.
-
-
-Rationale
----------
-
-In a system containing multiple CPUs, it is desirable to have the
-ability to turn off individual CPUs when the system is idle, reducing
-power consumption and thermal dissipation.
-
-In a system containing multiple clusters of CPUs, it is also desirable
-to have the ability to turn off entire clusters.
-
-Turning entire clusters off and on is a risky business, because it
-involves performing potentially destructive operations affecting a group
-of independently running CPUs, while the OS continues to run.  This
-means that we need some coordination in order to ensure that critical
-cluster-level operations are only performed when it is truly safe to do
-so.
-
-Simple locking may not be sufficient to solve this problem, because
-mechanisms like Linux spinlocks may rely on coherency mechanisms which
-are not immediately enabled when a cluster powers up.  Since enabling or
-disabling those mechanisms may itself be a non-atomic operation (such as
-writing some hardware registers and invalidating large caches), other
-methods of coordination are required in order to guarantee safe
-power-down and power-up at the cluster level.
-
-The mechanism presented in this document describes a coherent memory
-based protocol for performing the needed coordination.  It aims to be as
-lightweight as possible, while providing the required safety properties.
-
-
-Basic model
------------
-
-Each cluster and CPU is assigned a state, as follows:
-
-	DOWN
-	COMING_UP
-	UP
-	GOING_DOWN
-
-	    +---------> UP ----------+
-	    |                        v
-
-	COMING_UP                GOING_DOWN
-
-	    ^                        |
-	    +--------- DOWN <--------+
-
-
-DOWN:	The CPU or cluster is not coherent, and is either powered off or
-	suspended, or is ready to be powered off or suspended.
-
-COMING_UP: The CPU or cluster has committed to moving to the UP state.
-	It may be part way through the process of initialisation and
-	enabling coherency.
-
-UP:	The CPU or cluster is active and coherent at the hardware
-	level.  A CPU in this state is not necessarily being used
-	actively by the kernel.
-
-GOING_DOWN: The CPU or cluster has committed to moving to the DOWN
-	state.  It may be part way through the process of teardown and
-	coherency exit.
-
-
-Each CPU has one of these states assigned to it at any point in time.
-The CPU states are described in the "CPU state" section, below.
-
-Each cluster is also assigned a state, but it is necessary to split the
-state value into two parts (the "cluster" state and "inbound" state) and
-to introduce additional states in order to avoid races between different
-CPUs in the cluster simultaneously modifying the state.  The cluster-
-level states are described in the "Cluster state" section.
-
-To help distinguish the CPU states from cluster states in this
-discussion, the state names are given a CPU_ prefix for the CPU states,
-and a CLUSTER_ or INBOUND_ prefix for the cluster states.
-
-
-CPU state
----------
-
-In this algorithm, each individual core in a multi-core processor is
-referred to as a "CPU".  CPUs are assumed to be single-threaded:
-therefore, a CPU can only be doing one thing at a single point in time.
-
-This means that CPUs fit the basic model closely.
-
-The algorithm defines the following states for each CPU in the system:
-
-	CPU_DOWN
-	CPU_COMING_UP
-	CPU_UP
-	CPU_GOING_DOWN
-
-	 cluster setup and
-	CPU setup complete          policy decision
-	      +-----------> CPU_UP ------------+
-	      |                                v
-
-	CPU_COMING_UP                   CPU_GOING_DOWN
-
-	      ^                                |
-	      +----------- CPU_DOWN <----------+
-	 policy decision           CPU teardown complete
-	or hardware event
-
-
-The definitions of the four states correspond closely to the states of
-the basic model.
-
-Transitions between states occur as follows.
-
-A trigger event (spontaneous) means that the CPU can transition to the
-next state as a result of making local progress only, with no
-requirement for any external event to happen.
-
-
-CPU_DOWN:
-
-	A CPU reaches the CPU_DOWN state when it is ready for
-	power-down.  On reaching this state, the CPU will typically
-	power itself down or suspend itself, via a WFI instruction or a
-	firmware call.
-
-	Next state:	CPU_COMING_UP
-	Conditions:	none
-
-	Trigger events:
-
-		a) an explicit hardware power-up operation, resulting
-		   from a policy decision on another CPU;
-
-		b) a hardware event, such as an interrupt.
-
-
-CPU_COMING_UP:
-
-	A CPU cannot start participating in hardware coherency until the
-	cluster is set up and coherent.  If the cluster is not ready,
-	then the CPU will wait in the CPU_COMING_UP state until the
-	cluster has been set up.
-
-	Next state:	CPU_UP
-	Conditions:	The CPU's parent cluster must be in CLUSTER_UP.
-	Trigger events:	Transition of the parent cluster to CLUSTER_UP.
-
-	Refer to the "Cluster state" section for a description of the
-	CLUSTER_UP state.
-
-
-CPU_UP:
-	When a CPU reaches the CPU_UP state, it is safe for the CPU to
-	start participating in local coherency.
-
-	This is done by jumping to the kernel's CPU resume code.
-
-	Note that the definition of this state is slightly different
-	from the basic model definition: CPU_UP does not mean that the
-	CPU is coherent yet, but it does mean that it is safe to resume
-	the kernel.  The kernel handles the rest of the resume
-	procedure, so the remaining steps are not visible as part of the
-	race avoidance algorithm.
-
-	The CPU remains in this state until an explicit policy decision
-	is made to shut down or suspend the CPU.
-
-	Next state:	CPU_GOING_DOWN
-	Conditions:	none
-	Trigger events:	explicit policy decision
-
-
-CPU_GOING_DOWN:
-
-	While in this state, the CPU exits coherency, including any
-	operations required to achieve this (such as cleaning data
-	caches).
-
-	Next state:	CPU_DOWN
-	Conditions:	local CPU teardown complete
-	Trigger events:	(spontaneous)
-
-
-Cluster state
--------------
-
-A cluster is a group of connected CPUs with some common resources.
-Because a cluster contains multiple CPUs, it can be doing multiple
-things at the same time.  This has some implications.  In particular, a
-CPU can start up while another CPU is tearing the cluster down.
-
-In this discussion, the "outbound side" is the view of the cluster state
-as seen by a CPU tearing the cluster down.  The "inbound side" is the
-view of the cluster state as seen by a CPU setting the CPU up.
-
-In order to enable safe coordination in such situations, it is important
-that a CPU which is setting up the cluster can advertise its state
-independently of the CPU which is tearing down the cluster.  For this
-reason, the cluster state is split into two parts:
-
-	"cluster" state: The global state of the cluster; or the state
-		on the outbound side:
-
-		CLUSTER_DOWN
-		CLUSTER_UP
-		CLUSTER_GOING_DOWN
-
-	"inbound" state: The state of the cluster on the inbound side.
-
-		INBOUND_NOT_COMING_UP
-		INBOUND_COMING_UP
-
-
-	The different pairings of these states results in six possible
-	states for the cluster as a whole:
-
-	                            CLUSTER_UP
-	          +==========> INBOUND_NOT_COMING_UP -------------+
-	          #                                               |
-	                                                          |
-	     CLUSTER_UP     <----+                                |
-	  INBOUND_COMING_UP      |                                v
-
-	          ^             CLUSTER_GOING_DOWN       CLUSTER_GOING_DOWN
-	          #              INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
-
-	    CLUSTER_DOWN         |                                |
-	  INBOUND_COMING_UP <----+                                |
-	                                                          |
-	          ^                                               |
-	          +===========     CLUSTER_DOWN      <------------+
-	                       INBOUND_NOT_COMING_UP
-
-	Transitions -----> can only be made by the outbound CPU, and
-	only involve changes to the "cluster" state.
-
-	Transitions ===##> can only be made by the inbound CPU, and only
-	involve changes to the "inbound" state, except where there is no
-	further transition possible on the outbound side (i.e., the
-	outbound CPU has put the cluster into the CLUSTER_DOWN state).
-
-	The race avoidance algorithm does not provide a way to determine
-	which exact CPUs within the cluster play these roles.  This must
-	be decided in advance by some other means.  Refer to the section
-	"Last man and first man selection" for more explanation.
-
-
-	CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
-	cluster can actually be powered down.
-
-	The parallelism of the inbound and outbound CPUs is observed by
-	the existence of two different paths from CLUSTER_GOING_DOWN/
-	INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
-	model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
-	COMING_UP in the basic model).  The second path avoids cluster
-	teardown completely.
-
-	CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
-	model.  The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
-	is trivial and merely resets the state machine ready for the
-	next cycle.
-
-	Details of the allowable transitions follow.
-
-	The next state in each case is notated
-
-		<cluster state>/<inbound state> (<transitioner>)
-
-	where the <transitioner> is the side on which the transition
-	can occur; either the inbound or the outbound side.
-
-
-CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
-
-	Next state:	CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
-	Conditions:	none
-	Trigger events:
-
-		a) an explicit hardware power-up operation, resulting
-		   from a policy decision on another CPU;
-
-		b) a hardware event, such as an interrupt.
-
-
-CLUSTER_DOWN/INBOUND_COMING_UP:
-
-	In this state, an inbound CPU sets up the cluster, including
-	enabling of hardware coherency at the cluster level and any
-	other operations (such as cache invalidation) which are required
-	in order to achieve this.
-
-	The purpose of this state is to do sufficient cluster-level
-	setup to enable other CPUs in the cluster to enter coherency
-	safely.
-
-	Next state:	CLUSTER_UP/INBOUND_COMING_UP (inbound)
-	Conditions:	cluster-level setup and hardware coherency complete
-	Trigger events:	(spontaneous)
-
-
-CLUSTER_UP/INBOUND_COMING_UP:
-
-	Cluster-level setup is complete and hardware coherency is
-	enabled for the cluster.  Other CPUs in the cluster can safely
-	enter coherency.
-
-	This is a transient state, leading immediately to
-	CLUSTER_UP/INBOUND_NOT_COMING_UP.  All other CPUs on the cluster
-	should consider treat these two states as equivalent.
-
-	Next state:	CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
-	Conditions:	none
-	Trigger events:	(spontaneous)
-
-
-CLUSTER_UP/INBOUND_NOT_COMING_UP:
-
-	Cluster-level setup is complete and hardware coherency is
-	enabled for the cluster.  Other CPUs in the cluster can safely
-	enter coherency.
-
-	The cluster will remain in this state until a policy decision is
-	made to power the cluster down.
-
-	Next state:	CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
-	Conditions:	none
-	Trigger events:	policy decision to power down the cluster
-
-
-CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
-
-	An outbound CPU is tearing the cluster down.  The selected CPU
-	must wait in this state until all CPUs in the cluster are in the
-	CPU_DOWN state.
-
-	When all CPUs are in the CPU_DOWN state, the cluster can be torn
-	down, for example by cleaning data caches and exiting
-	cluster-level coherency.
-
-	To avoid wasteful unnecessary teardown operations, the outbound
-	should check the inbound cluster state for asynchronous
-	transitions to INBOUND_COMING_UP.  Alternatively, individual
-	CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
-
-
-	Next states:
-
-	CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
-		Conditions:	cluster torn down and ready to power off
-		Trigger events:	(spontaneous)
-
-	CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
-		Conditions:	none
-		Trigger events:
-
-			a) an explicit hardware power-up operation,
-			   resulting from a policy decision on another
-			   CPU;
-
-			b) a hardware event, such as an interrupt.
-
-
-CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
-
-	The cluster is (or was) being torn down, but another CPU has
-	come online in the meantime and is trying to set up the cluster
-	again.
-
-	If the outbound CPU observes this state, it has two choices:
-
-		a) back out of teardown, restoring the cluster to the
-		   CLUSTER_UP state;
-
-		b) finish tearing the cluster down and put the cluster
-		   in the CLUSTER_DOWN state; the inbound CPU will
-		   set up the cluster again from there.
-
-	Choice (a) permits the removal of some latency by avoiding
-	unnecessary teardown and setup operations in situations where
-	the cluster is not really going to be powered down.
-
-
-	Next states:
-
-	CLUSTER_UP/INBOUND_COMING_UP (outbound)
-		Conditions:	cluster-level setup and hardware
-				coherency complete
-		Trigger events:	(spontaneous)
-
-	CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
-		Conditions:	cluster torn down and ready to power off
-		Trigger events:	(spontaneous)
-
-
-Last man and First man selection
---------------------------------
-
-The CPU which performs cluster tear-down operations on the outbound side
-is commonly referred to as the "last man".
-
-The CPU which performs cluster setup on the inbound side is commonly
-referred to as the "first man".
-
-The race avoidance algorithm documented above does not provide a
-mechanism to choose which CPUs should play these roles.
-
-
-Last man:
-
-When shutting down the cluster, all the CPUs involved are initially
-executing Linux and hence coherent.  Therefore, ordinary spinlocks can
-be used to select a last man safely, before the CPUs become
-non-coherent.
-
-
-First man:
-
-Because CPUs may power up asynchronously in response to external wake-up
-events, a dynamic mechanism is needed to make sure that only one CPU
-attempts to play the first man role and do the cluster-level
-initialisation: any other CPUs must wait for this to complete before
-proceeding.
-
-Cluster-level initialisation may involve actions such as configuring
-coherency controls in the bus fabric.
-
-The current implementation in mcpm_head.S uses a separate mutual exclusion
-mechanism to do this arbitration.  This mechanism is documented in
-detail in vlocks.txt.
-
-
-Features and Limitations
-------------------------
-
-Implementation:
-
-	The current ARM-based implementation is split between
-	arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
-	arch/arm/common/mcpm_entry.c (everything else):
-
-	__mcpm_cpu_going_down() signals the transition of a CPU to the
-		CPU_GOING_DOWN state.
-
-	__mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
-		state.
-
-	A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
-		low-level power-up code in mcpm_head.S.  This could
-		involve CPU-specific setup code, but in the current
-		implementation it does not.
-
-	__mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
-		handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
-		and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
-		the case of an aborted cluster power-down).
-
-		These functions are more complex than the __mcpm_cpu_*()
-		functions due to the extra inter-CPU coordination which
-		is needed for safe transitions at the cluster level.
-
-	A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
-		the low-level power-up code in mcpm_head.S.  This
-		typically involves platform-specific setup code,
-		provided by the platform-specific power_up_setup
-		function registered via mcpm_sync_init.
-
-Deep topologies:
-
-	As currently described and implemented, the algorithm does not
-	support CPU topologies involving more than two levels (i.e.,
-	clusters of clusters are not supported).  The algorithm could be
-	extended by replicating the cluster-level states for the
-	additional topological levels, and modifying the transition
-	rules for the intermediate (non-outermost) cluster levels.
-
-
-Colophon
---------
-
-Originally created and documented by Dave Martin for Linaro Limited, in
-collaboration with Nicolas Pitre and Achin Gupta.
-
-Copyright (C) 2012-2013  Linaro Limited
-Distributed under the terms of Version 2 of the GNU General Public
-License, as defined in linux/COPYING.
diff --git a/Documentation/arm/firmware.rst b/Documentation/arm/firmware.rst
new file mode 100644
index 000000000000..efd844baec1d
--- /dev/null
+++ b/Documentation/arm/firmware.rst
@@ -0,0 +1,72 @@
+==========================================================================
+Interface for registering and calling firmware-specific operations for ARM
+==========================================================================
+
+Written by Tomasz Figa <t.figa@samsung.com>
+
+Some boards are running with secure firmware running in TrustZone secure
+world, which changes the way some things have to be initialized. This makes
+a need to provide an interface for such platforms to specify available firmware
+operations and call them when needed.
+
+Firmware operations can be specified by filling in a struct firmware_ops
+with appropriate callbacks and then registering it with register_firmware_ops()
+function::
+
+	void register_firmware_ops(const struct firmware_ops *ops)
+
+The ops pointer must be non-NULL. More information about struct firmware_ops
+and its members can be found in arch/arm/include/asm/firmware.h header.
+
+There is a default, empty set of operations provided, so there is no need to
+set anything if platform does not require firmware operations.
+
+To call a firmware operation, a helper macro is provided::
+
+	#define call_firmware_op(op, ...)				\
+		((firmware_ops->op) ? firmware_ops->op(__VA_ARGS__) : (-ENOSYS))
+
+the macro checks if the operation is provided and calls it or otherwise returns
+-ENOSYS to signal that given operation is not available (for example, to allow
+fallback to legacy operation).
+
+Example of registering firmware operations::
+
+	/* board file */
+
+	static int platformX_do_idle(void)
+	{
+		/* tell platformX firmware to enter idle */
+		return 0;
+	}
+
+	static int platformX_cpu_boot(int i)
+	{
+		/* tell platformX firmware to boot CPU i */
+		return 0;
+	}
+
+	static const struct firmware_ops platformX_firmware_ops = {
+		.do_idle        = exynos_do_idle,
+		.cpu_boot       = exynos_cpu_boot,
+		/* other operations not available on platformX */
+	};
+
+	/* init_early callback of machine descriptor */
+	static void __init board_init_early(void)
+	{
+		register_firmware_ops(&platformX_firmware_ops);
+	}
+
+Example of using a firmware operation::
+
+	/* some platform code, e.g. SMP initialization */
+
+	__raw_writel(__pa_symbol(exynos4_secondary_startup),
+		CPU1_BOOT_REG);
+
+	/* Call Exynos specific smc call */
+	if (call_firmware_op(cpu_boot, cpu) == -ENOSYS)
+		cpu_boot_legacy(...); /* Try legacy way */
+
+	gic_raise_softirq(cpumask_of(cpu), 1);
diff --git a/Documentation/arm/firmware.txt b/Documentation/arm/firmware.txt
deleted file mode 100644
index 7f175dbb427e..000000000000
--- a/Documentation/arm/firmware.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-Interface for registering and calling firmware-specific operations for ARM.
-----
-Written by Tomasz Figa <t.figa@samsung.com>
-
-Some boards are running with secure firmware running in TrustZone secure
-world, which changes the way some things have to be initialized. This makes
-a need to provide an interface for such platforms to specify available firmware
-operations and call them when needed.
-
-Firmware operations can be specified by filling in a struct firmware_ops
-with appropriate callbacks and then registering it with register_firmware_ops()
-function.
-
-	void register_firmware_ops(const struct firmware_ops *ops)
-
-The ops pointer must be non-NULL. More information about struct firmware_ops
-and its members can be found in arch/arm/include/asm/firmware.h header.
-
-There is a default, empty set of operations provided, so there is no need to
-set anything if platform does not require firmware operations.
-
-To call a firmware operation, a helper macro is provided
-
-	#define call_firmware_op(op, ...)				\
-		((firmware_ops->op) ? firmware_ops->op(__VA_ARGS__) : (-ENOSYS))
-
-the macro checks if the operation is provided and calls it or otherwise returns
--ENOSYS to signal that given operation is not available (for example, to allow
-fallback to legacy operation).
-
-Example of registering firmware operations:
-
-	/* board file */
-
-	static int platformX_do_idle(void)
-	{
-		/* tell platformX firmware to enter idle */
-		return 0;
-	}
-
-	static int platformX_cpu_boot(int i)
-	{
-		/* tell platformX firmware to boot CPU i */
-		return 0;
-	}
-
-	static const struct firmware_ops platformX_firmware_ops = {
-		.do_idle        = exynos_do_idle,
-		.cpu_boot       = exynos_cpu_boot,
-		/* other operations not available on platformX */
-	};
-
-	/* init_early callback of machine descriptor */
-	static void __init board_init_early(void)
-	{
-		register_firmware_ops(&platformX_firmware_ops);
-	}
-
-Example of using a firmware operation:
-
-	/* some platform code, e.g. SMP initialization */
-
-	__raw_writel(__pa_symbol(exynos4_secondary_startup),
-		CPU1_BOOT_REG);
-
-	/* Call Exynos specific smc call */
-	if (call_firmware_op(cpu_boot, cpu) == -ENOSYS)
-		cpu_boot_legacy(...); /* Try legacy way */
-
-	gic_raise_softirq(cpumask_of(cpu), 1);
diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst
new file mode 100644
index 000000000000..5fc072dd0c5e
--- /dev/null
+++ b/Documentation/arm/index.rst
@@ -0,0 +1,80 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+ARM Architecture
+================
+
+.. toctree::
+   :maxdepth: 1
+
+   arm
+   booting
+   cluster-pm-race-avoidance
+   firmware
+   interrupts
+   kernel_mode_neon
+   kernel_user_helpers
+   memory
+   mem_alignment
+   tcm
+   setup
+   swp_emulation
+   uefi
+   vlocks
+   porting
+
+SoC-specific documents
+======================
+
+.. toctree::
+   :maxdepth: 1
+
+   ixp4xx
+
+   marvel
+   microchip
+
+   netwinder
+   nwfpe/index
+
+   keystone/overview
+   keystone/knav-qmss
+
+   omap/index
+
+   pxa/mfp
+
+
+   sa1100/index
+
+   stm32/stm32f746-overview
+   stm32/overview
+   stm32/stm32h743-overview
+   stm32/stm32f769-overview
+   stm32/stm32f429-overview
+   stm32/stm32mp157-overview
+
+   sunxi
+
+   samsung/index
+   samsung-s3c24xx/index
+
+   sunxi/clocks
+
+   spear/overview
+
+   sti/stih416-overview
+   sti/stih407-overview
+   sti/stih418-overview
+   sti/overview
+   sti/stih415-overview
+
+   vfp/release-notes
+
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/arm/interrupts.rst b/Documentation/arm/interrupts.rst
new file mode 100644
index 000000000000..2ae70e0e9732
--- /dev/null
+++ b/Documentation/arm/interrupts.rst
@@ -0,0 +1,169 @@
+==========
+Interrupts
+==========
+
+2.5.2-rmk5:
+  This is the first kernel that contains a major shake up of some of the
+  major architecture-specific subsystems.
+
+Firstly, it contains some pretty major changes to the way we handle the
+MMU TLB.  Each MMU TLB variant is now handled completely separately -
+we have TLB v3, TLB v4 (without write buffer), TLB v4 (with write buffer),
+and finally TLB v4 (with write buffer, with I TLB invalidate entry).
+There is more assembly code inside each of these functions, mainly to
+allow more flexible TLB handling for the future.
+
+Secondly, the IRQ subsystem.
+
+The 2.5 kernels will be having major changes to the way IRQs are handled.
+Unfortunately, this means that machine types that touch the irq_desc[]
+array (basically all machine types) will break, and this means every
+machine type that we currently have.
+
+Lets take an example.  On the Assabet with Neponset, we have::
+
+                  GPIO25                 IRR:2
+        SA1100 ------------> Neponset -----------> SA1111
+                                         IIR:1
+                                      -----------> USAR
+                                         IIR:0
+                                      -----------> SMC9196
+
+The way stuff currently works, all SA1111 interrupts are mutually
+exclusive of each other - if you're processing one interrupt from the
+SA1111 and another comes in, you have to wait for that interrupt to
+finish processing before you can service the new interrupt.  Eg, an
+IDE PIO-based interrupt on the SA1111 excludes all other SA1111 and
+SMC9196 interrupts until it has finished transferring its multi-sector
+data, which can be a long time.  Note also that since we loop in the
+SA1111 IRQ handler, SA1111 IRQs can hold off SMC9196 IRQs indefinitely.
+
+
+The new approach brings several new ideas...
+
+We introduce the concept of a "parent" and a "child".  For example,
+to the Neponset handler, the "parent" is GPIO25, and the "children"d
+are SA1111, SMC9196 and USAR.
+
+We also bring the idea of an IRQ "chip" (mainly to reduce the size of
+the irqdesc array).  This doesn't have to be a real "IC"; indeed the
+SA11x0 IRQs are handled by two separate "chip" structures, one for
+GPIO0-10, and another for all the rest.  It is just a container for
+the various operations (maybe this'll change to a better name).
+This structure has the following operations::
+
+  struct irqchip {
+          /*
+           * Acknowledge the IRQ.
+           * If this is a level-based IRQ, then it is expected to mask the IRQ
+           * as well.
+           */
+          void (*ack)(unsigned int irq);
+          /*
+           * Mask the IRQ in hardware.
+           */
+          void (*mask)(unsigned int irq);
+          /*
+           * Unmask the IRQ in hardware.
+           */
+          void (*unmask)(unsigned int irq);
+          /*
+           * Re-run the IRQ
+           */
+          void (*rerun)(unsigned int irq);
+          /*
+           * Set the type of the IRQ.
+           */
+          int (*type)(unsigned int irq, unsigned int, type);
+  };
+
+ack
+       - required.  May be the same function as mask for IRQs
+         handled by do_level_IRQ.
+mask
+       - required.
+unmask
+       - required.
+rerun
+       - optional.  Not required if you're using do_level_IRQ for all
+         IRQs that use this 'irqchip'.  Generally expected to re-trigger
+         the hardware IRQ if possible.  If not, may call the handler
+	 directly.
+type
+       - optional.  If you don't support changing the type of an IRQ,
+         it should be null so people can detect if they are unable to
+         set the IRQ type.
+
+For each IRQ, we keep the following information:
+
+        - "disable" depth (number of disable_irq()s without enable_irq()s)
+        - flags indicating what we can do with this IRQ (valid, probe,
+          noautounmask) as before
+        - status of the IRQ (probing, enable, etc)
+        - chip
+        - per-IRQ handler
+        - irqaction structure list
+
+The handler can be one of the 3 standard handlers - "level", "edge" and
+"simple", or your own specific handler if you need to do something special.
+
+The "level" handler is what we currently have - its pretty simple.
+"edge" knows about the brokenness of such IRQ implementations - that you
+need to leave the hardware IRQ enabled while processing it, and queueing
+further IRQ events should the IRQ happen again while processing.  The
+"simple" handler is very basic, and does not perform any hardware
+manipulation, nor state tracking.  This is useful for things like the
+SMC9196 and USAR above.
+
+So, what's changed?
+===================
+
+1. Machine implementations must not write to the irqdesc array.
+
+2. New functions to manipulate the irqdesc array.  The first 4 are expected
+   to be useful only to machine specific code.  The last is recommended to
+   only be used by machine specific code, but may be used in drivers if
+   absolutely necessary.
+
+        set_irq_chip(irq,chip)
+                Set the mask/unmask methods for handling this IRQ
+
+        set_irq_handler(irq,handler)
+                Set the handler for this IRQ (level, edge, simple)
+
+        set_irq_chained_handler(irq,handler)
+                Set a "chained" handler for this IRQ - automatically
+                enables this IRQ (eg, Neponset and SA1111 handlers).
+
+        set_irq_flags(irq,flags)
+                Set the valid/probe/noautoenable flags.
+
+        set_irq_type(irq,type)
+                Set active the IRQ edge(s)/level.  This replaces the
+                SA1111 INTPOL manipulation, and the set_GPIO_IRQ_edge()
+                function.  Type should be one of IRQ_TYPE_xxx defined in
+		<linux/irq.h>
+
+3. set_GPIO_IRQ_edge() is obsolete, and should be replaced by set_irq_type.
+
+4. Direct access to SA1111 INTPOL is deprecated.  Use set_irq_type instead.
+
+5. A handler is expected to perform any necessary acknowledgement of the
+   parent IRQ via the correct chip specific function.  For instance, if
+   the SA1111 is directly connected to a SA1110 GPIO, then you should
+   acknowledge the SA1110 IRQ each time you re-read the SA1111 IRQ status.
+
+6. For any child which doesn't have its own IRQ enable/disable controls
+   (eg, SMC9196), the handler must mask or acknowledge the parent IRQ
+   while the child handler is called, and the child handler should be the
+   "simple" handler (not "edge" nor "level").  After the handler completes,
+   the parent IRQ should be unmasked, and the status of all children must
+   be re-checked for pending events.  (see the Neponset IRQ handler for
+   details).
+
+7. fixup_irq() is gone, as is `arch/arm/mach-*/include/mach/irq.h`
+
+Please note that this will not solve all problems - some of them are
+hardware based.  Mixing level-based and edge-based IRQs on the same
+parent signal (eg neponset) is one such area where a software based
+solution can't provide the full answer to low IRQ latency.
diff --git a/Documentation/arm/ixp4xx.rst b/Documentation/arm/ixp4xx.rst
new file mode 100644
index 000000000000..a57235616294
--- /dev/null
+++ b/Documentation/arm/ixp4xx.rst
@@ -0,0 +1,173 @@
+===========================================================
+Release Notes for Linux on Intel's IXP4xx Network Processor
+===========================================================
+
+Maintained by Deepak Saxena <dsaxena@plexity.net>
+-------------------------------------------------------------------------
+
+1. Overview
+
+Intel's IXP4xx network processor is a highly integrated SOC that
+is targeted for network applications, though it has become popular
+in industrial control and other areas due to low cost and power
+consumption. The IXP4xx family currently consists of several processors
+that support different network offload functions such as encryption,
+routing, firewalling, etc. The IXP46x family is an updated version which
+supports faster speeds, new memory and flash configurations, and more
+integration such as an on-chip I2C controller.
+
+For more information on the various versions of the CPU, see:
+
+   http://developer.intel.com/design/network/products/npfamily/ixp4xx.htm
+
+Intel also made the IXCP1100 CPU for sometime which is an IXP4xx
+stripped of much of the network intelligence.
+
+2. Linux Support
+
+Linux currently supports the following features on the IXP4xx chips:
+
+- Dual serial ports
+- PCI interface
+- Flash access (MTD/JFFS)
+- I2C through GPIO on IXP42x
+- GPIO for input/output/interrupts
+  See arch/arm/mach-ixp4xx/include/mach/platform.h for access functions.
+- Timers (watchdog, OS)
+
+The following components of the chips are not supported by Linux and
+require the use of Intel's proprietary CSR software:
+
+- USB device interface
+- Network interfaces (HSS, Utopia, NPEs, etc)
+- Network offload functionality
+
+If you need to use any of the above, you need to download Intel's
+software from:
+
+   http://developer.intel.com/design/network/products/npfamily/ixp425.htm
+
+DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPRIETARY
+SOFTWARE.
+
+There are several websites that provide directions/pointers on using
+Intel's software:
+
+   - http://sourceforge.net/projects/ixp4xx-osdg/
+     Open Source Developer's Guide for using uClinux and the Intel libraries
+
+   - http://gatewaymaker.sourceforge.net/
+     Simple one page summary of building a gateway using an IXP425 and Linux
+
+   - http://ixp425.sourceforge.net/
+     ATM device driver for IXP425 that relies on Intel's libraries
+
+3. Known Issues/Limitations
+
+3a. Limited inbound PCI window
+
+The IXP4xx family allows for up to 256MB of memory but the PCI interface
+can only expose 64MB of that memory to the PCI bus. This means that if
+you are running with > 64MB, all PCI buffers outside of the accessible
+range will be bounced using the routines in arch/arm/common/dmabounce.c.
+
+3b. Limited outbound PCI window
+
+IXP4xx provides two methods of accessing PCI memory space:
+
+1) A direct mapped window from 0x48000000 to 0x4bffffff (64MB).
+   To access PCI via this space, we simply ioremap() the BAR
+   into the kernel and we can use the standard read[bwl]/write[bwl]
+   macros. This is the preffered method due to speed but it
+   limits the system to just 64MB of PCI memory. This can be
+   problamatic if using video cards and other memory-heavy devices.
+
+2) If > 64MB of memory space is required, the IXP4xx can be
+   configured to use indirect registers to access PCI This allows
+   for up to 128MB (0x48000000 to 0x4fffffff) of memory on the bus.
+   The disadvantage of this is that every PCI access requires
+   three local register accesses plus a spinlock, but in some
+   cases the performance hit is acceptable. In addition, you cannot
+   mmap() PCI devices in this case due to the indirect nature
+   of the PCI window.
+
+By default, the direct method is used for performance reasons. If
+you need more PCI memory, enable the IXP4XX_INDIRECT_PCI config option.
+
+3c. GPIO as Interrupts
+
+Currently the code only handles level-sensitive GPIO interrupts
+
+4. Supported platforms
+
+ADI Engineering Coyote Gateway Reference Platform
+http://www.adiengineering.com/productsCoyote.html
+
+   The ADI Coyote platform is reference design for those building
+   small residential/office gateways. One NPE is connected to a 10/100
+   interface, one to 4-port 10/100 switch, and the third to and ADSL
+   interface. In addition, it also supports to POTs interfaces connected
+   via SLICs. Note that those are not supported by Linux ATM. Finally,
+   the platform has two mini-PCI slots used for 802.11[bga] cards.
+   Finally, there is an IDE port hanging off the expansion bus.
+
+Gateworks Avila Network Platform
+http://www.gateworks.com/support/overview.php
+
+   The Avila platform is basically and IXDP425 with the 4 PCI slots
+   replaced with mini-PCI slots and a CF IDE interface hanging off
+   the expansion bus.
+
+Intel IXDP425 Development Platform
+http://www.intel.com/design/network/products/npfamily/ixdpg425.htm
+
+   This is Intel's standard reference platform for the IXDP425 and is
+   also known as the Richfield board. It contains 4 PCI slots, 16MB
+   of flash, two 10/100 ports and one ADSL port.
+
+Intel IXDP465 Development Platform
+http://www.intel.com/design/network/products/npfamily/ixdp465.htm
+
+   This is basically an IXDP425 with an IXP465 and 32M of flash instead
+   of just 16.
+
+Intel IXDPG425 Development Platform
+
+   This is basically and ADI Coyote board with a NEC EHCI controller
+   added. One issue with this board is that the mini-PCI slots only
+   have the 3.3v line connected, so you can't use a PCI to mini-PCI
+   adapter with an E100 card. So to NFS root you need to use either
+   the CSR or a WiFi card and a ramdisk that BOOTPs and then does
+   a pivot_root to NFS.
+
+Motorola PrPMC1100 Processor Mezanine Card
+http://www.fountainsys.com
+
+   The PrPMC1100 is based on the IXCP1100 and is meant to plug into
+   and IXP2400/2800 system to act as the system controller. It simply
+   contains a CPU and 16MB of flash on the board and needs to be
+   plugged into a carrier board to function. Currently Linux only
+   supports the Motorola PrPMC carrier board for this platform.
+
+5. TODO LIST
+
+- Add support for Coyote IDE
+- Add support for edge-based GPIO interrupts
+- Add support for CF IDE on expansion bus
+
+6. Thanks
+
+The IXP4xx work has been funded by Intel Corp. and MontaVista Software, Inc.
+
+The following people have contributed patches/comments/etc:
+
+- Lennerty Buytenhek
+- Lutz Jaenicke
+- Justin Mayfield
+- Robert E. Ranslam
+
+[I know I've forgotten others, please email me to be added]
+
+-------------------------------------------------------------------------
+
+Last Update: 01/04/2005
diff --git a/Documentation/arm/kernel_mode_neon.rst b/Documentation/arm/kernel_mode_neon.rst
new file mode 100644
index 000000000000..9bfb71a2a9b9
--- /dev/null
+++ b/Documentation/arm/kernel_mode_neon.rst
@@ -0,0 +1,124 @@
+================
+Kernel mode NEON
+================
+
+TL;DR summary
+-------------
+* Use only NEON instructions, or VFP instructions that don't rely on support
+  code
+* Isolate your NEON code in a separate compilation unit, and compile it with
+  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
+* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
+  NEON code
+* Don't sleep in your NEON code, and be aware that it will be executed with
+  preemption disabled
+
+
+Introduction
+------------
+It is possible to use NEON instructions (and in some cases, VFP instructions) in
+code that runs in kernel mode. However, for performance reasons, the NEON/VFP
+register file is not preserved and restored at every context switch or taken
+exception like the normal register file is, so some manual intervention is
+required. Furthermore, special care is required for code that may sleep [i.e.,
+may call schedule()], as NEON or VFP instructions will be executed in a
+non-preemptible section for reasons outlined below.
+
+
+Lazy preserve and restore
+-------------------------
+The NEON/VFP register file is managed using lazy preserve (on UP systems) and
+lazy restore (on both SMP and UP systems). This means that the register file is
+kept 'live', and is only preserved and restored when multiple tasks are
+contending for the NEON/VFP unit (or, in the SMP case, when a task migrates to
+another core). Lazy restore is implemented by disabling the NEON/VFP unit after
+every context switch, resulting in a trap when subsequently a NEON/VFP
+instruction is issued, allowing the kernel to step in and perform the restore if
+necessary.
+
+Any use of the NEON/VFP unit in kernel mode should not interfere with this, so
+it is required to do an 'eager' preserve of the NEON/VFP register file, and
+enable the NEON/VFP unit explicitly so no exceptions are generated on first
+subsequent use. This is handled by the function kernel_neon_begin(), which
+should be called before any kernel mode NEON or VFP instructions are issued.
+Likewise, the NEON/VFP unit should be disabled again after use to make sure user
+mode will hit the lazy restore trap upon next use. This is handled by the
+function kernel_neon_end().
+
+
+Interruptions in kernel mode
+----------------------------
+For reasons of performance and simplicity, it was decided that there shall be no
+preserve/restore mechanism for the kernel mode NEON/VFP register contents. This
+implies that interruptions of a kernel mode NEON section can only be allowed if
+they are guaranteed not to touch the NEON/VFP registers. For this reason, the
+following rules and restrictions apply in the kernel:
+* NEON/VFP code is not allowed in interrupt context;
+* NEON/VFP code is not allowed to sleep;
+* NEON/VFP code is executed with preemption disabled.
+
+If latency is a concern, it is possible to put back to back calls to
+kernel_neon_end() and kernel_neon_begin() in places in your code where none of
+the NEON registers are live. (Additional calls to kernel_neon_begin() should be
+reasonably cheap if no context switch occurred in the meantime)
+
+
+VFP and support code
+--------------------
+Earlier versions of VFP (prior to version 3) rely on software support for things
+like IEEE-754 compliant underflow handling etc. When the VFP unit needs such
+software assistance, it signals the kernel by raising an undefined instruction
+exception. The kernel responds by inspecting the VFP control registers and the
+current instruction and arguments, and emulates the instruction in software.
+
+Such software assistance is currently not implemented for VFP instructions
+executed in kernel mode. If such a condition is encountered, the kernel will
+fail and generate an OOPS.
+
+
+Separating NEON code from ordinary code
+---------------------------------------
+The compiler is not aware of the special significance of kernel_neon_begin() and
+kernel_neon_end(), i.e., that it is only allowed to issue NEON/VFP instructions
+between calls to these respective functions. Furthermore, GCC may generate NEON
+instructions of its own at -O3 level if -mfpu=neon is selected, and even if the
+kernel is currently compiled at -O2, future changes may result in NEON/VFP
+instructions appearing in unexpected places if no special care is taken.
+
+Therefore, the recommended and only supported way of using NEON/VFP in the
+kernel is by adhering to the following rules:
+
+* isolate the NEON code in a separate compilation unit and compile it with
+  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
+* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
+  into the unit containing the NEON code from a compilation unit which is *not*
+  built with the GCC flag '-mfpu=neon' set.
+
+As the kernel is compiled with '-msoft-float', the above will guarantee that
+both NEON and VFP instructions will only ever appear in designated compilation
+units at any optimization level.
+
+
+NEON assembler
+--------------
+NEON assembler is supported with no additional caveats as long as the rules
+above are followed.
+
+
+NEON code generated by GCC
+--------------------------
+The GCC option -ftree-vectorize (implied by -O3) tries to exploit implicit
+parallelism, and generates NEON code from ordinary C source code. This is fully
+supported as long as the rules above are followed.
+
+
+NEON intrinsics
+---------------
+NEON intrinsics are also supported. However, as code using NEON intrinsics
+relies on the GCC header <arm_neon.h>, (which #includes <stdint.h>), you should
+observe the following in addition to the rules above:
+
+* Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC
+  uses its builtin version of <stdint.h> (this is a C99 header which the kernel
+  does not supply);
+* Include <arm_neon.h> last, or at least after <linux/types.h>
diff --git a/Documentation/arm/kernel_mode_neon.txt b/Documentation/arm/kernel_mode_neon.txt
deleted file mode 100644
index b9e060c5b61e..000000000000
--- a/Documentation/arm/kernel_mode_neon.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Kernel mode NEON
-================
-
-TL;DR summary
--------------
-* Use only NEON instructions, or VFP instructions that don't rely on support
-  code
-* Isolate your NEON code in a separate compilation unit, and compile it with
-  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
-* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
-  NEON code
-* Don't sleep in your NEON code, and be aware that it will be executed with
-  preemption disabled
-
-
-Introduction
-------------
-It is possible to use NEON instructions (and in some cases, VFP instructions) in
-code that runs in kernel mode. However, for performance reasons, the NEON/VFP
-register file is not preserved and restored at every context switch or taken
-exception like the normal register file is, so some manual intervention is
-required. Furthermore, special care is required for code that may sleep [i.e.,
-may call schedule()], as NEON or VFP instructions will be executed in a
-non-preemptible section for reasons outlined below.
-
-
-Lazy preserve and restore
--------------------------
-The NEON/VFP register file is managed using lazy preserve (on UP systems) and
-lazy restore (on both SMP and UP systems). This means that the register file is
-kept 'live', and is only preserved and restored when multiple tasks are
-contending for the NEON/VFP unit (or, in the SMP case, when a task migrates to
-another core). Lazy restore is implemented by disabling the NEON/VFP unit after
-every context switch, resulting in a trap when subsequently a NEON/VFP
-instruction is issued, allowing the kernel to step in and perform the restore if
-necessary.
-
-Any use of the NEON/VFP unit in kernel mode should not interfere with this, so
-it is required to do an 'eager' preserve of the NEON/VFP register file, and
-enable the NEON/VFP unit explicitly so no exceptions are generated on first
-subsequent use. This is handled by the function kernel_neon_begin(), which
-should be called before any kernel mode NEON or VFP instructions are issued.
-Likewise, the NEON/VFP unit should be disabled again after use to make sure user
-mode will hit the lazy restore trap upon next use. This is handled by the
-function kernel_neon_end().
-
-
-Interruptions in kernel mode
-----------------------------
-For reasons of performance and simplicity, it was decided that there shall be no
-preserve/restore mechanism for the kernel mode NEON/VFP register contents. This
-implies that interruptions of a kernel mode NEON section can only be allowed if
-they are guaranteed not to touch the NEON/VFP registers. For this reason, the
-following rules and restrictions apply in the kernel:
-* NEON/VFP code is not allowed in interrupt context;
-* NEON/VFP code is not allowed to sleep;
-* NEON/VFP code is executed with preemption disabled.
-
-If latency is a concern, it is possible to put back to back calls to
-kernel_neon_end() and kernel_neon_begin() in places in your code where none of
-the NEON registers are live. (Additional calls to kernel_neon_begin() should be
-reasonably cheap if no context switch occurred in the meantime)
-
-
-VFP and support code
---------------------
-Earlier versions of VFP (prior to version 3) rely on software support for things
-like IEEE-754 compliant underflow handling etc. When the VFP unit needs such
-software assistance, it signals the kernel by raising an undefined instruction
-exception. The kernel responds by inspecting the VFP control registers and the
-current instruction and arguments, and emulates the instruction in software.
-
-Such software assistance is currently not implemented for VFP instructions
-executed in kernel mode. If such a condition is encountered, the kernel will
-fail and generate an OOPS.
-
-
-Separating NEON code from ordinary code
----------------------------------------
-The compiler is not aware of the special significance of kernel_neon_begin() and
-kernel_neon_end(), i.e., that it is only allowed to issue NEON/VFP instructions
-between calls to these respective functions. Furthermore, GCC may generate NEON
-instructions of its own at -O3 level if -mfpu=neon is selected, and even if the
-kernel is currently compiled at -O2, future changes may result in NEON/VFP
-instructions appearing in unexpected places if no special care is taken.
-
-Therefore, the recommended and only supported way of using NEON/VFP in the
-kernel is by adhering to the following rules:
-* isolate the NEON code in a separate compilation unit and compile it with
-  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
-* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
-  into the unit containing the NEON code from a compilation unit which is *not*
-  built with the GCC flag '-mfpu=neon' set.
-
-As the kernel is compiled with '-msoft-float', the above will guarantee that
-both NEON and VFP instructions will only ever appear in designated compilation
-units at any optimization level.
-
-
-NEON assembler
---------------
-NEON assembler is supported with no additional caveats as long as the rules
-above are followed.
-
-
-NEON code generated by GCC
---------------------------
-The GCC option -ftree-vectorize (implied by -O3) tries to exploit implicit
-parallelism, and generates NEON code from ordinary C source code. This is fully
-supported as long as the rules above are followed.
-
-
-NEON intrinsics
----------------
-NEON intrinsics are also supported. However, as code using NEON intrinsics
-relies on the GCC header <arm_neon.h>, (which #includes <stdint.h>), you should
-observe the following in addition to the rules above:
-* Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC
-  uses its builtin version of <stdint.h> (this is a C99 header which the kernel
-  does not supply);
-* Include <arm_neon.h> last, or at least after <linux/types.h>
diff --git a/Documentation/arm/kernel_user_helpers.rst b/Documentation/arm/kernel_user_helpers.rst
new file mode 100644
index 000000000000..eb6f3d916622
--- /dev/null
+++ b/Documentation/arm/kernel_user_helpers.rst
@@ -0,0 +1,268 @@
+============================
+Kernel-provided User Helpers
+============================
+
+These are segment of kernel provided user code reachable from user space
+at a fixed address in kernel memory.  This is used to provide user space
+with some operations which require kernel help because of unimplemented
+native feature and/or instructions in many ARM CPUs. The idea is for this
+code to be executed directly in user mode for best efficiency but which is
+too intimate with the kernel counter part to be left to user libraries.
+In fact this code might even differ from one CPU to another depending on
+the available instruction set, or whether it is a SMP systems. In other
+words, the kernel reserves the right to change this code as needed without
+warning. Only the entry points and their results as documented here are
+guaranteed to be stable.
+
+This is different from (but doesn't preclude) a full blown VDSO
+implementation, however a VDSO would prevent some assembly tricks with
+constants that allows for efficient branching to those code segments. And
+since those code segments only use a few cycles before returning to user
+code, the overhead of a VDSO indirect far call would add a measurable
+overhead to such minimalistic operations.
+
+User space is expected to bypass those helpers and implement those things
+inline (either in the code emitted directly by the compiler, or part of
+the implementation of a library call) when optimizing for a recent enough
+processor that has the necessary native support, but only if resulting
+binaries are already to be incompatible with earlier ARM processors due to
+usage of similar native instructions for other things.  In other words
+don't make binaries unable to run on earlier processors just for the sake
+of not using these kernel helpers if your compiled code is not going to
+use new instructions for other purpose.
+
+New helpers may be added over time, so an older kernel may be missing some
+helpers present in a newer kernel.  For this reason, programs must check
+the value of __kuser_helper_version (see below) before assuming that it is
+safe to call any particular helper.  This check should ideally be
+performed only once at process startup time, and execution aborted early
+if the required helpers are not provided by the kernel version that
+process is running on.
+
+kuser_helper_version
+--------------------
+
+Location:	0xffff0ffc
+
+Reference declaration::
+
+  extern int32_t __kuser_helper_version;
+
+Definition:
+
+  This field contains the number of helpers being implemented by the
+  running kernel.  User space may read this to determine the availability
+  of a particular helper.
+
+Usage example::
+
+  #define __kuser_helper_version (*(int32_t *)0xffff0ffc)
+
+  void check_kuser_version(void)
+  {
+	if (__kuser_helper_version < 2) {
+		fprintf(stderr, "can't do atomic operations, kernel too old\n");
+		abort();
+	}
+  }
+
+Notes:
+
+  User space may assume that the value of this field never changes
+  during the lifetime of any single process.  This means that this
+  field can be read once during the initialisation of a library or
+  startup phase of a program.
+
+kuser_get_tls
+-------------
+
+Location:	0xffff0fe0
+
+Reference prototype::
+
+  void * __kuser_get_tls(void);
+
+Input:
+
+  lr = return address
+
+Output:
+
+  r0 = TLS value
+
+Clobbered registers:
+
+  none
+
+Definition:
+
+  Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
+
+Usage example::
+
+  typedef void * (__kuser_get_tls_t)(void);
+  #define __kuser_get_tls (*(__kuser_get_tls_t *)0xffff0fe0)
+
+  void foo()
+  {
+	void *tls = __kuser_get_tls();
+	printf("TLS = %p\n", tls);
+  }
+
+Notes:
+
+  - Valid only if __kuser_helper_version >= 1 (from kernel version 2.6.12).
+
+kuser_cmpxchg
+-------------
+
+Location:	0xffff0fc0
+
+Reference prototype::
+
+  int __kuser_cmpxchg(int32_t oldval, int32_t newval, volatile int32_t *ptr);
+
+Input:
+
+  r0 = oldval
+  r1 = newval
+  r2 = ptr
+  lr = return address
+
+Output:
+
+  r0 = success code (zero or non-zero)
+  C flag = set if r0 == 0, clear if r0 != 0
+
+Clobbered registers:
+
+  r3, ip, flags
+
+Definition:
+
+  Atomically store newval in `*ptr` only if `*ptr` is equal to oldval.
+  Return zero if `*ptr` was changed or non-zero if no exchange happened.
+  The C flag is also set if `*ptr` was changed to allow for assembly
+  optimization in the calling code.
+
+Usage example::
+
+  typedef int (__kuser_cmpxchg_t)(int oldval, int newval, volatile int *ptr);
+  #define __kuser_cmpxchg (*(__kuser_cmpxchg_t *)0xffff0fc0)
+
+  int atomic_add(volatile int *ptr, int val)
+  {
+	int old, new;
+
+	do {
+		old = *ptr;
+		new = old + val;
+	} while(__kuser_cmpxchg(old, new, ptr));
+
+	return new;
+  }
+
+Notes:
+
+  - This routine already includes memory barriers as needed.
+
+  - Valid only if __kuser_helper_version >= 2 (from kernel version 2.6.12).
+
+kuser_memory_barrier
+--------------------
+
+Location:	0xffff0fa0
+
+Reference prototype::
+
+  void __kuser_memory_barrier(void);
+
+Input:
+
+  lr = return address
+
+Output:
+
+  none
+
+Clobbered registers:
+
+  none
+
+Definition:
+
+  Apply any needed memory barrier to preserve consistency with data modified
+  manually and __kuser_cmpxchg usage.
+
+Usage example::
+
+  typedef void (__kuser_dmb_t)(void);
+  #define __kuser_dmb (*(__kuser_dmb_t *)0xffff0fa0)
+
+Notes:
+
+  - Valid only if __kuser_helper_version >= 3 (from kernel version 2.6.15).
+
+kuser_cmpxchg64
+---------------
+
+Location:	0xffff0f60
+
+Reference prototype::
+
+  int __kuser_cmpxchg64(const int64_t *oldval,
+                        const int64_t *newval,
+                        volatile int64_t *ptr);
+
+Input:
+
+  r0 = pointer to oldval
+  r1 = pointer to newval
+  r2 = pointer to target value
+  lr = return address
+
+Output:
+
+  r0 = success code (zero or non-zero)
+  C flag = set if r0 == 0, clear if r0 != 0
+
+Clobbered registers:
+
+  r3, lr, flags
+
+Definition:
+
+  Atomically store the 64-bit value pointed by `*newval` in `*ptr` only if `*ptr`
+  is equal to the 64-bit value pointed by `*oldval`.  Return zero if `*ptr` was
+  changed or non-zero if no exchange happened.
+
+  The C flag is also set if `*ptr` was changed to allow for assembly
+  optimization in the calling code.
+
+Usage example::
+
+  typedef int (__kuser_cmpxchg64_t)(const int64_t *oldval,
+                                    const int64_t *newval,
+                                    volatile int64_t *ptr);
+  #define __kuser_cmpxchg64 (*(__kuser_cmpxchg64_t *)0xffff0f60)
+
+  int64_t atomic_add64(volatile int64_t *ptr, int64_t val)
+  {
+	int64_t old, new;
+
+	do {
+		old = *ptr;
+		new = old + val;
+	} while(__kuser_cmpxchg64(&old, &new, ptr));
+
+	return new;
+  }
+
+Notes:
+
+  - This routine already includes memory barriers as needed.
+
+  - Due to the length of this sequence, this spans 2 conventional kuser
+    "slots", therefore 0xffff0f80 is not used as a valid entry point.
+
+  - Valid only if __kuser_helper_version >= 5 (from kernel version 3.1).
diff --git a/Documentation/arm/kernel_user_helpers.txt b/Documentation/arm/kernel_user_helpers.txt
deleted file mode 100644
index 5673594717cf..000000000000
--- a/Documentation/arm/kernel_user_helpers.txt
+++ /dev/null
@@ -1,267 +0,0 @@
-Kernel-provided User Helpers
-============================
-
-These are segment of kernel provided user code reachable from user space
-at a fixed address in kernel memory.  This is used to provide user space
-with some operations which require kernel help because of unimplemented
-native feature and/or instructions in many ARM CPUs. The idea is for this
-code to be executed directly in user mode for best efficiency but which is
-too intimate with the kernel counter part to be left to user libraries.
-In fact this code might even differ from one CPU to another depending on
-the available instruction set, or whether it is a SMP systems. In other
-words, the kernel reserves the right to change this code as needed without
-warning. Only the entry points and their results as documented here are
-guaranteed to be stable.
-
-This is different from (but doesn't preclude) a full blown VDSO
-implementation, however a VDSO would prevent some assembly tricks with
-constants that allows for efficient branching to those code segments. And
-since those code segments only use a few cycles before returning to user
-code, the overhead of a VDSO indirect far call would add a measurable
-overhead to such minimalistic operations.
-
-User space is expected to bypass those helpers and implement those things
-inline (either in the code emitted directly by the compiler, or part of
-the implementation of a library call) when optimizing for a recent enough
-processor that has the necessary native support, but only if resulting
-binaries are already to be incompatible with earlier ARM processors due to
-usage of similar native instructions for other things.  In other words
-don't make binaries unable to run on earlier processors just for the sake
-of not using these kernel helpers if your compiled code is not going to
-use new instructions for other purpose.
-
-New helpers may be added over time, so an older kernel may be missing some
-helpers present in a newer kernel.  For this reason, programs must check
-the value of __kuser_helper_version (see below) before assuming that it is
-safe to call any particular helper.  This check should ideally be
-performed only once at process startup time, and execution aborted early
-if the required helpers are not provided by the kernel version that
-process is running on.
-
-kuser_helper_version
---------------------
-
-Location:	0xffff0ffc
-
-Reference declaration:
-
-  extern int32_t __kuser_helper_version;
-
-Definition:
-
-  This field contains the number of helpers being implemented by the
-  running kernel.  User space may read this to determine the availability
-  of a particular helper.
-
-Usage example:
-
-#define __kuser_helper_version (*(int32_t *)0xffff0ffc)
-
-void check_kuser_version(void)
-{
-	if (__kuser_helper_version < 2) {
-		fprintf(stderr, "can't do atomic operations, kernel too old\n");
-		abort();
-	}
-}
-
-Notes:
-
-  User space may assume that the value of this field never changes
-  during the lifetime of any single process.  This means that this
-  field can be read once during the initialisation of a library or
-  startup phase of a program.
-
-kuser_get_tls
--------------
-
-Location:	0xffff0fe0
-
-Reference prototype:
-
-  void * __kuser_get_tls(void);
-
-Input:
-
-  lr = return address
-
-Output:
-
-  r0 = TLS value
-
-Clobbered registers:
-
-  none
-
-Definition:
-
-  Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
-
-Usage example:
-
-typedef void * (__kuser_get_tls_t)(void);
-#define __kuser_get_tls (*(__kuser_get_tls_t *)0xffff0fe0)
-
-void foo()
-{
-	void *tls = __kuser_get_tls();
-	printf("TLS = %p\n", tls);
-}
-
-Notes:
-
-  - Valid only if __kuser_helper_version >= 1 (from kernel version 2.6.12).
-
-kuser_cmpxchg
--------------
-
-Location:	0xffff0fc0
-
-Reference prototype:
-
-  int __kuser_cmpxchg(int32_t oldval, int32_t newval, volatile int32_t *ptr);
-
-Input:
-
-  r0 = oldval
-  r1 = newval
-  r2 = ptr
-  lr = return address
-
-Output:
-
-  r0 = success code (zero or non-zero)
-  C flag = set if r0 == 0, clear if r0 != 0
-
-Clobbered registers:
-
-  r3, ip, flags
-
-Definition:
-
-  Atomically store newval in *ptr only if *ptr is equal to oldval.
-  Return zero if *ptr was changed or non-zero if no exchange happened.
-  The C flag is also set if *ptr was changed to allow for assembly
-  optimization in the calling code.
-
-Usage example:
-
-typedef int (__kuser_cmpxchg_t)(int oldval, int newval, volatile int *ptr);
-#define __kuser_cmpxchg (*(__kuser_cmpxchg_t *)0xffff0fc0)
-
-int atomic_add(volatile int *ptr, int val)
-{
-	int old, new;
-
-	do {
-		old = *ptr;
-		new = old + val;
-	} while(__kuser_cmpxchg(old, new, ptr));
-
-	return new;
-}
-
-Notes:
-
-  - This routine already includes memory barriers as needed.
-
-  - Valid only if __kuser_helper_version >= 2 (from kernel version 2.6.12).
-
-kuser_memory_barrier
---------------------
-
-Location:	0xffff0fa0
-
-Reference prototype:
-
-  void __kuser_memory_barrier(void);
-
-Input:
-
-  lr = return address
-
-Output:
-
-  none
-
-Clobbered registers:
-
-  none
-
-Definition:
-
-  Apply any needed memory barrier to preserve consistency with data modified
-  manually and __kuser_cmpxchg usage.
-
-Usage example:
-
-typedef void (__kuser_dmb_t)(void);
-#define __kuser_dmb (*(__kuser_dmb_t *)0xffff0fa0)
-
-Notes:
-
-  - Valid only if __kuser_helper_version >= 3 (from kernel version 2.6.15).
-
-kuser_cmpxchg64
----------------
-
-Location:	0xffff0f60
-
-Reference prototype:
-
-  int __kuser_cmpxchg64(const int64_t *oldval,
-                        const int64_t *newval,
-                        volatile int64_t *ptr);
-
-Input:
-
-  r0 = pointer to oldval
-  r1 = pointer to newval
-  r2 = pointer to target value
-  lr = return address
-
-Output:
-
-  r0 = success code (zero or non-zero)
-  C flag = set if r0 == 0, clear if r0 != 0
-
-Clobbered registers:
-
-  r3, lr, flags
-
-Definition:
-
-  Atomically store the 64-bit value pointed by *newval in *ptr only if *ptr
-  is equal to the 64-bit value pointed by *oldval.  Return zero if *ptr was
-  changed or non-zero if no exchange happened.
-
-  The C flag is also set if *ptr was changed to allow for assembly
-  optimization in the calling code.
-
-Usage example:
-
-typedef int (__kuser_cmpxchg64_t)(const int64_t *oldval,
-                                  const int64_t *newval,
-                                  volatile int64_t *ptr);
-#define __kuser_cmpxchg64 (*(__kuser_cmpxchg64_t *)0xffff0f60)
-
-int64_t atomic_add64(volatile int64_t *ptr, int64_t val)
-{
-	int64_t old, new;
-
-	do {
-		old = *ptr;
-		new = old + val;
-	} while(__kuser_cmpxchg64(&old, &new, ptr));
-
-	return new;
-}
-
-Notes:
-
-  - This routine already includes memory barriers as needed.
-
-  - Due to the length of this sequence, this spans 2 conventional kuser
-    "slots", therefore 0xffff0f80 is not used as a valid entry point.
-
-  - Valid only if __kuser_helper_version >= 5 (from kernel version 3.1).
diff --git a/Documentation/arm/keystone/Overview.txt b/Documentation/arm/keystone/Overview.txt
deleted file mode 100644
index 400c0c270d2e..000000000000
--- a/Documentation/arm/keystone/Overview.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-		TI Keystone Linux Overview
-		--------------------------
-
-Introduction
-------------
-Keystone range of SoCs are based on ARM Cortex-A15 MPCore Processors
-and c66x DSP cores. This document describes essential information required
-for users to run Linux on Keystone based EVMs from Texas Instruments.
-
-Following SoCs  & EVMs are currently supported:-
-
------------- K2HK SoC and EVM --------------------------------------------------
-
-a.k.a Keystone 2 Hawking/Kepler SoC
-TCI6636K2H & TCI6636K2K: See documentation at
-	http://www.ti.com/product/tci6638k2k
-	http://www.ti.com/product/tci6638k2h
-
-EVM:
-http://www.advantech.com/Support/TI-EVM/EVMK2HX_sd.aspx
-
------------- K2E SoC and EVM ---------------------------------------------------
-
-a.k.a Keystone 2 Edison SoC
-K2E  -  66AK2E05: See documentation at
-	http://www.ti.com/product/66AK2E05/technicaldocuments
-
-EVM:
-https://www.einfochips.com/index.php/partnerships/texas-instruments/k2e-evm.html
-
------------- K2L SoC and EVM ---------------------------------------------------
-
-a.k.a Keystone 2 Lamarr SoC
-K2L  -  TCI6630K2L: See documentation at
-	http://www.ti.com/product/TCI6630K2L/technicaldocuments
-EVM:
-https://www.einfochips.com/index.php/partnerships/texas-instruments/k2l-evm.html
-
-Configuration
--------------
-
-All of the K2 SoCs/EVMs share a common defconfig, keystone_defconfig and same
-image is used to boot on individual EVMs. The platform configuration is
-specified through DTS. Following are the DTS used:-
-	K2HK EVM : k2hk-evm.dts
-	K2E EVM  : k2e-evm.dts
-	K2L EVM  : k2l-evm.dts
-
-The device tree documentation for the keystone machines are located at
-        Documentation/devicetree/bindings/arm/keystone/keystone.txt
-
-Document Author
----------------
-Murali Karicheri <m-karicheri2@ti.com>
-Copyright 2015 Texas Instruments
diff --git a/Documentation/arm/keystone/knav-qmss.rst b/Documentation/arm/keystone/knav-qmss.rst
new file mode 100644
index 000000000000..7f7638d80b42
--- /dev/null
+++ b/Documentation/arm/keystone/knav-qmss.rst
@@ -0,0 +1,60 @@
+======================================================================
+Texas Instruments Keystone Navigator Queue Management SubSystem driver
+======================================================================
+
+Driver source code path
+  drivers/soc/ti/knav_qmss.c
+  drivers/soc/ti/knav_qmss_acc.c
+
+The QMSS (Queue Manager Sub System) found on Keystone SOCs is one of
+the main hardware sub system which forms the backbone of the Keystone
+multi-core Navigator. QMSS consist of queue managers, packed-data structure
+processors(PDSP), linking RAM, descriptor pools and infrastructure
+Packet DMA.
+The Queue Manager is a hardware module that is responsible for accelerating
+management of the packet queues. Packets are queued/de-queued by writing or
+reading descriptor address to a particular memory mapped location. The PDSPs
+perform QMSS related functions like accumulation, QoS, or event management.
+Linking RAM registers are used to link the descriptors which are stored in
+descriptor RAM. Descriptor RAM is configurable as internal or external memory.
+The QMSS driver manages the PDSP setups, linking RAM regions,
+queue pool management (allocation, push, pop and notify) and descriptor
+pool management.
+
+knav qmss driver provides a set of APIs to drivers to open/close qmss queues,
+allocate descriptor pools, map the descriptors, push/pop to queues etc. For
+details of the available APIs, please refers to include/linux/soc/ti/knav_qmss.h
+
+DT documentation is available at
+Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
+
+Accumulator QMSS queues using PDSP firmware
+============================================
+The QMSS PDSP firmware support accumulator channel that can monitor a single
+queue or multiple contiguous queues. drivers/soc/ti/knav_qmss_acc.c is the
+driver that interface with the accumulator PDSP. This configures
+accumulator channels defined in DTS (example in DT documentation) to monitor
+1 or 32 queues per channel. More description on the firmware is available in
+CPPI/QMSS Low Level Driver document (docs/CPPI_QMSS_LLD_SDS.pdf) at
+
+	git://git.ti.com/keystone-rtos/qmss-lld.git
+
+k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin firmware supports upto 48 accumulator
+channels. This firmware is available under ti-keystone folder of
+firmware.git at
+
+   git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git
+
+To use copy the firmware image to lib/firmware folder of the initramfs or
+ubifs file system and provide a sym link to k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin
+in the file system and boot up the kernel. User would see
+
+ "firmware file ks2_qmss_pdsp_acc48.bin downloaded for PDSP"
+
+in the boot up log if loading of firmware to PDSP is successful.
+
+Use of accumulated queues requires the firmware image to be present in the
+file system. The driver doesn't acc queues to the supported queue range if
+PDSP is not running in the SoC. The API call fails if there is a queue open
+request to an acc queue and PDSP is not running. So make sure to copy firmware
+to file system before using these queue types.
diff --git a/Documentation/arm/keystone/knav-qmss.txt b/Documentation/arm/keystone/knav-qmss.txt
deleted file mode 100644
index fcdb9fd5f53a..000000000000
--- a/Documentation/arm/keystone/knav-qmss.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-* Texas Instruments Keystone Navigator Queue Management SubSystem driver
-
-Driver source code path
-  drivers/soc/ti/knav_qmss.c
-  drivers/soc/ti/knav_qmss_acc.c
-
-The QMSS (Queue Manager Sub System) found on Keystone SOCs is one of
-the main hardware sub system which forms the backbone of the Keystone
-multi-core Navigator. QMSS consist of queue managers, packed-data structure
-processors(PDSP), linking RAM, descriptor pools and infrastructure
-Packet DMA.
-The Queue Manager is a hardware module that is responsible for accelerating
-management of the packet queues. Packets are queued/de-queued by writing or
-reading descriptor address to a particular memory mapped location. The PDSPs
-perform QMSS related functions like accumulation, QoS, or event management.
-Linking RAM registers are used to link the descriptors which are stored in
-descriptor RAM. Descriptor RAM is configurable as internal or external memory.
-The QMSS driver manages the PDSP setups, linking RAM regions,
-queue pool management (allocation, push, pop and notify) and descriptor
-pool management.
-
-knav qmss driver provides a set of APIs to drivers to open/close qmss queues,
-allocate descriptor pools, map the descriptors, push/pop to queues etc. For
-details of the available APIs, please refers to include/linux/soc/ti/knav_qmss.h
-
-DT documentation is available at
-Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
-
-Accumulator QMSS queues using PDSP firmware
-============================================
-The QMSS PDSP firmware support accumulator channel that can monitor a single
-queue or multiple contiguous queues. drivers/soc/ti/knav_qmss_acc.c is the
-driver that interface with the accumulator PDSP. This configures
-accumulator channels defined in DTS (example in DT documentation) to monitor
-1 or 32 queues per channel. More description on the firmware is available in
-CPPI/QMSS Low Level Driver document (docs/CPPI_QMSS_LLD_SDS.pdf) at
-	git://git.ti.com/keystone-rtos/qmss-lld.git
-
-k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin firmware supports upto 48 accumulator
-channels. This firmware is available under ti-keystone folder of
-firmware.git at
-   git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git
-
-To use copy the firmware image to lib/firmware folder of the initramfs or
-ubifs file system and provide a sym link to k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin
-in the file system and boot up the kernel. User would see
-
- "firmware file ks2_qmss_pdsp_acc48.bin downloaded for PDSP"
-
-in the boot up log if loading of firmware to PDSP is successful.
-
-Use of accumulated queues requires the firmware image to be present in the
-file system. The driver doesn't acc queues to the supported queue range if
-PDSP is not running in the SoC. The API call fails if there is a queue open
-request to an acc queue and PDSP is not running. So make sure to copy firmware
-to file system before using these queue types.
diff --git a/Documentation/arm/keystone/overview.rst b/Documentation/arm/keystone/overview.rst
new file mode 100644
index 000000000000..cd90298c493c
--- /dev/null
+++ b/Documentation/arm/keystone/overview.rst
@@ -0,0 +1,74 @@
+==========================
+TI Keystone Linux Overview
+==========================
+
+Introduction
+------------
+Keystone range of SoCs are based on ARM Cortex-A15 MPCore Processors
+and c66x DSP cores. This document describes essential information required
+for users to run Linux on Keystone based EVMs from Texas Instruments.
+
+Following SoCs  & EVMs are currently supported:-
+
+K2HK SoC and EVM
+=================
+
+a.k.a Keystone 2 Hawking/Kepler SoC
+TCI6636K2H & TCI6636K2K: See documentation at
+
+	http://www.ti.com/product/tci6638k2k
+	http://www.ti.com/product/tci6638k2h
+
+EVM:
+  http://www.advantech.com/Support/TI-EVM/EVMK2HX_sd.aspx
+
+K2E SoC and EVM
+===============
+
+a.k.a Keystone 2 Edison SoC
+
+K2E  -  66AK2E05:
+
+See documentation at
+
+	http://www.ti.com/product/66AK2E05/technicaldocuments
+
+EVM:
+   https://www.einfochips.com/index.php/partnerships/texas-instruments/k2e-evm.html
+
+K2L SoC and EVM
+===============
+
+a.k.a Keystone 2 Lamarr SoC
+
+K2L  -  TCI6630K2L:
+
+See documentation at
+	http://www.ti.com/product/TCI6630K2L/technicaldocuments
+
+EVM:
+  https://www.einfochips.com/index.php/partnerships/texas-instruments/k2l-evm.html
+
+Configuration
+-------------
+
+All of the K2 SoCs/EVMs share a common defconfig, keystone_defconfig and same
+image is used to boot on individual EVMs. The platform configuration is
+specified through DTS. Following are the DTS used:
+
+	K2HK EVM:
+		k2hk-evm.dts
+	K2E EVM:
+		k2e-evm.dts
+	K2L EVM:
+		k2l-evm.dts
+
+The device tree documentation for the keystone machines are located at
+
+        Documentation/devicetree/bindings/arm/keystone/keystone.txt
+
+Document Author
+---------------
+Murali Karicheri <m-karicheri2@ti.com>
+
+Copyright 2015 Texas Instruments
diff --git a/Documentation/arm/marvel.rst b/Documentation/arm/marvel.rst
new file mode 100644
index 000000000000..16ab2eb085b8
--- /dev/null
+++ b/Documentation/arm/marvel.rst
@@ -0,0 +1,488 @@
+================
+ARM Marvell SoCs
+================
+
+This document lists all the ARM Marvell SoCs that are currently
+supported in mainline by the Linux kernel. As the Marvell families of
+SoCs are large and complex, it is hard to understand where the support
+for a particular SoC is available in the Linux kernel. This document
+tries to help in understanding where those SoCs are supported, and to
+match them with their corresponding public datasheet, when available.
+
+Orion family
+------------
+
+  Flavors:
+        - 88F5082
+        - 88F5181
+        - 88F5181L
+        - 88F5182
+
+               - Datasheet: http://www.embeddedarm.com/documentation/third-party/MV88F5182-datasheet.pdf
+               - Programmer's User Guide: http://www.embeddedarm.com/documentation/third-party/MV88F5182-opensource-manual.pdf
+               - User Manual: http://www.embeddedarm.com/documentation/third-party/MV88F5182-usermanual.pdf
+        - 88F5281
+
+               - Datasheet: http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
+        - 88F6183
+  Core:
+	Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
+  Linux kernel mach directory:
+	arch/arm/mach-orion5x
+  Linux kernel plat directory:
+	arch/arm/plat-orion
+
+Kirkwood family
+---------------
+
+  Flavors:
+        - 88F6282 a.k.a Armada 300
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
+        - 88F6283 a.k.a Armada 310
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
+        - 88F6190
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6190-003_WEB.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+        - 88F6192
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6192-003_ver1.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+        - 88F6182
+        - 88F6180
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6180-003_ver1.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6180_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+        - 88F6281
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6281-004_ver1.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6281_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+  Homepage:
+	http://www.marvell.com/embedded-processors/kirkwood/
+  Core:
+	Feroceon 88fr131 ARMv5 compatible
+  Linux kernel mach directory:
+	arch/arm/mach-mvebu
+  Linux kernel plat directory:
+	none
+
+Discovery family
+----------------
+
+  Flavors:
+        - MV78100
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78100-003_WEB.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78100_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
+        - MV78200
+
+                - Product Brief  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78200-002_WEB.pdf
+                - Hardware Spec  : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78200_OpenSource.pdf
+                - Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
+        - MV76100
+
+                Not supported by the Linux kernel.
+
+  Core:
+	Feroceon 88fr571-vd ARMv5 compatible
+
+  Linux kernel mach directory:
+	arch/arm/mach-mv78xx0
+  Linux kernel plat directory:
+	arch/arm/plat-orion
+
+EBU Armada family
+-----------------
+
+  Armada 370 Flavors:
+        - 88F6710
+        - 88F6707
+        - 88F6W11
+
+    - Product Brief:   http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
+    - Hardware Spec:   http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
+    - Functional Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
+
+  Core:
+	Sheeva ARMv7 compatible PJ4B
+
+  Armada 375 Flavors:
+	- 88F6720
+
+    - Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
+
+  Core:
+	ARM Cortex-A9
+
+  Armada 38x Flavors:
+	- 88F6810	Armada 380
+	- 88F6820 Armada 385
+	- 88F6828 Armada 388
+
+    - Product infos:   http://www.marvell.com/embedded-processors/armada-38x/
+    - Functional Spec: https://marvellcorp.wufoo.com/forms/marvell-armada-38x-functional-specifications/
+
+  Core:
+	ARM Cortex-A9
+
+  Armada 39x Flavors:
+	- 88F6920 Armada 390
+	- 88F6928 Armada 398
+
+    - Product infos: http://www.marvell.com/embedded-processors/armada-39x/
+
+  Core:
+	ARM Cortex-A9
+
+  Armada XP Flavors:
+        - MV78230
+        - MV78260
+        - MV78460
+
+    NOTE:
+	not to be confused with the non-SMP 78xx0 SoCs
+
+    Product Brief:
+	http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
+
+    Functional Spec:
+	http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
+
+    - Hardware Specs:
+
+        - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
+        - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
+        - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
+
+  Core:
+	Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
+
+  Linux kernel mach directory:
+	arch/arm/mach-mvebu
+  Linux kernel plat directory:
+	none
+
+EBU Armada family ARMv8
+-----------------------
+
+  Armada 3710/3720 Flavors:
+	- 88F3710
+	- 88F3720
+
+  Core:
+	ARM Cortex A53 (ARMv8)
+
+  Homepage:
+	http://www.marvell.com/embedded-processors/armada-3700/
+
+  Product Brief:
+	http://www.marvell.com/embedded-processors/assets/PB-88F3700-FNL.pdf
+
+  Device tree files:
+	arch/arm64/boot/dts/marvell/armada-37*
+
+  Armada 7K Flavors:
+	  - 88F7020 (AP806 Dual + one CP110)
+	  - 88F7040 (AP806 Quad + one CP110)
+
+  Core: ARM Cortex A72
+
+  Homepage:
+	http://www.marvell.com/embedded-processors/armada-70xx/
+
+  Product Brief:
+	  - http://www.marvell.com/embedded-processors/assets/Armada7020PB-Jan2016.pdf
+	  - http://www.marvell.com/embedded-processors/assets/Armada7040PB-Jan2016.pdf
+
+  Device tree files:
+	arch/arm64/boot/dts/marvell/armada-70*
+
+  Armada 8K Flavors:
+	- 88F8020 (AP806 Dual + two CP110)
+	- 88F8040 (AP806 Quad + two CP110)
+  Core:
+	ARM Cortex A72
+
+  Homepage:
+	http://www.marvell.com/embedded-processors/armada-80xx/
+
+  Product Brief:
+	  - http://www.marvell.com/embedded-processors/assets/Armada8020PB-Jan2016.pdf
+	  - http://www.marvell.com/embedded-processors/assets/Armada8040PB-Jan2016.pdf
+
+  Device tree files:
+	arch/arm64/boot/dts/marvell/armada-80*
+
+Avanta family
+-------------
+
+  Flavors:
+       - 88F6510
+       - 88F6530P
+       - 88F6550
+       - 88F6560
+
+  Homepage:
+	http://www.marvell.com/broadband/
+
+  Product Brief:
+	http://www.marvell.com/broadband/assets/Marvell_Avanta_88F6510_305_060-001_product_brief.pdf
+
+  No public datasheet available.
+
+  Core:
+	ARMv5 compatible
+
+  Linux kernel mach directory:
+	no code in mainline yet, planned for the future
+  Linux kernel plat directory:
+	no code in mainline yet, planned for the future
+
+Storage family
+--------------
+
+  Armada SP:
+	- 88RC1580
+
+  Product infos:
+	http://www.marvell.com/storage/armada-sp/
+
+  Core:
+	Sheeva ARMv7 comatible Quad-core PJ4C
+
+  (not supported in upstream Linux kernel)
+
+Dove family (application processor)
+-----------------------------------
+
+  Flavors:
+        - 88AP510 a.k.a Armada 510
+
+   Product Brief:
+	http://www.marvell.com/application-processors/armada-500/assets/Marvell_Armada510_SoC.pdf
+
+   Hardware Spec:
+	http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Hardware-Spec.pdf
+
+  Functional Spec:
+	http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Functional-Spec.pdf
+
+  Homepage:
+	http://www.marvell.com/application-processors/armada-500/
+
+  Core:
+	ARMv7 compatible
+
+  Directory:
+	- arch/arm/mach-mvebu (DT enabled platforms)
+        - arch/arm/mach-dove (non-DT enabled platforms)
+
+PXA 2xx/3xx/93x/95x family
+--------------------------
+
+  Flavors:
+        - PXA21x, PXA25x, PXA26x
+             - Application processor only
+             - Core: ARMv5 XScale1 core
+        - PXA270, PXA271, PXA272
+             - Product Brief         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_pb.pdf
+             - Design guide          : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_design_guide.pdf
+             - Developers manual     : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_dev_man.pdf
+             - Specification         : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_emts.pdf
+             - Specification update  : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_spec_update.pdf
+             - Application processor only
+             - Core: ARMv5 XScale2 core
+        - PXA300, PXA310, PXA320
+             - PXA 300 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA300_PB_R4.pdf
+             - PXA 310 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA310_PB_R4.pdf
+             - PXA 320 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA320_PB_R4.pdf
+             - Design guide          : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Design_Guide.pdf
+             - Developers manual     : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Developers_Manual.zip
+             - Specifications        : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_EMTS.pdf
+             - Specification Update  : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Spec_Update.zip
+             - Reference Manual      : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_TavorP_BootROM_Ref_Manual.pdf
+             - Application processor only
+             - Core: ARMv5 XScale3 core
+        - PXA930, PXA935
+             - Application processor with Communication processor
+             - Core: ARMv5 XScale3 core
+        - PXA955
+             - Application processor with Communication processor
+             - Core: ARMv7 compatible Sheeva PJ4 core
+
+   Comments:
+
+    * This line of SoCs originates from the XScale family developed by
+      Intel and acquired by Marvell in ~2006. The PXA21x, PXA25x,
+      PXA26x, PXA27x, PXA3xx and PXA93x were developed by Intel, while
+      the later PXA95x were developed by Marvell.
+
+    * Due to their XScale origin, these SoCs have virtually nothing in
+      common with the other (Kirkwood, Dove, etc.) families of Marvell
+      SoCs, except with the MMP/MMP2 family of SoCs.
+
+   Linux kernel mach directory:
+	arch/arm/mach-pxa
+   Linux kernel plat directory:
+	arch/arm/plat-pxa
+
+MMP/MMP2/MMP3 family (communication processor)
+----------------------------------------------
+
+   Flavors:
+        - PXA168, a.k.a Armada 168
+             - Homepage             : http://www.marvell.com/application-processors/armada-100/armada-168.jsp
+             - Product brief        : http://www.marvell.com/application-processors/armada-100/assets/pxa_168_pb.pdf
+             - Hardware manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_datasheet.pdf
+             - Software manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_software_manual.pdf
+             - Specification update : http://www.marvell.com/application-processors/armada-100/assets/ARMADA16x_Spec_update.pdf
+             - Boot ROM manual      : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_ref_manual.pdf
+             - App node package     : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_app_note_package.pdf
+             - Application processor only
+             - Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+        - PXA910/PXA920
+             - Homepage             : http://www.marvell.com/communication-processors/pxa910/
+             - Product Brief        : http://www.marvell.com/communication-processors/pxa910/assets/Marvell_PXA910_Platform-001_PB_final.pdf
+             - Application processor with Communication processor
+             - Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+        - PXA688, a.k.a. MMP2, a.k.a Armada 610
+             - Product Brief        : http://www.marvell.com/application-processors/armada-600/assets/armada610_pb.pdf
+             - Application processor only
+             - Core: ARMv7 compatible Sheeva PJ4 88sv581x core
+	- PXA2128, a.k.a. MMP3 (OLPC XO4, Linux support not upstream)
+	     - Product Brief	  : http://www.marvell.com/application-processors/armada/pxa2128/assets/Marvell-ARMADA-PXA2128-SoC-PB.pdf
+	     - Application processor only
+	     - Core: Dual-core ARMv7 compatible Sheeva PJ4C core
+	- PXA960/PXA968/PXA978 (Linux support not upstream)
+	     - Application processor with Communication Processor
+	     - Core: ARMv7 compatible Sheeva PJ4 core
+	- PXA986/PXA988 (Linux support not upstream)
+	     - Application processor with Communication Processor
+	     - Core: Dual-core ARMv7 compatible Sheeva PJ4B-MP core
+	- PXA1088/PXA1920 (Linux support not upstream)
+	     - Application processor with Communication Processor
+	     - Core: quad-core ARMv7 Cortex-A7
+	- PXA1908/PXA1928/PXA1936
+	     - Application processor with Communication Processor
+	     - Core: multi-core ARMv8 Cortex-A53
+
+   Comments:
+
+    * This line of SoCs originates from the XScale family developed by
+      Intel and acquired by Marvell in ~2006. All the processors of
+      this MMP/MMP2 family were developed by Marvell.
+
+    * Due to their XScale origin, these SoCs have virtually nothing in
+      common with the other (Kirkwood, Dove, etc.) families of Marvell
+      SoCs, except with the PXA family of SoCs listed above.
+
+   Linux kernel mach directory:
+	arch/arm/mach-mmp
+   Linux kernel plat directory:
+	arch/arm/plat-pxa
+
+Berlin family (Multimedia Solutions)
+-------------------------------------
+
+  - Flavors:
+	- 88DE3010, Armada 1000 (no Linux support)
+		- Core:		Marvell PJ1 (ARMv5TE), Dual-core
+		- Product Brief:	http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
+	- 88DE3005, Armada 1500 Mini
+		- Design name:	BG2CD
+		- Core:		ARM Cortex-A9, PL310 L2CC
+	- 88DE3006, Armada 1500 Mini Plus
+		- Design name:	BG2CDP
+		- Core:		Dual Core ARM Cortex-A7
+	- 88DE3100, Armada 1500
+		- Design name:	BG2
+		- Core:		Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
+	- 88DE3114, Armada 1500 Pro
+		- Design name:	BG2Q
+		- Core:		Quad Core ARM Cortex-A9, PL310 L2CC
+	- 88DE3214, Armada 1500 Pro 4K
+		- Design name:	BG3
+		- Core:		ARM Cortex-A15, CA15 integrated L2CC
+	- 88DE3218, ARMADA 1500 Ultra
+		- Core:		ARM Cortex-A53
+
+  Homepage: https://www.synaptics.com/products/multimedia-solutions
+  Directory: arch/arm/mach-berlin
+
+  Comments:
+
+   * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
+     with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
+
+   * The Berlin family was acquired by Synaptics from Marvell in 2017.
+
+CPU Cores
+---------
+
+The XScale cores were designed by Intel, and shipped by Marvell in the older
+PXA processors. Feroceon is a Marvell designed core that developed in-house,
+and that evolved into Sheeva. The XScale and Feroceon cores were phased out
+over time and replaced with Sheeva cores in later products, which subsequently
+got replaced with licensed ARM Cortex-A cores.
+
+  XScale 1
+	CPUID 0x69052xxx
+	ARMv5, iWMMXt
+  XScale 2
+	CPUID 0x69054xxx
+	ARMv5, iWMMXt
+  XScale 3
+	CPUID 0x69056xxx or 0x69056xxx
+	ARMv5, iWMMXt
+  Feroceon-1850 88fr331 "Mohawk"
+	CPUID 0x5615331x or 0x41xx926x
+	ARMv5TE, single issue
+  Feroceon-2850 88fr531-vd "Jolteon"
+	CPUID 0x5605531x or 0x41xx926x
+	ARMv5TE, VFP, dual-issue
+  Feroceon 88fr571-vd "Jolteon"
+	CPUID 0x5615571x
+	ARMv5TE, VFP, dual-issue
+  Feroceon 88fr131 "Mohawk-D"
+	CPUID 0x5625131x
+	ARMv5TE, single-issue in-order
+  Sheeva PJ1 88sv331 "Mohawk"
+	CPUID 0x561584xx
+	ARMv5, single-issue iWMMXt v2
+  Sheeva PJ4 88sv581x "Flareon"
+	CPUID 0x560f581x
+	ARMv7, idivt, optional iWMMXt v2
+  Sheeva PJ4B 88sv581x
+	CPUID 0x561f581x
+	ARMv7, idivt, optional iWMMXt v2
+  Sheeva PJ4B-MP / PJ4C
+	CPUID 0x562f584x
+	ARMv7, idivt/idiva, LPAE, optional iWMMXt v2 and/or NEON
+
+Long-term plans
+---------------
+
+ * Unify the mach-dove/, mach-mv78xx0/, mach-orion5x/ into the
+   mach-mvebu/ to support all SoCs from the Marvell EBU (Engineering
+   Business Unit) in a single mach-<foo> directory. The plat-orion/
+   would therefore disappear.
+
+ * Unify the mach-mmp/ and mach-pxa/ into the same mach-pxa
+   directory. The plat-pxa/ would therefore disappear.
+
+Credits
+-------
+
+- Maen Suleiman <maen@marvell.com>
+- Lior Amsalem <alior@marvell.com>
+- Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+- Andrew Lunn <andrew@lunn.ch>
+- Nicolas Pitre <nico@fluxnic.net>
+- Eric Miao <eric.y.miao@gmail.com>
diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment
deleted file mode 100644
index 6335fcacbba9..000000000000
--- a/Documentation/arm/mem_alignment
+++ /dev/null
@@ -1,58 +0,0 @@
-Too many problems poped up because of unnoticed misaligned memory access in
-kernel code lately.  Therefore the alignment fixup is now unconditionally
-configured in for SA11x0 based targets.  According to Alan Cox, this is a
-bad idea to configure it out, but Russell King has some good reasons for
-doing so on some f***ed up ARM architectures like the EBSA110.  However
-this is not the case on many design I'm aware of, like all SA11x0 based
-ones.
-
-Of course this is a bad idea to rely on the alignment trap to perform
-unaligned memory access in general.  If those access are predictable, you
-are better to use the macros provided by include/asm/unaligned.h.  The
-alignment trap can fixup misaligned access for the exception cases, but at
-a high performance cost.  It better be rare.
-
-Now for user space applications, it is possible to configure the alignment
-trap to SIGBUS any code performing unaligned access (good for debugging bad
-code), or even fixup the access by software like for kernel code.  The later
-mode isn't recommended for performance reasons (just think about the
-floating point emulation that works about the same way).  Fix your code
-instead!
-
-Please note that randomly changing the behaviour without good thought is
-real bad - it changes the behaviour of all unaligned instructions in user
-space, and might cause programs to fail unexpectedly.
-
-To change the alignment trap behavior, simply echo a number into
-/proc/cpu/alignment.  The number is made up from various bits:
-
-bit		behavior when set
----		-----------------
-
-0		A user process performing an unaligned memory access
-		will cause the kernel to print a message indicating
-		process name, pid, pc, instruction, address, and the
-		fault code.
-
-1		The kernel will attempt to fix up the user process
-		performing the unaligned access.  This is of course
-		slow (think about the floating point emulator) and
-		not recommended for production use.
-
-2		The kernel will send a SIGBUS signal to the user process
-		performing the unaligned access.
-
-Note that not all combinations are supported - only values 0 through 5.
-(6 and 7 don't make sense).
-
-For example, the following will turn on the warnings, but without
-fixing up or sending SIGBUS signals:
-
-	echo 1 > /proc/cpu/alignment
-
-You can also read the content of the same file to get statistical
-information on unaligned access occurrences plus the current mode of
-operation for user space code.
-
-
-Nicolas Pitre, Mar 13, 2001.  Modified Russell King, Nov 30, 2001.
diff --git a/Documentation/arm/mem_alignment.rst b/Documentation/arm/mem_alignment.rst
new file mode 100644
index 000000000000..aa22893b62bc
--- /dev/null
+++ b/Documentation/arm/mem_alignment.rst
@@ -0,0 +1,63 @@
+================
+Memory alignment
+================
+
+Too many problems popped up because of unnoticed misaligned memory access in
+kernel code lately.  Therefore the alignment fixup is now unconditionally
+configured in for SA11x0 based targets.  According to Alan Cox, this is a
+bad idea to configure it out, but Russell King has some good reasons for
+doing so on some f***ed up ARM architectures like the EBSA110.  However
+this is not the case on many design I'm aware of, like all SA11x0 based
+ones.
+
+Of course this is a bad idea to rely on the alignment trap to perform
+unaligned memory access in general.  If those access are predictable, you
+are better to use the macros provided by include/asm/unaligned.h.  The
+alignment trap can fixup misaligned access for the exception cases, but at
+a high performance cost.  It better be rare.
+
+Now for user space applications, it is possible to configure the alignment
+trap to SIGBUS any code performing unaligned access (good for debugging bad
+code), or even fixup the access by software like for kernel code.  The later
+mode isn't recommended for performance reasons (just think about the
+floating point emulation that works about the same way).  Fix your code
+instead!
+
+Please note that randomly changing the behaviour without good thought is
+real bad - it changes the behaviour of all unaligned instructions in user
+space, and might cause programs to fail unexpectedly.
+
+To change the alignment trap behavior, simply echo a number into
+/proc/cpu/alignment.  The number is made up from various bits:
+
+===		========================================================
+bit		behavior when set
+===		========================================================
+0		A user process performing an unaligned memory access
+		will cause the kernel to print a message indicating
+		process name, pid, pc, instruction, address, and the
+		fault code.
+
+1		The kernel will attempt to fix up the user process
+		performing the unaligned access.  This is of course
+		slow (think about the floating point emulator) and
+		not recommended for production use.
+
+2		The kernel will send a SIGBUS signal to the user process
+		performing the unaligned access.
+===		========================================================
+
+Note that not all combinations are supported - only values 0 through 5.
+(6 and 7 don't make sense).
+
+For example, the following will turn on the warnings, but without
+fixing up or sending SIGBUS signals::
+
+	echo 1 > /proc/cpu/alignment
+
+You can also read the content of the same file to get statistical
+information on unaligned access occurrences plus the current mode of
+operation for user space code.
+
+
+Nicolas Pitre, Mar 13, 2001.  Modified Russell King, Nov 30, 2001.
diff --git a/Documentation/arm/memory.rst b/Documentation/arm/memory.rst
new file mode 100644
index 000000000000..0521b4ce5c96
--- /dev/null
+++ b/Documentation/arm/memory.rst
@@ -0,0 +1,93 @@
+=================================
+Kernel Memory Layout on ARM Linux
+=================================
+
+		Russell King <rmk@arm.linux.org.uk>
+
+		     November 17, 2005 (2.6.15)
+
+This document describes the virtual memory layout which the Linux
+kernel uses for ARM processors.  It indicates which regions are
+free for platforms to use, and which are used by generic code.
+
+The ARM CPU is capable of addressing a maximum of 4GB virtual memory
+space, and this must be shared between user space processes, the
+kernel, and hardware devices.
+
+As the ARM architecture matures, it becomes necessary to reserve
+certain regions of VM space for use for new facilities; therefore
+this document may reserve more VM space over time.
+
+=============== =============== ===============================================
+Start		End		Use
+=============== =============== ===============================================
+ffff8000	ffffffff	copy_user_page / clear_user_page use.
+				For SA11xx and Xscale, this is used to
+				setup a minicache mapping.
+
+ffff4000	ffffffff	cache aliasing on ARMv6 and later CPUs.
+
+ffff1000	ffff7fff	Reserved.
+				Platforms must not use this address range.
+
+ffff0000	ffff0fff	CPU vector page.
+				The CPU vectors are mapped here if the
+				CPU supports vector relocation (control
+				register V bit.)
+
+fffe0000	fffeffff	XScale cache flush area.  This is used
+				in proc-xscale.S to flush the whole data
+				cache. (XScale does not have TCM.)
+
+fffe8000	fffeffff	DTCM mapping area for platforms with
+				DTCM mounted inside the CPU.
+
+fffe0000	fffe7fff	ITCM mapping area for platforms with
+				ITCM mounted inside the CPU.
+
+ffc00000	ffefffff	Fixmap mapping region.  Addresses provided
+				by fix_to_virt() will be located here.
+
+fee00000	feffffff	Mapping of PCI I/O space. This is a static
+				mapping within the vmalloc space.
+
+VMALLOC_START	VMALLOC_END-1	vmalloc() / ioremap() space.
+				Memory returned by vmalloc/ioremap will
+				be dynamically placed in this region.
+				Machine specific static mappings are also
+				located here through iotable_init().
+				VMALLOC_START is based upon the value
+				of the high_memory variable, and VMALLOC_END
+				is equal to 0xff800000.
+
+PAGE_OFFSET	high_memory-1	Kernel direct-mapped RAM region.
+				This maps the platforms RAM, and typically
+				maps all platform RAM in a 1:1 relationship.
+
+PKMAP_BASE	PAGE_OFFSET-1	Permanent kernel mappings
+				One way of mapping HIGHMEM pages into kernel
+				space.
+
+MODULES_VADDR	MODULES_END-1	Kernel module space
+				Kernel modules inserted via insmod are
+				placed here using dynamic mappings.
+
+00001000	TASK_SIZE-1	User space mappings
+				Per-thread mappings are placed here via
+				the mmap() system call.
+
+00000000	00000fff	CPU vector page / null pointer trap
+				CPUs which do not support vector remapping
+				place their vector page here.  NULL pointer
+				dereferences by both the kernel and user
+				space are also caught via this mapping.
+=============== =============== ===============================================
+
+Please note that mappings which collide with the above areas may result
+in a non-bootable kernel, or may cause the kernel to (eventually) panic
+at run time.
+
+Since future CPUs may impact the kernel mapping layout, user programs
+must not access any memory which is not mapped inside their 0x0001000
+to TASK_SIZE address range.  If they wish to access these areas, they
+must set up their own mappings using open() and mmap().
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
deleted file mode 100644
index 546a39048eb0..000000000000
--- a/Documentation/arm/memory.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-		Kernel Memory Layout on ARM Linux
-
-		Russell King <rmk@arm.linux.org.uk>
-		     November 17, 2005 (2.6.15)
-
-This document describes the virtual memory layout which the Linux
-kernel uses for ARM processors.  It indicates which regions are
-free for platforms to use, and which are used by generic code.
-
-The ARM CPU is capable of addressing a maximum of 4GB virtual memory
-space, and this must be shared between user space processes, the
-kernel, and hardware devices.
-
-As the ARM architecture matures, it becomes necessary to reserve
-certain regions of VM space for use for new facilities; therefore
-this document may reserve more VM space over time.
-
-Start		End		Use
---------------------------------------------------------------------------
-ffff8000	ffffffff	copy_user_page / clear_user_page use.
-				For SA11xx and Xscale, this is used to
-				setup a minicache mapping.
-
-ffff4000	ffffffff	cache aliasing on ARMv6 and later CPUs.
-
-ffff1000	ffff7fff	Reserved.
-				Platforms must not use this address range.
-
-ffff0000	ffff0fff	CPU vector page.
-				The CPU vectors are mapped here if the
-				CPU supports vector relocation (control
-				register V bit.)
-
-fffe0000	fffeffff	XScale cache flush area.  This is used
-				in proc-xscale.S to flush the whole data
-				cache. (XScale does not have TCM.)
-
-fffe8000	fffeffff	DTCM mapping area for platforms with
-				DTCM mounted inside the CPU.
-
-fffe0000	fffe7fff	ITCM mapping area for platforms with
-				ITCM mounted inside the CPU.
-
-ffc00000	ffefffff	Fixmap mapping region.  Addresses provided
-				by fix_to_virt() will be located here.
-
-fee00000	feffffff	Mapping of PCI I/O space. This is a static
-				mapping within the vmalloc space.
-
-VMALLOC_START	VMALLOC_END-1	vmalloc() / ioremap() space.
-				Memory returned by vmalloc/ioremap will
-				be dynamically placed in this region.
-				Machine specific static mappings are also
-				located here through iotable_init().
-				VMALLOC_START is based upon the value
-				of the high_memory variable, and VMALLOC_END
-				is equal to 0xff800000.
-
-PAGE_OFFSET	high_memory-1	Kernel direct-mapped RAM region.
-				This maps the platforms RAM, and typically
-				maps all platform RAM in a 1:1 relationship.
-
-PKMAP_BASE	PAGE_OFFSET-1	Permanent kernel mappings
-				One way of mapping HIGHMEM pages into kernel
-				space.
-
-MODULES_VADDR	MODULES_END-1	Kernel module space
-				Kernel modules inserted via insmod are
-				placed here using dynamic mappings.
-
-00001000	TASK_SIZE-1	User space mappings
-				Per-thread mappings are placed here via
-				the mmap() system call.
-
-00000000	00000fff	CPU vector page / null pointer trap
-				CPUs which do not support vector remapping
-				place their vector page here.  NULL pointer
-				dereferences by both the kernel and user
-				space are also caught via this mapping.
-
-Please note that mappings which collide with the above areas may result
-in a non-bootable kernel, or may cause the kernel to (eventually) panic
-at run time.
-
-Since future CPUs may impact the kernel mapping layout, user programs
-must not access any memory which is not mapped inside their 0x0001000
-to TASK_SIZE address range.  If they wish to access these areas, they
-must set up their own mappings using open() and mmap().
diff --git a/Documentation/arm/microchip.rst b/Documentation/arm/microchip.rst
new file mode 100644
index 000000000000..c9a44c98e868
--- /dev/null
+++ b/Documentation/arm/microchip.rst
@@ -0,0 +1,204 @@
+=============================
+ARM Microchip SoCs (aka AT91)
+=============================
+
+
+Introduction
+------------
+This document gives useful information about the ARM Microchip SoCs that are
+currently supported in Linux Mainline (you know, the one on kernel.org).
+
+It is important to note that the Microchip (previously Atmel) ARM-based MPU
+product line is historically named "AT91" or "at91" throughout the Linux kernel
+development process even if this product prefix has completely disappeared from
+the official Microchip product name. Anyway, files, directories, git trees,
+git branches/tags and email subject always contain this "at91" sub-string.
+
+
+AT91 SoCs
+---------
+Documentation and detailed datasheet for each product are available on
+the Microchip website: http://www.microchip.com.
+
+  Flavors:
+    * ARM 920 based SoC
+      - at91rm9200
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-1768-32-bit-ARM920T-Embedded-Microprocessor-AT91RM9200_Datasheet.pdf
+
+    * ARM 926 based SoCs
+      - at91sam9260
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6221-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9260_Datasheet.pdf
+
+      - at91sam9xe
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6254-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9XE_Datasheet.pdf
+
+      - at91sam9261
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6062-ARM926EJ-S-Microprocessor-SAM9261_Datasheet.pdf
+
+      - at91sam9263
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6249-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9263_Datasheet.pdf
+
+      - at91sam9rl
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/doc6289.pdf
+
+      - at91sam9g20
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001516A.pdf
+
+      - at91sam9g45 family
+        - at91sam9g45
+        - at91sam9g46
+        - at91sam9m10
+        - at91sam9m11 (device superset)
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6437-32-bit-ARM926-Embedded-Microprocessor-SAM9M11_Datasheet.pdf
+
+      - at91sam9x5 family (aka "The 5 series")
+        - at91sam9g15
+        - at91sam9g25
+        - at91sam9g35
+        - at91sam9x25
+        - at91sam9x35
+
+          * Datasheet (can be considered as covering the whole family)
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11055-32-bit-ARM926EJ-S-Microcontroller-SAM9X35_Datasheet.pdf
+
+      - at91sam9n12
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
+
+    * ARM Cortex-A5 based SoCs
+      - sama5d3 family
+
+        - sama5d31
+        - sama5d33
+        - sama5d34
+        - sama5d35
+        - sama5d36 (device superset)
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet.pdf
+
+    * ARM Cortex-A5 + NEON based SoCs
+      - sama5d4 family
+
+        - sama5d41
+        - sama5d42
+        - sama5d43
+        - sama5d44 (device superset)
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/60001525A.pdf
+
+      - sama5d2 family
+
+        - sama5d21
+        - sama5d22
+        - sama5d23
+        - sama5d24
+        - sama5d26
+        - sama5d27 (device superset)
+        - sama5d28 (device superset + environmental monitors)
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001476B.pdf
+
+    * ARM Cortex-M7 MCUs
+      - sams70 family
+
+        - sams70j19
+        - sams70j20
+        - sams70j21
+        - sams70n19
+        - sams70n20
+        - sams70n21
+        - sams70q19
+        - sams70q20
+        - sams70q21
+
+      - samv70 family
+
+        - samv70j19
+        - samv70j20
+        - samv70n19
+        - samv70n20
+        - samv70q19
+        - samv70q20
+
+      - samv71 family
+
+        - samv71j19
+        - samv71j20
+        - samv71j21
+        - samv71n19
+        - samv71n20
+        - samv71n21
+        - samv71q19
+        - samv71q20
+        - samv71q21
+
+          * Datasheet
+
+          http://ww1.microchip.com/downloads/en/DeviceDoc/60001527A.pdf
+
+
+Linux kernel information
+------------------------
+Linux kernel mach directory: arch/arm/mach-at91
+MAINTAINERS entry is: "ARM/Microchip (AT91) SoC support"
+
+
+Device Tree for AT91 SoCs and boards
+------------------------------------
+All AT91 SoCs are converted to Device Tree. Since Linux 3.19, these products
+must use this method to boot the Linux kernel.
+
+Work In Progress statement:
+Device Tree files and Device Tree bindings that apply to AT91 SoCs and boards are
+considered as "Unstable". To be completely clear, any at91 binding can change at
+any time. So, be sure to use a Device Tree Binary and a Kernel Image generated from
+the same source tree.
+Please refer to the Documentation/devicetree/bindings/ABI.txt file for a
+definition of a "Stable" binding/ABI.
+This statement will be removed by AT91 MAINTAINERS when appropriate.
+
+Naming conventions and best practice:
+
+- SoCs Device Tree Source Include files are named after the official name of
+  the product (at91sam9g20.dtsi or sama5d33.dtsi for instance).
+- Device Tree Source Include files (.dtsi) are used to collect common nodes that can be
+  shared across SoCs or boards (sama5d3.dtsi or at91sam9x5cm.dtsi for instance).
+  When collecting nodes for a particular peripheral or topic, the identifier have to
+  be placed at the end of the file name, separated with a "_" (at91sam9x5_can.dtsi
+  or sama5d3_gmac.dtsi for example).
+- board Device Tree Source files (.dts) are prefixed by the string "at91-" so
+  that they can be identified easily. Note that some files are historical exceptions
+  to this rule (sama5d3[13456]ek.dts, usb_a9g20.dts or animeo_ip.dts for example).
diff --git a/Documentation/arm/netwinder.rst b/Documentation/arm/netwinder.rst
new file mode 100644
index 000000000000..8eab66caa2ac
--- /dev/null
+++ b/Documentation/arm/netwinder.rst
@@ -0,0 +1,85 @@
+================================
+NetWinder specific documentation
+================================
+
+The NetWinder is a small low-power computer, primarily designed
+to run Linux.  It is based around the StrongARM RISC processor,
+DC21285 PCI bridge, with PC-type hardware glued around it.
+
+Port usage
+==========
+
+=======  ====== ===============================
+Min      Max	Description
+=======  ====== ===============================
+0x0000   0x000f	DMA1
+0x0020   0x0021	PIC1
+0x0060   0x006f	Keyboard
+0x0070   0x007f	RTC
+0x0080   0x0087	DMA1
+0x0088   0x008f	DMA2
+0x00a0   0x00a3	PIC2
+0x00c0   0x00df	DMA2
+0x0180   0x0187	IRDA
+0x01f0   0x01f6	ide0
+0x0201		Game port
+0x0203		RWA010 configuration read
+0x0220   ?	SoundBlaster
+0x0250   ?	WaveArtist
+0x0279		RWA010 configuration index
+0x02f8   0x02ff	Serial ttyS1
+0x0300   0x031f	Ether10
+0x0338		GPIO1
+0x033a		GPIO2
+0x0370   0x0371	W83977F configuration registers
+0x0388   ?	AdLib
+0x03c0   0x03df	VGA
+0x03f6		ide0
+0x03f8   0x03ff	Serial ttyS0
+0x0400   0x0408	DC21143
+0x0480   0x0487	DMA1
+0x0488   0x048f	DMA2
+0x0a79		RWA010 configuration write
+0xe800   0xe80f	ide0/ide1 BM DMA
+=======  ====== ===============================
+
+
+Interrupt usage
+===============
+
+======= ======= ========================
+IRQ	type	Description
+======= ======= ========================
+ 0	ISA	100Hz timer
+ 1	ISA	Keyboard
+ 2	ISA	cascade
+ 3	ISA	Serial ttyS1
+ 4	ISA	Serial ttyS0
+ 5	ISA	PS/2 mouse
+ 6	ISA	IRDA
+ 7	ISA	Printer
+ 8	ISA	RTC alarm
+ 9	ISA
+10	ISA	GP10 (Orange reset button)
+11	ISA
+12	ISA	WaveArtist
+13	ISA
+14	ISA	hda1
+15	ISA
+======= ======= ========================
+
+DMA usage
+=========
+
+======= ======= ===========
+DMA	type	Description
+======= ======= ===========
+ 0	ISA	IRDA
+ 1	ISA
+ 2	ISA	cascade
+ 3	ISA	WaveArtist
+ 4	ISA
+ 5	ISA
+ 6	ISA
+ 7	ISA	WaveArtist
+======= ======= ===========
diff --git a/Documentation/arm/nwfpe/NOTES b/Documentation/arm/nwfpe/NOTES
deleted file mode 100644
index 40577b5a49d3..000000000000
--- a/Documentation/arm/nwfpe/NOTES
+++ /dev/null
@@ -1,29 +0,0 @@
-There seems to be a problem with exp(double) and our emulator.  I haven't
-been able to track it down yet.  This does not occur with the emulator
-supplied by Russell King.
-
-I also found one oddity in the emulator.  I don't think it is serious but
-will point it out.  The ARM calling conventions require floating point
-registers f4-f7 to be preserved over a function call.  The compiler quite
-often uses an stfe instruction to save f4 on the stack upon entry to a
-function, and an ldfe instruction to restore it before returning.
-
-I was looking at some code, that calculated a double result, stored it in f4
-then made a function call. Upon return from the function call the number in
-f4 had been converted to an extended value in the emulator.
-
-This is a side effect of the stfe instruction.  The double in f4 had to be
-converted to extended, then stored.  If an lfm/sfm combination had been used,
-then no conversion would occur.  This has performance considerations.  The
-result from the function call and f4 were used in a multiplication.  If the
-emulator sees a multiply of a double and extended, it promotes the double to
-extended, then does the multiply in extended precision.
-
-This code will cause this problem:
-
-double x, y, z;
-z = log(x)/log(y);
-
-The result of log(x) (a double) will be calculated, returned in f0, then
-moved to f4 to preserve it over the log(y) call.  The division will be done
-in extended precision, due to the stfe instruction used to save f4 in log(y).
diff --git a/Documentation/arm/nwfpe/README b/Documentation/arm/nwfpe/README
deleted file mode 100644
index 771871de0c8b..000000000000
--- a/Documentation/arm/nwfpe/README
+++ /dev/null
@@ -1,70 +0,0 @@
-This directory contains the version 0.92 test release of the NetWinder 
-Floating Point Emulator.
-
-The majority of the code was written by me, Scott Bambrough It is
-written in C, with a small number of routines in inline assembler
-where required.  It was written quickly, with a goal of implementing a
-working version of all the floating point instructions the compiler
-emits as the first target.  I have attempted to be as optimal as
-possible, but there remains much room for improvement.
-
-I have attempted to make the emulator as portable as possible.  One of
-the problems is with leading underscores on kernel symbols.  Elf
-kernels have no leading underscores, a.out compiled kernels do.  I
-have attempted to use the C_SYMBOL_NAME macro wherever this may be
-important.
-
-Another choice I made was in the file structure.  I have attempted to
-contain all operating system specific code in one module (fpmodule.*).
-All the other files contain emulator specific code.  This should allow
-others to port the emulator to NetBSD for instance relatively easily.
-
-The floating point operations are based on SoftFloat Release 2, by
-John Hauser.  SoftFloat is a software implementation of floating-point
-that conforms to the IEC/IEEE Standard for Binary Floating-point
-Arithmetic.  As many as four formats are supported: single precision,
-double precision, extended double precision, and quadruple precision.
-All operations required by the standard are implemented, except for
-conversions to and from decimal.  We use only the single precision,
-double precision and extended double precision formats.  The port of
-SoftFloat to the ARM was done by Phil Blundell, based on an earlier
-port of SoftFloat version 1 by Neil Carson for NetBSD/arm32.
-
-The file README.FPE contains a description of what has been implemented
-so far in the emulator.  The file TODO contains a information on what 
-remains to be done, and other ideas for the emulator.
-
-Bug reports, comments, suggestions should be directed to me at
-<scottb@netwinder.org>.  General reports of "this program doesn't
-work correctly when your emulator is installed" are useful for
-determining that bugs still exist; but are virtually useless when
-attempting to isolate the problem.  Please report them, but don't
-expect quick action.  Bugs still exist.  The problem remains in isolating
-which instruction contains the bug.  Small programs illustrating a specific
-problem are a godsend.
-
-Legal Notices
--------------
-
-The NetWinder Floating Point Emulator is free software.  Everything Rebel.com
-has written is provided under the GNU GPL.  See the file COPYING for copying
-conditions.  Excluded from the above is the SoftFloat code.  John Hauser's 
-legal notice for SoftFloat is included below.
-
--------------------------------------------------------------------------------
-SoftFloat Legal Notice
-
-SoftFloat was written by John R. Hauser.  This work was made possible in
-part by the International Computer Science Institute, located at Suite 600,
-1947 Center Street, Berkeley, California 94704.  Funding was partially
-provided by the National Science Foundation under grant MIP-9311980.  The
-original version of this code was written as part of a project to build
-a fixed-point vector processor in collaboration with the University of
-California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
-
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
-has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
-TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
-PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
-AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
--------------------------------------------------------------------------------
diff --git a/Documentation/arm/nwfpe/README.FPE b/Documentation/arm/nwfpe/README.FPE
deleted file mode 100644
index 26f5d7bb9a41..000000000000
--- a/Documentation/arm/nwfpe/README.FPE
+++ /dev/null
@@ -1,156 +0,0 @@
-The following describes the current state of the NetWinder's floating point
-emulator.
-
-In the following nomenclature is used to describe the floating point
-instructions.  It follows the conventions in the ARM manual.
-
-<S|D|E> = <single|double|extended>, no default
-{P|M|Z} = {round to +infinity,round to -infinity,round to zero},
-          default = round to nearest
-
-Note: items enclosed in {} are optional.
-
-Floating Point Coprocessor Data Transfer Instructions (CPDT)
-------------------------------------------------------------
-
-LDF/STF - load and store floating
-
-<LDF|STF>{cond}<S|D|E> Fd, Rn
-<LDF|STF>{cond}<S|D|E> Fd, [Rn, #<expression>]{!}
-<LDF|STF>{cond}<S|D|E> Fd, [Rn], #<expression>
-
-These instructions are fully implemented.
-
-LFM/SFM - load and store multiple floating
-
-Form 1 syntax:
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn]
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn, #<expression>]{!}
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn], #<expression>
-
-Form 2 syntax:
-<LFM|SFM>{cond}<FD,EA> Fd, <count>, [Rn]{!}
-
-These instructions are fully implemented.  They store/load three words
-for each floating point register into the memory location given in the 
-instruction.  The format in memory is unlikely to be compatible with
-other implementations, in particular the actual hardware.  Specific
-mention of this is made in the ARM manuals.  
-
-Floating Point Coprocessor Register Transfer Instructions (CPRT)
-----------------------------------------------------------------
-
-Conversions, read/write status/control register instructions
-
-FLT{cond}<S,D,E>{P,M,Z} Fn, Rd          Convert integer to floating point
-FIX{cond}{P,M,Z} Rd, Fn                 Convert floating point to integer
-WFS{cond} Rd                            Write floating point status register
-RFS{cond} Rd                            Read floating point status register
-WFC{cond} Rd                            Write floating point control register
-RFC{cond} Rd                            Read floating point control register
-
-FLT/FIX are fully implemented.
-
-RFS/WFS are fully implemented.
-
-RFC/WFC are fully implemented.  RFC/WFC are supervisor only instructions, and
-presently check the CPU mode, and do an invalid instruction trap if not called
-from supervisor mode.
-
-Compare instructions
-
-CMF{cond} Fn, Fm        Compare floating
-CMFE{cond} Fn, Fm       Compare floating with exception
-CNF{cond} Fn, Fm        Compare negated floating
-CNFE{cond} Fn, Fm       Compare negated floating with exception
-
-These are fully implemented.
-
-Floating Point Coprocessor Data Instructions (CPDT)
----------------------------------------------------
-
-Dyadic operations:
-
-ADF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - add
-SUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - subtract
-RSF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse subtract
-MUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - multiply
-DVF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - divide
-RDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse divide
-
-These are fully implemented.
-
-FML{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast multiply
-FDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast divide
-FRD{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast reverse divide
-
-These are fully implemented as well.  They use the same algorithm as the
-non-fast versions.  Hence, in this implementation their performance is
-equivalent to the MUF/DVF/RDV instructions.  This is acceptable according
-to the ARM manual.  The manual notes these are defined only for single
-operands, on the actual FPA11 hardware they do not work for double or
-extended precision operands.  The emulator currently does not check
-the requested permissions conditions, and performs the requested operation.
-
-RMF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - IEEE remainder
-
-This is fully implemented.
-
-Monadic operations:
-
-MVF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move
-MNF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move negated
-
-These are fully implemented.
-
-ABS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - absolute value
-SQT{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - square root
-RND{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - round
-
-These are fully implemented.
-
-URD{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - unnormalized round
-NRM{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - normalize
-
-These are implemented.  URD is implemented using the same code as the RND
-instruction.  Since URD cannot return a unnormalized number, NRM becomes
-a NOP.
-
-Library calls:
-
-POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
-RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
-POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
-
-LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
-LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e 
-EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
-SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
-COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
-TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
-ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
-ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
-ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
-
-These are not implemented.  They are not currently issued by the compiler,
-and are handled by routines in libc.  These are not implemented by the FPA11
-hardware, but are handled by the floating point support code.  They should 
-be implemented in future versions.
-
-Signalling:
-
-Signals are implemented.  However current ELF kernels produced by Rebel.com
-have a bug in them that prevents the module from generating a SIGFPE.  This
-is caused by a failure to alias fp_current to the kernel variable
-current_set[0] correctly.
-
-The kernel provided with this distribution (vmlinux-nwfpe-0.93) contains
-a fix for this problem and also incorporates the current version of the
-emulator directly.  It is possible to run with no floating point module
-loaded with this kernel.  It is provided as a demonstration of the 
-technology and for those who want to do floating point work that depends
-on signals.  It is not strictly necessary to use the module.
-
-A module (either the one provided by Russell King, or the one in this 
-distribution) can be loaded to replace the functionality of the emulator
-built into the kernel.
diff --git a/Documentation/arm/nwfpe/TODO b/Documentation/arm/nwfpe/TODO
deleted file mode 100644
index 8027061b60eb..000000000000
--- a/Documentation/arm/nwfpe/TODO
+++ /dev/null
@@ -1,67 +0,0 @@
-TODO LIST
----------
-
-POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
-RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
-POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
-
-LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
-LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e 
-EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
-SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
-COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
-TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
-ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
-ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
-ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
-
-These are not implemented.  They are not currently issued by the compiler,
-and are handled by routines in libc.  These are not implemented by the FPA11
-hardware, but are handled by the floating point support code.  They should 
-be implemented in future versions.
-
-There are a couple of ways to approach the implementation of these.  One
-method would be to use accurate table methods for these routines.  I have 
-a couple of papers by S. Gal from IBM's research labs in Haifa, Israel that
-seem to promise extreme accuracy (in the order of 99.8%) and reasonable speed.
-These methods are used in GLIBC for some of the transcendental functions.
-
-Another approach, which I know little about is CORDIC.  This stands for
-Coordinate Rotation Digital Computer, and is a method of computing 
-transcendental functions using mostly shifts and adds and a few
-multiplications and divisions.  The ARM excels at shifts and adds,
-so such a method could be promising, but requires more research to 
-determine if it is feasible.
-
-Rounding Methods
-
-The IEEE standard defines 4 rounding modes.  Round to nearest is the
-default, but rounding to + or - infinity or round to zero are also allowed.
-Many architectures allow the rounding mode to be specified by modifying bits
-in a control register.  Not so with the ARM FPA11 architecture.  To change
-the rounding mode one must specify it with each instruction.
-
-This has made porting some benchmarks difficult.  It is possible to
-introduce such a capability into the emulator.  The FPCR contains 
-bits describing the rounding mode.  The emulator could be altered to 
-examine a flag, which if set forced it to ignore the rounding mode in
-the instruction, and use the mode specified in the bits in the FPCR.
-
-This would require a method of getting/setting the flag, and the bits
-in the FPCR.  This requires a kernel call in ArmLinux, as WFC/RFC are
-supervisor only instructions.  If anyone has any ideas or comments I
-would like to hear them.
-
-[NOTE: pulled out from some docs on ARM floating point, specifically
- for the Acorn FPE, but not limited to it:
-
- The floating point control register (FPCR) may only be present in some
- implementations: it is there to control the hardware in an implementation-
- specific manner, for example to disable the floating point system.  The user
- mode of the ARM is not permitted to use this register (since the right is
- reserved to alter it between implementations) and the WFC and RFC
- instructions will trap if tried in user mode.
-
- Hence, the answer is yes, you could do this, but then you will run a high
- risk of becoming isolated if and when hardware FP emulation comes out
-		-- Russell].
diff --git a/Documentation/arm/nwfpe/index.rst b/Documentation/arm/nwfpe/index.rst
new file mode 100644
index 000000000000..3c4d2f9aa10e
--- /dev/null
+++ b/Documentation/arm/nwfpe/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+NetWinder's floating point emulator
+===================================
+
+.. toctree::
+   :maxdepth: 1
+
+   nwfpe
+   netwinder-fpe
+   notes
+   todo
diff --git a/Documentation/arm/nwfpe/netwinder-fpe.rst b/Documentation/arm/nwfpe/netwinder-fpe.rst
new file mode 100644
index 000000000000..cbb320960fc4
--- /dev/null
+++ b/Documentation/arm/nwfpe/netwinder-fpe.rst
@@ -0,0 +1,162 @@
+=============
+Current State
+=============
+
+The following describes the current state of the NetWinder's floating point
+emulator.
+
+In the following nomenclature is used to describe the floating point
+instructions.  It follows the conventions in the ARM manual.
+
+::
+
+  <S|D|E> = <single|double|extended>, no default
+  {P|M|Z} = {round to +infinity,round to -infinity,round to zero},
+            default = round to nearest
+
+Note: items enclosed in {} are optional.
+
+Floating Point Coprocessor Data Transfer Instructions (CPDT)
+------------------------------------------------------------
+
+LDF/STF - load and store floating
+
+<LDF|STF>{cond}<S|D|E> Fd, Rn
+<LDF|STF>{cond}<S|D|E> Fd, [Rn, #<expression>]{!}
+<LDF|STF>{cond}<S|D|E> Fd, [Rn], #<expression>
+
+These instructions are fully implemented.
+
+LFM/SFM - load and store multiple floating
+
+Form 1 syntax:
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn]
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn, #<expression>]{!}
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn], #<expression>
+
+Form 2 syntax:
+<LFM|SFM>{cond}<FD,EA> Fd, <count>, [Rn]{!}
+
+These instructions are fully implemented.  They store/load three words
+for each floating point register into the memory location given in the
+instruction.  The format in memory is unlikely to be compatible with
+other implementations, in particular the actual hardware.  Specific
+mention of this is made in the ARM manuals.
+
+Floating Point Coprocessor Register Transfer Instructions (CPRT)
+----------------------------------------------------------------
+
+Conversions, read/write status/control register instructions
+
+FLT{cond}<S,D,E>{P,M,Z} Fn, Rd          Convert integer to floating point
+FIX{cond}{P,M,Z} Rd, Fn                 Convert floating point to integer
+WFS{cond} Rd                            Write floating point status register
+RFS{cond} Rd                            Read floating point status register
+WFC{cond} Rd                            Write floating point control register
+RFC{cond} Rd                            Read floating point control register
+
+FLT/FIX are fully implemented.
+
+RFS/WFS are fully implemented.
+
+RFC/WFC are fully implemented.  RFC/WFC are supervisor only instructions, and
+presently check the CPU mode, and do an invalid instruction trap if not called
+from supervisor mode.
+
+Compare instructions
+
+CMF{cond} Fn, Fm        Compare floating
+CMFE{cond} Fn, Fm       Compare floating with exception
+CNF{cond} Fn, Fm        Compare negated floating
+CNFE{cond} Fn, Fm       Compare negated floating with exception
+
+These are fully implemented.
+
+Floating Point Coprocessor Data Instructions (CPDT)
+---------------------------------------------------
+
+Dyadic operations:
+
+ADF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - add
+SUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - subtract
+RSF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse subtract
+MUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - multiply
+DVF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - divide
+RDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse divide
+
+These are fully implemented.
+
+FML{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast multiply
+FDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast divide
+FRD{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast reverse divide
+
+These are fully implemented as well.  They use the same algorithm as the
+non-fast versions.  Hence, in this implementation their performance is
+equivalent to the MUF/DVF/RDV instructions.  This is acceptable according
+to the ARM manual.  The manual notes these are defined only for single
+operands, on the actual FPA11 hardware they do not work for double or
+extended precision operands.  The emulator currently does not check
+the requested permissions conditions, and performs the requested operation.
+
+RMF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - IEEE remainder
+
+This is fully implemented.
+
+Monadic operations:
+
+MVF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move
+MNF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move negated
+
+These are fully implemented.
+
+ABS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - absolute value
+SQT{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - square root
+RND{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - round
+
+These are fully implemented.
+
+URD{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - unnormalized round
+NRM{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - normalize
+
+These are implemented.  URD is implemented using the same code as the RND
+instruction.  Since URD cannot return a unnormalized number, NRM becomes
+a NOP.
+
+Library calls:
+
+POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented.  They are not currently issued by the compiler,
+and are handled by routines in libc.  These are not implemented by the FPA11
+hardware, but are handled by the floating point support code.  They should
+be implemented in future versions.
+
+Signalling:
+
+Signals are implemented.  However current ELF kernels produced by Rebel.com
+have a bug in them that prevents the module from generating a SIGFPE.  This
+is caused by a failure to alias fp_current to the kernel variable
+current_set[0] correctly.
+
+The kernel provided with this distribution (vmlinux-nwfpe-0.93) contains
+a fix for this problem and also incorporates the current version of the
+emulator directly.  It is possible to run with no floating point module
+loaded with this kernel.  It is provided as a demonstration of the
+technology and for those who want to do floating point work that depends
+on signals.  It is not strictly necessary to use the module.
+
+A module (either the one provided by Russell King, or the one in this
+distribution) can be loaded to replace the functionality of the emulator
+built into the kernel.
diff --git a/Documentation/arm/nwfpe/notes.rst b/Documentation/arm/nwfpe/notes.rst
new file mode 100644
index 000000000000..102e55af8439
--- /dev/null
+++ b/Documentation/arm/nwfpe/notes.rst
@@ -0,0 +1,32 @@
+Notes
+=====
+
+There seems to be a problem with exp(double) and our emulator.  I haven't
+been able to track it down yet.  This does not occur with the emulator
+supplied by Russell King.
+
+I also found one oddity in the emulator.  I don't think it is serious but
+will point it out.  The ARM calling conventions require floating point
+registers f4-f7 to be preserved over a function call.  The compiler quite
+often uses an stfe instruction to save f4 on the stack upon entry to a
+function, and an ldfe instruction to restore it before returning.
+
+I was looking at some code, that calculated a double result, stored it in f4
+then made a function call. Upon return from the function call the number in
+f4 had been converted to an extended value in the emulator.
+
+This is a side effect of the stfe instruction.  The double in f4 had to be
+converted to extended, then stored.  If an lfm/sfm combination had been used,
+then no conversion would occur.  This has performance considerations.  The
+result from the function call and f4 were used in a multiplication.  If the
+emulator sees a multiply of a double and extended, it promotes the double to
+extended, then does the multiply in extended precision.
+
+This code will cause this problem:
+
+double x, y, z;
+z = log(x)/log(y);
+
+The result of log(x) (a double) will be calculated, returned in f0, then
+moved to f4 to preserve it over the log(y) call.  The division will be done
+in extended precision, due to the stfe instruction used to save f4 in log(y).
diff --git a/Documentation/arm/nwfpe/nwfpe.rst b/Documentation/arm/nwfpe/nwfpe.rst
new file mode 100644
index 000000000000..35cd90dacbff
--- /dev/null
+++ b/Documentation/arm/nwfpe/nwfpe.rst
@@ -0,0 +1,74 @@
+Introduction
+============
+
+This directory contains the version 0.92 test release of the NetWinder
+Floating Point Emulator.
+
+The majority of the code was written by me, Scott Bambrough It is
+written in C, with a small number of routines in inline assembler
+where required.  It was written quickly, with a goal of implementing a
+working version of all the floating point instructions the compiler
+emits as the first target.  I have attempted to be as optimal as
+possible, but there remains much room for improvement.
+
+I have attempted to make the emulator as portable as possible.  One of
+the problems is with leading underscores on kernel symbols.  Elf
+kernels have no leading underscores, a.out compiled kernels do.  I
+have attempted to use the C_SYMBOL_NAME macro wherever this may be
+important.
+
+Another choice I made was in the file structure.  I have attempted to
+contain all operating system specific code in one module (fpmodule.*).
+All the other files contain emulator specific code.  This should allow
+others to port the emulator to NetBSD for instance relatively easily.
+
+The floating point operations are based on SoftFloat Release 2, by
+John Hauser.  SoftFloat is a software implementation of floating-point
+that conforms to the IEC/IEEE Standard for Binary Floating-point
+Arithmetic.  As many as four formats are supported: single precision,
+double precision, extended double precision, and quadruple precision.
+All operations required by the standard are implemented, except for
+conversions to and from decimal.  We use only the single precision,
+double precision and extended double precision formats.  The port of
+SoftFloat to the ARM was done by Phil Blundell, based on an earlier
+port of SoftFloat version 1 by Neil Carson for NetBSD/arm32.
+
+The file README.FPE contains a description of what has been implemented
+so far in the emulator.  The file TODO contains a information on what
+remains to be done, and other ideas for the emulator.
+
+Bug reports, comments, suggestions should be directed to me at
+<scottb@netwinder.org>.  General reports of "this program doesn't
+work correctly when your emulator is installed" are useful for
+determining that bugs still exist; but are virtually useless when
+attempting to isolate the problem.  Please report them, but don't
+expect quick action.  Bugs still exist.  The problem remains in isolating
+which instruction contains the bug.  Small programs illustrating a specific
+problem are a godsend.
+
+Legal Notices
+-------------
+
+The NetWinder Floating Point Emulator is free software.  Everything Rebel.com
+has written is provided under the GNU GPL.  See the file COPYING for copying
+conditions.  Excluded from the above is the SoftFloat code.  John Hauser's
+legal notice for SoftFloat is included below.
+
+-------------------------------------------------------------------------------
+
+SoftFloat Legal Notice
+
+SoftFloat was written by John R. Hauser.  This work was made possible in
+part by the International Computer Science Institute, located at Suite 600,
+1947 Center Street, Berkeley, California 94704.  Funding was partially
+provided by the National Science Foundation under grant MIP-9311980.  The
+original version of this code was written as part of a project to build
+a fixed-point vector processor in collaboration with the University of
+California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
+-------------------------------------------------------------------------------
diff --git a/Documentation/arm/nwfpe/todo.rst b/Documentation/arm/nwfpe/todo.rst
new file mode 100644
index 000000000000..393f11b14540
--- /dev/null
+++ b/Documentation/arm/nwfpe/todo.rst
@@ -0,0 +1,72 @@
+TODO LIST
+=========
+
+::
+
+  POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+  RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+  POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+  LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+  LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+  EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+  SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+  COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+  TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+  ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+  ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+  ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented.  They are not currently issued by the compiler,
+and are handled by routines in libc.  These are not implemented by the FPA11
+hardware, but are handled by the floating point support code.  They should
+be implemented in future versions.
+
+There are a couple of ways to approach the implementation of these.  One
+method would be to use accurate table methods for these routines.  I have
+a couple of papers by S. Gal from IBM's research labs in Haifa, Israel that
+seem to promise extreme accuracy (in the order of 99.8%) and reasonable speed.
+These methods are used in GLIBC for some of the transcendental functions.
+
+Another approach, which I know little about is CORDIC.  This stands for
+Coordinate Rotation Digital Computer, and is a method of computing
+transcendental functions using mostly shifts and adds and a few
+multiplications and divisions.  The ARM excels at shifts and adds,
+so such a method could be promising, but requires more research to
+determine if it is feasible.
+
+Rounding Methods
+----------------
+
+The IEEE standard defines 4 rounding modes.  Round to nearest is the
+default, but rounding to + or - infinity or round to zero are also allowed.
+Many architectures allow the rounding mode to be specified by modifying bits
+in a control register.  Not so with the ARM FPA11 architecture.  To change
+the rounding mode one must specify it with each instruction.
+
+This has made porting some benchmarks difficult.  It is possible to
+introduce such a capability into the emulator.  The FPCR contains
+bits describing the rounding mode.  The emulator could be altered to
+examine a flag, which if set forced it to ignore the rounding mode in
+the instruction, and use the mode specified in the bits in the FPCR.
+
+This would require a method of getting/setting the flag, and the bits
+in the FPCR.  This requires a kernel call in ArmLinux, as WFC/RFC are
+supervisor only instructions.  If anyone has any ideas or comments I
+would like to hear them.
+
+NOTE:
+ pulled out from some docs on ARM floating point, specifically
+ for the Acorn FPE, but not limited to it:
+
+ The floating point control register (FPCR) may only be present in some
+ implementations: it is there to control the hardware in an implementation-
+ specific manner, for example to disable the floating point system.  The user
+ mode of the ARM is not permitted to use this register (since the right is
+ reserved to alter it between implementations) and the WFC and RFC
+ instructions will trap if tried in user mode.
+
+ Hence, the answer is yes, you could do this, but then you will run a high
+ risk of becoming isolated if and when hardware FP emulation comes out
+
+		-- Russell.
diff --git a/Documentation/arm/omap/dss.rst b/Documentation/arm/omap/dss.rst
new file mode 100644
index 000000000000..a40c4d9c717a
--- /dev/null
+++ b/Documentation/arm/omap/dss.rst
@@ -0,0 +1,372 @@
+=========================
+OMAP2/3 Display Subsystem
+=========================
+
+This is an almost total rewrite of the OMAP FB driver in drivers/video/omap
+(let's call it DSS1). The main differences between DSS1 and DSS2 are DSI,
+TV-out and multiple display support, but there are lots of small improvements
+also.
+
+The DSS2 driver (omapdss module) is in arch/arm/plat-omap/dss/, and the FB,
+panel and controller drivers are in drivers/video/omap2/. DSS1 and DSS2 live
+currently side by side, you can choose which one to use.
+
+Features
+--------
+
+Working and tested features include:
+
+- MIPI DPI (parallel) output
+- MIPI DSI output in command mode
+- MIPI DBI (RFBI) output
+- SDI output
+- TV output
+- All pieces can be compiled as a module or inside kernel
+- Use DISPC to update any of the outputs
+- Use CPU to update RFBI or DSI output
+- OMAP DISPC planes
+- RGB16, RGB24 packed, RGB24 unpacked
+- YUV2, UYVY
+- Scaling
+- Adjusting DSS FCK to find a good pixel clock
+- Use DSI DPLL to create DSS FCK
+
+Tested boards include:
+- OMAP3 SDP board
+- Beagle board
+- N810
+
+omapdss driver
+--------------
+
+The DSS driver does not itself have any support for Linux framebuffer, V4L or
+such like the current ones, but it has an internal kernel API that upper level
+drivers can use.
+
+The DSS driver models OMAP's overlays, overlay managers and displays in a
+flexible way to enable non-common multi-display configuration. In addition to
+modelling the hardware overlays, omapdss supports virtual overlays and overlay
+managers. These can be used when updating a display with CPU or system DMA.
+
+omapdss driver support for audio
+--------------------------------
+There exist several display technologies and standards that support audio as
+well. Hence, it is relevant to update the DSS device driver to provide an audio
+interface that may be used by an audio driver or any other driver interested in
+the functionality.
+
+The audio_enable function is intended to prepare the relevant
+IP for playback (e.g., enabling an audio FIFO, taking in/out of reset
+some IP, enabling companion chips, etc). It is intended to be called before
+audio_start. The audio_disable function performs the reverse operation and is
+intended to be called after audio_stop.
+
+While a given DSS device driver may support audio, it is possible that for
+certain configurations audio is not supported (e.g., an HDMI display using a
+VESA video timing). The audio_supported function is intended to query whether
+the current configuration of the display supports audio.
+
+The audio_config function is intended to configure all the relevant audio
+parameters of the display. In order to make the function independent of any
+specific DSS device driver, a struct omap_dss_audio is defined. Its purpose
+is to contain all the required parameters for audio configuration. At the
+moment, such structure contains pointers to IEC-60958 channel status word
+and CEA-861 audio infoframe structures. This should be enough to support
+HDMI and DisplayPort, as both are based on CEA-861 and IEC-60958.
+
+The audio_enable/disable, audio_config and audio_supported functions could be
+implemented as functions that may sleep. Hence, they should not be called
+while holding a spinlock or a readlock.
+
+The audio_start/audio_stop function is intended to effectively start/stop audio
+playback after the configuration has taken place. These functions are designed
+to be used in an atomic context. Hence, audio_start should return quickly and be
+called only after all the needed resources for audio playback (audio FIFOs,
+DMA channels, companion chips, etc) have been enabled to begin data transfers.
+audio_stop is designed to only stop the audio transfers. The resources used
+for playback are released using audio_disable.
+
+The enum omap_dss_audio_state may be used to help the implementations of
+the interface to keep track of the audio state. The initial state is _DISABLED;
+then, the state transitions to _CONFIGURED, and then, when it is ready to
+play audio, to _ENABLED. The state _PLAYING is used when the audio is being
+rendered.
+
+
+Panel and controller drivers
+----------------------------
+
+The drivers implement panel or controller specific functionality and are not
+usually visible to users except through omapfb driver.  They register
+themselves to the DSS driver.
+
+omapfb driver
+-------------
+
+The omapfb driver implements arbitrary number of standard linux framebuffers.
+These framebuffers can be routed flexibly to any overlays, thus allowing very
+dynamic display architecture.
+
+The driver exports some omapfb specific ioctls, which are compatible with the
+ioctls in the old driver.
+
+The rest of the non standard features are exported via sysfs. Whether the final
+implementation will use sysfs, or ioctls, is still open.
+
+V4L2 drivers
+------------
+
+V4L2 is being implemented in TI.
+
+From omapdss point of view the V4L2 drivers should be similar to framebuffer
+driver.
+
+Architecture
+--------------------
+
+Some clarification what the different components do:
+
+    - Framebuffer is a memory area inside OMAP's SRAM/SDRAM that contains the
+      pixel data for the image. Framebuffer has width and height and color
+      depth.
+    - Overlay defines where the pixels are read from and where they go on the
+      screen. The overlay may be smaller than framebuffer, thus displaying only
+      part of the framebuffer. The position of the overlay may be changed if
+      the overlay is smaller than the display.
+    - Overlay manager combines the overlays in to one image and feeds them to
+      display.
+    - Display is the actual physical display device.
+
+A framebuffer can be connected to multiple overlays to show the same pixel data
+on all of the overlays. Note that in this case the overlay input sizes must be
+the same, but, in case of video overlays, the output size can be different. Any
+framebuffer can be connected to any overlay.
+
+An overlay can be connected to one overlay manager. Also DISPC overlays can be
+connected only to DISPC overlay managers, and virtual overlays can be only
+connected to virtual overlays.
+
+An overlay manager can be connected to one display. There are certain
+restrictions which kinds of displays an overlay manager can be connected:
+
+    - DISPC TV overlay manager can be only connected to TV display.
+    - Virtual overlay managers can only be connected to DBI or DSI displays.
+    - DISPC LCD overlay manager can be connected to all displays, except TV
+      display.
+
+Sysfs
+-----
+The sysfs interface is mainly used for testing. I don't think sysfs
+interface is the best for this in the final version, but I don't quite know
+what would be the best interfaces for these things.
+
+The sysfs interface is divided to two parts: DSS and FB.
+
+/sys/class/graphics/fb? directory:
+mirror		0=off, 1=on
+rotate		Rotation 0-3 for 0, 90, 180, 270 degrees
+rotate_type	0 = DMA rotation, 1 = VRFB rotation
+overlays	List of overlay numbers to which framebuffer pixels go
+phys_addr	Physical address of the framebuffer
+virt_addr	Virtual address of the framebuffer
+size		Size of the framebuffer
+
+/sys/devices/platform/omapdss/overlay? directory:
+enabled		0=off, 1=on
+input_size	width,height (ie. the framebuffer size)
+manager		Destination overlay manager name
+name
+output_size	width,height
+position	x,y
+screen_width	width
+global_alpha   	global alpha 0-255 0=transparent 255=opaque
+
+/sys/devices/platform/omapdss/manager? directory:
+display				Destination display
+name
+alpha_blending_enabled		0=off, 1=on
+trans_key_enabled		0=off, 1=on
+trans_key_type			gfx-destination, video-source
+trans_key_value			transparency color key (RGB24)
+default_color			default background color (RGB24)
+
+/sys/devices/platform/omapdss/display? directory:
+
+=============== =============================================================
+ctrl_name	Controller name
+mirror		0=off, 1=on
+update_mode	0=off, 1=auto, 2=manual
+enabled		0=off, 1=on
+name
+rotate		Rotation 0-3 for 0, 90, 180, 270 degrees
+timings		Display timings (pixclock,xres/hfp/hbp/hsw,yres/vfp/vbp/vsw)
+		When writing, two special timings are accepted for tv-out:
+		"pal" and "ntsc"
+panel_name
+tear_elim	Tearing elimination 0=off, 1=on
+output_type	Output type (video encoder only): "composite" or "svideo"
+=============== =============================================================
+
+There are also some debugfs files at <debugfs>/omapdss/ which show information
+about clocks and registers.
+
+Examples
+--------
+
+The following definitions have been made for the examples below::
+
+	ovl0=/sys/devices/platform/omapdss/overlay0
+	ovl1=/sys/devices/platform/omapdss/overlay1
+	ovl2=/sys/devices/platform/omapdss/overlay2
+
+	mgr0=/sys/devices/platform/omapdss/manager0
+	mgr1=/sys/devices/platform/omapdss/manager1
+
+	lcd=/sys/devices/platform/omapdss/display0
+	dvi=/sys/devices/platform/omapdss/display1
+	tv=/sys/devices/platform/omapdss/display2
+
+	fb0=/sys/class/graphics/fb0
+	fb1=/sys/class/graphics/fb1
+	fb2=/sys/class/graphics/fb2
+
+Default setup on OMAP3 SDP
+--------------------------
+
+Here's the default setup on OMAP3 SDP board. All planes go to LCD. DVI
+and TV-out are not in use. The columns from left to right are:
+framebuffers, overlays, overlay managers, displays. Framebuffers are
+handled by omapfb, and the rest by the DSS::
+
+	FB0 --- GFX  -\            DVI
+	FB1 --- VID1 --+- LCD ---- LCD
+	FB2 --- VID2 -/   TV ----- TV
+
+Example: Switch from LCD to DVI
+-------------------------------
+
+::
+
+	w=`cat $dvi/timings | cut -d "," -f 2 | cut -d "/" -f 1`
+	h=`cat $dvi/timings | cut -d "," -f 3 | cut -d "/" -f 1`
+
+	echo "0" > $lcd/enabled
+	echo "" > $mgr0/display
+	fbset -fb /dev/fb0 -xres $w -yres $h -vxres $w -vyres $h
+	# at this point you have to switch the dvi/lcd dip-switch from the omap board
+	echo "dvi" > $mgr0/display
+	echo "1" > $dvi/enabled
+
+After this the configuration looks like:::
+
+	FB0 --- GFX  -\         -- DVI
+	FB1 --- VID1 --+- LCD -/   LCD
+	FB2 --- VID2 -/   TV ----- TV
+
+Example: Clone GFX overlay to LCD and TV
+----------------------------------------
+
+::
+
+	w=`cat $tv/timings | cut -d "," -f 2 | cut -d "/" -f 1`
+	h=`cat $tv/timings | cut -d "," -f 3 | cut -d "/" -f 1`
+
+	echo "0" > $ovl0/enabled
+	echo "0" > $ovl1/enabled
+
+	echo "" > $fb1/overlays
+	echo "0,1" > $fb0/overlays
+
+	echo "$w,$h" > $ovl1/output_size
+	echo "tv" > $ovl1/manager
+
+	echo "1" > $ovl0/enabled
+	echo "1" > $ovl1/enabled
+
+	echo "1" > $tv/enabled
+
+After this the configuration looks like (only relevant parts shown)::
+
+	FB0 +-- GFX  ---- LCD ---- LCD
+	\- VID1 ---- TV  ---- TV
+
+Misc notes
+----------
+
+OMAP FB allocates the framebuffer memory using the standard dma allocator. You
+can enable Contiguous Memory Allocator (CONFIG_CMA) to improve the dma
+allocator, and if CMA is enabled, you use "cma=" kernel parameter to increase
+the global memory area for CMA.
+
+Using DSI DPLL to generate pixel clock it is possible produce the pixel clock
+of 86.5MHz (max possible), and with that you get 1280x1024@57 output from DVI.
+
+Rotation and mirroring currently only supports RGB565 and RGB8888 modes. VRFB
+does not support mirroring.
+
+VRFB rotation requires much more memory than non-rotated framebuffer, so you
+probably need to increase your vram setting before using VRFB rotation. Also,
+many applications may not work with VRFB if they do not pay attention to all
+framebuffer parameters.
+
+Kernel boot arguments
+---------------------
+
+omapfb.mode=<display>:<mode>[,...]
+	- Default video mode for specified displays. For example,
+	  "dvi:800x400MR-24@60".  See drivers/video/modedb.c.
+	  There are also two special modes: "pal" and "ntsc" that
+	  can be used to tv out.
+
+omapfb.vram=<fbnum>:<size>[@<physaddr>][,...]
+	- VRAM allocated for a framebuffer. Normally omapfb allocates vram
+	  depending on the display size. With this you can manually allocate
+	  more or define the physical address of each framebuffer. For example,
+	  "1:4M" to allocate 4M for fb1.
+
+omapfb.debug=<y|n>
+	- Enable debug printing. You have to have OMAPFB debug support enabled
+	  in kernel config.
+
+omapfb.test=<y|n>
+	- Draw test pattern to framebuffer whenever framebuffer settings change.
+	  You need to have OMAPFB debug support enabled in kernel config.
+
+omapfb.vrfb=<y|n>
+	- Use VRFB rotation for all framebuffers.
+
+omapfb.rotate=<angle>
+	- Default rotation applied to all framebuffers.
+	  0 - 0 degree rotation
+	  1 - 90 degree rotation
+	  2 - 180 degree rotation
+	  3 - 270 degree rotation
+
+omapfb.mirror=<y|n>
+	- Default mirror for all framebuffers. Only works with DMA rotation.
+
+omapdss.def_disp=<display>
+	- Name of default display, to which all overlays will be connected.
+	  Common examples are "lcd" or "tv".
+
+omapdss.debug=<y|n>
+	- Enable debug printing. You have to have DSS debug support enabled in
+	  kernel config.
+
+TODO
+----
+
+DSS locking
+
+Error checking
+
+- Lots of checks are missing or implemented just as BUG()
+
+System DMA update for DSI
+
+- Can be used for RGB16 and RGB24P modes. Probably not for RGB24U (how
+  to skip the empty byte?)
+
+OMAP1 support
+
+- Not sure if needed
diff --git a/Documentation/arm/omap/index.rst b/Documentation/arm/omap/index.rst
new file mode 100644
index 000000000000..8b365b212e49
--- /dev/null
+++ b/Documentation/arm/omap/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+TI OMAP
+=======
+
+.. toctree::
+   :maxdepth: 1
+
+   omap
+   omap_pm
+   dss
diff --git a/Documentation/arm/omap/omap.rst b/Documentation/arm/omap/omap.rst
new file mode 100644
index 000000000000..f440c0f4613f
--- /dev/null
+++ b/Documentation/arm/omap/omap.rst
@@ -0,0 +1,18 @@
+============
+OMAP history
+============
+
+This file contains documentation for running mainline
+kernel on omaps.
+
+======		======================================================
+KERNEL		NEW DEPENDENCIES
+======		======================================================
+v4.3+		Update is needed for custom .config files to make sure
+		CONFIG_REGULATOR_PBIAS is enabled for MMC1 to work
+		properly.
+
+v4.18+		Update is needed for custom .config files to make sure
+		CONFIG_MMC_SDHCI_OMAP is enabled for all MMC instances
+		to work in DRA7 and K2G based boards.
+======		======================================================
diff --git a/Documentation/arm/omap/omap_pm.rst b/Documentation/arm/omap/omap_pm.rst
new file mode 100644
index 000000000000..a335e4c8ce2c
--- /dev/null
+++ b/Documentation/arm/omap/omap_pm.rst
@@ -0,0 +1,165 @@
+=====================
+The OMAP PM interface
+=====================
+
+This document describes the temporary OMAP PM interface.  Driver
+authors use these functions to communicate minimum latency or
+throughput constraints to the kernel power management code.
+Over time, the intention is to merge features from the OMAP PM
+interface into the Linux PM QoS code.
+
+Drivers need to express PM parameters which:
+
+- support the range of power management parameters present in the TI SRF;
+
+- separate the drivers from the underlying PM parameter
+  implementation, whether it is the TI SRF or Linux PM QoS or Linux
+  latency framework or something else;
+
+- specify PM parameters in terms of fundamental units, such as
+  latency and throughput, rather than units which are specific to OMAP
+  or to particular OMAP variants;
+
+- allow drivers which are shared with other architectures (e.g.,
+  DaVinci) to add these constraints in a way which won't affect non-OMAP
+  systems,
+
+- can be implemented immediately with minimal disruption of other
+  architectures.
+
+
+This document proposes the OMAP PM interface, including the following
+five power management functions for driver code:
+
+1. Set the maximum MPU wakeup latency::
+
+   (*pdata->set_max_mpu_wakeup_lat)(struct device *dev, unsigned long t)
+
+2. Set the maximum device wakeup latency::
+
+   (*pdata->set_max_dev_wakeup_lat)(struct device *dev, unsigned long t)
+
+3. Set the maximum system DMA transfer start latency (CORE pwrdm)::
+
+   (*pdata->set_max_sdma_lat)(struct device *dev, long t)
+
+4. Set the minimum bus throughput needed by a device::
+
+   (*pdata->set_min_bus_tput)(struct device *dev, u8 agent_id, unsigned long r)
+
+5. Return the number of times the device has lost context::
+
+   (*pdata->get_dev_context_loss_count)(struct device *dev)
+
+
+Further documentation for all OMAP PM interface functions can be
+found in arch/arm/plat-omap/include/mach/omap-pm.h.
+
+
+The OMAP PM layer is intended to be temporary
+---------------------------------------------
+
+The intention is that eventually the Linux PM QoS layer should support
+the range of power management features present in OMAP3.  As this
+happens, existing drivers using the OMAP PM interface can be modified
+to use the Linux PM QoS code; and the OMAP PM interface can disappear.
+
+
+Driver usage of the OMAP PM functions
+-------------------------------------
+
+As the 'pdata' in the above examples indicates, these functions are
+exposed to drivers through function pointers in driver .platform_data
+structures.  The function pointers are initialized by the `board-*.c`
+files to point to the corresponding OMAP PM functions:
+
+- set_max_dev_wakeup_lat will point to
+  omap_pm_set_max_dev_wakeup_lat(), etc.  Other architectures which do
+  not support these functions should leave these function pointers set
+  to NULL.  Drivers should use the following idiom::
+
+        if (pdata->set_max_dev_wakeup_lat)
+            (*pdata->set_max_dev_wakeup_lat)(dev, t);
+
+The most common usage of these functions will probably be to specify
+the maximum time from when an interrupt occurs, to when the device
+becomes accessible.  To accomplish this, driver writers should use the
+set_max_mpu_wakeup_lat() function to constrain the MPU wakeup
+latency, and the set_max_dev_wakeup_lat() function to constrain the
+device wakeup latency (from clk_enable() to accessibility).  For
+example::
+
+        /* Limit MPU wakeup latency */
+        if (pdata->set_max_mpu_wakeup_lat)
+            (*pdata->set_max_mpu_wakeup_lat)(dev, tc);
+
+        /* Limit device powerdomain wakeup latency */
+        if (pdata->set_max_dev_wakeup_lat)
+            (*pdata->set_max_dev_wakeup_lat)(dev, td);
+
+        /* total wakeup latency in this example: (tc + td) */
+
+The PM parameters can be overwritten by calling the function again
+with the new value.  The settings can be removed by calling the
+function with a t argument of -1 (except in the case of
+set_max_bus_tput(), which should be called with an r argument of 0).
+
+The fifth function above, omap_pm_get_dev_context_loss_count(),
+is intended as an optimization to allow drivers to determine whether the
+device has lost its internal context.  If context has been lost, the
+driver must restore its internal context before proceeding.
+
+
+Other specialized interface functions
+-------------------------------------
+
+The five functions listed above are intended to be usable by any
+device driver.  DSPBridge and CPUFreq have a few special requirements.
+DSPBridge expresses target DSP performance levels in terms of OPP IDs.
+CPUFreq expresses target MPU performance levels in terms of MPU
+frequency.  The OMAP PM interface contains functions for these
+specialized cases to convert that input information (OPPs/MPU
+frequency) into the form that the underlying power management
+implementation needs:
+
+6. `(*pdata->dsp_get_opp_table)(void)`
+
+7. `(*pdata->dsp_set_min_opp)(u8 opp_id)`
+
+8. `(*pdata->dsp_get_opp)(void)`
+
+9. `(*pdata->cpu_get_freq_table)(void)`
+
+10. `(*pdata->cpu_set_freq)(unsigned long f)`
+
+11. `(*pdata->cpu_get_freq)(void)`
+
+Customizing OPP for platform
+============================
+Defining CONFIG_PM should enable OPP layer for the silicon
+and the registration of OPP table should take place automatically.
+However, in special cases, the default OPP table may need to be
+tweaked, for e.g.:
+
+ * enable default OPPs which are disabled by default, but which
+   could be enabled on a platform
+ * Disable an unsupported OPP on the platform
+ * Define and add a custom opp table entry
+   in these cases, the board file needs to do additional steps as follows:
+
+arch/arm/mach-omapx/board-xyz.c::
+
+	#include "pm.h"
+	....
+	static void __init omap_xyz_init_irq(void)
+	{
+		....
+		/* Initialize the default table */
+		omapx_opp_init();
+		/* Do customization to the defaults */
+		....
+	}
+
+NOTE:
+  omapx_opp_init will be omap3_opp_init or as required
+  based on the omap family.
diff --git a/Documentation/arm/porting.rst b/Documentation/arm/porting.rst
new file mode 100644
index 000000000000..bd21958bdb2d
--- /dev/null
+++ b/Documentation/arm/porting.rst
@@ -0,0 +1,137 @@
+=======
+Porting
+=======
+
+Taken from list archive at http://lists.arm.linux.org.uk/pipermail/linux-arm-kernel/2001-July/004064.html
+
+Initial definitions
+-------------------
+
+The following symbol definitions rely on you knowing the translation that
+__virt_to_phys() does for your machine.  This macro converts the passed
+virtual address to a physical address.  Normally, it is simply:
+
+		phys = virt - PAGE_OFFSET + PHYS_OFFSET
+
+
+Decompressor Symbols
+--------------------
+
+ZTEXTADDR
+	Start address of decompressor.  There's no point in talking about
+	virtual or physical addresses here, since the MMU will be off at
+	the time when you call the decompressor code.  You normally call
+	the kernel at this address to start it booting.  This doesn't have
+	to be located in RAM, it can be in flash or other read-only or
+	read-write addressable medium.
+
+ZBSSADDR
+	Start address of zero-initialised work area for the decompressor.
+	This must be pointing at RAM.  The decompressor will zero initialise
+	this for you.  Again, the MMU will be off.
+
+ZRELADDR
+	This is the address where the decompressed kernel will be written,
+	and eventually executed.  The following constraint must be valid:
+
+		__virt_to_phys(TEXTADDR) == ZRELADDR
+
+	The initial part of the kernel is carefully coded to be position
+	independent.
+
+INITRD_PHYS
+	Physical address to place the initial RAM disk.  Only relevant if
+	you are using the bootpImage stuff (which only works on the old
+	struct param_struct).
+
+INITRD_VIRT
+	Virtual address of the initial RAM disk.  The following  constraint
+	must be valid:
+
+		__virt_to_phys(INITRD_VIRT) == INITRD_PHYS
+
+PARAMS_PHYS
+	Physical address of the struct param_struct or tag list, giving the
+	kernel various parameters about its execution environment.
+
+
+Kernel Symbols
+--------------
+
+PHYS_OFFSET
+	Physical start address of the first bank of RAM.
+
+PAGE_OFFSET
+	Virtual start address of the first bank of RAM.  During the kernel
+	boot phase, virtual address PAGE_OFFSET will be mapped to physical
+	address PHYS_OFFSET, along with any other mappings you supply.
+	This should be the same value as TASK_SIZE.
+
+TASK_SIZE
+	The maximum size of a user process in bytes.  Since user space
+	always starts at zero, this is the maximum address that a user
+	process can access+1.  The user space stack grows down from this
+	address.
+
+	Any virtual address below TASK_SIZE is deemed to be user process
+	area, and therefore managed dynamically on a process by process
+	basis by the kernel.  I'll call this the user segment.
+
+	Anything above TASK_SIZE is common to all processes.  I'll call
+	this the kernel segment.
+
+	(In other words, you can't put IO mappings below TASK_SIZE, and
+	hence PAGE_OFFSET).
+
+TEXTADDR
+	Virtual start address of kernel, normally PAGE_OFFSET + 0x8000.
+	This is where the kernel image ends up.  With the latest kernels,
+	it must be located at 32768 bytes into a 128MB region.  Previous
+	kernels placed a restriction of 256MB here.
+
+DATAADDR
+	Virtual address for the kernel data segment.  Must not be defined
+	when using the decompressor.
+
+VMALLOC_START / VMALLOC_END
+	Virtual addresses bounding the vmalloc() area.  There must not be
+	any static mappings in this area; vmalloc will overwrite them.
+	The addresses must also be in the kernel segment (see above).
+	Normally, the vmalloc() area starts VMALLOC_OFFSET bytes above the
+	last virtual RAM address (found using variable high_memory).
+
+VMALLOC_OFFSET
+	Offset normally incorporated into VMALLOC_START to provide a hole
+	between virtual RAM and the vmalloc area.  We do this to allow
+	out of bounds memory accesses (eg, something writing off the end
+	of the mapped memory map) to be caught.  Normally set to 8MB.
+
+Architecture Specific Macros
+----------------------------
+
+BOOT_MEM(pram,pio,vio)
+	`pram` specifies the physical start address of RAM.  Must always
+	be present, and should be the same as PHYS_OFFSET.
+
+	`pio` is the physical address of an 8MB region containing IO for
+	use with the debugging macros in arch/arm/kernel/debug-armv.S.
+
+	`vio` is the virtual address of the 8MB debugging region.
+
+	It is expected that the debugging region will be re-initialised
+	by the architecture specific code later in the code (via the
+	MAPIO function).
+
+BOOT_PARAMS
+	Same as, and see PARAMS_PHYS.
+
+FIXUP(func)
+	Machine specific fixups, run before memory subsystems have been
+	initialised.
+
+MAPIO(func)
+	Machine specific function to map IO areas (including the debug
+	region above).
+
+INITIRQ(func)
+	Machine specific function to initialise interrupts.
diff --git a/Documentation/arm/pxa/mfp.rst b/Documentation/arm/pxa/mfp.rst
new file mode 100644
index 000000000000..ac34e5d7ee44
--- /dev/null
+++ b/Documentation/arm/pxa/mfp.rst
@@ -0,0 +1,288 @@
+==============================================
+MFP Configuration for PXA2xx/PXA3xx Processors
+==============================================
+
+			Eric Miao <eric.miao@marvell.com>
+
+MFP stands for Multi-Function Pin, which is the pin-mux logic on PXA3xx and
+later PXA series processors.  This document describes the existing MFP API,
+and how board/platform driver authors could make use of it.
+
+Basic Concept
+=============
+
+Unlike the GPIO alternate function settings on PXA25x and PXA27x, a new MFP
+mechanism is introduced from PXA3xx to completely move the pin-mux functions
+out of the GPIO controller. In addition to pin-mux configurations, the MFP
+also controls the low power state, driving strength, pull-up/down and event
+detection of each pin.  Below is a diagram of internal connections between
+the MFP logic and the remaining SoC peripherals::
+
+ +--------+
+ |        |--(GPIO19)--+
+ |  GPIO  |            |
+ |        |--(GPIO...) |
+ +--------+            |
+                       |       +---------+
+ +--------+            +------>|         |
+ |  PWM2  |--(PWM_OUT)-------->|   MFP   |
+ +--------+            +------>|         |-------> to external PAD
+                       | +---->|         |
+ +--------+            | | +-->|         |
+ |  SSP2  |---(TXD)----+ | |   +---------+
+ +--------+              | |
+                         | |
+ +--------+              | |
+ | Keypad |--(MKOUT4)----+ |
+ +--------+                |
+                           |
+ +--------+                |
+ |  UART2 |---(TXD)--------+
+ +--------+
+
+NOTE: the external pad is named as MFP_PIN_GPIO19, it doesn't necessarily
+mean it's dedicated for GPIO19, only as a hint that internally this pin
+can be routed from GPIO19 of the GPIO controller.
+
+To better understand the change from PXA25x/PXA27x GPIO alternate function
+to this new MFP mechanism, here are several key points:
+
+  1. GPIO controller on PXA3xx is now a dedicated controller, same as other
+     internal controllers like PWM, SSP and UART, with 128 internal signals
+     which can be routed to external through one or more MFPs (e.g. GPIO<0>
+     can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2,
+     see arch/arm/mach-pxa/mfp-pxa300.h)
+
+  2. Alternate function configuration is removed from this GPIO controller,
+     the remaining functions are pure GPIO-specific, i.e.
+
+       - GPIO signal level control
+       - GPIO direction control
+       - GPIO level change detection
+
+  3. Low power state for each pin is now controlled by MFP, this means the
+     PGSRx registers on PXA2xx are now useless on PXA3xx
+
+  4. Wakeup detection is now controlled by MFP, PWER does not control the
+     wakeup from GPIO(s) any more, depending on the sleeping state, ADxER
+     (as defined in pxa3xx-regs.h) controls the wakeup from MFP
+
+NOTE: with such a clear separation of MFP and GPIO, by GPIO<xx> we normally
+mean it is a GPIO signal, and by MFP<xxx> or pin xxx, we mean a physical
+pad (or ball).
+
+MFP API Usage
+=============
+
+For board code writers, here are some guidelines:
+
+1. include ONE of the following header files in your <board>.c:
+
+   - #include "mfp-pxa25x.h"
+   - #include "mfp-pxa27x.h"
+   - #include "mfp-pxa300.h"
+   - #include "mfp-pxa320.h"
+   - #include "mfp-pxa930.h"
+
+   NOTE: only one file in your <board>.c, depending on the processors used,
+   because pin configuration definitions may conflict in these file (i.e.
+   same name, different meaning and settings on different processors). E.g.
+   for zylonite platform, which support both PXA300/PXA310 and PXA320, two
+   separate files are introduced: zylonite_pxa300.c and zylonite_pxa320.c
+   (in addition to handle MFP configuration differences, they also handle
+   the other differences between the two combinations).
+
+   NOTE: PXA300 and PXA310 are almost identical in pin configurations (with
+   PXA310 supporting some additional ones), thus the difference is actually
+   covered in a single mfp-pxa300.h.
+
+2. prepare an array for the initial pin configurations, e.g.::
+
+     static unsigned long mainstone_pin_config[] __initdata = {
+	/* Chip Select */
+	GPIO15_nCS_1,
+
+	/* LCD - 16bpp Active TFT */
+	GPIOxx_TFT_LCD_16BPP,
+	GPIO16_PWM0_OUT,	/* Backlight */
+
+	/* MMC */
+	GPIO32_MMC_CLK,
+	GPIO112_MMC_CMD,
+	GPIO92_MMC_DAT_0,
+	GPIO109_MMC_DAT_1,
+	GPIO110_MMC_DAT_2,
+	GPIO111_MMC_DAT_3,
+
+	...
+
+	/* GPIO */
+	GPIO1_GPIO | WAKEUP_ON_EDGE_BOTH,
+     };
+
+   a) once the pin configurations are passed to pxa{2xx,3xx}_mfp_config(),
+   and written to the actual registers, they are useless and may discard,
+   adding '__initdata' will help save some additional bytes here.
+
+   b) when there is only one possible pin configurations for a component,
+   some simplified definitions can be used, e.g. GPIOxx_TFT_LCD_16BPP on
+   PXA25x and PXA27x processors
+
+   c) if by board design, a pin can be configured to wake up the system
+   from low power state, it can be 'OR'ed with any of:
+
+      WAKEUP_ON_EDGE_BOTH
+      WAKEUP_ON_EDGE_RISE
+      WAKEUP_ON_EDGE_FALL
+      WAKEUP_ON_LEVEL_HIGH - specifically for enabling of keypad GPIOs,
+
+   to indicate that this pin has the capability of wake-up the system,
+   and on which edge(s). This, however, doesn't necessarily mean the
+   pin _will_ wakeup the system, it will only when set_irq_wake() is
+   invoked with the corresponding GPIO IRQ (GPIO_IRQ(xx) or gpio_to_irq())
+   and eventually calls gpio_set_wake() for the actual register setting.
+
+   d) although PXA3xx MFP supports edge detection on each pin, the
+   internal logic will only wakeup the system when those specific bits
+   in ADxER registers are set, which can be well mapped to the
+   corresponding peripheral, thus set_irq_wake() can be called with
+   the peripheral IRQ to enable the wakeup.
+
+
+MFP on PXA3xx
+=============
+
+Every external I/O pad on PXA3xx (excluding those for special purpose) has
+one MFP logic associated, and is controlled by one MFP register (MFPR).
+
+The MFPR has the following bit definitions (for PXA300/PXA310/PXA320)::
+
+ 31                        16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
+  +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  |         RESERVED        |PS|PU|PD|  DRIVE |SS|SD|SO|EC|EF|ER|--| AF_SEL |
+  +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+
+  Bit 3:   RESERVED
+  Bit 4:   EDGE_RISE_EN - enable detection of rising edge on this pin
+  Bit 5:   EDGE_FALL_EN - enable detection of falling edge on this pin
+  Bit 6:   EDGE_CLEAR   - disable edge detection on this pin
+  Bit 7:   SLEEP_OE_N   - enable outputs during low power modes
+  Bit 8:   SLEEP_DATA   - output data on the pin during low power modes
+  Bit 9:   SLEEP_SEL    - selection control for low power modes signals
+  Bit 13:  PULLDOWN_EN  - enable the internal pull-down resistor on this pin
+  Bit 14:  PULLUP_EN    - enable the internal pull-up resistor on this pin
+  Bit 15:  PULL_SEL     - pull state controlled by selected alternate function
+                          (0) or by PULL{UP,DOWN}_EN bits (1)
+
+  Bit 0 - 2: AF_SEL - alternate function selection, 8 possibilities, from 0-7
+  Bit 10-12: DRIVE  - drive strength and slew rate
+			0b000 - fast 1mA
+			0b001 - fast 2mA
+			0b002 - fast 3mA
+			0b003 - fast 4mA
+			0b004 - slow 6mA
+			0b005 - fast 6mA
+			0b006 - slow 10mA
+			0b007 - fast 10mA
+
+MFP Design for PXA2xx/PXA3xx
+============================
+
+Due to the difference of pin-mux handling between PXA2xx and PXA3xx, a unified
+MFP API is introduced to cover both series of processors.
+
+The basic idea of this design is to introduce definitions for all possible pin
+configurations, these definitions are processor and platform independent, and
+the actual API invoked to convert these definitions into register settings and
+make them effective there-after.
+
+Files Involved
+--------------
+
+  - arch/arm/mach-pxa/include/mach/mfp.h
+
+  for
+    1. Unified pin definitions - enum constants for all configurable pins
+    2. processor-neutral bit definitions for a possible MFP configuration
+
+  - arch/arm/mach-pxa/mfp-pxa3xx.h
+
+  for PXA3xx specific MFPR register bit definitions and PXA3xx common pin
+  configurations
+
+  - arch/arm/mach-pxa/mfp-pxa2xx.h
+
+  for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations
+
+  - arch/arm/mach-pxa/mfp-pxa25x.h
+    arch/arm/mach-pxa/mfp-pxa27x.h
+    arch/arm/mach-pxa/mfp-pxa300.h
+    arch/arm/mach-pxa/mfp-pxa320.h
+    arch/arm/mach-pxa/mfp-pxa930.h
+
+  for processor specific definitions
+
+  - arch/arm/mach-pxa/mfp-pxa3xx.c
+  - arch/arm/mach-pxa/mfp-pxa2xx.c
+
+  for implementation of the pin configuration to take effect for the actual
+  processor.
+
+Pin Configuration
+-----------------
+
+  The following comments are copied from mfp.h (see the actual source code
+  for most updated info)::
+
+    /*
+     * a possible MFP configuration is represented by a 32-bit integer
+     *
+     * bit  0.. 9 - MFP Pin Number (1024 Pins Maximum)
+     * bit 10..12 - Alternate Function Selection
+     * bit 13..15 - Drive Strength
+     * bit 16..18 - Low Power Mode State
+     * bit 19..20 - Low Power Mode Edge Detection
+     * bit 21..22 - Run Mode Pull State
+     *
+     * to facilitate the definition, the following macros are provided
+     *
+     * MFP_CFG_DEFAULT - default MFP configuration value, with
+     * 		  alternate function = 0,
+     * 		  drive strength = fast 3mA (MFP_DS03X)
+     * 		  low power mode = default
+     * 		  edge detection = none
+     *
+     * MFP_CFG	- default MFPR value with alternate function
+     * MFP_CFG_DRV	- default MFPR value with alternate function and
+     * 		  pin drive strength
+     * MFP_CFG_LPM	- default MFPR value with alternate function and
+     * 		  low power mode
+     * MFP_CFG_X	- default MFPR value with alternate function,
+     * 		  pin drive strength and low power mode
+     */
+
+   Examples of pin configurations are::
+
+     #define GPIO94_SSP3_RXD		MFP_CFG_X(GPIO94, AF1, DS08X, FLOAT)
+
+   which reads GPIO94 can be configured as SSP3_RXD, with alternate function
+   selection of 1, driving strength of 0b101, and a float state in low power
+   modes.
+
+   NOTE: this is the default setting of this pin being configured as SSP3_RXD
+   which can be modified a bit in board code, though it is not recommended to
+   do so, simply because this default setting is usually carefully encoded,
+   and is supposed to work in most cases.
+
+Register Settings
+-----------------
+
+   Register settings on PXA3xx for a pin configuration is actually very
+   straight-forward, most bits can be converted directly into MFPR value
+   in a easier way. Two sets of MFPR values are calculated: the run-time
+   ones and the low power mode ones, to allow different settings.
+
+   The conversion from a generic pin configuration to the actual register
+   settings on PXA2xx is a bit complicated: many registers are involved,
+   including GAFRx, GPDRx, PGSRx, PWER, PKWR, PFER and PRER. Please see
+   mfp-pxa2xx.c for how the conversion is made.
diff --git a/Documentation/arm/pxa/mfp.txt b/Documentation/arm/pxa/mfp.txt
deleted file mode 100644
index 0b7cab978c02..000000000000
--- a/Documentation/arm/pxa/mfp.txt
+++ /dev/null
@@ -1,286 +0,0 @@
-                 MFP Configuration for PXA2xx/PXA3xx Processors
-
-			Eric Miao <eric.miao@marvell.com>
-
-MFP stands for Multi-Function Pin, which is the pin-mux logic on PXA3xx and
-later PXA series processors.  This document describes the existing MFP API,
-and how board/platform driver authors could make use of it.
-
- Basic Concept
-===============
-
-Unlike the GPIO alternate function settings on PXA25x and PXA27x, a new MFP
-mechanism is introduced from PXA3xx to completely move the pin-mux functions
-out of the GPIO controller. In addition to pin-mux configurations, the MFP
-also controls the low power state, driving strength, pull-up/down and event
-detection of each pin.  Below is a diagram of internal connections between
-the MFP logic and the remaining SoC peripherals:
-
- +--------+
- |        |--(GPIO19)--+
- |  GPIO  |            |
- |        |--(GPIO...) |
- +--------+            |
-                       |       +---------+
- +--------+            +------>|         |
- |  PWM2  |--(PWM_OUT)-------->|   MFP   |
- +--------+            +------>|         |-------> to external PAD
-                       | +---->|         |
- +--------+            | | +-->|         |
- |  SSP2  |---(TXD)----+ | |   +---------+
- +--------+              | |
-                         | |
- +--------+              | |
- | Keypad |--(MKOUT4)----+ |
- +--------+                |
-                           |
- +--------+                |
- |  UART2 |---(TXD)--------+
- +--------+
-
-NOTE: the external pad is named as MFP_PIN_GPIO19, it doesn't necessarily
-mean it's dedicated for GPIO19, only as a hint that internally this pin
-can be routed from GPIO19 of the GPIO controller.
-
-To better understand the change from PXA25x/PXA27x GPIO alternate function
-to this new MFP mechanism, here are several key points:
-
-  1. GPIO controller on PXA3xx is now a dedicated controller, same as other
-     internal controllers like PWM, SSP and UART, with 128 internal signals
-     which can be routed to external through one or more MFPs (e.g. GPIO<0>
-     can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2,
-     see arch/arm/mach-pxa/mfp-pxa300.h)
-
-  2. Alternate function configuration is removed from this GPIO controller,
-     the remaining functions are pure GPIO-specific, i.e.
-
-       - GPIO signal level control
-       - GPIO direction control
-       - GPIO level change detection
-
-  3. Low power state for each pin is now controlled by MFP, this means the
-     PGSRx registers on PXA2xx are now useless on PXA3xx
-
-  4. Wakeup detection is now controlled by MFP, PWER does not control the
-     wakeup from GPIO(s) any more, depending on the sleeping state, ADxER
-     (as defined in pxa3xx-regs.h) controls the wakeup from MFP
-
-NOTE: with such a clear separation of MFP and GPIO, by GPIO<xx> we normally
-mean it is a GPIO signal, and by MFP<xxx> or pin xxx, we mean a physical
-pad (or ball).
-
- MFP API Usage
-===============
-
-For board code writers, here are some guidelines:
-
-1. include ONE of the following header files in your <board>.c:
-
-   - #include "mfp-pxa25x.h"
-   - #include "mfp-pxa27x.h"
-   - #include "mfp-pxa300.h"
-   - #include "mfp-pxa320.h"
-   - #include "mfp-pxa930.h"
-
-   NOTE: only one file in your <board>.c, depending on the processors used,
-   because pin configuration definitions may conflict in these file (i.e.
-   same name, different meaning and settings on different processors). E.g.
-   for zylonite platform, which support both PXA300/PXA310 and PXA320, two
-   separate files are introduced: zylonite_pxa300.c and zylonite_pxa320.c
-   (in addition to handle MFP configuration differences, they also handle
-   the other differences between the two combinations).
-
-   NOTE: PXA300 and PXA310 are almost identical in pin configurations (with
-   PXA310 supporting some additional ones), thus the difference is actually
-   covered in a single mfp-pxa300.h.
-
-2. prepare an array for the initial pin configurations, e.g.:
-
-   static unsigned long mainstone_pin_config[] __initdata = {
-	/* Chip Select */
-	GPIO15_nCS_1,
-
-	/* LCD - 16bpp Active TFT */
-	GPIOxx_TFT_LCD_16BPP,
-	GPIO16_PWM0_OUT,	/* Backlight */
-
-	/* MMC */
-	GPIO32_MMC_CLK,
-	GPIO112_MMC_CMD,
-	GPIO92_MMC_DAT_0,
-	GPIO109_MMC_DAT_1,
-	GPIO110_MMC_DAT_2,
-	GPIO111_MMC_DAT_3,
-
-	...
-
-	/* GPIO */
-	GPIO1_GPIO | WAKEUP_ON_EDGE_BOTH,
-   };
-
-   a) once the pin configurations are passed to pxa{2xx,3xx}_mfp_config(),
-   and written to the actual registers, they are useless and may discard,
-   adding '__initdata' will help save some additional bytes here.
-
-   b) when there is only one possible pin configurations for a component,
-   some simplified definitions can be used, e.g. GPIOxx_TFT_LCD_16BPP on
-   PXA25x and PXA27x processors
-
-   c) if by board design, a pin can be configured to wake up the system
-   from low power state, it can be 'OR'ed with any of:
-
-      WAKEUP_ON_EDGE_BOTH
-      WAKEUP_ON_EDGE_RISE
-      WAKEUP_ON_EDGE_FALL
-      WAKEUP_ON_LEVEL_HIGH - specifically for enabling of keypad GPIOs,
-
-   to indicate that this pin has the capability of wake-up the system,
-   and on which edge(s). This, however, doesn't necessarily mean the
-   pin _will_ wakeup the system, it will only when set_irq_wake() is
-   invoked with the corresponding GPIO IRQ (GPIO_IRQ(xx) or gpio_to_irq())
-   and eventually calls gpio_set_wake() for the actual register setting.
-
-   d) although PXA3xx MFP supports edge detection on each pin, the
-   internal logic will only wakeup the system when those specific bits
-   in ADxER registers are set, which can be well mapped to the
-   corresponding peripheral, thus set_irq_wake() can be called with 
-   the peripheral IRQ to enable the wakeup.
-
-
- MFP on PXA3xx
-===============
-
-Every external I/O pad on PXA3xx (excluding those for special purpose) has
-one MFP logic associated, and is controlled by one MFP register (MFPR).
-
-The MFPR has the following bit definitions (for PXA300/PXA310/PXA320):
-
- 31                        16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
-  +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-  |         RESERVED        |PS|PU|PD|  DRIVE |SS|SD|SO|EC|EF|ER|--| AF_SEL |
-  +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-
-  Bit 3:   RESERVED
-  Bit 4:   EDGE_RISE_EN - enable detection of rising edge on this pin
-  Bit 5:   EDGE_FALL_EN - enable detection of falling edge on this pin
-  Bit 6:   EDGE_CLEAR   - disable edge detection on this pin
-  Bit 7:   SLEEP_OE_N   - enable outputs during low power modes
-  Bit 8:   SLEEP_DATA   - output data on the pin during low power modes
-  Bit 9:   SLEEP_SEL    - selection control for low power modes signals
-  Bit 13:  PULLDOWN_EN  - enable the internal pull-down resistor on this pin
-  Bit 14:  PULLUP_EN    - enable the internal pull-up resistor on this pin
-  Bit 15:  PULL_SEL     - pull state controlled by selected alternate function
-                          (0) or by PULL{UP,DOWN}_EN bits (1)
-
-  Bit 0 - 2: AF_SEL - alternate function selection, 8 possibilities, from 0-7
-  Bit 10-12: DRIVE  - drive strength and slew rate
-			0b000 - fast 1mA
-			0b001 - fast 2mA
-			0b002 - fast 3mA
-			0b003 - fast 4mA
-			0b004 - slow 6mA
-			0b005 - fast 6mA
-			0b006 - slow 10mA
-			0b007 - fast 10mA
-
- MFP Design for PXA2xx/PXA3xx
-==============================
-
-Due to the difference of pin-mux handling between PXA2xx and PXA3xx, a unified
-MFP API is introduced to cover both series of processors.
-
-The basic idea of this design is to introduce definitions for all possible pin
-configurations, these definitions are processor and platform independent, and
-the actual API invoked to convert these definitions into register settings and
-make them effective there-after.
-
-  Files Involved
-  --------------
-
-  - arch/arm/mach-pxa/include/mach/mfp.h
-  
-  for
-    1. Unified pin definitions - enum constants for all configurable pins
-    2. processor-neutral bit definitions for a possible MFP configuration
-
-  - arch/arm/mach-pxa/mfp-pxa3xx.h
-
-  for PXA3xx specific MFPR register bit definitions and PXA3xx common pin
-  configurations
-
-  - arch/arm/mach-pxa/mfp-pxa2xx.h
-
-  for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations
-
-  - arch/arm/mach-pxa/mfp-pxa25x.h
-    arch/arm/mach-pxa/mfp-pxa27x.h
-    arch/arm/mach-pxa/mfp-pxa300.h
-    arch/arm/mach-pxa/mfp-pxa320.h
-    arch/arm/mach-pxa/mfp-pxa930.h
-
-  for processor specific definitions
-
-  - arch/arm/mach-pxa/mfp-pxa3xx.c
-  - arch/arm/mach-pxa/mfp-pxa2xx.c
-
-  for implementation of the pin configuration to take effect for the actual
-  processor.
-
-  Pin Configuration
-  -----------------
-
-  The following comments are copied from mfp.h (see the actual source code
-  for most updated info)
-  
-  /*
-   * a possible MFP configuration is represented by a 32-bit integer
-   *
-   * bit  0.. 9 - MFP Pin Number (1024 Pins Maximum)
-   * bit 10..12 - Alternate Function Selection
-   * bit 13..15 - Drive Strength
-   * bit 16..18 - Low Power Mode State
-   * bit 19..20 - Low Power Mode Edge Detection
-   * bit 21..22 - Run Mode Pull State
-   *
-   * to facilitate the definition, the following macros are provided
-   *
-   * MFP_CFG_DEFAULT - default MFP configuration value, with
-   * 		  alternate function = 0,
-   * 		  drive strength = fast 3mA (MFP_DS03X)
-   * 		  low power mode = default
-   * 		  edge detection = none
-   *
-   * MFP_CFG	- default MFPR value with alternate function
-   * MFP_CFG_DRV	- default MFPR value with alternate function and
-   * 		  pin drive strength
-   * MFP_CFG_LPM	- default MFPR value with alternate function and
-   * 		  low power mode
-   * MFP_CFG_X	- default MFPR value with alternate function,
-   * 		  pin drive strength and low power mode
-   */
-
-   Examples of pin configurations are:
-
-   #define GPIO94_SSP3_RXD		MFP_CFG_X(GPIO94, AF1, DS08X, FLOAT)
-
-   which reads GPIO94 can be configured as SSP3_RXD, with alternate function
-   selection of 1, driving strength of 0b101, and a float state in low power
-   modes.
-
-   NOTE: this is the default setting of this pin being configured as SSP3_RXD
-   which can be modified a bit in board code, though it is not recommended to
-   do so, simply because this default setting is usually carefully encoded,
-   and is supposed to work in most cases.
-
-  Register Settings
-  -----------------
-
-   Register settings on PXA3xx for a pin configuration is actually very
-   straight-forward, most bits can be converted directly into MFPR value
-   in a easier way. Two sets of MFPR values are calculated: the run-time
-   ones and the low power mode ones, to allow different settings.
-
-   The conversion from a generic pin configuration to the actual register
-   settings on PXA2xx is a bit complicated: many registers are involved,
-   including GAFRx, GPDRx, PGSRx, PWER, PKWR, PFER and PRER. Please see
-   mfp-pxa2xx.c for how the conversion is made.
diff --git a/Documentation/arm/sa1100/adsbitsy.rst b/Documentation/arm/sa1100/adsbitsy.rst
new file mode 100644
index 000000000000..c179cb26b682
--- /dev/null
+++ b/Documentation/arm/sa1100/adsbitsy.rst
@@ -0,0 +1,51 @@
+===============================
+ADS Bitsy Single Board Computer
+===============================
+
+(It is different from Bitsy(iPAQ) of Compaq)
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The Linux support for this product has been provided by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make adsbitsy_config' before any 'make config'.
+This will set up defaults for ADS Bitsy support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can  be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- SA1100 serial port
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- console on LCD screen
+- serial ports (ttyS[0-2])
+  - ttyS0 is default for serial console
+
+To do
+=====
+
+- everything else!  :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions.
+  You should be careful to use flash on board.
+  Its partition is different from GraphicsClient Plus and GraphicsMaster
+
+- 16bpp mode requires a different cable than what ships with the board.
+  Contact ADS or look through the manual to wire your own. Currently,
+  if you compile with 16bit mode support and switch into a lower bpp
+  mode, the timing is off so the image is corrupted.  This will be
+  fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/assabet.rst b/Documentation/arm/sa1100/assabet.rst
new file mode 100644
index 000000000000..3e704831c311
--- /dev/null
+++ b/Documentation/arm/sa1100/assabet.rst
@@ -0,0 +1,301 @@
+============================================
+The Intel Assabet (SA-1110 evaluation) board
+============================================
+
+Please see:
+http://developer.intel.com
+
+Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
+http://www.cs.cmu.edu/~wearable/software/assabet.html
+
+
+Building the kernel
+-------------------
+
+To build the kernel with current defaults::
+
+	make assabet_config
+	make oldconfig
+	make zImage
+
+The resulting kernel image should be available in linux/arch/arm/boot/zImage.
+
+
+Installing a bootloader
+-----------------------
+
+A couple of bootloaders able to boot Linux on Assabet are available:
+
+BLOB (http://www.lartmaker.nl/lartware/blob/)
+
+   BLOB is a bootloader used within the LART project.  Some contributed
+   patches were merged into BLOB to add support for Assabet.
+
+Compaq's Bootldr + John Dorsey's patch for Assabet support
+(http://www.handhelds.org/Compaq/bootldr.html)
+(http://www.wearablegroup.org/software/bootldr/)
+
+   Bootldr is the bootloader developed by Compaq for the iPAQ Pocket PC.
+   John Dorsey has produced add-on patches to add support for Assabet and
+   the JFFS filesystem.
+
+RedBoot (http://sources.redhat.com/redboot/)
+
+   RedBoot is a bootloader developed by Red Hat based on the eCos RTOS
+   hardware abstraction layer.  It supports Assabet amongst many other
+   hardware platforms.
+
+RedBoot is currently the recommended choice since it's the only one to have
+networking support, and is the most actively maintained.
+
+Brief examples on how to boot Linux with RedBoot are shown below.  But first
+you need to have RedBoot installed in your flash memory.  A known to work
+precompiled RedBoot binary is available from the following location:
+
+- ftp://ftp.netwinder.org/users/n/nico/
+- ftp://ftp.arm.linux.org.uk/pub/linux/arm/people/nico/
+- ftp://ftp.handhelds.org/pub/linux/arm/sa-1100-patches/
+
+Look for redboot-assabet*.tgz.  Some installation infos are provided in
+redboot-assabet*.txt.
+
+
+Initial RedBoot configuration
+-----------------------------
+
+The commands used here are explained in The RedBoot User's Guide available
+on-line at http://sources.redhat.com/ecos/docs.html.
+Please refer to it for explanations.
+
+If you have a CF network card (my Assabet kit contained a CF+ LP-E from
+Socket Communications Inc.), you should strongly consider using it for TFTP
+file transfers.  You must insert it before RedBoot runs since it can't detect
+it dynamically.
+
+To initialize the flash directory::
+
+	fis init -f
+
+To initialize the non-volatile settings, like whether you want to use BOOTP or
+a static IP address, etc, use this command::
+
+	fconfig -i
+
+
+Writing a kernel image into flash
+---------------------------------
+
+First, the kernel image must be loaded into RAM.  If you have the zImage file
+available on a TFTP server::
+
+	load zImage -r -b 0x100000
+
+If you rather want to use Y-Modem upload over the serial port::
+
+	load -m ymodem -r -b 0x100000
+
+To write it to flash::
+
+	fis create "Linux kernel" -b 0x100000 -l 0xc0000
+
+
+Booting the kernel
+------------------
+
+The kernel still requires a filesystem to boot.  A ramdisk image can be loaded
+as follows::
+
+	load ramdisk_image.gz -r -b 0x800000
+
+Again, Y-Modem upload can be used instead of TFTP by replacing the file name
+by '-y ymodem'.
+
+Now the kernel can be retrieved from flash like this::
+
+	fis load "Linux kernel"
+
+or loaded as described previously.  To boot the kernel::
+
+	exec -b 0x100000 -l 0xc0000
+
+The ramdisk image could be stored into flash as well, but there are better
+solutions for on-flash filesystems as mentioned below.
+
+
+Using JFFS2
+-----------
+
+Using JFFS2 (the Second Journalling Flash File System) is probably the most
+convenient way to store a writable filesystem into flash.  JFFS2 is used in
+conjunction with the MTD layer which is responsible for low-level flash
+management.  More information on the Linux MTD can be found on-line at:
+http://www.linux-mtd.infradead.org/.  A JFFS howto with some infos about
+creating JFFS/JFFS2 images is available from the same site.
+
+For instance, a sample JFFS2 image can be retrieved from the same FTP sites
+mentioned below for the precompiled RedBoot image.
+
+To load this file::
+
+	load sample_img.jffs2 -r -b 0x100000
+
+The result should look like::
+
+	RedBoot> load sample_img.jffs2 -r -b 0x100000
+	Raw file loaded 0x00100000-0x00377424
+
+Now we must know the size of the unallocated flash::
+
+	fis free
+
+Result::
+
+	RedBoot> fis free
+	  0x500E0000 .. 0x503C0000
+
+The values above may be different depending on the size of the filesystem and
+the type of flash.  See their usage below as an example and take care of
+substituting yours appropriately.
+
+We must determine some values::
+
+	size of unallocated flash:	0x503c0000 - 0x500e0000 = 0x2e0000
+	size of the filesystem image:	0x00377424 - 0x00100000 = 0x277424
+
+We want to fit the filesystem image of course, but we also want to give it all
+the remaining flash space as well.  To write it::
+
+	fis unlock -f 0x500E0000 -l 0x2e0000
+	fis erase -f 0x500E0000 -l 0x2e0000
+	fis write -b 0x100000 -l 0x277424 -f 0x500E0000
+	fis create "JFFS2" -n -f 0x500E0000 -l 0x2e0000
+
+Now the filesystem is associated to a MTD "partition" once Linux has discovered
+what they are in the boot process.  From Redboot, the 'fis list' command
+displays them::
+
+	RedBoot> fis list
+	Name              FLASH addr  Mem addr    Length      Entry point
+	RedBoot           0x50000000  0x50000000  0x00020000  0x00000000
+	RedBoot config    0x503C0000  0x503C0000  0x00020000  0x00000000
+	FIS directory     0x503E0000  0x503E0000  0x00020000  0x00000000
+	Linux kernel      0x50020000  0x00100000  0x000C0000  0x00000000
+	JFFS2             0x500E0000  0x500E0000  0x002E0000  0x00000000
+
+However Linux should display something like::
+
+	SA1100 flash: probing 32-bit flash bus
+	SA1100 flash: Found 2 x16 devices at 0x0 in 32-bit mode
+	Using RedBoot partition definition
+	Creating 5 MTD partitions on "SA1100 flash":
+	0x00000000-0x00020000 : "RedBoot"
+	0x00020000-0x000e0000 : "Linux kernel"
+	0x000e0000-0x003c0000 : "JFFS2"
+	0x003c0000-0x003e0000 : "RedBoot config"
+	0x003e0000-0x00400000 : "FIS directory"
+
+What's important here is the position of the partition we are interested in,
+which is the third one.  Within Linux, this correspond to /dev/mtdblock2.
+Therefore to boot Linux with the kernel and its root filesystem in flash, we
+need this RedBoot command::
+
+	fis load "Linux kernel"
+	exec -b 0x100000 -l 0xc0000 -c "root=/dev/mtdblock2"
+
+Of course other filesystems than JFFS might be used, like cramfs for example.
+You might want to boot with a root filesystem over NFS, etc.  It is also
+possible, and sometimes more convenient, to flash a filesystem directly from
+within Linux while booted from a ramdisk or NFS.  The Linux MTD repository has
+many tools to deal with flash memory as well, to erase it for example.  JFFS2
+can then be mounted directly on a freshly erased partition and files can be
+copied over directly.  Etc...
+
+
+RedBoot scripting
+-----------------
+
+All the commands above aren't so useful if they have to be typed in every
+time the Assabet is rebooted.  Therefore it's possible to automate the boot
+process using RedBoot's scripting capability.
+
+For example, I use this to boot Linux with both the kernel and the ramdisk
+images retrieved from a TFTP server on the network::
+
+	RedBoot> fconfig
+	Run script at boot: false true
+	Boot script:
+	Enter script, terminate with empty line
+	>> load zImage -r -b 0x100000
+	>> load ramdisk_ks.gz -r -b 0x800000
+	>> exec -b 0x100000 -l 0xc0000
+	>>
+	Boot script timeout (1000ms resolution): 3
+	Use BOOTP for network configuration: true
+	GDB connection port: 9000
+	Network debug at boot time: false
+	Update RedBoot non-volatile configuration - are you sure (y/n)? y
+
+Then, rebooting the Assabet is just a matter of waiting for the login prompt.
+
+
+
+Nicolas Pitre
+nico@fluxnic.net
+
+June 12, 2001
+
+
+Status of peripherals in -rmk tree (updated 14/10/2001)
+-------------------------------------------------------
+
+Assabet:
+ Serial ports:
+  Radio:		TX, RX, CTS, DSR, DCD, RI
+   - PM:		Not tested.
+   - COM:		TX, RX, CTS, DSR, DCD, RTS, DTR, PM
+   - PM:		Not tested.
+   - I2C:		Implemented, not fully tested.
+   - L3:		Fully tested, pass.
+   - PM:		Not tested.
+
+ Video:
+  - LCD:		Fully tested.  PM
+
+   (LCD doesn't like being blanked with neponset connected)
+
+  - Video out:		Not fully
+
+ Audio:
+  UDA1341:
+  -  Playback:		Fully tested, pass.
+  -  Record:		Implemented, not tested.
+  -  PM:			Not tested.
+
+  UCB1200:
+  -  Audio play:	Implemented, not heavily tested.
+  -  Audio rec:		Implemented, not heavily tested.
+  -  Telco audio play:	Implemented, not heavily tested.
+  -  Telco audio rec:	Implemented, not heavily tested.
+  -  POTS control:	No
+  -  Touchscreen:	Yes
+  -  PM:		Not tested.
+
+ Other:
+  - PCMCIA:
+  - LPE:		Fully tested, pass.
+  - USB:		No
+  - IRDA:
+  - SIR:		Fully tested, pass.
+  - FIR:		Fully tested, pass.
+  - PM:			Not tested.
+
+Neponset:
+ Serial ports:
+  - COM1,2:		TX, RX, CTS, DSR, DCD, RTS, DTR
+  - PM:			Not tested.
+  - USB:		Implemented, not heavily tested.
+  - PCMCIA:		Implemented, not heavily tested.
+  - CF:			Implemented, not heavily tested.
+  - PM:			Not tested.
+
+More stuff can be found in the -np (Nicolas Pitre's) tree.
diff --git a/Documentation/arm/sa1100/brutus.rst b/Documentation/arm/sa1100/brutus.rst
new file mode 100644
index 000000000000..e1a23bee6d44
--- /dev/null
+++ b/Documentation/arm/sa1100/brutus.rst
@@ -0,0 +1,69 @@
+======
+Brutus
+======
+
+Brutus is an evaluation platform for the SA1100 manufactured by Intel.
+For more details, see:
+
+http://developer.intel.com
+
+To compile for Brutus, you must issue the following commands::
+
+	make brutus_config
+	make config
+	[accept all the defaults]
+	make zImage
+
+The resulting kernel will end up in linux/arch/arm/boot/zImage.  This file
+must be loaded at 0xc0008000 in Brutus's memory and execution started at
+0xc0008000 as well with the value of registers r0 = 0 and r1 = 16 upon
+entry.
+
+But prior to execute the kernel, a ramdisk image must also be loaded in
+memory.  Use memory address 0xd8000000 for this.  Note that the file
+containing the (compressed) ramdisk image must not exceed 4 MB.
+
+Typically, you'll need angelboot to load the kernel.
+The following angelboot.opt file should be used::
+
+	base 0xc0008000
+	entry 0xc0008000
+	r0 0x00000000
+	r1 0x00000010
+	device /dev/ttyS0
+	options "9600 8N1"
+	baud 115200
+	otherfile ramdisk_img.gz
+	otherbase 0xd8000000
+
+Then load the kernel and ramdisk with::
+
+	angelboot -f angelboot.opt zImage
+
+The first Brutus serial port (assumed to be linked to /dev/ttyS0 on your
+host PC) is used by angel to load the kernel and ramdisk image. The serial
+console is provided through the second Brutus serial port. To access it,
+you may use minicom configured with /dev/ttyS1, 9600 baud, 8N1, no flow
+control.
+
+Currently supported
+===================
+
+	- RS232 serial ports
+	- audio output
+	- LCD screen
+	- keyboard
+
+The actual Brutus support may not be complete without extra patches.
+If such patches exist, they should be found from
+ftp.netwinder.org/users/n/nico.
+
+A full PCMCIA support is still missing, although it's possible to hack
+some drivers in order to drive already inserted cards at boot time with
+little modifications.
+
+Any contribution is welcome.
+
+Please send patches to nico@fluxnic.net
+
+Have Fun !
diff --git a/Documentation/arm/sa1100/cerf.rst b/Documentation/arm/sa1100/cerf.rst
new file mode 100644
index 000000000000..7fa71b609bf9
--- /dev/null
+++ b/Documentation/arm/sa1100/cerf.rst
@@ -0,0 +1,35 @@
+==============
+CerfBoard/Cube
+==============
+
+*** The StrongARM version of the CerfBoard/Cube has been discontinued ***
+
+The Intrinsyc CerfBoard is a StrongARM 1110-based computer on a board
+that measures approximately 2" square. It includes an Ethernet
+controller, an RS232-compatible serial port, a USB function port, and
+one CompactFlash+ slot on the back. Pictures can be found at the
+Intrinsyc website, http://www.intrinsyc.com.
+
+This document describes the support in the Linux kernel for the
+Intrinsyc CerfBoard.
+
+Supported in this version
+=========================
+
+   - CompactFlash+ slot (select PCMCIA in General Setup and any options
+     that may be required)
+   - Onboard Crystal CS8900 Ethernet controller (Cerf CS8900A support in
+     Network Devices)
+   - Serial ports with a serial console (hardcoded to 38400 8N1)
+
+In order to get this kernel onto your Cerf, you need a server that runs
+both BOOTP and TFTP. Detailed instructions should have come with your
+evaluation kit on how to use the bootloader. This series of commands
+will suffice::
+
+   make ARCH=arm CROSS_COMPILE=arm-linux- cerfcube_defconfig
+   make ARCH=arm CROSS_COMPILE=arm-linux- zImage
+   make ARCH=arm CROSS_COMPILE=arm-linux- modules
+   cp arch/arm/boot/zImage <TFTP directory>
+
+support@intrinsyc.com
diff --git a/Documentation/arm/sa1100/freebird.rst b/Documentation/arm/sa1100/freebird.rst
new file mode 100644
index 000000000000..81043d0c6d64
--- /dev/null
+++ b/Documentation/arm/sa1100/freebird.rst
@@ -0,0 +1,25 @@
+========
+Freebird
+========
+
+Freebird-1.1 is produced by Legend(C), Inc.
+`http://web.archive.org/web/*/http://www.legend.com.cn`
+and software/linux maintained by Coventive(C), Inc.
+(http://www.coventive.com)
+
+Based on the Nicolas's strongarm kernel tree.
+
+Maintainer:
+
+Chester Kuo
+	- <chester@coventive.com>
+	- <chester@linux.org.tw>
+
+Author:
+
+- Tim wu <timwu@coventive.com>
+- CIH <cih@coventive.com>
+- Eric Peng <ericpeng@coventive.com>
+- Jeff Lee <jeff_lee@coventive.com>
+- Allen Cheng
+- Tony Liu <tonyliu@coventive.com>
diff --git a/Documentation/arm/sa1100/graphicsclient.rst b/Documentation/arm/sa1100/graphicsclient.rst
new file mode 100644
index 000000000000..a73d61c3ce91
--- /dev/null
+++ b/Documentation/arm/sa1100/graphicsclient.rst
@@ -0,0 +1,102 @@
+=============================================
+ADS GraphicsClient Plus Single Board Computer
+=============================================
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+It's currently possible to mount a root filesystem via NFS providing a
+complete Linux environment.  Otherwise a ramdisk image may be used.  The
+board supports MTD/JFFS, so you could also mount something on there.
+
+Use 'make graphicsclient_config' before any 'make config'.  This will set up
+defaults for GraphicsClient Plus support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0200000.
+Also the following registers should have the specified values upon entry::
+
+	r0 = 0
+	r1 = 29	(this is the GraphicsClient architecture number)
+
+Linux can  be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+Angel is not available for the GraphicsClient Plus AFAIK.
+
+There is a  board known as just the GraphicsClient that ADS used to
+produce but has end of lifed. This code will not work on the older
+board with the ADS bootloader, but should still work with Angel,
+as outlined below.  In any case, if you're planning on deploying
+something en masse, you should probably get the newer board.
+
+If using Angel on the older boards, here is a typical angel.opt option file
+if the kernel is loaded through the Angel Debug Monitor::
+
+	base 0xc0200000
+	entry 0xc0200000
+	r0 0x00000000
+	r1 0x0000001d
+	device /dev/ttyS1
+	options "38400 8N1"
+	baud 115200
+	#otherfile ramdisk.gz
+	#otherbase 0xc0800000
+	exec minicom
+
+Then the kernel (and ramdisk if otherfile/otherbase lines above are
+uncommented) would be loaded with::
+
+	angelboot -f angelboot.opt zImage
+
+Here it is assumed that the board is connected to ttyS1 on your PC
+and that minicom is preconfigured with /dev/ttyS1, 38400 baud, 8N1, no flow
+control by default.
+
+If any other bootloader is used, ensure it accomplish the same, especially
+for r0/r1 register values before jumping into the kernel.
+
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+  - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
+  and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do
+=====
+
+- UCB1200 audio with new ucb_generic layer
+- everything else!  :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions.  mtd0 is where
+  the ADS boot ROM and zImage is stored.  It's been marked as
+  read-only to keep you from blasting over the bootloader. :)  mtd1 is
+  for the ramdisk.gz image.  mtd2 is user flash space and can be
+  utilized for either JFFS or if you're feeling crazy, running ext2
+  on top of it. If you're not using the ADS bootloader, you're
+  welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+  Contact ADS or look through the manual to wire your own. Currently,
+  if you compile with 16bit mode support and switch into a lower bpp
+  mode, the timing is off so the image is corrupted.  This will be
+  fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/graphicsmaster.rst b/Documentation/arm/sa1100/graphicsmaster.rst
new file mode 100644
index 000000000000..e39892514f0c
--- /dev/null
+++ b/Documentation/arm/sa1100/graphicsmaster.rst
@@ -0,0 +1,60 @@
+========================================
+ADS GraphicsMaster Single Board Computer
+========================================
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make graphicsmaster_config' before any 'make config'.
+This will set up defaults for GraphicsMaster support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can  be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+  - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+  See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
+  and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do
+=====
+
+- everything else!  :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions.  mtd0 is where
+  the zImage is stored.  It's been marked as read-only to keep you
+  from blasting over the bootloader. :)  mtd1 is
+  for the ramdisk.gz image.  mtd2 is user flash space and can be
+  utilized for either JFFS or if you're feeling crazy, running ext2
+  on top of it. If you're not using the ADS bootloader, you're
+  welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+  Contact ADS or look through the manual to wire your own. Currently,
+  if you compile with 16bit mode support and switch into a lower bpp
+  mode, the timing is off so the image is corrupted.  This will be
+  fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/huw_webpanel.rst b/Documentation/arm/sa1100/huw_webpanel.rst
new file mode 100644
index 000000000000..1dc7ccb165f0
--- /dev/null
+++ b/Documentation/arm/sa1100/huw_webpanel.rst
@@ -0,0 +1,21 @@
+=======================
+Hoeft & Wessel Webpanel
+=======================
+
+The HUW_WEBPANEL is a product of the german company Hoeft & Wessel AG
+
+If you want more information, please visit
+http://www.hoeft-wessel.de
+
+To build the kernel::
+
+	make huw_webpanel_config
+	make oldconfig
+	[accept all defaults]
+	make zImage
+
+Mostly of the work is done by:
+Roman Jordan         jor@hoeft-wessel.de
+Christoph Schulz    schu@hoeft-wessel.de
+
+2000/12/18/
diff --git a/Documentation/arm/sa1100/index.rst b/Documentation/arm/sa1100/index.rst
new file mode 100644
index 000000000000..68c2a280a745
--- /dev/null
+++ b/Documentation/arm/sa1100/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Intel StrongARM 1100
+====================
+
+.. toctree::
+    :maxdepth: 1
+
+    adsbitsy
+    assabet
+    brutus
+    cerf
+    freebird
+    graphicsclient
+    graphicsmaster
+    huw_webpanel
+    itsy
+    lart
+    nanoengine
+    pangolin
+    pleb
+    serial_uart
+    tifon
+    yopy
diff --git a/Documentation/arm/sa1100/itsy.rst b/Documentation/arm/sa1100/itsy.rst
new file mode 100644
index 000000000000..f49896ba3ef1
--- /dev/null
+++ b/Documentation/arm/sa1100/itsy.rst
@@ -0,0 +1,47 @@
+====
+Itsy
+====
+
+Itsy is a research project done by the Western Research Lab, and Systems
+Research Center in Palo Alto, CA. The Itsy project is one of several
+research projects at Compaq that are related to pocket computing.
+
+For more information, see:
+
+	http://www.hpl.hp.com/downloads/crl/itsy/
+
+Notes on initial 2.4 Itsy support (8/27/2000) :
+
+The port was done on an Itsy version 1.5 machine with a daughtercard with
+64 Meg of DRAM and 32 Meg of Flash. The initial work includes support for
+serial console (to see what you're doing).  No other devices have been
+enabled.
+
+To build, do a "make menuconfig" (or xmenuconfig) and select Itsy support.
+Disable Flash and LCD support. and then do a make zImage.
+Finally, you will need to cd to arch/arm/boot/tools and execute a make there
+to build the params-itsy program used to boot the kernel.
+
+In order to install the port of 2.4 to the itsy, You will need to set the
+configuration parameters in the monitor as follows::
+
+	Arg 1:0x08340000, Arg2: 0xC0000000, Arg3:18 (0x12), Arg4:0
+
+Make sure the start-routine address is set to 0x00060000.
+
+Next, flash the params-itsy program to 0x00060000 ("p 1 0x00060000" in the
+flash menu)  Flash the kernel in arch/arm/boot/zImage into 0x08340000
+("p 1 0x00340000").  Finally flash an initial ramdisk into 0xC8000000
+("p 2 0x0")  We used ramdisk-2-30.gz from the 0.11 version directory on
+handhelds.org.
+
+The serial connection we established was at:
+
+8-bit data, no parity, 1 stop bit(s), 115200.00 b/s. in the monitor, in the
+params-itsy program, and in the kernel itself.  This can be changed, but
+not easily. The monitor parameters are easily changed, the params program
+setup is assembly outl's, and the kernel is a configuration item specific to
+the itsy. (i.e. grep for CONFIG_SA1100_ITSY and you'll find where it is.)
+
+
+This should get you a properly booting 2.4 kernel on the itsy.
diff --git a/Documentation/arm/sa1100/lart.rst b/Documentation/arm/sa1100/lart.rst
new file mode 100644
index 000000000000..94c0568d1095
--- /dev/null
+++ b/Documentation/arm/sa1100/lart.rst
@@ -0,0 +1,15 @@
+====================================
+Linux Advanced Radio Terminal (LART)
+====================================
+
+The LART is a small (7.5 x 10cm) SA-1100 board, designed for embedded
+applications. It has 32 MB DRAM, 4MB Flash ROM, double RS232 and all
+other StrongARM-gadgets. Almost all SA signals are directly accessible
+through a number of connectors. The powersupply accepts voltages
+between 3.5V and 16V and is overdimensioned to support a range of
+daughterboards. A quad Ethernet / IDE / PS2 / sound daughterboard
+is under development, with plenty of others in different stages of
+planning.
+
+The hardware designs for this board have been released under an open license;
+see the LART page at http://www.lartmaker.nl/ for more information.
diff --git a/Documentation/arm/sa1100/nanoengine.rst b/Documentation/arm/sa1100/nanoengine.rst
new file mode 100644
index 000000000000..47f1a14cf98a
--- /dev/null
+++ b/Documentation/arm/sa1100/nanoengine.rst
@@ -0,0 +1,11 @@
+==========
+nanoEngine
+==========
+
+"nanoEngine" is a SA1110 based single board computer from
+Bright Star Engineering Inc.  See www.brightstareng.com/arm
+for more info.
+(Ref: Stuart Adams <sja@brightstareng.com>)
+
+Also visit Larry Doolittle's "Linux for the nanoEngine" site:
+http://www.brightstareng.com/arm/nanoeng.htm
diff --git a/Documentation/arm/sa1100/pangolin.rst b/Documentation/arm/sa1100/pangolin.rst
new file mode 100644
index 000000000000..f0c5c1618553
--- /dev/null
+++ b/Documentation/arm/sa1100/pangolin.rst
@@ -0,0 +1,29 @@
+========
+Pangolin
+========
+
+Pangolin is a StrongARM 1110-based evaluation platform produced
+by Dialogue Technology (http://www.dialogue.com.tw/).
+It has EISA slots for ease of configuration with SDRAM/Flash
+memory card, USB/Serial/Audio card, Compact Flash card,
+PCMCIA/IDE card and TFT-LCD card.
+
+To compile for Pangolin, you must issue the following commands::
+
+	make pangolin_config
+	make oldconfig
+	make zImage
+
+Supported peripherals
+=====================
+
+- SA1110 serial port (UART1/UART2/UART3)
+- flash memory access
+- compact flash driver
+- UDA1341 sound driver
+- SA1100 LCD controller for 800x600 16bpp TFT-LCD
+- MQ-200 driver for 800x600 16bpp TFT-LCD
+- Penmount(touch panel) driver
+- PCMCIA driver
+- SMC91C94 LAN driver
+- IDE driver (experimental)
diff --git a/Documentation/arm/sa1100/pleb.rst b/Documentation/arm/sa1100/pleb.rst
new file mode 100644
index 000000000000..d5b732967aa3
--- /dev/null
+++ b/Documentation/arm/sa1100/pleb.rst
@@ -0,0 +1,13 @@
+====
+PLEB
+====
+
+The PLEB project was started as a student initiative at the School of
+Computer Science and Engineering, University of New South Wales to make a
+pocket computer capable of running the Linux Kernel.
+
+PLEB support has yet to be fully integrated.
+
+For more information, see:
+
+	http://www.cse.unsw.edu.au
diff --git a/Documentation/arm/sa1100/serial_uart.rst b/Documentation/arm/sa1100/serial_uart.rst
new file mode 100644
index 000000000000..ea983642b9be
--- /dev/null
+++ b/Documentation/arm/sa1100/serial_uart.rst
@@ -0,0 +1,51 @@
+==================
+SA1100 serial port
+==================
+
+The SA1100 serial port had its major/minor numbers officially assigned::
+
+  > Date: Sun, 24 Sep 2000 21:40:27 -0700
+  > From: H. Peter Anvin <hpa@transmeta.com>
+  > To: Nicolas Pitre <nico@CAM.ORG>
+  > Cc: Device List Maintainer <device@lanana.org>
+  > Subject: Re: device
+  >
+  > Okay.  Note that device numbers 204 and 205 are used for "low density
+  > serial devices", so you will have a range of minors on those majors (the
+  > tty device layer handles this just fine, so you don't have to worry about
+  > doing anything special.)
+  >
+  > So your assignments are:
+  >
+  > 204 char        Low-density serial ports
+  >                   5 = /dev/ttySA0               SA1100 builtin serial port 0
+  >                   6 = /dev/ttySA1               SA1100 builtin serial port 1
+  >                   7 = /dev/ttySA2               SA1100 builtin serial port 2
+  >
+  > 205 char        Low-density serial ports (alternate device)
+  >                   5 = /dev/cusa0                Callout device for ttySA0
+  >                   6 = /dev/cusa1                Callout device for ttySA1
+  >                   7 = /dev/cusa2                Callout device for ttySA2
+  >
+
+You must create those inodes in /dev on the root filesystem used
+by your SA1100-based device::
+
+	mknod ttySA0 c 204 5
+	mknod ttySA1 c 204 6
+	mknod ttySA2 c 204 7
+	mknod cusa0 c 205 5
+	mknod cusa1 c 205 6
+	mknod cusa2 c 205 7
+
+In addition to the creation of the appropriate device nodes above, you
+must ensure your user space applications make use of the correct device
+name. The classic example is the content of the /etc/inittab file where
+you might have a getty process started on ttyS0.
+
+In this case:
+
+- replace occurrences of ttyS0 with ttySA0, ttyS1 with ttySA1, etc.
+
+- don't forget to add 'ttySA0', 'console', or the appropriate tty name
+  in /etc/securetty for root to be allowed to login as well.
diff --git a/Documentation/arm/sa1100/tifon.rst b/Documentation/arm/sa1100/tifon.rst
new file mode 100644
index 000000000000..c26e910b9ea7
--- /dev/null
+++ b/Documentation/arm/sa1100/tifon.rst
@@ -0,0 +1,7 @@
+=====
+Tifon
+=====
+
+More info has to come...
+
+Contact: Peter Danielsson <peter.danielsson@era-t.ericsson.se>
diff --git a/Documentation/arm/sa1100/yopy.rst b/Documentation/arm/sa1100/yopy.rst
new file mode 100644
index 000000000000..5b35a5f61a44
--- /dev/null
+++ b/Documentation/arm/sa1100/yopy.rst
@@ -0,0 +1,5 @@
+====
+Yopy
+====
+
+See http://www.yopydeveloper.org for more.
diff --git a/Documentation/arm/samsung-s3c24xx/cpufreq.rst b/Documentation/arm/samsung-s3c24xx/cpufreq.rst
new file mode 100644
index 000000000000..2ddc26c03b1f
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/cpufreq.rst
@@ -0,0 +1,76 @@
+=======================
+S3C24XX CPUfreq support
+=======================
+
+Introduction
+------------
+
+ The S3C24XX series support a number of power saving systems, such as
+ the ability to change the core, memory and peripheral operating
+ frequencies. The core control is exported via the CPUFreq driver
+ which has a number of different manual or automatic controls over the
+ rate the core is running at.
+
+ There are two forms of the driver depending on the specific CPU and
+ how the clocks are arranged. The first implementation used as single
+ PLL to feed the ARM, memory and peripherals via a series of dividers
+ and muxes and this is the implementation that is documented here. A
+ newer version where there is a separate PLL and clock divider for the
+ ARM core is available as a separate driver.
+
+
+Layout
+------
+
+ The code core manages the CPU specific drivers, any data that they
+ need to register and the interface to the generic drivers/cpufreq
+ system. Each CPU registers a driver to control the PLL, clock dividers
+ and anything else associated with it. Any board that wants to use this
+ framework needs to supply at least basic details of what is required.
+
+ The core registers with drivers/cpufreq at init time if all the data
+ necessary has been supplied.
+
+
+CPU support
+-----------
+
+ The support for each CPU depends on the facilities provided by the
+ SoC and the driver as each device has different PLL and clock chains
+ associated with it.
+
+
+Slow Mode
+---------
+
+ The SLOW mode where the PLL is turned off altogether and the
+ system is fed by the external crystal input is currently not
+ supported.
+
+
+sysfs
+-----
+
+ The core code exports extra information via sysfs in the directory
+ devices/system/cpu/cpu0/arch-freq.
+
+
+Board Support
+-------------
+
+ Each board that wants to use the cpufreq code must register some basic
+ information with the core driver to provide information about what the
+ board requires and any restrictions being placed on it.
+
+ The board needs to supply information about whether it needs the IO bank
+ timings changing, any maximum frequency limits and information about the
+ SDRAM refresh rate.
+
+
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2009 Simtec Electronics
+Licensed under GPLv2
diff --git a/Documentation/arm/samsung-s3c24xx/eb2410itx.rst b/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
new file mode 100644
index 000000000000..7863c93652f8
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
@@ -0,0 +1,59 @@
+===================================
+Simtec Electronics EB2410ITX (BAST)
+===================================
+
+	http://www.simtec.co.uk/products/EB2410ITX/
+
+Introduction
+------------
+
+  The EB2410ITX is a S3C2410 based development board with a variety of
+  peripherals and expansion connectors. This board is also known by
+  the shortened name of Bast.
+
+
+Configuration
+-------------
+
+  To set the default configuration, use `make bast_defconfig` which
+  supports the commonly used features of this board.
+
+
+Support
+-------
+
+  Official support information can be found on the Simtec Electronics
+  website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
+
+  Useful links:
+
+    - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
+
+    - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
+
+    - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
+      and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
+
+
+MTD
+---
+
+  The NAND and NOR support has been merged from the linux-mtd project.
+  Any problems, see http://www.linux-mtd.infradead.org/ for more
+  information or up-to-date versions of linux-mtd.
+
+
+IDE
+---
+
+  Both onboard IDE ports are supported, however there is no support for
+  changing speed of devices, PIO Mode 4 capable drives should be used.
+
+
+Maintainers
+-----------
+
+  This board is maintained by Simtec Electronics.
+
+
+Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/gpio.rst b/Documentation/arm/samsung-s3c24xx/gpio.rst
new file mode 100644
index 000000000000..f7c3d7d011a2
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/gpio.rst
@@ -0,0 +1,172 @@
+====================
+S3C24XX GPIO Control
+====================
+
+Introduction
+------------
+
+  The s3c2410 kernel provides an interface to configure and
+  manipulate the state of the GPIO pins, and find out other
+  information about them.
+
+  There are a number of conditions attached to the configuration
+  of the s3c2410 GPIO system, please read the Samsung provided
+  data-sheet/users manual to find out the complete list.
+
+  See Documentation/arm/samsung/gpio.rst for the core implementation.
+
+
+GPIOLIB
+-------
+
+  With the event of the GPIOLIB in drivers/gpio, support for some
+  of the GPIO functions such as reading and writing a pin will
+  be removed in favour of this common access method.
+
+  Once all the extant drivers have been converted, the functions
+  listed below will be removed (they may be marked as __deprecated
+  in the near future).
+
+  The following functions now either have a `s3c_` specific variant
+  or are merged into gpiolib. See the definitions in
+  arch/arm/plat-samsung/include/plat/gpio-cfg.h:
+
+  - s3c2410_gpio_setpin()	gpio_set_value() or gpio_direction_output()
+  - s3c2410_gpio_getpin()	gpio_get_value() or gpio_direction_input()
+  - s3c2410_gpio_getirq()	gpio_to_irq()
+  - s3c2410_gpio_cfgpin()	s3c_gpio_cfgpin()
+  - s3c2410_gpio_getcfg()	s3c_gpio_getcfg()
+  - s3c2410_gpio_pullup()	s3c_gpio_setpull()
+
+
+GPIOLIB conversion
+------------------
+
+If you need to convert your board or driver to use gpiolib from the phased
+out s3c2410 API, then here are some notes on the process.
+
+1) If your board is exclusively using an GPIO, say to control peripheral
+   power, then it will require to claim the gpio with gpio_request() before
+   it can use it.
+
+   It is recommended to check the return value, with at least WARN_ON()
+   during initialisation.
+
+2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
+   as they have the same arguments, and can either take the pin specific
+   values, or the more generic special-function-number arguments.
+
+3) s3c2410_gpio_pullup() changes have the problem that while the
+   s3c2410_gpio_pullup(x, 1) can be easily translated to the
+   s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
+   are not so easy.
+
+   The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
+   of some of the devices, a pull-down) and as such the new API distinguishes
+   between the UP and DOWN case. There is currently no 'just turn on' setting
+   which may be required if this becomes a problem.
+
+4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
+   does not implicitly configure the relevant gpio to output. The gpio
+   direction should be changed before using gpio_set_value().
+
+5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
+   has been set to input. It is currently unknown what the behaviour is
+   when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
+   would return the value the pin is supposed to be outputting).
+
+6) s3c2410_gpio_getirq() should be directly replaceable with the
+   gpio_to_irq() call.
+
+The s3c2410_gpio and `gpio_` calls have always operated on the same gpio
+numberspace, so there is no problem with converting the gpio numbering
+between the calls.
+
+
+Headers
+-------
+
+  See arch/arm/mach-s3c24xx/include/mach/regs-gpio.h for the list
+  of GPIO pins, and the configuration values for them. This
+  is included by using #include <mach/regs-gpio.h>
+
+
+PIN Numbers
+-----------
+
+  Each pin has an unique number associated with it in regs-gpio.h,
+  e.g. S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
+  the GPIO functions which pin is to be used.
+
+  With the conversion to gpiolib, there is no longer a direct conversion
+  from gpio pin number to register base address as in earlier kernels. This
+  is due to the number space required for newer SoCs where the later
+  GPIOs are not contiguous.
+
+
+Configuring a pin
+-----------------
+
+  The following function allows the configuration of a given pin to
+  be changed.
+
+    void s3c_gpio_cfgpin(unsigned int pin, unsigned int function);
+
+  e.g.:
+
+     s3c_gpio_cfgpin(S3C2410_GPA(0), S3C_GPIO_SFN(1));
+     s3c_gpio_cfgpin(S3C2410_GPE(8), S3C_GPIO_SFN(2));
+
+   which would turn GPA(0) into the lowest Address line A0, and set
+   GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
+
+
+Reading the current configuration
+---------------------------------
+
+  The current configuration of a pin can be read by using standard
+  gpiolib function:
+
+  s3c_gpio_getcfg(unsigned int pin);
+
+  The return value will be from the same set of values which can be
+  passed to s3c_gpio_cfgpin().
+
+
+Configuring a pull-up resistor
+------------------------------
+
+  A large proportion of the GPIO pins on the S3C2410 can have weak
+  pull-up resistors enabled. This can be configured by the following
+  function:
+
+    void s3c_gpio_setpull(unsigned int pin, unsigned int to);
+
+  Where the to value is S3C_GPIO_PULL_NONE to set the pull-up off,
+  and S3C_GPIO_PULL_UP to enable the specified pull-up. Any other
+  values are currently undefined.
+
+
+Getting and setting the state of a PIN
+--------------------------------------
+
+  These calls are now implemented by the relevant gpiolib calls, convert
+  your board or driver to use gpiolib.
+
+
+Getting the IRQ number associated with a PIN
+--------------------------------------------
+
+  A standard gpiolib function can map the given pin number to an IRQ
+  number to pass to the IRQ system.
+
+   int gpio_to_irq(unsigned int pin);
+
+  Note, not all pins have an IRQ.
+
+
+Author
+-------
+
+Ben Dooks, 03 October 2004
+Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/h1940.rst b/Documentation/arm/samsung-s3c24xx/h1940.rst
new file mode 100644
index 000000000000..62a562c178e3
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/h1940.rst
@@ -0,0 +1,41 @@
+=============
+HP IPAQ H1940
+=============
+
+http://www.handhelds.org/projects/h1940.html
+
+Introduction
+------------
+
+  The HP H1940 is a S3C2410 based handheld device, with
+  bluetooth connectivity.
+
+
+Support
+-------
+
+  A variety of information is available
+
+  handhelds.org project page:
+
+    http://www.handhelds.org/projects/h1940.html
+
+  handhelds.org wiki page:
+
+    http://handhelds.org/moin/moin.cgi/HpIpaqH1940
+
+  Herbert Pötzl pages:
+
+    http://vserver.13thfloor.at/H1940/
+
+
+Maintainers
+-----------
+
+  This project is being maintained and developed by a variety
+  of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
+
+  Thanks to the many others who have also provided support.
+
+
+(c) 2005 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/index.rst b/Documentation/arm/samsung-s3c24xx/index.rst
new file mode 100644
index 000000000000..5b8a7f9398d8
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/index.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Samsung S3C24XX SoC Family
+==========================
+
+.. toctree::
+   :maxdepth: 1
+
+   h1940
+   gpio
+   cpufreq
+   suspend
+   usb-host
+   s3c2412
+   eb2410itx
+   nand
+   smdk2440
+   s3c2413
+   overview
diff --git a/Documentation/arm/samsung-s3c24xx/nand.rst b/Documentation/arm/samsung-s3c24xx/nand.rst
new file mode 100644
index 000000000000..938995694ee7
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/nand.rst
@@ -0,0 +1,30 @@
+====================
+S3C24XX NAND Support
+====================
+
+Introduction
+------------
+
+Small Page NAND
+---------------
+
+The driver uses a 512 byte (1 page) ECC code for this setup. The
+ECC code is not directly compatible with the default kernel ECC
+code, so the driver enforces its own OOB layout and ECC parameters
+
+Large Page NAND
+---------------
+
+The driver is capable of handling NAND flash with a 2KiB page
+size, with support for hardware ECC generation and correction.
+
+Unlike the 512byte page mode, the driver generates ECC data for
+each 256 byte block in an 2KiB page. This means that more than
+one error in a page can be rectified. It also means that the
+OOB layout remains the default kernel layout for these flashes.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2007 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/overview.rst b/Documentation/arm/samsung-s3c24xx/overview.rst
new file mode 100644
index 000000000000..e9a1dc7276b5
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/overview.rst
@@ -0,0 +1,319 @@
+==========================
+S3C24XX ARM Linux Overview
+==========================
+
+
+
+Introduction
+------------
+
+  The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
+  by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
+  S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443 and S3C2450 devices
+  are supported.
+
+  Support for the S3C2400 and S3C24A0 series was never completed and the
+  corresponding code has been removed after a while.  If someone wishes to
+  revive this effort, partial support can be retrieved from earlier Linux
+  versions.
+
+  The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
+  included under the arch/arm/mach-s3c2416 directory. Note, while core
+  support for these SoCs is in, work on some of the extra peripherals
+  and extra interrupts is still ongoing.
+
+
+Configuration
+-------------
+
+  A generic S3C2410 configuration is provided, and can be used as the
+  default by `make s3c2410_defconfig`. This configuration has support
+  for all the machines, and the commonly used features on them.
+
+  Certain machines may have their own default configurations as well,
+  please check the machine specific documentation.
+
+
+Layout
+------
+
+  The core support files are located in the platform code contained in
+  arch/arm/plat-s3c24xx with headers in include/asm-arm/plat-s3c24xx.
+  This directory should be kept to items shared between the platform
+  code (arch/arm/plat-s3c24xx) and the arch/arm/mach-s3c24* code.
+
+  Each cpu has a directory with the support files for it, and the
+  machines that carry the device. For example S3C2410 is contained
+  in arch/arm/mach-s3c2410 and S3C2440 in arch/arm/mach-s3c2440
+
+  Register, kernel and platform data definitions are held in the
+  arch/arm/mach-s3c2410 directory./include/mach
+
+arch/arm/plat-s3c24xx:
+
+  Files in here are either common to all the s3c24xx family,
+  or are common to only some of them with names to indicate this
+  status. The files that are not common to all are generally named
+  with the initial cpu they support in the series to ensure a short
+  name without any possibility of confusion with newer devices.
+
+  As an example, initially s3c244x would cover s3c2440 and s3c2442, but
+  with the s3c2443 which does not share many of the same drivers in
+  this directory, the name becomes invalid. We stick to s3c2440-<x>
+  to indicate a driver that is s3c2440 and s3c2442 compatible.
+
+  This does mean that to find the status of any given SoC, a number
+  of directories may need to be searched.
+
+
+Machines
+--------
+
+  The currently supported machines are as follows:
+
+  Simtec Electronics EB2410ITX (BAST)
+
+    A general purpose development board, see EB2410ITX.txt for further
+    details
+
+  Simtec Electronics IM2440D20 (Osiris)
+
+    CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
+    and a PCMCIA controller.
+
+  Samsung SMDK2410
+
+    Samsung's own development board, geared for PDA work.
+
+  Samsung/Aiji SMDK2412
+
+    The S3C2412 version of the SMDK2440.
+
+  Samsung/Aiji SMDK2413
+
+    The S3C2412 version of the SMDK2440.
+
+  Samsung/Meritech SMDK2440
+
+    The S3C2440 compatible version of the SMDK2440, which has the
+    option of an S3C2440 or S3C2442 CPU module.
+
+  Thorcom VR1000
+
+    Custom embedded board
+
+  HP IPAQ 1940
+
+    Handheld (IPAQ), available in several varieties
+
+  HP iPAQ rx3715
+
+    S3C2440 based IPAQ, with a number of variations depending on
+    features shipped.
+
+  Acer N30
+
+    A S3C2410 based PDA from Acer.  There is a Wiki page at
+    http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
+
+  AML M5900
+
+    American Microsystems' M5900
+
+  Nex Vision Nexcoder
+  Nex Vision Otom
+
+    Two machines by Nex Vision
+
+
+Adding New Machines
+-------------------
+
+  The architecture has been designed to support as many machines as can
+  be configured for it in one kernel build, and any future additions
+  should keep this in mind before altering items outside of their own
+  machine files.
+
+  Machine definitions should be kept in linux/arch/arm/mach-s3c2410,
+  and there are a number of examples that can be looked at.
+
+  Read the kernel patch submission policies as well as the
+  Documentation/arm directory before submitting patches. The
+  ARM kernel series is managed by Russell King, and has a patch system
+  located at http://www.arm.linux.org.uk/developer/patches/
+  as well as mailing lists that can be found from the same site.
+
+  As a courtesy, please notify <ben-linux@fluff.org> of any new
+  machines or other modifications.
+
+  Any large scale modifications, or new drivers should be discussed
+  on the ARM kernel mailing list (linux-arm-kernel) before being
+  attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
+  mailing list information.
+
+
+I2C
+---
+
+  The hardware I2C core in the CPU is supported in single master
+  mode, and can be configured via platform data.
+
+
+RTC
+---
+
+  Support for the onboard RTC unit, including alarm function.
+
+  This has recently been upgraded to use the new RTC core,
+  and the module has been renamed to rtc-s3c to fit in with
+  the new rtc naming scheme.
+
+
+Watchdog
+--------
+
+  The onchip watchdog is available via the standard watchdog
+  interface.
+
+
+NAND
+----
+
+  The current kernels now have support for the s3c2410 NAND
+  controller. If there are any problems the latest linux-mtd
+  code can be found from http://www.linux-mtd.infradead.org/
+
+  For more information see Documentation/arm/samsung-s3c24xx/nand.rst
+
+
+SD/MMC
+------
+
+  The SD/MMC hardware pre S3C2443 is supported in the current
+  kernel, the driver is drivers/mmc/host/s3cmci.c and supports
+  1 and 4 bit SD or MMC cards.
+
+  The SDIO behaviour of this driver has not been fully tested. There is no
+  current support for hardware SDIO interrupts.
+
+
+Serial
+------
+
+  The s3c2410 serial driver provides support for the internal
+  serial ports. These devices appear as /dev/ttySAC0 through 3.
+
+  To create device nodes for these, use the following commands
+
+    mknod ttySAC0 c 204 64
+    mknod ttySAC1 c 204 65
+    mknod ttySAC2 c 204 66
+
+
+GPIO
+----
+
+  The core contains support for manipulating the GPIO, see the
+  documentation in GPIO.txt in the same directory as this file.
+
+  Newer kernels carry GPIOLIB, and support is being moved towards
+  this with some of the older support in line to be removed.
+
+  As of v2.6.34, the move towards using gpiolib support is almost
+  complete, and very little of the old calls are left.
+
+  See Documentation/arm/samsung-s3c24xx/gpio.rst for the S3C24XX specific
+  support and Documentation/arm/samsung/gpio.rst for the core Samsung
+  implementation.
+
+
+Clock Management
+----------------
+
+  The core provides the interface defined in the header file
+  include/asm-arm/hardware/clock.h, to allow control over the
+  various clock units
+
+
+Suspend to RAM
+--------------
+
+  For boards that provide support for suspend to RAM, the
+  system can be placed into low power suspend.
+
+  See Suspend.txt for more information.
+
+
+SPI
+---
+
+  SPI drivers are available for both the in-built hardware
+  (although there is no DMA support yet) and a generic
+  GPIO based solution.
+
+
+LEDs
+----
+
+  There is support for GPIO based LEDs via a platform driver
+  in the LED subsystem.
+
+
+Platform Data
+-------------
+
+  Whenever a device has platform specific data that is specified
+  on a per-machine basis, care should be taken to ensure the
+  following:
+
+    1) that default data is not left in the device to confuse the
+       driver if a machine does not set it at startup
+
+    2) the data should (if possible) be marked as __initdata,
+       to ensure that the data is thrown away if the machine is
+       not the one currently in use.
+
+       The best way of doing this is to make a function that
+       kmalloc()s an area of memory, and copies the __initdata
+       and then sets the relevant device's platform data. Making
+       the function `__init` takes care of ensuring it is discarded
+       with the rest of the initialisation code::
+
+         static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
+         {
+             struct s3c2410_xxx_mach_info *npd;
+
+	   npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
+	   if (npd) {
+	      memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
+	      s3c_device_xxx.dev.platform_data = npd;
+	   } else {
+                printk(KERN_ERR "no memory for xxx platform data\n");
+	   }
+	}
+
+	Note, since the code is marked as __init, it should not be
+	exported outside arch/arm/mach-s3c2410/, or exported to
+	modules via EXPORT_SYMBOL() and related functions.
+
+
+Port Contributors
+-----------------
+
+  Ben Dooks (BJD)
+  Vincent Sanders
+  Herbert Potzl
+  Arnaud Patard (RTP)
+  Roc Wu
+  Klaus Fetscher
+  Dimitry Andric
+  Shannon Holland
+  Guillaume Gourat (NexVision)
+  Christer Weinigel (wingel) (Acer N30)
+  Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2004-2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2412.rst b/Documentation/arm/samsung-s3c24xx/s3c2412.rst
new file mode 100644
index 000000000000..68b985fc6bf4
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/s3c2412.rst
@@ -0,0 +1,121 @@
+==========================
+S3C2412 ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+  The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
+  from Samsung. This part has an ARM926-EJS core, capable of running up
+  to 266MHz (see data-sheet for more information)
+
+
+Clock
+-----
+
+  The core clock code provides a set of clocks to the drivers, and allows
+  for source selection and a number of other features.
+
+
+Power
+-----
+
+  No support for suspend/resume to RAM in the current system.
+
+
+DMA
+---
+
+  No current support for DMA.
+
+
+GPIO
+----
+
+  There is support for setting the GPIO to input/output/special function
+  and reading or writing to them.
+
+
+UART
+----
+
+  The UART hardware is similar to the S3C2440, and is supported by the
+  s3c2410 driver in the drivers/serial directory.
+
+
+NAND
+----
+
+  The NAND hardware is similar to the S3C2440, and is supported by the
+  s3c2410 driver in the drivers/mtd/nand/raw directory.
+
+
+USB Host
+--------
+
+  The USB hardware is similar to the S3C2410, with extended clock source
+  control. The OHCI portion is supported by the ohci-s3c2410 driver, and
+  the clock control selection is supported by the core clock code.
+
+
+USB Device
+----------
+
+  No current support in the kernel
+
+
+IRQs
+----
+
+  All the standard, and external interrupt sources are supported. The
+  extra sub-sources are not yet supported.
+
+
+RTC
+---
+
+  The RTC hardware is similar to the S3C2410, and is supported by the
+  s3c2410-rtc driver.
+
+
+Watchdog
+--------
+
+  The watchdog hardware is the same as the S3C2410, and is supported by
+  the s3c2410_wdt driver.
+
+
+MMC/SD/SDIO
+-----------
+
+  No current support for the MMC/SD/SDIO block.
+
+IIC
+---
+
+  The IIC hardware is the same as the S3C2410, and is supported by the
+  i2c-s3c24xx driver.
+
+
+IIS
+---
+
+  No current support for the IIS interface.
+
+
+SPI
+---
+
+  No current support for the SPI interfaces.
+
+
+ATA
+---
+
+  No current support for the on-board ATA block.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2413.rst b/Documentation/arm/samsung-s3c24xx/s3c2413.rst
new file mode 100644
index 000000000000..1f51e207fc46
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/s3c2413.rst
@@ -0,0 +1,22 @@
+==========================
+S3C2413 ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+  The S3C2413 is an extended version of the S3C2412, with an camera
+  interface and mobile DDR memory support. See the S3C2412 support
+  documentation for more information.
+
+
+Camera Interface
+----------------
+
+  This block is currently not supported.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/smdk2440.rst b/Documentation/arm/samsung-s3c24xx/smdk2440.rst
new file mode 100644
index 000000000000..524fd0b4afaf
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/smdk2440.rst
@@ -0,0 +1,57 @@
+=========================
+Samsung/Meritech SMDK2440
+=========================
+
+Introduction
+------------
+
+  The SMDK2440 is a two part evaluation board for the Samsung S3C2440
+  processor. It includes support for LCD, SmartMedia, Audio, SD and
+  10MBit Ethernet, and expansion headers for various signals, including
+  the camera and unused GPIO.
+
+
+Configuration
+-------------
+
+  To set the default configuration, use `make smdk2440_defconfig` which
+  will configure the common features of this board, or use
+  `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
+
+
+Support
+-------
+
+  Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
+  includes linux based USB download tools.
+
+  Some of the h1940 patches that can be found from the H1940 project
+  site at http://www.handhelds.org/projects/h1940.html can also be
+  applied to this board.
+
+
+Peripherals
+-----------
+
+  There is no current support for any of the extra peripherals on the
+  base-board itself.
+
+
+MTD
+---
+
+  The NAND flash should be supported by the in kernel MTD NAND support,
+  NOR flash will be added later.
+
+
+Maintainers
+-----------
+
+  This board is being maintained by Ben Dooks, for more info, see
+  http://www.fluff.org/ben/smdk2440/
+
+  Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
+  and to Simtec Electronics for allowing me time to work on this.
+
+
+(c) 2004 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/suspend.rst b/Documentation/arm/samsung-s3c24xx/suspend.rst
new file mode 100644
index 000000000000..b4f3ae9fe76e
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/suspend.rst
@@ -0,0 +1,137 @@
+=======================
+S3C24XX Suspend Support
+=======================
+
+
+Introduction
+------------
+
+  The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
+  in Self-Refresh mode, and all but the essential peripheral blocks are
+  powered down. For more information on how this works, please look
+  at the relevant CPU datasheet from Samsung.
+
+
+Requirements
+------------
+
+  1) A bootloader that can support the necessary resume operation
+
+  2) Support for at least 1 source for resume
+
+  3) CONFIG_PM enabled in the kernel
+
+  4) Any peripherals that are going to be powered down at the same
+     time require suspend/resume support.
+
+
+Resuming
+--------
+
+  The S3C2410 user manual defines the process of sending the CPU to
+  sleep and how it resumes. The default behaviour of the Linux code
+  is to set the GSTATUS3 register to the physical address of the
+  code to resume Linux operation.
+
+  GSTATUS4 is currently left alone by the sleep code, and is free to
+  use for any other purposes (for example, the EB2410ITX uses this to
+  save memory configuration in).
+
+
+Machine Support
+---------------
+
+  The machine specific functions must call the s3c_pm_init() function
+  to say that its bootloader is capable of resuming. This can be as
+  simple as adding the following to the machine's definition:
+
+  INITMACHINE(s3c_pm_init)
+
+  A board can do its own setup before calling s3c_pm_init, if it
+  needs to setup anything else for power management support.
+
+  There is currently no support for over-riding the default method of
+  saving the resume address, if your board requires it, then contact
+  the maintainer and discuss what is required.
+
+  Note, the original method of adding an late_initcall() is wrong,
+  and will end up initialising all compiled machines' pm init!
+
+  The following is an example of code used for testing wakeup from
+  an falling edge on IRQ_EINT0::
+
+
+    static irqreturn_t button_irq(int irq, void *pw)
+    {
+	return IRQ_HANDLED;
+    }
+
+    statuc void __init machine_init(void)
+    {
+	...
+
+	request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
+		   "button-irq-eint0", NULL);
+
+	enable_irq_wake(IRQ_EINT0);
+
+	s3c_pm_init();
+    }
+
+
+Debugging
+---------
+
+  There are several important things to remember when using PM suspend:
+
+  1) The uart drivers will disable the clocks to the UART blocks when
+     suspending, which means that use of printascii() or similar direct
+     access to the UARTs will cause the debug to stop.
+
+  2) While the pm code itself will attempt to re-enable the UART clocks,
+     care should be taken that any external clock sources that the UARTs
+     rely on are still enabled at that point.
+
+  3) If any debugging is placed in the resume path, then it must have the
+     relevant clocks and peripherals setup before use (ie, bootloader).
+
+     For example, if you transmit a character from the UART, the baud
+     rate and uart controls must be setup beforehand.
+
+
+Configuration
+-------------
+
+  The S3C2410 specific configuration in `System Type` defines various
+  aspects of how the S3C2410 suspend and resume support is configured
+
+  `S3C2410 PM Suspend debug`
+
+    This option prints messages to the serial console before and after
+    the actual suspend, giving detailed information on what is
+    happening
+
+
+  `S3C2410 PM Suspend Memory CRC`
+
+    Allows the entire memory to be checksummed before and after the
+    suspend to see if there has been any corruption of the contents.
+
+    Note, the time to calculate the CRC is dependent on the CPU speed
+    and the size of memory. For an 64Mbyte RAM area on an 200MHz
+    S3C2410, this can take approximately 4 seconds to complete.
+
+    This support requires the CRC32 function to be enabled.
+
+
+  `S3C2410 PM Suspend CRC Chunksize (KiB)`
+
+    Defines the size of memory each CRC chunk covers. A smaller value
+    will mean that the CRC data block will take more memory, but will
+    identify any faults with better precision
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2004 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/usb-host.rst b/Documentation/arm/samsung-s3c24xx/usb-host.rst
new file mode 100644
index 000000000000..c84268bd1884
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/usb-host.rst
@@ -0,0 +1,91 @@
+========================
+S3C24XX USB Host support
+========================
+
+
+
+Introduction
+------------
+
+  This document details the S3C2410/S3C2440 in-built OHCI USB host support.
+
+Configuration
+-------------
+
+  Enable at least the following kernel options:
+
+  menuconfig::
+
+   Device Drivers  --->
+     USB support  --->
+       <*> Support for Host-side USB
+       <*>   OHCI HCD support
+
+
+  .config:
+
+    - CONFIG_USB
+    - CONFIG_USB_OHCI_HCD
+
+
+  Once these options are configured, the standard set of USB device
+  drivers can be configured and used.
+
+
+Board Support
+-------------
+
+  The driver attaches to a platform device, which will need to be
+  added by the board specific support file in linux/arch/arm/mach-s3c2410,
+  such as mach-bast.c or mach-smdk2410.c
+
+  The platform device's platform_data field is only needed if the
+  board implements extra power control or over-current monitoring.
+
+  The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
+  register, so if both ports are to be used for the host, then it is
+  the board support file's responsibility to ensure that the second
+  port is configured to be connected to the OHCI core.
+
+
+Platform Data
+-------------
+
+  See arch/arm/mach-s3c2410/include/mach/usb-control.h for the
+  descriptions of the platform device data. An implementation
+  can be found in linux/arch/arm/mach-s3c2410/usb-simtec.c .
+
+  The `struct s3c2410_hcd_info` contains a pair of functions
+  that get called to enable over-current detection, and to
+  control the port power status.
+
+  The ports are numbered 0 and 1.
+
+  power_control:
+    Called to enable or disable the power on the port.
+
+  enable_oc:
+    Called to enable or disable the over-current monitoring.
+    This should claim or release the resources being used to
+    check the power condition on the port, such as an IRQ.
+
+  report_oc:
+    The OHCI driver fills this field in for the over-current code
+    to call when there is a change to the over-current state on
+    an port. The ports argument is a bitmask of 1 bit per port,
+    with bit X being 1 for an over-current on port X.
+
+    The function s3c2410_usb_report_oc() has been provided to
+    ensure this is called correctly.
+
+  port[x]:
+    This is struct describes each port, 0 or 1. The platform driver
+    should set the flags field of each port to S3C_HCDFLG_USED if
+    the port is enabled.
+
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2005 Simtec Electronics
diff --git a/Documentation/arm/samsung/bootloader-interface.rst b/Documentation/arm/samsung/bootloader-interface.rst
new file mode 100644
index 000000000000..a56f325dae78
--- /dev/null
+++ b/Documentation/arm/samsung/bootloader-interface.rst
@@ -0,0 +1,81 @@
+==========================================================
+Interface between kernel and boot loaders on Exynos boards
+==========================================================
+
+Author: Krzysztof Kozlowski
+
+Date  : 6 June 2015
+
+The document tries to describe currently used interface between Linux kernel
+and boot loaders on Samsung Exynos based boards. This is not a definition
+of interface but rather a description of existing state, a reference
+for information purpose only.
+
+In the document "boot loader" means any of following: U-boot, proprietary
+SBOOT or any other firmware for ARMv7 and ARMv8 initializing the board before
+executing kernel.
+
+
+1. Non-Secure mode
+
+Address:      sysram_ns_base_addr
+
+============= ============================================ ==================
+Offset        Value                                        Purpose
+============= ============================================ ==================
+0x08          exynos_cpu_resume_ns, mcpm_entry_point       System suspend
+0x0c          0x00000bad (Magic cookie)                    System suspend
+0x1c          exynos4_secondary_startup                    Secondary CPU boot
+0x1c + 4*cpu  exynos4_secondary_startup (Exynos4412)       Secondary CPU boot
+0x20          0xfcba0d10 (Magic cookie)                    AFTR
+0x24          exynos_cpu_resume_ns                         AFTR
+0x28 + 4*cpu  0x8 (Magic cookie, Exynos3250)               AFTR
+0x28          0x0 or last value during resume (Exynos542x) System suspend
+============= ============================================ ==================
+
+
+2. Secure mode
+
+Address:      sysram_base_addr
+
+============= ============================================ ==================
+Offset        Value                                        Purpose
+============= ============================================ ==================
+0x00          exynos4_secondary_startup                    Secondary CPU boot
+0x04          exynos4_secondary_startup (Exynos542x)       Secondary CPU boot
+4*cpu         exynos4_secondary_startup (Exynos4412)       Secondary CPU boot
+0x20          exynos_cpu_resume (Exynos4210 r1.0)          AFTR
+0x24          0xfcba0d10 (Magic cookie, Exynos4210 r1.0)   AFTR
+============= ============================================ ==================
+
+Address:      pmu_base_addr
+
+============= ============================================ ==================
+Offset        Value                                        Purpose
+============= ============================================ ==================
+0x0800        exynos_cpu_resume                            AFTR, suspend
+0x0800        mcpm_entry_point (Exynos542x with MCPM)      AFTR, suspend
+0x0804        0xfcba0d10 (Magic cookie)                    AFTR
+0x0804        0x00000bad (Magic cookie)                    System suspend
+0x0814        exynos4_secondary_startup (Exynos4210 r1.1)  Secondary CPU boot
+0x0818        0xfcba0d10 (Magic cookie, Exynos4210 r1.1)   AFTR
+0x081C        exynos_cpu_resume (Exynos4210 r1.1)          AFTR
+============= ============================================ ==================
+
+3. Other (regardless of secure/non-secure mode)
+
+Address:      pmu_base_addr
+
+============= =============================== ===============================
+Offset        Value                           Purpose
+============= =============================== ===============================
+0x0908        Non-zero                        Secondary CPU boot up indicator
+                                              on Exynos3250 and Exynos542x
+============= =============================== ===============================
+
+
+4. Glossary
+
+AFTR - ARM Off Top Running, a low power mode, Cortex cores and many other
+modules are power gated, except the TOP modules
+MCPM - Multi-Cluster Power Management
diff --git a/Documentation/arm/Samsung/clksrc-change-registers.awk b/Documentation/arm/samsung/clksrc-change-registers.awk
index 7be1b8aa7cd9..7be1b8aa7cd9 100755
--- a/Documentation/arm/Samsung/clksrc-change-registers.awk
+++ b/Documentation/arm/samsung/clksrc-change-registers.awk
diff --git a/Documentation/arm/samsung/gpio.rst b/Documentation/arm/samsung/gpio.rst
new file mode 100644
index 000000000000..5f7cadd7159e
--- /dev/null
+++ b/Documentation/arm/samsung/gpio.rst
@@ -0,0 +1,41 @@
+===========================
+Samsung GPIO implementation
+===========================
+
+Introduction
+------------
+
+This outlines the Samsung GPIO implementation and the architecture
+specific calls provided alongside the drivers/gpio core.
+
+
+S3C24XX (Legacy)
+----------------
+
+See Documentation/arm/samsung-s3c24xx/gpio.rst for more information
+about these devices. Their implementation has been brought into line
+with the core samsung implementation described in this document.
+
+
+GPIOLIB integration
+-------------------
+
+The gpio implementation uses gpiolib as much as possible, only providing
+specific calls for the items that require Samsung specific handling, such
+as pin special-function or pull resistor control.
+
+GPIO numbering is synchronised between the Samsung and gpiolib system.
+
+
+PIN configuration
+-----------------
+
+Pin configuration is specific to the Samsung architecture, with each SoC
+registering the necessary information for the core gpio configuration
+implementation to configure pins as necessary.
+
+The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a
+driver or machine to change gpio configuration.
+
+See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information
+on these functions.
diff --git a/Documentation/arm/samsung/index.rst b/Documentation/arm/samsung/index.rst
new file mode 100644
index 000000000000..8142cce3d23e
--- /dev/null
+++ b/Documentation/arm/samsung/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Samsung SoC
+===========
+
+.. toctree::
+   :maxdepth: 1
+
+   gpio
+   bootloader-interface
+   overview
diff --git a/Documentation/arm/samsung/overview.rst b/Documentation/arm/samsung/overview.rst
new file mode 100644
index 000000000000..e74307897416
--- /dev/null
+++ b/Documentation/arm/samsung/overview.rst
@@ -0,0 +1,89 @@
+==========================
+Samsung ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+  The Samsung range of ARM SoCs spans many similar devices, from the initial
+  ARM9 through to the newest ARM cores. This document shows an overview of
+  the current kernel support, how to use it and where to find the code
+  that supports this.
+
+  The currently supported SoCs are:
+
+  - S3C24XX: See Documentation/arm/samsung-s3c24xx/overview.rst for full list
+  - S3C64XX: S3C6400 and S3C6410
+  - S5PC110 / S5PV210
+
+
+S3C24XX Systems
+---------------
+
+  There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
+  deals with the architecture and drivers specific to these devices.
+
+  See Documentation/arm/samsung-s3c24xx/overview.rst for more information
+  on the implementation details and specific support.
+
+
+Configuration
+-------------
+
+  A number of configurations are supplied, as there is no current way of
+  unifying all the SoCs into one kernel.
+
+  s5pc110_defconfig
+	- S5PC110 specific default configuration
+  s5pv210_defconfig
+	- S5PV210 specific default configuration
+
+
+Layout
+------
+
+  The directory layout is currently being restructured, and consists of
+  several platform directories and then the machine specific directories
+  of the CPUs being built for.
+
+  plat-samsung provides the base for all the implementations, and is the
+  last in the line of include directories that are processed for the build
+  specific information. It contains the base clock, GPIO and device definitions
+  to get the system running.
+
+  plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
+
+  plat-s5p is for s5p specific builds, and contains common support for the
+  S5P specific systems. Not all S5Ps use all the features in this directory
+  due to differences in the hardware.
+
+
+Layout changes
+--------------
+
+  The old plat-s3c and plat-s5pc1xx directories have been removed, with
+  support moved to either plat-samsung or plat-s5p as necessary. These moves
+  where to simplify the include and dependency issues involved with having
+  so many different platform directories.
+
+
+Port Contributors
+-----------------
+
+  Ben Dooks (BJD)
+  Vincent Sanders
+  Herbert Potzl
+  Arnaud Patard (RTP)
+  Roc Wu
+  Klaus Fetscher
+  Dimitry Andric
+  Shannon Holland
+  Guillaume Gourat (NexVision)
+  Christer Weinigel (wingel) (Acer N30)
+  Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
diff --git a/Documentation/arm/setup.rst b/Documentation/arm/setup.rst
new file mode 100644
index 000000000000..8e12ef3fb9a7
--- /dev/null
+++ b/Documentation/arm/setup.rst
@@ -0,0 +1,108 @@
+=============================================
+Kernel initialisation parameters on ARM Linux
+=============================================
+
+The following document describes the kernel initialisation parameter
+structure, otherwise known as 'struct param_struct' which is used
+for most ARM Linux architectures.
+
+This structure is used to pass initialisation parameters from the
+kernel loader to the Linux kernel proper, and may be short lived
+through the kernel initialisation process.  As a general rule, it
+should not be referenced outside of arch/arm/kernel/setup.c:setup_arch().
+
+There are a lot of parameters listed in there, and they are described
+below:
+
+ page_size
+   This parameter must be set to the page size of the machine, and
+   will be checked by the kernel.
+
+ nr_pages
+   This is the total number of pages of memory in the system.  If
+   the memory is banked, then this should contain the total number
+   of pages in the system.
+
+   If the system contains separate VRAM, this value should not
+   include this information.
+
+ ramdisk_size
+   This is now obsolete, and should not be used.
+
+ flags
+   Various kernel flags, including:
+
+    =====   ========================
+    bit 0   1 = mount root read only
+    bit 1   unused
+    bit 2   0 = load ramdisk
+    bit 3   0 = prompt for ramdisk
+    =====   ========================
+
+ rootdev
+   major/minor number pair of device to mount as the root filesystem.
+
+ video_num_cols / video_num_rows
+   These two together describe the character size of the dummy console,
+   or VGA console character size.  They should not be used for any other
+   purpose.
+
+   It's generally a good idea to set these to be either standard VGA, or
+   the equivalent character size of your fbcon display.  This then allows
+   all the bootup messages to be displayed correctly.
+
+ video_x / video_y
+   This describes the character position of cursor on VGA console, and
+   is otherwise unused. (should not be used for other console types, and
+   should not be used for other purposes).
+
+ memc_control_reg
+   MEMC chip control register for Acorn Archimedes and Acorn A5000
+   based machines.  May be used differently by different architectures.
+
+ sounddefault
+   Default sound setting on Acorn machines.  May be used differently by
+   different architectures.
+
+ adfsdrives
+   Number of ADFS/MFM disks.  May be used differently by different
+   architectures.
+
+ bytes_per_char_h / bytes_per_char_v
+   These are now obsolete, and should not be used.
+
+ pages_in_bank[4]
+   Number of pages in each bank of the systems memory (used for RiscPC).
+   This is intended to be used on systems where the physical memory
+   is non-contiguous from the processors point of view.
+
+ pages_in_vram
+   Number of pages in VRAM (used on Acorn RiscPC).  This value may also
+   be used by loaders if the size of the video RAM can't be obtained
+   from the hardware.
+
+ initrd_start / initrd_size
+   This describes the kernel virtual start address and size of the
+   initial ramdisk.
+
+ rd_start
+   Start address in sectors of the ramdisk image on a floppy disk.
+
+ system_rev
+   system revision number.
+
+ system_serial_low / system_serial_high
+   system 64-bit serial number
+
+ mem_fclk_21285
+   The speed of the external oscillator to the 21285 (footbridge),
+   which control's the speed of the memory bus, timer & serial port.
+   Depending upon the speed of the cpu its value can be between
+   0-66 MHz. If no params are passed or a value of zero is passed,
+   then a value of 50 Mhz is the default on 21285 architectures.
+
+ paths[8][128]
+   These are now obsolete, and should not be used.
+
+ commandline
+   Kernel command line parameters.  Details can be found elsewhere.
diff --git a/Documentation/arm/SH-Mobile/.gitignore b/Documentation/arm/sh-mobile/.gitignore
index c928dbf3cc88..c928dbf3cc88 100644
--- a/Documentation/arm/SH-Mobile/.gitignore
+++ b/Documentation/arm/sh-mobile/.gitignore
diff --git a/Documentation/arm/spear/overview.rst b/Documentation/arm/spear/overview.rst
new file mode 100644
index 000000000000..1a77f6b213b6
--- /dev/null
+++ b/Documentation/arm/spear/overview.rst
@@ -0,0 +1,66 @@
+========================
+SPEAr ARM Linux Overview
+========================
+
+Introduction
+------------
+
+  SPEAr (Structured Processor Enhanced Architecture).
+  weblink : http://www.st.com/spear
+
+  The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
+  supported by the 'spear' platform of ARM Linux. Currently SPEAr1310,
+  SPEAr1340, SPEAr300, SPEAr310, SPEAr320 and SPEAr600 SOCs are supported.
+
+  Hierarchy in SPEAr is as follows:
+
+  SPEAr (Platform)
+
+	- SPEAr3XX (3XX SOC series, based on ARM9)
+		- SPEAr300 (SOC)
+			- SPEAr300 Evaluation Board
+		- SPEAr310 (SOC)
+			- SPEAr310 Evaluation Board
+		- SPEAr320 (SOC)
+			- SPEAr320 Evaluation Board
+	- SPEAr6XX (6XX SOC series, based on ARM9)
+		- SPEAr600 (SOC)
+			- SPEAr600 Evaluation Board
+	- SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
+		- SPEAr1310 (SOC)
+			- SPEAr1310 Evaluation Board
+		- SPEAr1340 (SOC)
+			- SPEAr1340 Evaluation Board
+
+Configuration
+-------------
+
+  A generic configuration is provided for each machine, and can be used as the
+  default by::
+
+	make spear13xx_defconfig
+	make spear3xx_defconfig
+	make spear6xx_defconfig
+
+Layout
+------
+
+  The common files for multiple machine families (SPEAr3xx, SPEAr6xx and
+  SPEAr13xx) are located in the platform code contained in arch/arm/plat-spear
+  with headers in plat/.
+
+  Each machine series have a directory with name arch/arm/mach-spear followed by
+  series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
+
+  Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c, for
+  spear6xx is mach-spear6xx/spear6xx.c and for spear13xx family is
+  mach-spear13xx/spear13xx.c. mach-spear* also contain soc/machine specific
+  files, like spear1310.c, spear1340.c spear300.c, spear310.c, spear320.c and
+  spear600.c.  mach-spear* doesn't contains board specific files as they fully
+  support Flattened Device Tree.
+
+
+Document Author
+---------------
+
+  Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/Documentation/arm/sti/overview.rst b/Documentation/arm/sti/overview.rst
new file mode 100644
index 000000000000..70743617a74f
--- /dev/null
+++ b/Documentation/arm/sti/overview.rst
@@ -0,0 +1,36 @@
+======================
+STi ARM Linux Overview
+======================
+
+Introduction
+------------
+
+  The ST Microelectronics Multimedia and Application Processors range of
+  CortexA9 System-on-Chip are supported by the 'STi' platform of
+  ARM Linux. Currently STiH415, STiH416 SOCs are supported with both
+  B2000 and B2020 Reference boards.
+
+
+configuration
+-------------
+
+  A generic configuration is provided for both STiH415/416, and can be used as the
+  default by::
+
+	make stih41x_defconfig
+
+Layout
+------
+
+  All the files for multiple machine families (STiH415, STiH416, and STiG125)
+  are located in the platform code contained in arch/arm/mach-sti
+
+  There is a generic board board-dt.c in the mach folder which support
+  Flattened Device Tree, which means, It works with any compatible board with
+  Device Trees.
+
+
+Document Author
+---------------
+
+  Srinivas Kandagatla <srinivas.kandagatla@st.com>, (c) 2013 ST Microelectronics
diff --git a/Documentation/arm/sti/overview.txt b/Documentation/arm/sti/overview.txt
deleted file mode 100644
index 1a4e93d6027f..000000000000
--- a/Documentation/arm/sti/overview.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-			STi ARM Linux Overview
-			==========================
-
-Introduction
-------------
-
-  The ST Microelectronics Multimedia and Application Processors range of
-  CortexA9 System-on-Chip are supported by the 'STi' platform of
-  ARM Linux. Currently STiH415, STiH416 SOCs are supported with both
-  B2000 and B2020 Reference boards.
-
-
-  configuration
-  -------------
-
-  A generic configuration is provided for both STiH415/416, and can be used as the
-  default by
-	make stih41x_defconfig
-
-  Layout
-  ------
-  All the files for multiple machine families (STiH415, STiH416, and STiG125)
-  are located in the platform code contained in arch/arm/mach-sti
-
-  There is a generic board board-dt.c in the mach folder which support
-  Flattened Device Tree, which means, It works with any compatible board with
-  Device Trees.
-
-
-  Document Author
-  ---------------
-
-  Srinivas Kandagatla <srinivas.kandagatla@st.com>, (c) 2013 ST Microelectronics
diff --git a/Documentation/arm/sti/stih407-overview.rst b/Documentation/arm/sti/stih407-overview.rst
new file mode 100644
index 000000000000..027e75bc7b7c
--- /dev/null
+++ b/Documentation/arm/sti/stih407-overview.rst
@@ -0,0 +1,19 @@
+================
+STiH407 Overview
+================
+
+Introduction
+------------
+
+    The STiH407 is the new generation of SoC for Multi-HD, AVC set-top boxes
+    and server/connected client application for satellite, cable, terrestrial
+    and IP-STB markets.
+
+    Features
+    - ARM Cortex-A9 1.5 GHz dual core CPU (28nm)
+    - SATA2, USB 3.0, PCIe, Gbit Ethernet
+
+Document Author
+---------------
+
+  Maxime Coquelin <maxime.coquelin@st.com>, (c) 2014 ST Microelectronics
diff --git a/Documentation/arm/sti/stih407-overview.txt b/Documentation/arm/sti/stih407-overview.txt
deleted file mode 100644
index 3343f32f58bc..000000000000
--- a/Documentation/arm/sti/stih407-overview.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-			STiH407 Overview
-			================
-
-Introduction
-------------
-
-    The STiH407 is the new generation of SoC for Multi-HD, AVC set-top boxes
-    and server/connected client application for satellite, cable, terrestrial
-    and IP-STB markets.
-
-    Features
-    - ARM Cortex-A9 1.5 GHz dual core CPU (28nm)
-    - SATA2, USB 3.0, PCIe, Gbit Ethernet
-
-  Document Author
-  ---------------
-
-  Maxime Coquelin <maxime.coquelin@st.com>, (c) 2014 ST Microelectronics
diff --git a/Documentation/arm/sti/stih415-overview.rst b/Documentation/arm/sti/stih415-overview.rst
new file mode 100644
index 000000000000..b67452d610c4
--- /dev/null
+++ b/Documentation/arm/sti/stih415-overview.rst
@@ -0,0 +1,14 @@
+================
+STiH415 Overview
+================
+
+Introduction
+------------
+
+    The STiH415 is the next generation of HD, AVC set-top box processors
+    for satellite, cable, terrestrial and IP-STB markets.
+
+    Features:
+
+    - ARM Cortex-A9 1.0 GHz, dual-core CPU
+    - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih415-overview.txt b/Documentation/arm/sti/stih415-overview.txt
deleted file mode 100644
index 1383e33f265d..000000000000
--- a/Documentation/arm/sti/stih415-overview.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-			STiH415 Overview
-			================
-
-Introduction
-------------
-
-    The STiH415 is the next generation of HD, AVC set-top box processors
-    for satellite, cable, terrestrial and IP-STB markets.
-
-    Features
-    - ARM Cortex-A9 1.0 GHz, dual-core CPU
-    - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih416-overview.rst b/Documentation/arm/sti/stih416-overview.rst
new file mode 100644
index 000000000000..93f17d74d8db
--- /dev/null
+++ b/Documentation/arm/sti/stih416-overview.rst
@@ -0,0 +1,13 @@
+================
+STiH416 Overview
+================
+
+Introduction
+------------
+
+    The STiH416 is the next generation of HD, AVC set-top box processors
+    for satellite, cable, terrestrial and IP-STB markets.
+
+    Features
+    - ARM Cortex-A9 1.2 GHz dual core CPU
+    - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih416-overview.txt b/Documentation/arm/sti/stih416-overview.txt
deleted file mode 100644
index 558444c201c6..000000000000
--- a/Documentation/arm/sti/stih416-overview.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-			STiH416 Overview
-			================
-
-Introduction
-------------
-
-    The STiH416 is the next generation of HD, AVC set-top box processors
-    for satellite, cable, terrestrial and IP-STB markets.
-
-    Features
-    - ARM Cortex-A9 1.2 GHz dual core CPU
-    - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih418-overview.rst b/Documentation/arm/sti/stih418-overview.rst
new file mode 100644
index 000000000000..b563c1f4fe5a
--- /dev/null
+++ b/Documentation/arm/sti/stih418-overview.rst
@@ -0,0 +1,21 @@
+================
+STiH418 Overview
+================
+
+Introduction
+------------
+
+    The STiH418 is the new generation of SoC for UHDp60 set-top boxes
+    and server/connected client application for satellite, cable, terrestrial
+    and IP-STB markets.
+
+    Features
+    - ARM Cortex-A9 1.5 GHz quad core CPU (28nm)
+    - SATA2, USB 3.0, PCIe, Gbit Ethernet
+    - HEVC L5.1 Main 10
+    - VP9
+
+Document Author
+---------------
+
+  Maxime Coquelin <maxime.coquelin@st.com>, (c) 2015 ST Microelectronics
diff --git a/Documentation/arm/sti/stih418-overview.txt b/Documentation/arm/sti/stih418-overview.txt
deleted file mode 100644
index 1cd8fc80646d..000000000000
--- a/Documentation/arm/sti/stih418-overview.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-			STiH418 Overview
-			================
-
-Introduction
-------------
-
-    The STiH418 is the new generation of SoC for UHDp60 set-top boxes
-    and server/connected client application for satellite, cable, terrestrial
-    and IP-STB markets.
-
-    Features
-    - ARM Cortex-A9 1.5 GHz quad core CPU (28nm)
-    - SATA2, USB 3.0, PCIe, Gbit Ethernet
-    - HEVC L5.1 Main 10
-    - VP9
-
-  Document Author
-  ---------------
-
-  Maxime Coquelin <maxime.coquelin@st.com>, (c) 2015 ST Microelectronics
diff --git a/Documentation/arm/stm32/stm32f429-overview.rst b/Documentation/arm/stm32/stm32f429-overview.rst
index 18feda97f483..a7ebe8ea6697 100644
--- a/Documentation/arm/stm32/stm32f429-overview.rst
+++ b/Documentation/arm/stm32/stm32f429-overview.rst
@@ -1,3 +1,4 @@
+==================
 STM32F429 Overview
 ==================
 
@@ -21,6 +22,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F429_)
 
 .. _STM32F429: http://www.st.com/web/en/catalog/mmc/FM141/SC1169/SS1577/LN1806?ecmp=stm32f429-439_pron_pr-ces2014_nov2013
 
-:Authors:
-
-Maxime Coquelin <mcoquelin.stm32@gmail.com>
+:Authors: Maxime Coquelin <mcoquelin.stm32@gmail.com>
diff --git a/Documentation/arm/stm32/stm32f746-overview.rst b/Documentation/arm/stm32/stm32f746-overview.rst
index b5f4b6ce7656..78befddc7740 100644
--- a/Documentation/arm/stm32/stm32f746-overview.rst
+++ b/Documentation/arm/stm32/stm32f746-overview.rst
@@ -1,3 +1,4 @@
+==================
 STM32F746 Overview
 ==================
 
@@ -28,6 +29,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F746_)
 
 .. _STM32F746: http://www.st.com/content/st_com/en/products/microcontrollers/stm32-32-bit-arm-cortex-mcus/stm32f7-series/stm32f7x6/stm32f746ng.html
 
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32f769-overview.rst b/Documentation/arm/stm32/stm32f769-overview.rst
index 228656ced2fe..e482980ddf21 100644
--- a/Documentation/arm/stm32/stm32f769-overview.rst
+++ b/Documentation/arm/stm32/stm32f769-overview.rst
@@ -1,3 +1,4 @@
+==================
 STM32F769 Overview
 ==================
 
@@ -30,6 +31,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F769_)
 
 .. _STM32F769: http://www.st.com/content/st_com/en/products/microcontrollers/stm32-32-bit-arm-cortex-mcus/stm32-high-performance-mcus/stm32f7-series/stm32f7x9/stm32f769ni.html
 
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32h743-overview.rst b/Documentation/arm/stm32/stm32h743-overview.rst
index 3458dc00095d..4e15f1a42730 100644
--- a/Documentation/arm/stm32/stm32h743-overview.rst
+++ b/Documentation/arm/stm32/stm32h743-overview.rst
@@ -1,3 +1,4 @@
+==================
 STM32H743 Overview
 ==================
 
@@ -29,6 +30,4 @@ Datasheet and reference manual are publicly available on ST website (STM32H743_)
 
 .. _STM32H743: http://www.st.com/en/microcontrollers/stm32h7x3.html?querycriteria=productId=LN2033
 
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32mp157-overview.rst b/Documentation/arm/stm32/stm32mp157-overview.rst
index 62e176d47ca7..f62fdc8e7d8d 100644
--- a/Documentation/arm/stm32/stm32mp157-overview.rst
+++ b/Documentation/arm/stm32/stm32mp157-overview.rst
@@ -1,3 +1,4 @@
+===================
 STM32MP157 Overview
 ===================
 
diff --git a/Documentation/arm/sunxi.rst b/Documentation/arm/sunxi.rst
new file mode 100644
index 000000000000..b037428aee98
--- /dev/null
+++ b/Documentation/arm/sunxi.rst
@@ -0,0 +1,150 @@
+==================
+ARM Allwinner SoCs
+==================
+
+This document lists all the ARM Allwinner SoCs that are currently
+supported in mainline by the Linux kernel. This document will also
+provide links to documentation and/or datasheet for these SoCs.
+
+SunXi family
+------------
+  Linux kernel mach directory: arch/arm/mach-sunxi
+
+  Flavors:
+
+    * ARM926 based SoCs
+      - Allwinner F20 (sun3i)
+
+        * Not Supported
+
+    * ARM Cortex-A8 based SoCs
+      - Allwinner A10 (sun4i)
+
+        * Datasheet
+
+	  http://dl.linux-sunxi.org/A10/A10%20Datasheet%20-%20v1.21%20%282012-04-06%29.pdf
+	* User Manual
+
+	  http://dl.linux-sunxi.org/A10/A10%20User%20Manual%20-%20v1.20%20%282012-04-09%2c%20DECRYPTED%29.pdf
+
+      - Allwinner A10s (sun5i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A10s/A10s%20Datasheet%20-%20v1.20%20%282012-03-27%29.pdf
+
+      - Allwinner A13 / R8 (sun5i)
+
+        * Datasheet
+
+	  http://dl.linux-sunxi.org/A13/A13%20Datasheet%20-%20v1.12%20%282012-03-29%29.pdf
+        * User Manual
+
+          http://dl.linux-sunxi.org/A13/A13%20User%20Manual%20-%20v1.2%20%282013-01-08%29.pdf
+
+      - Next Thing Co GR8 (sun5i)
+
+    * Single ARM Cortex-A7 based SoCs
+      - Allwinner V3s (sun8i)
+
+        * Datasheet
+
+          http://linux-sunxi.org/File:Allwinner_V3s_Datasheet_V1.0.pdf
+
+    * Dual ARM Cortex-A7 based SoCs
+      - Allwinner A20 (sun7i)
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A20/A20%20User%20Manual%202013-03-22.pdf
+
+      - Allwinner A23 (sun8i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A23/A23%20Datasheet%20V1.0%2020130830.pdf
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A23/A23%20User%20Manual%20V1.0%2020130830.pdf
+
+    * Quad ARM Cortex-A7 based SoCs
+      - Allwinner A31 (sun6i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20datasheet%20V1.3%2020131106.pdf
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20user%20manual%20V1.1%2020130630.pdf
+
+      - Allwinner A31s (sun6i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20datasheet%20V1.3%2020131106.pdf
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20User%20Manual%20%20V1.0%2020130322.pdf
+
+      - Allwinner A33 (sun8i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A33/A33%20Datasheet%20release%201.1.pdf
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A33/A33%20user%20manual%20release%201.1.pdf
+
+      - Allwinner H2+ (sun8i)
+
+        * No document available now, but is known to be working properly with
+          H3 drivers and memory map.
+
+      - Allwinner H3 (sun8i)
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/H3/Allwinner_H3_Datasheet_V1.0.pdf
+
+      - Allwinner R40 (sun8i)
+
+        * Datasheet
+
+          https://github.com/tinalinux/docs/raw/r40-v1.y/R40_Datasheet_V1.0.pdf
+
+        * User Manual
+
+          https://github.com/tinalinux/docs/raw/r40-v1.y/Allwinner_R40_User_Manual_V1.0.pdf
+
+    * Quad ARM Cortex-A15, Quad ARM Cortex-A7 based SoCs
+      - Allwinner A80
+
+        * Datasheet
+
+	  http://dl.linux-sunxi.org/A80/A80_Datasheet_Revision_1.0_0404.pdf
+
+    * Octa ARM Cortex-A7 based SoCs
+      - Allwinner A83T
+
+        * Datasheet
+
+          https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_Datasheet_v1.3_20150510.pdf
+
+        * User Manual
+
+          https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_User_Manual_v1.5.1_20150513.pdf
+
+    * Quad ARM Cortex-A53 based SoCs
+      - Allwinner A64
+
+        * Datasheet
+
+          http://dl.linux-sunxi.org/A64/A64_Datasheet_V1.1.pdf
+
+        * User Manual
+
+          http://dl.linux-sunxi.org/A64/Allwinner%20A64%20User%20Manual%20v1.0.pdf
diff --git a/Documentation/arm/sunxi/README b/Documentation/arm/sunxi/README
deleted file mode 100644
index f8efc21998bf..000000000000
--- a/Documentation/arm/sunxi/README
+++ /dev/null
@@ -1,102 +0,0 @@
-ARM Allwinner SoCs
-==================
-
-This document lists all the ARM Allwinner SoCs that are currently
-supported in mainline by the Linux kernel. This document will also
-provide links to documentation and/or datasheet for these SoCs.
-
-SunXi family
-------------
-  Linux kernel mach directory: arch/arm/mach-sunxi
-
-  Flavors:
-    * ARM926 based SoCs
-      - Allwinner F20 (sun3i)
-        + Not Supported
-
-    * ARM Cortex-A8 based SoCs
-      - Allwinner A10 (sun4i)
-        + Datasheet
-	  http://dl.linux-sunxi.org/A10/A10%20Datasheet%20-%20v1.21%20%282012-04-06%29.pdf
-	+ User Manual
-	  http://dl.linux-sunxi.org/A10/A10%20User%20Manual%20-%20v1.20%20%282012-04-09%2c%20DECRYPTED%29.pdf
-
-      - Allwinner A10s (sun5i)
-        + Datasheet
-          http://dl.linux-sunxi.org/A10s/A10s%20Datasheet%20-%20v1.20%20%282012-03-27%29.pdf
-
-      - Allwinner A13 / R8 (sun5i)
-        + Datasheet
-	  http://dl.linux-sunxi.org/A13/A13%20Datasheet%20-%20v1.12%20%282012-03-29%29.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A13/A13%20User%20Manual%20-%20v1.2%20%282013-01-08%29.pdf
-
-      - Next Thing Co GR8 (sun5i)
-
-    * Single ARM Cortex-A7 based SoCs
-      - Allwinner V3s (sun8i)
-        + Datasheet
-          http://linux-sunxi.org/File:Allwinner_V3s_Datasheet_V1.0.pdf
-
-    * Dual ARM Cortex-A7 based SoCs
-      - Allwinner A20 (sun7i)
-        + User Manual
-          http://dl.linux-sunxi.org/A20/A20%20User%20Manual%202013-03-22.pdf
-
-      - Allwinner A23 (sun8i)
-        + Datasheet
-          http://dl.linux-sunxi.org/A23/A23%20Datasheet%20V1.0%2020130830.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A23/A23%20User%20Manual%20V1.0%2020130830.pdf
-
-    * Quad ARM Cortex-A7 based SoCs
-      - Allwinner A31 (sun6i)
-        + Datasheet
-          http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20datasheet%20V1.3%2020131106.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20user%20manual%20V1.1%2020130630.pdf
-
-      - Allwinner A31s (sun6i)
-        + Datasheet
-          http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20datasheet%20V1.3%2020131106.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20User%20Manual%20%20V1.0%2020130322.pdf
-
-      - Allwinner A33 (sun8i)
-        + Datasheet
-          http://dl.linux-sunxi.org/A33/A33%20Datasheet%20release%201.1.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A33/A33%20user%20manual%20release%201.1.pdf
-
-      - Allwinner H2+ (sun8i)
-        + No document available now, but is known to be working properly with
-          H3 drivers and memory map.
-
-      - Allwinner H3 (sun8i)
-        + Datasheet
-          http://dl.linux-sunxi.org/H3/Allwinner_H3_Datasheet_V1.0.pdf
-
-      - Allwinner R40 (sun8i)
-        + Datasheet
-          https://github.com/tinalinux/docs/raw/r40-v1.y/R40_Datasheet_V1.0.pdf
-        + User Manual
-          https://github.com/tinalinux/docs/raw/r40-v1.y/Allwinner_R40_User_Manual_V1.0.pdf
-
-    * Quad ARM Cortex-A15, Quad ARM Cortex-A7 based SoCs
-      - Allwinner A80
-        + Datasheet
-	  http://dl.linux-sunxi.org/A80/A80_Datasheet_Revision_1.0_0404.pdf
-
-    * Octa ARM Cortex-A7 based SoCs
-      - Allwinner A83T
-        + Datasheet
-          https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_Datasheet_v1.3_20150510.pdf
-        + User Manual
-          https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_User_Manual_v1.5.1_20150513.pdf
-
-    * Quad ARM Cortex-A53 based SoCs
-      - Allwinner A64
-        + Datasheet
-          http://dl.linux-sunxi.org/A64/A64_Datasheet_V1.1.pdf
-        + User Manual
-          http://dl.linux-sunxi.org/A64/Allwinner%20A64%20User%20Manual%20v1.0.pdf
diff --git a/Documentation/arm/sunxi/clocks.rst b/Documentation/arm/sunxi/clocks.rst
new file mode 100644
index 000000000000..23bd03f3e21f
--- /dev/null
+++ b/Documentation/arm/sunxi/clocks.rst
@@ -0,0 +1,57 @@
+=======================================================
+Frequently asked questions about the sunxi clock system
+=======================================================
+
+This document contains useful bits of information that people tend to ask
+about the sunxi clock system, as well as accompanying ASCII art when adequate.
+
+Q: Why is the main 24MHz oscillator gatable? Wouldn't that break the
+   system?
+
+A: The 24MHz oscillator allows gating to save power. Indeed, if gated
+   carelessly the system would stop functioning, but with the right
+   steps, one can gate it and keep the system running. Consider this
+   simplified suspend example:
+
+   While the system is operational, you would see something like::
+
+      24MHz         32kHz
+       |
+      PLL1
+       \
+        \_ CPU Mux
+             |
+           [CPU]
+
+   When you are about to suspend, you switch the CPU Mux to the 32kHz
+   oscillator::
+
+      24Mhz         32kHz
+       |              |
+      PLL1            |
+                     /
+           CPU Mux _/
+             |
+           [CPU]
+
+    Finally you can gate the main oscillator::
+
+                    32kHz
+                      |
+                      |
+                     /
+           CPU Mux _/
+             |
+           [CPU]
+
+Q: Were can I learn more about the sunxi clocks?
+
+A: The linux-sunxi wiki contains a page documenting the clock registers,
+   you can find it at
+
+        http://linux-sunxi.org/A10/CCM
+
+   The authoritative source for information at this time is the ccmu driver
+   released by Allwinner, you can find it at
+
+        https://github.com/linux-sunxi/linux-sunxi/tree/sunxi-3.0/arch/arm/mach-sun4i/clock/ccmu
diff --git a/Documentation/arm/sunxi/clocks.txt b/Documentation/arm/sunxi/clocks.txt
deleted file mode 100644
index e09a88aa3136..000000000000
--- a/Documentation/arm/sunxi/clocks.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Frequently asked questions about the sunxi clock system
-=======================================================
-
-This document contains useful bits of information that people tend to ask
-about the sunxi clock system, as well as accompanying ASCII art when adequate.
-
-Q: Why is the main 24MHz oscillator gatable? Wouldn't that break the
-   system?
-
-A: The 24MHz oscillator allows gating to save power. Indeed, if gated
-   carelessly the system would stop functioning, but with the right
-   steps, one can gate it and keep the system running. Consider this
-   simplified suspend example:
-
-   While the system is operational, you would see something like
-
-      24MHz         32kHz
-       |
-      PLL1
-       \
-        \_ CPU Mux
-             |
-           [CPU]
-
-   When you are about to suspend, you switch the CPU Mux to the 32kHz
-   oscillator:
-
-      24Mhz         32kHz
-       |              |
-      PLL1            |
-                     /
-           CPU Mux _/
-             |
-           [CPU]
-
-    Finally you can gate the main oscillator
-
-                    32kHz
-                      |
-                      |
-                     /
-           CPU Mux _/
-             |
-           [CPU]
-
-Q: Were can I learn more about the sunxi clocks?
-
-A: The linux-sunxi wiki contains a page documenting the clock registers,
-   you can find it at
-
-        http://linux-sunxi.org/A10/CCM
-
-   The authoritative source for information at this time is the ccmu driver
-   released by Allwinner, you can find it at
-
-        https://github.com/linux-sunxi/linux-sunxi/tree/sunxi-3.0/arch/arm/mach-sun4i/clock/ccmu
diff --git a/Documentation/arm/swp_emulation b/Documentation/arm/swp_emulation
deleted file mode 100644
index af903d22fd93..000000000000
--- a/Documentation/arm/swp_emulation
+++ /dev/null
@@ -1,27 +0,0 @@
-Software emulation of deprecated SWP instruction (CONFIG_SWP_EMULATE)
----------------------------------------------------------------------
-
-ARMv6 architecture deprecates use of the SWP/SWPB instructions, and recommeds
-moving to the load-locked/store-conditional instructions LDREX and STREX.
-
-ARMv7 multiprocessing extensions introduce the ability to disable these
-instructions, triggering an undefined instruction exception when executed.
-Trapped instructions are emulated using an LDREX/STREX or LDREXB/STREXB
-sequence. If a memory access fault (an abort) occurs, a segmentation fault is
-signalled to the triggering process.
-
-/proc/cpu/swp_emulation holds some statistics/information, including the PID of
-the last process to trigger the emulation to be invocated. For example:
----
-Emulated SWP:		12
-Emulated SWPB:		0
-Aborted SWP{B}:		1
-Last process:		314
----
-
-NOTE: when accessing uncached shared regions, LDREX/STREX rely on an external
-transaction monitoring block called a global monitor to maintain update
-atomicity. If your system does not implement a global monitor, this option can
-cause programs that perform SWP operations to uncached memory to deadlock, as
-the STREX operation will always fail.
-
diff --git a/Documentation/arm/swp_emulation.rst b/Documentation/arm/swp_emulation.rst
new file mode 100644
index 000000000000..6a608a9c3715
--- /dev/null
+++ b/Documentation/arm/swp_emulation.rst
@@ -0,0 +1,27 @@
+Software emulation of deprecated SWP instruction (CONFIG_SWP_EMULATE)
+---------------------------------------------------------------------
+
+ARMv6 architecture deprecates use of the SWP/SWPB instructions, and recommeds
+moving to the load-locked/store-conditional instructions LDREX and STREX.
+
+ARMv7 multiprocessing extensions introduce the ability to disable these
+instructions, triggering an undefined instruction exception when executed.
+Trapped instructions are emulated using an LDREX/STREX or LDREXB/STREXB
+sequence. If a memory access fault (an abort) occurs, a segmentation fault is
+signalled to the triggering process.
+
+/proc/cpu/swp_emulation holds some statistics/information, including the PID of
+the last process to trigger the emulation to be invocated. For example::
+
+  Emulated SWP:		12
+  Emulated SWPB:		0
+  Aborted SWP{B}:		1
+  Last process:		314
+
+
+NOTE:
+  when accessing uncached shared regions, LDREX/STREX rely on an external
+  transaction monitoring block called a global monitor to maintain update
+  atomicity. If your system does not implement a global monitor, this option can
+  cause programs that perform SWP operations to uncached memory to deadlock, as
+  the STREX operation will always fail.
diff --git a/Documentation/arm/tcm.rst b/Documentation/arm/tcm.rst
new file mode 100644
index 000000000000..effd9c7bc968
--- /dev/null
+++ b/Documentation/arm/tcm.rst
@@ -0,0 +1,161 @@
+==================================================
+ARM TCM (Tightly-Coupled Memory) handling in Linux
+==================================================
+
+Written by Linus Walleij <linus.walleij@stericsson.com>
+
+Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
+This is usually just a few (4-64) KiB of RAM inside the ARM
+processor.
+
+Due to being embedded inside the CPU The TCM has a
+Harvard-architecture, so there is an ITCM (instruction TCM)
+and a DTCM (data TCM). The DTCM can not contain any
+instructions, but the ITCM can actually contain data.
+The size of DTCM or ITCM is minimum 4KiB so the typical
+minimum configuration is 4KiB ITCM and 4KiB DTCM.
+
+ARM CPU:s have special registers to read out status, physical
+location and size of TCM memories. arch/arm/include/asm/cputype.h
+defines a CPUID_TCM register that you can read out from the
+system control coprocessor. Documentation from ARM can be found
+at http://infocenter.arm.com, search for "TCM Status Register"
+to see documents for all CPUs. Reading this register you can
+determine if ITCM (bits 1-0) and/or DTCM (bit 17-16) is present
+in the machine.
+
+There is further a TCM region register (search for "TCM Region
+Registers" at the ARM site) that can report and modify the location
+size of TCM memories at runtime. This is used to read out and modify
+TCM location and size. Notice that this is not a MMU table: you
+actually move the physical location of the TCM around. At the
+place you put it, it will mask any underlying RAM from the
+CPU so it is usually wise not to overlap any physical RAM with
+the TCM.
+
+The TCM memory can then be remapped to another address again using
+the MMU, but notice that the TCM if often used in situations where
+the MMU is turned off. To avoid confusion the current Linux
+implementation will map the TCM 1 to 1 from physical to virtual
+memory in the location specified by the kernel. Currently Linux
+will map ITCM to 0xfffe0000 and on, and DTCM to 0xfffe8000 and
+on, supporting a maximum of 32KiB of ITCM and 32KiB of DTCM.
+
+Newer versions of the region registers also support dividing these
+TCMs in two separate banks, so for example an 8KiB ITCM is divided
+into two 4KiB banks with its own control registers. The idea is to
+be able to lock and hide one of the banks for use by the secure
+world (TrustZone).
+
+TCM is used for a few things:
+
+- FIQ and other interrupt handlers that need deterministic
+  timing and cannot wait for cache misses.
+
+- Idle loops where all external RAM is set to self-refresh
+  retention mode, so only on-chip RAM is accessible by
+  the CPU and then we hang inside ITCM waiting for an
+  interrupt.
+
+- Other operations which implies shutting off or reconfiguring
+  the external RAM controller.
+
+There is an interface for using TCM on the ARM architecture
+in <asm/tcm.h>. Using this interface it is possible to:
+
+- Define the physical address and size of ITCM and DTCM.
+
+- Tag functions to be compiled into ITCM.
+
+- Tag data and constants to be allocated to DTCM and ITCM.
+
+- Have the remaining TCM RAM added to a special
+  allocation pool with gen_pool_create() and gen_pool_add()
+  and provice tcm_alloc() and tcm_free() for this
+  memory. Such a heap is great for things like saving
+  device state when shutting off device power domains.
+
+A machine that has TCM memory shall select HAVE_TCM from
+arch/arm/Kconfig for itself. Code that needs to use TCM shall
+#include <asm/tcm.h>
+
+Functions to go into itcm can be tagged like this:
+int __tcmfunc foo(int bar);
+
+Since these are marked to become long_calls and you may want
+to have functions called locally inside the TCM without
+wasting space, there is also the __tcmlocalfunc prefix that
+will make the call relative.
+
+Variables to go into dtcm can be tagged like this::
+
+  int __tcmdata foo;
+
+Constants can be tagged like this::
+
+  int __tcmconst foo;
+
+To put assembler into TCM just use::
+
+  .section ".tcm.text" or .section ".tcm.data"
+
+respectively.
+
+Example code::
+
+  #include <asm/tcm.h>
+
+  /* Uninitialized data */
+  static u32 __tcmdata tcmvar;
+  /* Initialized data */
+  static u32 __tcmdata tcmassigned = 0x2BADBABEU;
+  /* Constant */
+  static const u32 __tcmconst tcmconst = 0xCAFEBABEU;
+
+  static void __tcmlocalfunc tcm_to_tcm(void)
+  {
+	int i;
+	for (i = 0; i < 100; i++)
+		tcmvar ++;
+  }
+
+  static void __tcmfunc hello_tcm(void)
+  {
+	/* Some abstract code that runs in ITCM */
+	int i;
+	for (i = 0; i < 100; i++) {
+		tcmvar ++;
+	}
+	tcm_to_tcm();
+  }
+
+  static void __init test_tcm(void)
+  {
+	u32 *tcmem;
+	int i;
+
+	hello_tcm();
+	printk("Hello TCM executed from ITCM RAM\n");
+
+	printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar);
+	tcmvar = 0xDEADBEEFU;
+	printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar);
+
+	printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned);
+
+	printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst);
+
+	/* Allocate some TCM memory from the pool */
+	tcmem = tcm_alloc(20);
+	if (tcmem) {
+		printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem);
+		tcmem[0] = 0xDEADBEEFU;
+		tcmem[1] = 0x2BADBABEU;
+		tcmem[2] = 0xCAFEBABEU;
+		tcmem[3] = 0xDEADBEEFU;
+		tcmem[4] = 0x2BADBABEU;
+		for (i = 0; i < 5; i++)
+			printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]);
+		tcm_free(tcmem, 20);
+	}
+  }
diff --git a/Documentation/arm/tcm.txt b/Documentation/arm/tcm.txt
deleted file mode 100644
index 7c15871c1885..000000000000
--- a/Documentation/arm/tcm.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-ARM TCM (Tightly-Coupled Memory) handling in Linux
-----
-Written by Linus Walleij <linus.walleij@stericsson.com>
-
-Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
-This is usually just a few (4-64) KiB of RAM inside the ARM
-processor.
-
-Due to being embedded inside the CPU The TCM has a
-Harvard-architecture, so there is an ITCM (instruction TCM)
-and a DTCM (data TCM). The DTCM can not contain any
-instructions, but the ITCM can actually contain data.
-The size of DTCM or ITCM is minimum 4KiB so the typical
-minimum configuration is 4KiB ITCM and 4KiB DTCM.
-
-ARM CPU:s have special registers to read out status, physical
-location and size of TCM memories. arch/arm/include/asm/cputype.h
-defines a CPUID_TCM register that you can read out from the
-system control coprocessor. Documentation from ARM can be found
-at http://infocenter.arm.com, search for "TCM Status Register"
-to see documents for all CPUs. Reading this register you can
-determine if ITCM (bits 1-0) and/or DTCM (bit 17-16) is present
-in the machine.
-
-There is further a TCM region register (search for "TCM Region
-Registers" at the ARM site) that can report and modify the location
-size of TCM memories at runtime. This is used to read out and modify
-TCM location and size. Notice that this is not a MMU table: you
-actually move the physical location of the TCM around. At the
-place you put it, it will mask any underlying RAM from the
-CPU so it is usually wise not to overlap any physical RAM with
-the TCM.
-
-The TCM memory can then be remapped to another address again using
-the MMU, but notice that the TCM if often used in situations where
-the MMU is turned off. To avoid confusion the current Linux
-implementation will map the TCM 1 to 1 from physical to virtual
-memory in the location specified by the kernel. Currently Linux
-will map ITCM to 0xfffe0000 and on, and DTCM to 0xfffe8000 and
-on, supporting a maximum of 32KiB of ITCM and 32KiB of DTCM.
-
-Newer versions of the region registers also support dividing these
-TCMs in two separate banks, so for example an 8KiB ITCM is divided
-into two 4KiB banks with its own control registers. The idea is to
-be able to lock and hide one of the banks for use by the secure
-world (TrustZone).
-
-TCM is used for a few things:
-
-- FIQ and other interrupt handlers that need deterministic
-  timing and cannot wait for cache misses.
-
-- Idle loops where all external RAM is set to self-refresh
-  retention mode, so only on-chip RAM is accessible by
-  the CPU and then we hang inside ITCM waiting for an
-  interrupt.
-
-- Other operations which implies shutting off or reconfiguring
-  the external RAM controller.
-
-There is an interface for using TCM on the ARM architecture
-in <asm/tcm.h>. Using this interface it is possible to:
-
-- Define the physical address and size of ITCM and DTCM.
-
-- Tag functions to be compiled into ITCM.
-
-- Tag data and constants to be allocated to DTCM and ITCM.
-
-- Have the remaining TCM RAM added to a special
-  allocation pool with gen_pool_create() and gen_pool_add()
-  and provice tcm_alloc() and tcm_free() for this
-  memory. Such a heap is great for things like saving
-  device state when shutting off device power domains.
-
-A machine that has TCM memory shall select HAVE_TCM from
-arch/arm/Kconfig for itself. Code that needs to use TCM shall
-#include <asm/tcm.h>
-
-Functions to go into itcm can be tagged like this:
-int __tcmfunc foo(int bar);
-
-Since these are marked to become long_calls and you may want
-to have functions called locally inside the TCM without
-wasting space, there is also the __tcmlocalfunc prefix that
-will make the call relative.
-
-Variables to go into dtcm can be tagged like this:
-int __tcmdata foo;
-
-Constants can be tagged like this:
-int __tcmconst foo;
-
-To put assembler into TCM just use
-.section ".tcm.text" or .section ".tcm.data"
-respectively.
-
-Example code:
-
-#include <asm/tcm.h>
-
-/* Uninitialized data */
-static u32 __tcmdata tcmvar;
-/* Initialized data */
-static u32 __tcmdata tcmassigned = 0x2BADBABEU;
-/* Constant */
-static const u32 __tcmconst tcmconst = 0xCAFEBABEU;
-
-static void __tcmlocalfunc tcm_to_tcm(void)
-{
-	int i;
-	for (i = 0; i < 100; i++)
-		tcmvar ++;
-}
-
-static void __tcmfunc hello_tcm(void)
-{
-	/* Some abstract code that runs in ITCM */
-	int i;
-	for (i = 0; i < 100; i++) {
-		tcmvar ++;
-	}
-	tcm_to_tcm();
-}
-
-static void __init test_tcm(void)
-{
-	u32 *tcmem;
-	int i;
-
-	hello_tcm();
-	printk("Hello TCM executed from ITCM RAM\n");
-
-	printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar);
-	tcmvar = 0xDEADBEEFU;
-	printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar);
-
-	printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned);
-
-	printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst);
-
-	/* Allocate some TCM memory from the pool */
-	tcmem = tcm_alloc(20);
-	if (tcmem) {
-		printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem);
-		tcmem[0] = 0xDEADBEEFU;
-		tcmem[1] = 0x2BADBABEU;
-		tcmem[2] = 0xCAFEBABEU;
-		tcmem[3] = 0xDEADBEEFU;
-		tcmem[4] = 0x2BADBABEU;
-		for (i = 0; i < 5; i++)
-			printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]);
-		tcm_free(tcmem, 20);
-	}
-}
diff --git a/Documentation/arm/uefi.rst b/Documentation/arm/uefi.rst
new file mode 100644
index 000000000000..f868330df6be
--- /dev/null
+++ b/Documentation/arm/uefi.rst
@@ -0,0 +1,67 @@
+================================================
+The Unified Extensible Firmware Interface (UEFI)
+================================================
+
+UEFI, the Unified Extensible Firmware Interface, is a specification
+governing the behaviours of compatible firmware interfaces. It is
+maintained by the UEFI Forum - http://www.uefi.org/.
+
+UEFI is an evolution of its predecessor 'EFI', so the terms EFI and
+UEFI are used somewhat interchangeably in this document and associated
+source code. As a rule, anything new uses 'UEFI', whereas 'EFI' refers
+to legacy code or specifications.
+
+UEFI support in Linux
+=====================
+Booting on a platform with firmware compliant with the UEFI specification
+makes it possible for the kernel to support additional features:
+
+- UEFI Runtime Services
+- Retrieving various configuration information through the standardised
+  interface of UEFI configuration tables. (ACPI, SMBIOS, ...)
+
+For actually enabling [U]EFI support, enable:
+
+- CONFIG_EFI=y
+- CONFIG_EFI_VARS=y or m
+
+The implementation depends on receiving information about the UEFI environment
+in a Flattened Device Tree (FDT) - so is only available with CONFIG_OF.
+
+UEFI stub
+=========
+The "stub" is a feature that extends the Image/zImage into a valid UEFI
+PE/COFF executable, including a loader application that makes it possible to
+load the kernel directly from the UEFI shell, boot menu, or one of the
+lightweight bootloaders like Gummiboot or rEFInd.
+
+The kernel image built with stub support remains a valid kernel image for
+booting in non-UEFI environments.
+
+UEFI kernel support on ARM
+==========================
+UEFI kernel support on the ARM architectures (arm and arm64) is only available
+when boot is performed through the stub.
+
+When booting in UEFI mode, the stub deletes any memory nodes from a provided DT.
+Instead, the kernel reads the UEFI memory map.
+
+The stub populates the FDT /chosen node with (and the kernel scans for) the
+following parameters:
+
+==========================  ======   ===========================================
+Name                        Size     Description
+==========================  ======   ===========================================
+linux,uefi-system-table     64-bit   Physical address of the UEFI System Table.
+
+linux,uefi-mmap-start       64-bit   Physical address of the UEFI memory map,
+                                     populated by the UEFI GetMemoryMap() call.
+
+linux,uefi-mmap-size        32-bit   Size in bytes of the UEFI memory map
+                                     pointed to in previous entry.
+
+linux,uefi-mmap-desc-size   32-bit   Size in bytes of each entry in the UEFI
+                                     memory map.
+
+linux,uefi-mmap-desc-ver    32-bit   Version of the mmap descriptor format.
+==========================  ======   ===========================================
diff --git a/Documentation/arm/uefi.txt b/Documentation/arm/uefi.txt
deleted file mode 100644
index 6543a0adea8a..000000000000
--- a/Documentation/arm/uefi.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-UEFI, the Unified Extensible Firmware Interface, is a specification
-governing the behaviours of compatible firmware interfaces. It is
-maintained by the UEFI Forum - http://www.uefi.org/.
-
-UEFI is an evolution of its predecessor 'EFI', so the terms EFI and
-UEFI are used somewhat interchangeably in this document and associated
-source code. As a rule, anything new uses 'UEFI', whereas 'EFI' refers
-to legacy code or specifications.
-
-UEFI support in Linux
-=====================
-Booting on a platform with firmware compliant with the UEFI specification
-makes it possible for the kernel to support additional features:
-- UEFI Runtime Services
-- Retrieving various configuration information through the standardised
-  interface of UEFI configuration tables. (ACPI, SMBIOS, ...)
-
-For actually enabling [U]EFI support, enable:
-- CONFIG_EFI=y
-- CONFIG_EFI_VARS=y or m
-
-The implementation depends on receiving information about the UEFI environment
-in a Flattened Device Tree (FDT) - so is only available with CONFIG_OF.
-
-UEFI stub
-=========
-The "stub" is a feature that extends the Image/zImage into a valid UEFI
-PE/COFF executable, including a loader application that makes it possible to
-load the kernel directly from the UEFI shell, boot menu, or one of the
-lightweight bootloaders like Gummiboot or rEFInd.
-
-The kernel image built with stub support remains a valid kernel image for
-booting in non-UEFI environments.
-
-UEFI kernel support on ARM
-==========================
-UEFI kernel support on the ARM architectures (arm and arm64) is only available
-when boot is performed through the stub.
-
-When booting in UEFI mode, the stub deletes any memory nodes from a provided DT.
-Instead, the kernel reads the UEFI memory map.
-
-The stub populates the FDT /chosen node with (and the kernel scans for) the
-following parameters:
-________________________________________________________________________________
-Name                      | Size   | Description
-================================================================================
-linux,uefi-system-table   | 64-bit | Physical address of the UEFI System Table.
---------------------------------------------------------------------------------
-linux,uefi-mmap-start     | 64-bit | Physical address of the UEFI memory map,
-                          |        | populated by the UEFI GetMemoryMap() call.
---------------------------------------------------------------------------------
-linux,uefi-mmap-size      | 32-bit | Size in bytes of the UEFI memory map
-                          |        | pointed to in previous entry.
---------------------------------------------------------------------------------
-linux,uefi-mmap-desc-size | 32-bit | Size in bytes of each entry in the UEFI
-                          |        | memory map.
---------------------------------------------------------------------------------
-linux,uefi-mmap-desc-ver  | 32-bit | Version of the mmap descriptor format.
---------------------------------------------------------------------------------
diff --git a/Documentation/arm/vfp/release-notes.rst b/Documentation/arm/vfp/release-notes.rst
new file mode 100644
index 000000000000..c6b04937cee3
--- /dev/null
+++ b/Documentation/arm/vfp/release-notes.rst
@@ -0,0 +1,57 @@
+===============================================
+Release notes for Linux Kernel VFP support code
+===============================================
+
+Date: 	20 May 2004
+
+Author:	Russell King
+
+This is the first release of the Linux Kernel VFP support code.  It
+provides support for the exceptions bounced from VFP hardware found
+on ARM926EJ-S.
+
+This release has been validated against the SoftFloat-2b library by
+John R. Hauser using the TestFloat-2a test suite.  Details of this
+library and test suite can be found at:
+
+   http://www.jhauser.us/arithmetic/SoftFloat.html
+
+The operations which have been tested with this package are:
+
+ - fdiv
+ - fsub
+ - fadd
+ - fmul
+ - fcmp
+ - fcmpe
+ - fcvtd
+ - fcvts
+ - fsito
+ - ftosi
+ - fsqrt
+
+All the above pass softfloat tests with the following exceptions:
+
+- fadd/fsub shows some differences in the handling of +0 / -0 results
+  when input operands differ in signs.
+- the handling of underflow exceptions is slightly different.  If a
+  result underflows before rounding, but becomes a normalised number
+  after rounding, we do not signal an underflow exception.
+
+Other operations which have been tested by basic assembly-only tests
+are:
+
+ - fcpy
+ - fabs
+ - fneg
+ - ftoui
+ - ftosiz
+ - ftouiz
+
+The combination operations have not been tested:
+
+ - fmac
+ - fnmac
+ - fmsc
+ - fnmsc
+ - fnmul
diff --git a/Documentation/arm/vlocks.rst b/Documentation/arm/vlocks.rst
new file mode 100644
index 000000000000..a40a1742110b
--- /dev/null
+++ b/Documentation/arm/vlocks.rst
@@ -0,0 +1,212 @@
+======================================
+vlocks for Bare-Metal Mutual Exclusion
+======================================
+
+Voting Locks, or "vlocks" provide a simple low-level mutual exclusion
+mechanism, with reasonable but minimal requirements on the memory
+system.
+
+These are intended to be used to coordinate critical activity among CPUs
+which are otherwise non-coherent, in situations where the hardware
+provides no other mechanism to support this and ordinary spinlocks
+cannot be used.
+
+
+vlocks make use of the atomicity provided by the memory system for
+writes to a single memory location.  To arbitrate, every CPU "votes for
+itself", by storing a unique number to a common memory location.  The
+final value seen in that memory location when all the votes have been
+cast identifies the winner.
+
+In order to make sure that the election produces an unambiguous result
+in finite time, a CPU will only enter the election in the first place if
+no winner has been chosen and the election does not appear to have
+started yet.
+
+
+Algorithm
+---------
+
+The easiest way to explain the vlocks algorithm is with some pseudo-code::
+
+
+	int currently_voting[NR_CPUS] = { 0, };
+	int last_vote = -1; /* no votes yet */
+
+	bool vlock_trylock(int this_cpu)
+	{
+		/* signal our desire to vote */
+		currently_voting[this_cpu] = 1;
+		if (last_vote != -1) {
+			/* someone already volunteered himself */
+			currently_voting[this_cpu] = 0;
+			return false; /* not ourself */
+		}
+
+		/* let's suggest ourself */
+		last_vote = this_cpu;
+		currently_voting[this_cpu] = 0;
+
+		/* then wait until everyone else is done voting */
+		for_each_cpu(i) {
+			while (currently_voting[i] != 0)
+				/* wait */;
+		}
+
+		/* result */
+		if (last_vote == this_cpu)
+			return true; /* we won */
+		return false;
+	}
+
+	bool vlock_unlock(void)
+	{
+		last_vote = -1;
+	}
+
+
+The currently_voting[] array provides a way for the CPUs to determine
+whether an election is in progress, and plays a role analogous to the
+"entering" array in Lamport's bakery algorithm [1].
+
+However, once the election has started, the underlying memory system
+atomicity is used to pick the winner.  This avoids the need for a static
+priority rule to act as a tie-breaker, or any counters which could
+overflow.
+
+As long as the last_vote variable is globally visible to all CPUs, it
+will contain only one value that won't change once every CPU has cleared
+its currently_voting flag.
+
+
+Features and limitations
+------------------------
+
+ * vlocks are not intended to be fair.  In the contended case, it is the
+   _last_ CPU which attempts to get the lock which will be most likely
+   to win.
+
+   vlocks are therefore best suited to situations where it is necessary
+   to pick a unique winner, but it does not matter which CPU actually
+   wins.
+
+ * Like other similar mechanisms, vlocks will not scale well to a large
+   number of CPUs.
+
+   vlocks can be cascaded in a voting hierarchy to permit better scaling
+   if necessary, as in the following hypothetical example for 4096 CPUs::
+
+	/* first level: local election */
+	my_town = towns[(this_cpu >> 4) & 0xf];
+	I_won = vlock_trylock(my_town, this_cpu & 0xf);
+	if (I_won) {
+		/* we won the town election, let's go for the state */
+		my_state = states[(this_cpu >> 8) & 0xf];
+		I_won = vlock_lock(my_state, this_cpu & 0xf));
+		if (I_won) {
+			/* and so on */
+			I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
+			if (I_won) {
+				/* ... */
+			}
+			vlock_unlock(the_whole_country);
+		}
+		vlock_unlock(my_state);
+	}
+	vlock_unlock(my_town);
+
+
+ARM implementation
+------------------
+
+The current ARM implementation [2] contains some optimisations beyond
+the basic algorithm:
+
+ * By packing the members of the currently_voting array close together,
+   we can read the whole array in one transaction (providing the number
+   of CPUs potentially contending the lock is small enough).  This
+   reduces the number of round-trips required to external memory.
+
+   In the ARM implementation, this means that we can use a single load
+   and comparison::
+
+	LDR	Rt, [Rn]
+	CMP	Rt, #0
+
+   ...in place of code equivalent to::
+
+	LDRB	Rt, [Rn]
+	CMP	Rt, #0
+	LDRBEQ	Rt, [Rn, #1]
+	CMPEQ	Rt, #0
+	LDRBEQ	Rt, [Rn, #2]
+	CMPEQ	Rt, #0
+	LDRBEQ	Rt, [Rn, #3]
+	CMPEQ	Rt, #0
+
+   This cuts down on the fast-path latency, as well as potentially
+   reducing bus contention in contended cases.
+
+   The optimisation relies on the fact that the ARM memory system
+   guarantees coherency between overlapping memory accesses of
+   different sizes, similarly to many other architectures.  Note that
+   we do not care which element of currently_voting appears in which
+   bits of Rt, so there is no need to worry about endianness in this
+   optimisation.
+
+   If there are too many CPUs to read the currently_voting array in
+   one transaction then multiple transations are still required.  The
+   implementation uses a simple loop of word-sized loads for this
+   case.  The number of transactions is still fewer than would be
+   required if bytes were loaded individually.
+
+
+   In principle, we could aggregate further by using LDRD or LDM, but
+   to keep the code simple this was not attempted in the initial
+   implementation.
+
+
+ * vlocks are currently only used to coordinate between CPUs which are
+   unable to enable their caches yet.  This means that the
+   implementation removes many of the barriers which would be required
+   when executing the algorithm in cached memory.
+
+   packing of the currently_voting array does not work with cached
+   memory unless all CPUs contending the lock are cache-coherent, due
+   to cache writebacks from one CPU clobbering values written by other
+   CPUs.  (Though if all the CPUs are cache-coherent, you should be
+   probably be using proper spinlocks instead anyway).
+
+
+ * The "no votes yet" value used for the last_vote variable is 0 (not
+   -1 as in the pseudocode).  This allows statically-allocated vlocks
+   to be implicitly initialised to an unlocked state simply by putting
+   them in .bss.
+
+   An offset is added to each CPU's ID for the purpose of setting this
+   variable, so that no CPU uses the value 0 for its ID.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, for
+use in ARM-based big.LITTLE platforms, with review and input gratefully
+received from Nicolas Pitre and Achin Gupta.  Thanks to Nicolas for
+grabbing most of this text out of the relevant mail thread and writing
+up the pseudocode.
+
+Copyright (C) 2012-2013  Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
+
+
+References
+----------
+
+[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming
+    Problem", Communications of the ACM 17, 8 (August 1974), 453-455.
+
+    https://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm
+
+[2] linux/arch/arm/common/vlock.S, www.kernel.org.
diff --git a/Documentation/arm/vlocks.txt b/Documentation/arm/vlocks.txt
deleted file mode 100644
index 45731672c564..000000000000
--- a/Documentation/arm/vlocks.txt
+++ /dev/null
@@ -1,211 +0,0 @@
-vlocks for Bare-Metal Mutual Exclusion
-======================================
-
-Voting Locks, or "vlocks" provide a simple low-level mutual exclusion
-mechanism, with reasonable but minimal requirements on the memory
-system.
-
-These are intended to be used to coordinate critical activity among CPUs
-which are otherwise non-coherent, in situations where the hardware
-provides no other mechanism to support this and ordinary spinlocks
-cannot be used.
-
-
-vlocks make use of the atomicity provided by the memory system for
-writes to a single memory location.  To arbitrate, every CPU "votes for
-itself", by storing a unique number to a common memory location.  The
-final value seen in that memory location when all the votes have been
-cast identifies the winner.
-
-In order to make sure that the election produces an unambiguous result
-in finite time, a CPU will only enter the election in the first place if
-no winner has been chosen and the election does not appear to have
-started yet.
-
-
-Algorithm
----------
-
-The easiest way to explain the vlocks algorithm is with some pseudo-code:
-
-
-	int currently_voting[NR_CPUS] = { 0, };
-	int last_vote = -1; /* no votes yet */
-
-	bool vlock_trylock(int this_cpu)
-	{
-		/* signal our desire to vote */
-		currently_voting[this_cpu] = 1;
-		if (last_vote != -1) {
-			/* someone already volunteered himself */
-			currently_voting[this_cpu] = 0;
-			return false; /* not ourself */
-		}
-
-		/* let's suggest ourself */
-		last_vote = this_cpu;
-		currently_voting[this_cpu] = 0;
-
-		/* then wait until everyone else is done voting */
-		for_each_cpu(i) {
-			while (currently_voting[i] != 0)
-				/* wait */;
-		}
-
-		/* result */
-		if (last_vote == this_cpu)
-			return true; /* we won */
-		return false;
-	}
-
-	bool vlock_unlock(void)
-	{
-		last_vote = -1;
-	}
-
-
-The currently_voting[] array provides a way for the CPUs to determine
-whether an election is in progress, and plays a role analogous to the
-"entering" array in Lamport's bakery algorithm [1].
-
-However, once the election has started, the underlying memory system
-atomicity is used to pick the winner.  This avoids the need for a static
-priority rule to act as a tie-breaker, or any counters which could
-overflow.
-
-As long as the last_vote variable is globally visible to all CPUs, it
-will contain only one value that won't change once every CPU has cleared
-its currently_voting flag.
-
-
-Features and limitations
-------------------------
-
- * vlocks are not intended to be fair.  In the contended case, it is the
-   _last_ CPU which attempts to get the lock which will be most likely
-   to win.
-
-   vlocks are therefore best suited to situations where it is necessary
-   to pick a unique winner, but it does not matter which CPU actually
-   wins.
-
- * Like other similar mechanisms, vlocks will not scale well to a large
-   number of CPUs.
-
-   vlocks can be cascaded in a voting hierarchy to permit better scaling
-   if necessary, as in the following hypothetical example for 4096 CPUs:
-
-	/* first level: local election */
-	my_town = towns[(this_cpu >> 4) & 0xf];
-	I_won = vlock_trylock(my_town, this_cpu & 0xf);
-	if (I_won) {
-		/* we won the town election, let's go for the state */
-		my_state = states[(this_cpu >> 8) & 0xf];
-		I_won = vlock_lock(my_state, this_cpu & 0xf));
-		if (I_won) {
-			/* and so on */
-			I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
-			if (I_won) {
-				/* ... */
-			}
-			vlock_unlock(the_whole_country);
-		}
-		vlock_unlock(my_state);
-	}
-	vlock_unlock(my_town);
-
-
-ARM implementation
-------------------
-
-The current ARM implementation [2] contains some optimisations beyond
-the basic algorithm:
-
- * By packing the members of the currently_voting array close together,
-   we can read the whole array in one transaction (providing the number
-   of CPUs potentially contending the lock is small enough).  This
-   reduces the number of round-trips required to external memory.
-
-   In the ARM implementation, this means that we can use a single load
-   and comparison:
-
-	LDR	Rt, [Rn]
-	CMP	Rt, #0
-
-   ...in place of code equivalent to:
-
-	LDRB	Rt, [Rn]
-	CMP	Rt, #0
-	LDRBEQ	Rt, [Rn, #1]
-	CMPEQ	Rt, #0
-	LDRBEQ	Rt, [Rn, #2]
-	CMPEQ	Rt, #0
-	LDRBEQ	Rt, [Rn, #3]
-	CMPEQ	Rt, #0
-
-   This cuts down on the fast-path latency, as well as potentially
-   reducing bus contention in contended cases.
-
-   The optimisation relies on the fact that the ARM memory system
-   guarantees coherency between overlapping memory accesses of
-   different sizes, similarly to many other architectures.  Note that
-   we do not care which element of currently_voting appears in which
-   bits of Rt, so there is no need to worry about endianness in this
-   optimisation.
-
-   If there are too many CPUs to read the currently_voting array in
-   one transaction then multiple transations are still required.  The
-   implementation uses a simple loop of word-sized loads for this
-   case.  The number of transactions is still fewer than would be
-   required if bytes were loaded individually.
-
-
-   In principle, we could aggregate further by using LDRD or LDM, but
-   to keep the code simple this was not attempted in the initial
-   implementation.
-
-
- * vlocks are currently only used to coordinate between CPUs which are
-   unable to enable their caches yet.  This means that the
-   implementation removes many of the barriers which would be required
-   when executing the algorithm in cached memory.
-
-   packing of the currently_voting array does not work with cached
-   memory unless all CPUs contending the lock are cache-coherent, due
-   to cache writebacks from one CPU clobbering values written by other
-   CPUs.  (Though if all the CPUs are cache-coherent, you should be
-   probably be using proper spinlocks instead anyway).
-
-
- * The "no votes yet" value used for the last_vote variable is 0 (not
-   -1 as in the pseudocode).  This allows statically-allocated vlocks
-   to be implicitly initialised to an unlocked state simply by putting
-   them in .bss.
-
-   An offset is added to each CPU's ID for the purpose of setting this
-   variable, so that no CPU uses the value 0 for its ID.
-
-
-Colophon
---------
-
-Originally created and documented by Dave Martin for Linaro Limited, for
-use in ARM-based big.LITTLE platforms, with review and input gratefully
-received from Nicolas Pitre and Achin Gupta.  Thanks to Nicolas for
-grabbing most of this text out of the relevant mail thread and writing
-up the pseudocode.
-
-Copyright (C) 2012-2013  Linaro Limited
-Distributed under the terms of Version 2 of the GNU General Public
-License, as defined in linux/COPYING.
-
-
-References
-----------
-
-[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming
-    Problem", Communications of the ACM 17, 8 (August 1974), 453-455.
-
-    https://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm
-
-[2] linux/arch/arm/common/vlock.S, www.kernel.org.
diff --git a/Documentation/arm64/acpi_object_usage.rst b/Documentation/arm64/acpi_object_usage.rst
new file mode 100644
index 000000000000..d51b69dc624d
--- /dev/null
+++ b/Documentation/arm64/acpi_object_usage.rst
@@ -0,0 +1,738 @@
+===========
+ACPI Tables
+===========
+
+The expectations of individual ACPI tables are discussed in the list that
+follows.
+
+If a section number is used, it refers to a section number in the ACPI
+specification where the object is defined.  If "Signature Reserved" is used,
+the table signature (the first four bytes of the table) is the only portion
+of the table recognized by the specification, and the actual table is defined
+outside of the UEFI Forum (see Section 5.2.6 of the specification).
+
+For ACPI on arm64, tables also fall into the following categories:
+
+       -  Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT
+
+       -  Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT
+
+       -  Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT,
+          MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO,
+	  TCPA, TPM2, UEFI, XENV
+
+       -  Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT,
+          MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT
+
+====== ========================================================================
+Table  Usage for ARMv8 Linux
+====== ========================================================================
+BERT   Section 18.3 (signature == "BERT")
+
+       **Boot Error Record Table**
+
+       Must be supplied if RAS support is provided by the platform.  It
+       is recommended this table be supplied.
+
+BOOT   Signature Reserved (signature == "BOOT")
+
+       **simple BOOT flag table**
+
+       Microsoft only table, will not be supported.
+
+BGRT   Section 5.2.22 (signature == "BGRT")
+
+       **Boot Graphics Resource Table**
+
+       Optional, not currently supported, with no real use-case for an
+       ARM server.
+
+CPEP   Section 5.2.18 (signature == "CPEP")
+
+       **Corrected Platform Error Polling table**
+
+       Optional, not currently supported, and not recommended until such
+       time as ARM-compatible hardware is available, and the specification
+       suitably modified.
+
+CSRT   Signature Reserved (signature == "CSRT")
+
+       **Core System Resources Table**
+
+       Optional, not currently supported.
+
+DBG2   Signature Reserved (signature == "DBG2")
+
+       **DeBuG port table 2**
+
+       License has changed and should be usable.  Optional if used instead
+       of earlycon=<device> on the command line.
+
+DBGP   Signature Reserved (signature == "DBGP")
+
+       **DeBuG Port table**
+
+       Microsoft only table, will not be supported.
+
+DSDT   Section 5.2.11.1 (signature == "DSDT")
+
+       **Differentiated System Description Table**
+
+       A DSDT is required; see also SSDT.
+
+       ACPI tables contain only one DSDT but can contain one or more SSDTs,
+       which are optional.  Each SSDT can only add to the ACPI namespace,
+       but cannot modify or replace anything in the DSDT.
+
+DMAR   Signature Reserved (signature == "DMAR")
+
+       **DMA Remapping table**
+
+       x86 only table, will not be supported.
+
+DRTM   Signature Reserved (signature == "DRTM")
+
+       **Dynamic Root of Trust for Measurement table**
+
+       Optional, not currently supported.
+
+ECDT   Section 5.2.16 (signature == "ECDT")
+
+       **Embedded Controller Description Table**
+
+       Optional, not currently supported, but could be used on ARM if and
+       only if one uses the GPE_BIT field to represent an IRQ number, since
+       there are no GPE blocks defined in hardware reduced mode.  This would
+       need to be modified in the ACPI specification.
+
+EINJ   Section 18.6 (signature == "EINJ")
+
+       **Error Injection table**
+
+       This table is very useful for testing platform response to error
+       conditions; it allows one to inject an error into the system as
+       if it had actually occurred.  However, this table should not be
+       shipped with a production system; it should be dynamically loaded
+       and executed with the ACPICA tools only during testing.
+
+ERST   Section 18.5 (signature == "ERST")
+
+       **Error Record Serialization Table**
+
+       On a platform supports RAS, this table must be supplied if it is not
+       UEFI-based; if it is UEFI-based, this table may be supplied. When this
+       table is not present, UEFI run time service will be utilized to save
+       and retrieve hardware error information to and from a persistent store.
+
+ETDT   Signature Reserved (signature == "ETDT")
+
+       **Event Timer Description Table**
+
+       Obsolete table, will not be supported.
+
+FACS   Section 5.2.10 (signature == "FACS")
+
+       **Firmware ACPI Control Structure**
+
+       It is unlikely that this table will be terribly useful.  If it is
+       provided, the Global Lock will NOT be used since it is not part of
+       the hardware reduced profile, and only 64-bit address fields will
+       be considered valid.
+
+FADT   Section 5.2.9 (signature == "FACP")
+
+       **Fixed ACPI Description Table**
+       Required for arm64.
+
+
+       The HW_REDUCED_ACPI flag must be set.  All of the fields that are
+       to be ignored when HW_REDUCED_ACPI is set are expected to be set to
+       zero.
+
+       If an FACS table is provided, the X_FIRMWARE_CTRL field is to be
+       used, not FIRMWARE_CTRL.
+
+       If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is
+       filled in properly - that the PSCI_COMPLIANT flag is set and that
+       PSCI_USE_HVC is set or unset as needed (see table 5-37).
+
+       For the DSDT that is also required, the X_DSDT field is to be used,
+       not the DSDT field.
+
+FPDT   Section 5.2.23 (signature == "FPDT")
+
+       **Firmware Performance Data Table**
+
+       Optional, not currently supported.
+
+GTDT   Section 5.2.24 (signature == "GTDT")
+
+       **Generic Timer Description Table**
+
+       Required for arm64.
+
+HEST   Section 18.3.2 (signature == "HEST")
+
+       **Hardware Error Source Table**
+
+       ARM-specific error sources have been defined; please use those or the
+       PCI types such as type 6 (AER Root Port), 7 (AER Endpoint), or 8 (AER
+       Bridge), or use type 9 (Generic Hardware Error Source).  Firmware first
+       error handling is possible if and only if Trusted Firmware is being
+       used on arm64.
+
+       Must be supplied if RAS support is provided by the platform.  It
+       is recommended this table be supplied.
+
+HPET   Signature Reserved (signature == "HPET")
+
+       **High Precision Event timer Table**
+
+       x86 only table, will not be supported.
+
+IBFT   Signature Reserved (signature == "IBFT")
+
+       **iSCSI Boot Firmware Table**
+
+       Microsoft defined table, support TBD.
+
+IORT   Signature Reserved (signature == "IORT")
+
+       **Input Output Remapping Table**
+
+       arm64 only table, required in order to describe IO topology, SMMUs,
+       and GIC ITSs, and how those various components are connected together,
+       such as identifying which components are behind which SMMUs/ITSs.
+       This table will only be required on certain SBSA platforms (e.g.,
+       when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it
+       remains optional.
+
+IVRS   Signature Reserved (signature == "IVRS")
+
+       **I/O Virtualization Reporting Structure**
+
+       x86_64 (AMD) only table, will not be supported.
+
+LPIT   Signature Reserved (signature == "LPIT")
+
+       **Low Power Idle Table**
+
+       x86 only table as of ACPI 5.1; starting with ACPI 6.0, processor
+       descriptions and power states on ARM platforms should use the DSDT
+       and define processor container devices (_HID ACPI0010, Section 8.4,
+       and more specifically 8.4.3 and and 8.4.4).
+
+MADT   Section 5.2.12 (signature == "APIC")
+
+       **Multiple APIC Description Table**
+
+       Required for arm64.  Only the GIC interrupt controller structures
+       should be used (types 0xA - 0xF).
+
+MCFG   Signature Reserved (signature == "MCFG")
+
+       **Memory-mapped ConFiGuration space**
+
+       If the platform supports PCI/PCIe, an MCFG table is required.
+
+MCHI   Signature Reserved (signature == "MCHI")
+
+       **Management Controller Host Interface table**
+
+       Optional, not currently supported.
+
+MPST   Section 5.2.21 (signature == "MPST")
+
+       **Memory Power State Table**
+
+       Optional, not currently supported.
+
+MSCT   Section 5.2.19 (signature == "MSCT")
+
+       **Maximum System Characteristic Table**
+
+       Optional, not currently supported.
+
+MSDM   Signature Reserved (signature == "MSDM")
+
+       **Microsoft Data Management table**
+
+       Microsoft only table, will not be supported.
+
+NFIT   Section 5.2.25 (signature == "NFIT")
+
+       **NVDIMM Firmware Interface Table**
+
+       Optional, not currently supported.
+
+OEMx   Signature of "OEMx" only
+
+       **OEM Specific Tables**
+
+       All tables starting with a signature of "OEM" are reserved for OEM
+       use.  Since these are not meant to be of general use but are limited
+       to very specific end users, they are not recommended for use and are
+       not supported by the kernel for arm64.
+
+PCCT   Section 14.1 (signature == "PCCT)
+
+       **Platform Communications Channel Table**
+
+       Recommend for use on arm64; use of PCC is recommended when using CPPC
+       to control performance and power for platform processors.
+
+PMTT   Section 5.2.21.12 (signature == "PMTT")
+
+       **Platform Memory Topology Table**
+
+       Optional, not currently supported.
+
+PSDT   Section 5.2.11.3 (signature == "PSDT")
+
+       **Persistent System Description Table**
+
+       Obsolete table, will not be supported.
+
+RASF   Section 5.2.20 (signature == "RASF")
+
+       **RAS Feature table**
+
+       Optional, not currently supported.
+
+RSDP   Section 5.2.5 (signature == "RSD PTR")
+
+       **Root System Description PoinTeR**
+
+       Required for arm64.
+
+RSDT   Section 5.2.7 (signature == "RSDT")
+
+       **Root System Description Table**
+
+       Since this table can only provide 32-bit addresses, it is deprecated
+       on arm64, and will not be used.  If provided, it will be ignored.
+
+SBST   Section 5.2.14 (signature == "SBST")
+
+       **Smart Battery Subsystem Table**
+
+       Optional, not currently supported.
+
+SLIC   Signature Reserved (signature == "SLIC")
+
+       **Software LIcensing table**
+
+       Microsoft only table, will not be supported.
+
+SLIT   Section 5.2.17 (signature == "SLIT")
+
+       **System Locality distance Information Table**
+
+       Optional in general, but required for NUMA systems.
+
+SPCR   Signature Reserved (signature == "SPCR")
+
+       **Serial Port Console Redirection table**
+
+       Required for arm64.
+
+SPMI   Signature Reserved (signature == "SPMI")
+
+       **Server Platform Management Interface table**
+
+       Optional, not currently supported.
+
+SRAT   Section 5.2.16 (signature == "SRAT")
+
+       **System Resource Affinity Table**
+
+       Optional, but if used, only the GICC Affinity structures are read.
+       To support arm64 NUMA, this table is required.
+
+SSDT   Section 5.2.11.2 (signature == "SSDT")
+
+       **Secondary System Description Table**
+
+       These tables are a continuation of the DSDT; these are recommended
+       for use with devices that can be added to a running system, but can
+       also serve the purpose of dividing up device descriptions into more
+       manageable pieces.
+
+       An SSDT can only ADD to the ACPI namespace.  It cannot modify or
+       replace existing device descriptions already in the namespace.
+
+       These tables are optional, however.  ACPI tables should contain only
+       one DSDT but can contain many SSDTs.
+
+STAO   Signature Reserved (signature == "STAO")
+
+       **_STA Override table**
+
+       Optional, but only necessary in virtualized environments in order to
+       hide devices from guest OSs.
+
+TCPA   Signature Reserved (signature == "TCPA")
+
+       **Trusted Computing Platform Alliance table**
+
+       Optional, not currently supported, and may need changes to fully
+       interoperate with arm64.
+
+TPM2   Signature Reserved (signature == "TPM2")
+
+       **Trusted Platform Module 2 table**
+
+       Optional, not currently supported, and may need changes to fully
+       interoperate with arm64.
+
+UEFI   Signature Reserved (signature == "UEFI")
+
+       **UEFI ACPI data table**
+
+       Optional, not currently supported.  No known use case for arm64,
+       at present.
+
+WAET   Signature Reserved (signature == "WAET")
+
+       **Windows ACPI Emulated devices Table**
+
+       Microsoft only table, will not be supported.
+
+WDAT   Signature Reserved (signature == "WDAT")
+
+       **Watch Dog Action Table**
+
+       Microsoft only table, will not be supported.
+
+WDRT   Signature Reserved (signature == "WDRT")
+
+       **Watch Dog Resource Table**
+
+       Microsoft only table, will not be supported.
+
+WPBT   Signature Reserved (signature == "WPBT")
+
+       **Windows Platform Binary Table**
+
+       Microsoft only table, will not be supported.
+
+XENV   Signature Reserved (signature == "XENV")
+
+       **Xen project table**
+
+       Optional, used only by Xen at present.
+
+XSDT   Section 5.2.8 (signature == "XSDT")
+
+       **eXtended System Description Table**
+
+       Required for arm64.
+====== ========================================================================
+
+ACPI Objects
+------------
+The expectations on individual ACPI objects that are likely to be used are
+shown in the list that follows; any object not explicitly mentioned below
+should be used as needed for a particular platform or particular subsystem,
+such as power management or PCI.
+
+===== ================ ========================================================
+Name   Section         Usage for ARMv8 Linux
+===== ================ ========================================================
+_CCA   6.2.17          This method must be defined for all bus masters
+                       on arm64 - there are no assumptions made about
+                       whether such devices are cache coherent or not.
+                       The _CCA value is inherited by all descendants of
+                       these devices so it does not need to be repeated.
+                       Without _CCA on arm64, the kernel does not know what
+                       to do about setting up DMA for the device.
+
+                       NB: this method provides default cache coherency
+                       attributes; the presence of an SMMU can be used to
+                       modify that, however.  For example, a master could
+                       default to non-coherent, but be made coherent with
+                       the appropriate SMMU configuration (see Table 17 of
+                       the IORT specification, ARM Document DEN 0049B).
+
+_CID   6.1.2           Use as needed, see also _HID.
+
+_CLS   6.1.3           Use as needed, see also _HID.
+
+_CPC   8.4.7.1         Use as needed, power management specific.  CPPC is
+                       recommended on arm64.
+
+_CRS   6.2.2           Required on arm64.
+
+_CSD   8.4.2.2         Use as needed, used only in conjunction with _CST.
+
+_CST   8.4.2.1         Low power idle states (8.4.4) are recommended instead
+                       of C-states.
+
+_DDN   6.1.4           This field can be used for a device name.  However,
+                       it is meant for DOS device names (e.g., COM1), so be
+                       careful of its use across OSes.
+
+_DSD   6.2.5           To be used with caution.  If this object is used, try
+                       to use it within the constraints already defined by the
+                       Device Properties UUID.  Only in rare circumstances
+                       should it be necessary to create a new _DSD UUID.
+
+                       In either case, submit the _DSD definition along with
+                       any driver patches for discussion, especially when
+                       device properties are used.  A driver will not be
+                       considered complete without a corresponding _DSD
+                       description.  Once approved by kernel maintainers,
+                       the UUID or device properties must then be registered
+                       with the UEFI Forum; this may cause some iteration as
+                       more than one OS will be registering entries.
+
+_DSM   9.1.1           Do not use this method.  It is not standardized, the
+                       return values are not well documented, and it is
+                       currently a frequent source of error.
+
+\_GL   5.7.1           This object is not to be used in hardware reduced
+                       mode, and therefore should not be used on arm64.
+
+_GLK   6.5.7           This object requires a global lock be defined; there
+                       is no global lock on arm64 since it runs in hardware
+                       reduced mode.  Hence, do not use this object on arm64.
+
+\_GPE  5.3.1           This namespace is for x86 use only.  Do not use it
+                       on arm64.
+
+_HID   6.1.5           This is the primary object to use in device probing,
+		       though _CID and _CLS may also be used.
+
+_INI   6.5.1           Not required, but can be useful in setting up devices
+                       when UEFI leaves them in a state that may not be what
+                       the driver expects before it starts probing.
+
+_LPI   8.4.4.3         Recommended for use with processor definitions (_HID
+		       ACPI0010) on arm64.  See also _RDI.
+
+_MLS   6.1.7           Highly recommended for use in internationalization.
+
+_OFF   7.2.2           It is recommended to define this method for any device
+                       that can be turned on or off.
+
+_ON    7.2.3           It is recommended to define this method for any device
+                       that can be turned on or off.
+
+\_OS   5.7.3           This method will return "Linux" by default (this is
+                       the value of the macro ACPI_OS_NAME on Linux).  The
+                       command line parameter acpi_os=<string> can be used
+                       to set it to some other value.
+
+_OSC   6.2.11          This method can be a global method in ACPI (i.e.,
+                       \_SB._OSC), or it may be associated with a specific
+                       device (e.g., \_SB.DEV0._OSC), or both.  When used
+                       as a global method, only capabilities published in
+                       the ACPI specification are allowed.  When used as
+                       a device-specific method, the process described for
+                       using _DSD MUST be used to create an _OSC definition;
+                       out-of-process use of _OSC is not allowed.  That is,
+                       submit the device-specific _OSC usage description as
+                       part of the kernel driver submission, get it approved
+                       by the kernel community, then register it with the
+                       UEFI Forum.
+
+\_OSI  5.7.2           Deprecated on ARM64.  As far as ACPI firmware is
+		       concerned, _OSI is not to be used to determine what
+		       sort of system is being used or what functionality
+		       is provided.  The _OSC method is to be used instead.
+
+_PDC   8.4.1           Deprecated, do not use on arm64.
+
+\_PIC  5.8.1           The method should not be used.  On arm64, the only
+                       interrupt model available is GIC.
+
+\_PR   5.3.1           This namespace is for x86 use only on legacy systems.
+                       Do not use it on arm64.
+
+_PRT   6.2.13          Required as part of the definition of all PCI root
+                       devices.
+
+_PRx   7.3.8-11        Use as needed; power management specific.  If _PR0 is
+                       defined, _PR3 must also be defined.
+
+_PSx   7.3.2-5         Use as needed; power management specific.  If _PS0 is
+                       defined, _PS3 must also be defined.  If clocks or
+                       regulators need adjusting to be consistent with power
+                       usage, change them in these methods.
+
+_RDI   8.4.4.4         Recommended for use with processor definitions (_HID
+		       ACPI0010) on arm64.  This should only be used in
+		       conjunction with _LPI.
+
+\_REV  5.7.4           Always returns the latest version of ACPI supported.
+
+\_SB   5.3.1           Required on arm64; all devices must be defined in this
+                       namespace.
+
+_SLI   6.2.15          Use is recommended when SLIT table is in use.
+
+_STA   6.3.7,          It is recommended to define this method for any device
+       7.2.4           that can be turned on or off.  See also the STAO table
+                       that provides overrides to hide devices in virtualized
+                       environments.
+
+_SRS   6.2.16          Use as needed; see also _PRS.
+
+_STR   6.1.10          Recommended for conveying device names to end users;
+                       this is preferred over using _DDN.
+
+_SUB   6.1.9           Use as needed; _HID or _CID are preferred.
+
+_SUN   6.1.11          Use as needed, but recommended.
+
+_SWS   7.4.3           Use as needed; power management specific; this may
+                       require specification changes for use on arm64.
+
+_UID   6.1.12          Recommended for distinguishing devices of the same
+                       class; define it if at all possible.
+===== ================ ========================================================
+
+
+
+
+ACPI Event Model
+----------------
+Do not use GPE block devices; these are not supported in the hardware reduced
+profile used by arm64.  Since there are no GPE blocks defined for use on ARM
+platforms, ACPI events must be signaled differently.
+
+There are two options: GPIO-signaled interrupts (Section 5.6.5), and
+interrupt-signaled events (Section 5.6.9).  Interrupt-signaled events are a
+new feature in the ACPI 6.1 specification.  Either - or both - can be used
+on a given platform, and which to use may be dependent of limitations in any
+given SoC.  If possible, interrupt-signaled events are recommended.
+
+
+ACPI Processor Control
+----------------------
+Section 8 of the ACPI specification changed significantly in version 6.0.
+Processors should now be defined as Device objects with _HID ACPI0007; do
+not use the deprecated Processor statement in ASL.  All multiprocessor systems
+should also define a hierarchy of processors, done with Processor Container
+Devices (see Section 8.4.3.1, _HID ACPI0010); do not use processor aggregator
+devices (Section 8.5) to describe processor topology.  Section 8.4 of the
+specification describes the semantics of these object definitions and how
+they interrelate.
+
+Most importantly, the processor hierarchy defined also defines the low power
+idle states that are available to the platform, along with the rules for
+determining which processors can be turned on or off and the circumstances
+that control that.  Without this information, the processors will run in
+whatever power state they were left in by UEFI.
+
+Note too, that the processor Device objects defined and the entries in the
+MADT for GICs are expected to be in synchronization.  The _UID of the Device
+object must correspond to processor IDs used in the MADT.
+
+It is recommended that CPPC (8.4.5) be used as the primary model for processor
+performance control on arm64.  C-states and P-states may become available at
+some point in the future, but most current design work appears to favor CPPC.
+
+Further, it is essential that the ARMv8 SoC provide a fully functional
+implementation of PSCI; this will be the only mechanism supported by ACPI
+to control CPU power state.  Booting of secondary CPUs using the ACPI
+parking protocol is possible, but discouraged, since only PSCI is supported
+for ARM servers.
+
+
+ACPI System Address Map Interfaces
+----------------------------------
+In Section 15 of the ACPI specification, several methods are mentioned as
+possible mechanisms for conveying memory resource information to the kernel.
+For arm64, we will only support UEFI for booting with ACPI, hence the UEFI
+GetMemoryMap() boot service is the only mechanism that will be used.
+
+
+ACPI Platform Error Interfaces (APEI)
+-------------------------------------
+The APEI tables supported are described above.
+
+APEI requires the equivalent of an SCI and an NMI on ARMv8.  The SCI is used
+to notify the OSPM of errors that have occurred but can be corrected and the
+system can continue correct operation, even if possibly degraded.  The NMI is
+used to indicate fatal errors that cannot be corrected, and require immediate
+attention.
+
+Since there is no direct equivalent of the x86 SCI or NMI, arm64 handles
+these slightly differently.  The SCI is handled as a high priority interrupt;
+given that these are corrected (or correctable) errors being reported, this
+is sufficient.  The NMI is emulated as the highest priority interrupt
+possible.  This implies some caution must be used since there could be
+interrupts at higher privilege levels or even interrupts at the same priority
+as the emulated NMI.  In Linux, this should not be the case but one should
+be aware it could happen.
+
+
+ACPI Objects Not Supported on ARM64
+-----------------------------------
+While this may change in the future, there are several classes of objects
+that can be defined, but are not currently of general interest to ARM servers.
+Some of these objects have x86 equivalents, and may actually make sense in ARM
+servers.  However, there is either no hardware available at present, or there
+may not even be a non-ARM implementation yet.  Hence, they are not currently
+supported.
+
+The following classes of objects are not supported:
+
+       -  Section 9.2: ambient light sensor devices
+
+       -  Section 9.3: battery devices
+
+       -  Section 9.4: lids (e.g., laptop lids)
+
+       -  Section 9.8.2: IDE controllers
+
+       -  Section 9.9: floppy controllers
+
+       -  Section 9.10: GPE block devices
+
+       -  Section 9.15: PC/AT RTC/CMOS devices
+
+       -  Section 9.16: user presence detection devices
+
+       -  Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT
+
+       -  Section 9.18: time and alarm devices (see 9.15)
+
+       -  Section 10: power source and power meter devices
+
+       -  Section 11: thermal management
+
+       -  Section 12: embedded controllers interface
+
+       -  Section 13: SMBus interfaces
+
+
+This also means that there is no support for the following objects:
+
+====   =========================== ====   ==========
+Name   Section                     Name   Section
+====   =========================== ====   ==========
+_ALC   9.3.4                       _FDM   9.10.3
+_ALI   9.3.2                       _FIX   6.2.7
+_ALP   9.3.6                       _GAI   10.4.5
+_ALR   9.3.5                       _GHL   10.4.7
+_ALT   9.3.3                       _GTM   9.9.2.1.1
+_BCT   10.2.2.10                   _LID   9.5.1
+_BDN   6.5.3                       _PAI   10.4.4
+_BIF   10.2.2.1                    _PCL   10.3.2
+_BIX   10.2.2.1                    _PIF   10.3.3
+_BLT   9.2.3                       _PMC   10.4.1
+_BMA   10.2.2.4                    _PMD   10.4.8
+_BMC   10.2.2.12                   _PMM   10.4.3
+_BMD   10.2.2.11                   _PRL   10.3.4
+_BMS   10.2.2.5                    _PSR   10.3.1
+_BST   10.2.2.6                    _PTP   10.4.2
+_BTH   10.2.2.7                    _SBS   10.1.3
+_BTM   10.2.2.9                    _SHL   10.4.6
+_BTP   10.2.2.8                    _STM   9.9.2.1.1
+_DCK   6.5.2                       _UPD   9.16.1
+_EC    12.12                       _UPP   9.16.2
+_FDE   9.10.1                      _WPC   10.5.2
+_FDI   9.10.2                      _WPP   10.5.3
+====   =========================== ====   ==========
diff --git a/Documentation/arm64/acpi_object_usage.txt b/Documentation/arm64/acpi_object_usage.txt
deleted file mode 100644
index c77010c5c1f0..000000000000
--- a/Documentation/arm64/acpi_object_usage.txt
+++ /dev/null
@@ -1,622 +0,0 @@
-ACPI Tables
------------
-The expectations of individual ACPI tables are discussed in the list that
-follows.
-
-If a section number is used, it refers to a section number in the ACPI
-specification where the object is defined.  If "Signature Reserved" is used,
-the table signature (the first four bytes of the table) is the only portion
-of the table recognized by the specification, and the actual table is defined
-outside of the UEFI Forum (see Section 5.2.6 of the specification).
-
-For ACPI on arm64, tables also fall into the following categories:
-
-       -- Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT
-
-       -- Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT
-
-       -- Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT,
-          MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO,
-	  TCPA, TPM2, UEFI, XENV
-
-       -- Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT,
-          MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT
-
-Table  Usage for ARMv8 Linux
------  ----------------------------------------------------------------
-BERT   Section 18.3 (signature == "BERT")
-       == Boot Error Record Table ==
-       Must be supplied if RAS support is provided by the platform.  It
-       is recommended this table be supplied.
-
-BOOT   Signature Reserved (signature == "BOOT")
-       == simple BOOT flag table ==
-       Microsoft only table, will not be supported.
-
-BGRT   Section 5.2.22 (signature == "BGRT")
-       == Boot Graphics Resource Table ==
-       Optional, not currently supported, with no real use-case for an
-       ARM server.
-
-CPEP   Section 5.2.18 (signature == "CPEP")
-       == Corrected Platform Error Polling table ==
-       Optional, not currently supported, and not recommended until such
-       time as ARM-compatible hardware is available, and the specification
-       suitably modified.
-
-CSRT   Signature Reserved (signature == "CSRT")
-       == Core System Resources Table ==
-       Optional, not currently supported.
-
-DBG2   Signature Reserved (signature == "DBG2")
-       == DeBuG port table 2 ==
-       License has changed and should be usable.  Optional if used instead
-       of earlycon=<device> on the command line.
-
-DBGP   Signature Reserved (signature == "DBGP")
-       == DeBuG Port table ==
-       Microsoft only table, will not be supported.
-
-DSDT   Section 5.2.11.1 (signature == "DSDT")
-       == Differentiated System Description Table ==
-       A DSDT is required; see also SSDT.
-
-       ACPI tables contain only one DSDT but can contain one or more SSDTs,
-       which are optional.  Each SSDT can only add to the ACPI namespace,
-       but cannot modify or replace anything in the DSDT.
-
-DMAR   Signature Reserved (signature == "DMAR")
-       == DMA Remapping table ==
-       x86 only table, will not be supported.
-
-DRTM   Signature Reserved (signature == "DRTM")
-       == Dynamic Root of Trust for Measurement table ==
-       Optional, not currently supported.
-
-ECDT   Section 5.2.16 (signature == "ECDT")
-       == Embedded Controller Description Table ==
-       Optional, not currently supported, but could be used on ARM if and
-       only if one uses the GPE_BIT field to represent an IRQ number, since
-       there are no GPE blocks defined in hardware reduced mode.  This would
-       need to be modified in the ACPI specification.
-
-EINJ   Section 18.6 (signature == "EINJ")
-       == Error Injection table ==
-       This table is very useful for testing platform response to error
-       conditions; it allows one to inject an error into the system as
-       if it had actually occurred.  However, this table should not be
-       shipped with a production system; it should be dynamically loaded
-       and executed with the ACPICA tools only during testing.
-
-ERST   Section 18.5 (signature == "ERST")
-       == Error Record Serialization Table ==
-       On a platform supports RAS, this table must be supplied if it is not
-       UEFI-based; if it is UEFI-based, this table may be supplied. When this
-       table is not present, UEFI run time service will be utilized to save
-       and retrieve hardware error information to and from a persistent store.
-
-ETDT   Signature Reserved (signature == "ETDT")
-       == Event Timer Description Table ==
-       Obsolete table, will not be supported.
-
-FACS   Section 5.2.10 (signature == "FACS")
-       == Firmware ACPI Control Structure ==
-       It is unlikely that this table will be terribly useful.  If it is
-       provided, the Global Lock will NOT be used since it is not part of
-       the hardware reduced profile, and only 64-bit address fields will
-       be considered valid.
-
-FADT   Section 5.2.9 (signature == "FACP")
-       == Fixed ACPI Description Table ==
-       Required for arm64.
-
-       The HW_REDUCED_ACPI flag must be set.  All of the fields that are
-       to be ignored when HW_REDUCED_ACPI is set are expected to be set to
-       zero.
-
-       If an FACS table is provided, the X_FIRMWARE_CTRL field is to be
-       used, not FIRMWARE_CTRL.
-
-       If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is
-       filled in properly -- that the PSCI_COMPLIANT flag is set and that
-       PSCI_USE_HVC is set or unset as needed (see table 5-37).
-
-       For the DSDT that is also required, the X_DSDT field is to be used,
-       not the DSDT field.
-
-FPDT   Section 5.2.23 (signature == "FPDT")
-       == Firmware Performance Data Table ==
-       Optional, not currently supported.
-
-GTDT   Section 5.2.24 (signature == "GTDT")
-       == Generic Timer Description Table ==
-       Required for arm64.
-
-HEST   Section 18.3.2 (signature == "HEST")
-       == Hardware Error Source Table ==
-       ARM-specific error sources have been defined; please use those or the
-       PCI types such as type 6 (AER Root Port), 7 (AER Endpoint), or 8 (AER
-       Bridge), or use type 9 (Generic Hardware Error Source).  Firmware first
-       error handling is possible if and only if Trusted Firmware is being
-       used on arm64.
-
-       Must be supplied if RAS support is provided by the platform.  It
-       is recommended this table be supplied.
-
-HPET   Signature Reserved (signature == "HPET")
-       == High Precision Event timer Table ==
-       x86 only table, will not be supported.
-
-IBFT   Signature Reserved (signature == "IBFT")
-       == iSCSI Boot Firmware Table ==
-       Microsoft defined table, support TBD.
-
-IORT   Signature Reserved (signature == "IORT")
-       == Input Output Remapping Table ==
-       arm64 only table, required in order to describe IO topology, SMMUs,
-       and GIC ITSs, and how those various components are connected together,
-       such as identifying which components are behind which SMMUs/ITSs.
-       This table will only be required on certain SBSA platforms (e.g.,
-       when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it 
-       remains optional.
-
-IVRS   Signature Reserved (signature == "IVRS")
-       == I/O Virtualization Reporting Structure ==
-       x86_64 (AMD) only table, will not be supported.
-
-LPIT   Signature Reserved (signature == "LPIT")
-       == Low Power Idle Table ==
-       x86 only table as of ACPI 5.1; starting with ACPI 6.0, processor
-       descriptions and power states on ARM platforms should use the DSDT
-       and define processor container devices (_HID ACPI0010, Section 8.4,
-       and more specifically 8.4.3 and and 8.4.4).
-
-MADT   Section 5.2.12 (signature == "APIC")
-       == Multiple APIC Description Table ==
-       Required for arm64.  Only the GIC interrupt controller structures
-       should be used (types 0xA - 0xF).
-
-MCFG   Signature Reserved (signature == "MCFG")
-       == Memory-mapped ConFiGuration space ==
-       If the platform supports PCI/PCIe, an MCFG table is required.
-
-MCHI   Signature Reserved (signature == "MCHI")
-       == Management Controller Host Interface table ==
-       Optional, not currently supported.
-
-MPST   Section 5.2.21 (signature == "MPST")
-       == Memory Power State Table ==
-       Optional, not currently supported.
-
-MSCT   Section 5.2.19 (signature == "MSCT")
-       == Maximum System Characteristic Table ==
-       Optional, not currently supported.
-
-MSDM   Signature Reserved (signature == "MSDM")
-       == Microsoft Data Management table ==
-       Microsoft only table, will not be supported.
-
-NFIT   Section 5.2.25 (signature == "NFIT")
-       == NVDIMM Firmware Interface Table ==
-       Optional, not currently supported.
-
-OEMx   Signature of "OEMx" only
-       == OEM Specific Tables ==
-       All tables starting with a signature of "OEM" are reserved for OEM
-       use.  Since these are not meant to be of general use but are limited
-       to very specific end users, they are not recommended for use and are
-       not supported by the kernel for arm64.
-
-PCCT   Section 14.1 (signature == "PCCT)
-       == Platform Communications Channel Table ==
-       Recommend for use on arm64; use of PCC is recommended when using CPPC
-       to control performance and power for platform processors.
-
-PMTT   Section 5.2.21.12 (signature == "PMTT")
-       == Platform Memory Topology Table ==
-       Optional, not currently supported.
-
-PSDT   Section 5.2.11.3 (signature == "PSDT")
-       == Persistent System Description Table ==
-       Obsolete table, will not be supported.
-
-RASF   Section 5.2.20 (signature == "RASF")
-       == RAS Feature table ==
-       Optional, not currently supported.
-
-RSDP   Section 5.2.5 (signature == "RSD PTR")
-       == Root System Description PoinTeR ==
-       Required for arm64.
-
-RSDT   Section 5.2.7 (signature == "RSDT")
-       == Root System Description Table ==
-       Since this table can only provide 32-bit addresses, it is deprecated
-       on arm64, and will not be used.  If provided, it will be ignored.
-
-SBST   Section 5.2.14 (signature == "SBST")
-       == Smart Battery Subsystem Table ==
-       Optional, not currently supported.
-
-SLIC   Signature Reserved (signature == "SLIC")
-       == Software LIcensing table ==
-       Microsoft only table, will not be supported.
-
-SLIT   Section 5.2.17 (signature == "SLIT")
-       == System Locality distance Information Table ==
-       Optional in general, but required for NUMA systems.
-
-SPCR   Signature Reserved (signature == "SPCR")
-       == Serial Port Console Redirection table ==
-       Required for arm64.
-
-SPMI   Signature Reserved (signature == "SPMI")
-       == Server Platform Management Interface table ==
-       Optional, not currently supported.
-
-SRAT   Section 5.2.16 (signature == "SRAT")
-       == System Resource Affinity Table ==
-       Optional, but if used, only the GICC Affinity structures are read.
-       To support arm64 NUMA, this table is required.
-
-SSDT   Section 5.2.11.2 (signature == "SSDT")
-       == Secondary System Description Table ==
-       These tables are a continuation of the DSDT; these are recommended
-       for use with devices that can be added to a running system, but can
-       also serve the purpose of dividing up device descriptions into more
-       manageable pieces.
-
-       An SSDT can only ADD to the ACPI namespace.  It cannot modify or
-       replace existing device descriptions already in the namespace.
-
-       These tables are optional, however.  ACPI tables should contain only
-       one DSDT but can contain many SSDTs.
-
-STAO   Signature Reserved (signature == "STAO")
-       == _STA Override table ==
-       Optional, but only necessary in virtualized environments in order to
-       hide devices from guest OSs.
-
-TCPA   Signature Reserved (signature == "TCPA")
-       == Trusted Computing Platform Alliance table ==
-       Optional, not currently supported, and may need changes to fully
-       interoperate with arm64.
-
-TPM2   Signature Reserved (signature == "TPM2")
-       == Trusted Platform Module 2 table ==
-       Optional, not currently supported, and may need changes to fully
-       interoperate with arm64.
-
-UEFI   Signature Reserved (signature == "UEFI")
-       == UEFI ACPI data table ==
-       Optional, not currently supported.  No known use case for arm64,
-       at present.
-
-WAET   Signature Reserved (signature == "WAET")
-       == Windows ACPI Emulated devices Table ==
-       Microsoft only table, will not be supported.
-
-WDAT   Signature Reserved (signature == "WDAT")
-       == Watch Dog Action Table ==
-       Microsoft only table, will not be supported.
-
-WDRT   Signature Reserved (signature == "WDRT")
-       == Watch Dog Resource Table ==
-       Microsoft only table, will not be supported.
-
-WPBT   Signature Reserved (signature == "WPBT")
-       == Windows Platform Binary Table ==
-       Microsoft only table, will not be supported.
-
-XENV   Signature Reserved (signature == "XENV")
-       == Xen project table ==
-       Optional, used only by Xen at present.
-
-XSDT   Section 5.2.8 (signature == "XSDT")
-       == eXtended System Description Table ==
-       Required for arm64.
-
-
-ACPI Objects
-------------
-The expectations on individual ACPI objects that are likely to be used are
-shown in the list that follows; any object not explicitly mentioned below
-should be used as needed for a particular platform or particular subsystem,
-such as power management or PCI.
-
-Name   Section         Usage for ARMv8 Linux
-----   ------------    -------------------------------------------------
-_CCA   6.2.17          This method must be defined for all bus masters
-                       on arm64 -- there are no assumptions made about
-                       whether such devices are cache coherent or not.
-                       The _CCA value is inherited by all descendants of
-                       these devices so it does not need to be repeated.
-                       Without _CCA on arm64, the kernel does not know what
-                       to do about setting up DMA for the device.
-
-                       NB: this method provides default cache coherency
-                       attributes; the presence of an SMMU can be used to
-                       modify that, however.  For example, a master could
-                       default to non-coherent, but be made coherent with
-                       the appropriate SMMU configuration (see Table 17 of
-                       the IORT specification, ARM Document DEN 0049B).
-
-_CID   6.1.2           Use as needed, see also _HID.
-
-_CLS   6.1.3           Use as needed, see also _HID.
-
-_CPC   8.4.7.1         Use as needed, power management specific.  CPPC is
-                       recommended on arm64.
-
-_CRS   6.2.2           Required on arm64.
-
-_CSD   8.4.2.2         Use as needed, used only in conjunction with _CST.
-
-_CST   8.4.2.1         Low power idle states (8.4.4) are recommended instead
-                       of C-states.
-
-_DDN   6.1.4           This field can be used for a device name.  However,
-                       it is meant for DOS device names (e.g., COM1), so be
-                       careful of its use across OSes.
-
-_DSD   6.2.5           To be used with caution.  If this object is used, try
-                       to use it within the constraints already defined by the
-                       Device Properties UUID.  Only in rare circumstances
-                       should it be necessary to create a new _DSD UUID.
-
-                       In either case, submit the _DSD definition along with
-                       any driver patches for discussion, especially when
-                       device properties are used.  A driver will not be
-                       considered complete without a corresponding _DSD
-                       description.  Once approved by kernel maintainers,
-                       the UUID or device properties must then be registered
-                       with the UEFI Forum; this may cause some iteration as
-                       more than one OS will be registering entries.
-
-_DSM   9.1.1           Do not use this method.  It is not standardized, the
-                       return values are not well documented, and it is
-                       currently a frequent source of error.
-
-\_GL   5.7.1           This object is not to be used in hardware reduced
-                       mode, and therefore should not be used on arm64.
-
-_GLK   6.5.7           This object requires a global lock be defined; there
-                       is no global lock on arm64 since it runs in hardware
-                       reduced mode.  Hence, do not use this object on arm64.
-
-\_GPE  5.3.1           This namespace is for x86 use only.  Do not use it
-                       on arm64.
-
-_HID   6.1.5           This is the primary object to use in device probing,
-		       though _CID and _CLS may also be used.
-
-_INI   6.5.1           Not required, but can be useful in setting up devices
-                       when UEFI leaves them in a state that may not be what
-                       the driver expects before it starts probing.
-
-_LPI   8.4.4.3         Recommended for use with processor definitions (_HID
-		       ACPI0010) on arm64.  See also _RDI.
-
-_MLS   6.1.7           Highly recommended for use in internationalization.
-
-_OFF   7.2.2           It is recommended to define this method for any device
-                       that can be turned on or off.
-
-_ON    7.2.3           It is recommended to define this method for any device
-                       that can be turned on or off.
-
-\_OS   5.7.3           This method will return "Linux" by default (this is
-                       the value of the macro ACPI_OS_NAME on Linux).  The
-                       command line parameter acpi_os=<string> can be used
-                       to set it to some other value.
-
-_OSC   6.2.11          This method can be a global method in ACPI (i.e.,
-                       \_SB._OSC), or it may be associated with a specific
-                       device (e.g., \_SB.DEV0._OSC), or both.  When used
-                       as a global method, only capabilities published in
-                       the ACPI specification are allowed.  When used as
-                       a device-specific method, the process described for
-                       using _DSD MUST be used to create an _OSC definition;
-                       out-of-process use of _OSC is not allowed.  That is,
-                       submit the device-specific _OSC usage description as
-                       part of the kernel driver submission, get it approved
-                       by the kernel community, then register it with the
-                       UEFI Forum.
-
-\_OSI  5.7.2           Deprecated on ARM64.  As far as ACPI firmware is 
-		       concerned, _OSI is not to be used to determine what 
-		       sort of system is being used or what functionality
-		       is provided.  The _OSC method is to be used instead.
-
-_PDC   8.4.1           Deprecated, do not use on arm64.
-
-\_PIC  5.8.1           The method should not be used.  On arm64, the only
-                       interrupt model available is GIC.
-
-\_PR   5.3.1           This namespace is for x86 use only on legacy systems.
-                       Do not use it on arm64.
-
-_PRT   6.2.13          Required as part of the definition of all PCI root
-                       devices.
-
-_PRx   7.3.8-11        Use as needed; power management specific.  If _PR0 is
-                       defined, _PR3 must also be defined.
-
-_PSx   7.3.2-5         Use as needed; power management specific.  If _PS0 is
-                       defined, _PS3 must also be defined.  If clocks or
-                       regulators need adjusting to be consistent with power
-                       usage, change them in these methods.
-
-_RDI   8.4.4.4         Recommended for use with processor definitions (_HID
-		       ACPI0010) on arm64.  This should only be used in 
-		       conjunction with _LPI.
-
-\_REV  5.7.4           Always returns the latest version of ACPI supported.
-
-\_SB   5.3.1           Required on arm64; all devices must be defined in this
-                       namespace.
-
-_SLI   6.2.15          Use is recommended when SLIT table is in use.
-
-_STA   6.3.7,          It is recommended to define this method for any device
-       7.2.4           that can be turned on or off.  See also the STAO table
-                       that provides overrides to hide devices in virtualized
-                       environments.
-
-_SRS   6.2.16          Use as needed; see also _PRS.
-
-_STR   6.1.10          Recommended for conveying device names to end users;
-                       this is preferred over using _DDN.
-
-_SUB   6.1.9           Use as needed; _HID or _CID are preferred.
-
-_SUN   6.1.11          Use as needed, but recommended.
-
-_SWS   7.4.3           Use as needed; power management specific; this may
-                       require specification changes for use on arm64.
-
-_UID   6.1.12          Recommended for distinguishing devices of the same
-                       class; define it if at all possible.
-
-
-
-
-ACPI Event Model
-----------------
-Do not use GPE block devices; these are not supported in the hardware reduced
-profile used by arm64.  Since there are no GPE blocks defined for use on ARM
-platforms, ACPI events must be signaled differently.
-
-There are two options: GPIO-signaled interrupts (Section 5.6.5), and
-interrupt-signaled events (Section 5.6.9).  Interrupt-signaled events are a
-new feature in the ACPI 6.1 specification.  Either -- or both -- can be used
-on a given platform, and which to use may be dependent of limitations in any
-given SoC.  If possible, interrupt-signaled events are recommended.
-
-
-ACPI Processor Control
-----------------------
-Section 8 of the ACPI specification changed significantly in version 6.0.
-Processors should now be defined as Device objects with _HID ACPI0007; do
-not use the deprecated Processor statement in ASL.  All multiprocessor systems
-should also define a hierarchy of processors, done with Processor Container
-Devices (see Section 8.4.3.1, _HID ACPI0010); do not use processor aggregator
-devices (Section 8.5) to describe processor topology.  Section 8.4 of the
-specification describes the semantics of these object definitions and how
-they interrelate.
-
-Most importantly, the processor hierarchy defined also defines the low power
-idle states that are available to the platform, along with the rules for
-determining which processors can be turned on or off and the circumstances
-that control that.  Without this information, the processors will run in
-whatever power state they were left in by UEFI.
-
-Note too, that the processor Device objects defined and the entries in the
-MADT for GICs are expected to be in synchronization.  The _UID of the Device
-object must correspond to processor IDs used in the MADT.
-
-It is recommended that CPPC (8.4.5) be used as the primary model for processor
-performance control on arm64.  C-states and P-states may become available at
-some point in the future, but most current design work appears to favor CPPC.
-
-Further, it is essential that the ARMv8 SoC provide a fully functional
-implementation of PSCI; this will be the only mechanism supported by ACPI
-to control CPU power state.  Booting of secondary CPUs using the ACPI
-parking protocol is possible, but discouraged, since only PSCI is supported
-for ARM servers.
-
-
-ACPI System Address Map Interfaces
-----------------------------------
-In Section 15 of the ACPI specification, several methods are mentioned as
-possible mechanisms for conveying memory resource information to the kernel.
-For arm64, we will only support UEFI for booting with ACPI, hence the UEFI
-GetMemoryMap() boot service is the only mechanism that will be used.
-
-
-ACPI Platform Error Interfaces (APEI)
--------------------------------------
-The APEI tables supported are described above.
-
-APEI requires the equivalent of an SCI and an NMI on ARMv8.  The SCI is used
-to notify the OSPM of errors that have occurred but can be corrected and the
-system can continue correct operation, even if possibly degraded.  The NMI is
-used to indicate fatal errors that cannot be corrected, and require immediate
-attention.
-
-Since there is no direct equivalent of the x86 SCI or NMI, arm64 handles
-these slightly differently.  The SCI is handled as a high priority interrupt;
-given that these are corrected (or correctable) errors being reported, this
-is sufficient.  The NMI is emulated as the highest priority interrupt
-possible.  This implies some caution must be used since there could be
-interrupts at higher privilege levels or even interrupts at the same priority
-as the emulated NMI.  In Linux, this should not be the case but one should
-be aware it could happen.
-
-
-ACPI Objects Not Supported on ARM64
------------------------------------
-While this may change in the future, there are several classes of objects
-that can be defined, but are not currently of general interest to ARM servers.
-Some of these objects have x86 equivalents, and may actually make sense in ARM
-servers.  However, there is either no hardware available at present, or there
-may not even be a non-ARM implementation yet.  Hence, they are not currently
-supported.
-
-The following classes of objects are not supported:
-
-       -- Section 9.2: ambient light sensor devices
-
-       -- Section 9.3: battery devices
-
-       -- Section 9.4: lids (e.g., laptop lids)
-
-       -- Section 9.8.2: IDE controllers
-
-       -- Section 9.9: floppy controllers
-
-       -- Section 9.10: GPE block devices
-
-       -- Section 9.15: PC/AT RTC/CMOS devices
-
-       -- Section 9.16: user presence detection devices
-
-       -- Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT
-
-       -- Section 9.18: time and alarm devices (see 9.15)
-
-       -- Section 10: power source and power meter devices
-
-       -- Section 11: thermal management
-
-       -- Section 12: embedded controllers interface
-
-       -- Section 13: SMBus interfaces
-
-
-This also means that there is no support for the following objects:
-
-Name   Section                     Name   Section
-----   ------------                ----   ------------
-_ALC   9.3.4                       _FDM   9.10.3
-_ALI   9.3.2                       _FIX   6.2.7
-_ALP   9.3.6                       _GAI   10.4.5
-_ALR   9.3.5                       _GHL   10.4.7
-_ALT   9.3.3                       _GTM   9.9.2.1.1
-_BCT   10.2.2.10                   _LID   9.5.1
-_BDN   6.5.3                       _PAI   10.4.4
-_BIF   10.2.2.1                    _PCL   10.3.2
-_BIX   10.2.2.1                    _PIF   10.3.3
-_BLT   9.2.3                       _PMC   10.4.1
-_BMA   10.2.2.4                    _PMD   10.4.8
-_BMC   10.2.2.12                   _PMM   10.4.3
-_BMD   10.2.2.11                   _PRL   10.3.4
-_BMS   10.2.2.5                    _PSR   10.3.1
-_BST   10.2.2.6                    _PTP   10.4.2
-_BTH   10.2.2.7                    _SBS   10.1.3
-_BTM   10.2.2.9                    _SHL   10.4.6
-_BTP   10.2.2.8                    _STM   9.9.2.1.1
-_DCK   6.5.2                       _UPD   9.16.1
-_EC    12.12                       _UPP   9.16.2
-_FDE   9.10.1                      _WPC   10.5.2
-_FDI   9.10.2                      _WPP   10.5.3
-
diff --git a/Documentation/arm64/arm-acpi.rst b/Documentation/arm64/arm-acpi.rst
new file mode 100644
index 000000000000..872dbbc73d4a
--- /dev/null
+++ b/Documentation/arm64/arm-acpi.rst
@@ -0,0 +1,528 @@
+=====================
+ACPI on ARMv8 Servers
+=====================
+
+ACPI can be used for ARMv8 general purpose servers designed to follow
+the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server
+Base Boot Requirements) [1] specifications.  Please note that the SBBR
+can be retrieved simply by visiting [1], but the SBSA is currently only
+available to those with an ARM login due to ARM IP licensing concerns.
+
+The ARMv8 kernel implements the reduced hardware model of ACPI version
+5.1 or later.  Links to the specification and all external documents
+it refers to are managed by the UEFI Forum.  The specification is
+available at http://www.uefi.org/specifications and documents referenced
+by the specification can be found via http://www.uefi.org/acpi.
+
+If an ARMv8 system does not meet the requirements of the SBSA and SBBR,
+or cannot be described using the mechanisms defined in the required ACPI
+specifications, then ACPI may not be a good fit for the hardware.
+
+While the documents mentioned above set out the requirements for building
+industry-standard ARMv8 servers, they also apply to more than one operating
+system.  The purpose of this document is to describe the interaction between
+ACPI and Linux only, on an ARMv8 system -- that is, what Linux expects of
+ACPI and what ACPI can expect of Linux.
+
+
+Why ACPI on ARM?
+----------------
+Before examining the details of the interface between ACPI and Linux, it is
+useful to understand why ACPI is being used.  Several technologies already
+exist in Linux for describing non-enumerable hardware, after all.  In this
+section we summarize a blog post [2] from Grant Likely that outlines the
+reasoning behind ACPI on ARMv8 servers.  Actually, we snitch a good portion
+of the summary text almost directly, to be honest.
+
+The short form of the rationale for ACPI on ARM is:
+
+-  ACPI’s byte code (AML) allows the platform to encode hardware behavior,
+   while DT explicitly does not support this.  For hardware vendors, being
+   able to encode behavior is a key tool used in supporting operating
+   system releases on new hardware.
+
+-  ACPI’s OSPM defines a power management model that constrains what the
+   platform is allowed to do into a specific model, while still providing
+   flexibility in hardware design.
+
+-  In the enterprise server environment, ACPI has established bindings (such
+   as for RAS) which are currently used in production systems.  DT does not.
+   Such bindings could be defined in DT at some point, but doing so means ARM
+   and x86 would end up using completely different code paths in both firmware
+   and the kernel.
+
+-  Choosing a single interface to describe the abstraction between a platform
+   and an OS is important.  Hardware vendors would not be required to implement
+   both DT and ACPI if they want to support multiple operating systems.  And,
+   agreeing on a single interface instead of being fragmented into per OS
+   interfaces makes for better interoperability overall.
+
+-  The new ACPI governance process works well and Linux is now at the same
+   table as hardware vendors and other OS vendors.  In fact, there is no
+   longer any reason to feel that ACPI only belongs to Windows or that
+   Linux is in any way secondary to Microsoft in this arena.  The move of
+   ACPI governance into the UEFI forum has significantly opened up the
+   specification development process, and currently, a large portion of the
+   changes being made to ACPI are being driven by Linux.
+
+Key to the use of ACPI is the support model.  For servers in general, the
+responsibility for hardware behaviour cannot solely be the domain of the
+kernel, but rather must be split between the platform and the kernel, in
+order to allow for orderly change over time.  ACPI frees the OS from needing
+to understand all the minute details of the hardware so that the OS doesn’t
+need to be ported to each and every device individually.  It allows the
+hardware vendors to take responsibility for power management behaviour without
+depending on an OS release cycle which is not under their control.
+
+ACPI is also important because hardware and OS vendors have already worked
+out the mechanisms for supporting a general purpose computing ecosystem.  The
+infrastructure is in place, the bindings are in place, and the processes are
+in place.  DT does exactly what Linux needs it to when working with vertically
+integrated devices, but there are no good processes for supporting what the
+server vendors need.  Linux could potentially get there with DT, but doing so
+really just duplicates something that already works.  ACPI already does what
+the hardware vendors need, Microsoft won’t collaborate on DT, and hardware
+vendors would still end up providing two completely separate firmware
+interfaces -- one for Linux and one for Windows.
+
+
+Kernel Compatibility
+--------------------
+One of the primary motivations for ACPI is standardization, and using that
+to provide backward compatibility for Linux kernels.  In the server market,
+software and hardware are often used for long periods.  ACPI allows the
+kernel and firmware to agree on a consistent abstraction that can be
+maintained over time, even as hardware or software change.  As long as the
+abstraction is supported, systems can be updated without necessarily having
+to replace the kernel.
+
+When a Linux driver or subsystem is first implemented using ACPI, it by
+definition ends up requiring a specific version of the ACPI specification
+-- it's baseline.  ACPI firmware must continue to work, even though it may
+not be optimal, with the earliest kernel version that first provides support
+for that baseline version of ACPI.  There may be a need for additional drivers,
+but adding new functionality (e.g., CPU power management) should not break
+older kernel versions.  Further, ACPI firmware must also work with the most
+recent version of the kernel.
+
+
+Relationship with Device Tree
+-----------------------------
+ACPI support in drivers and subsystems for ARMv8 should never be mutually
+exclusive with DT support at compile time.
+
+At boot time the kernel will only use one description method depending on
+parameters passed from the boot loader (including kernel bootargs).
+
+Regardless of whether DT or ACPI is used, the kernel must always be capable
+of booting with either scheme (in kernels with both schemes enabled at compile
+time).
+
+
+Booting using ACPI tables
+-------------------------
+The only defined method for passing ACPI tables to the kernel on ARMv8
+is via the UEFI system configuration table.  Just so it is explicit, this
+means that ACPI is only supported on platforms that boot via UEFI.
+
+When an ARMv8 system boots, it can either have DT information, ACPI tables,
+or in some very unusual cases, both.  If no command line parameters are used,
+the kernel will try to use DT for device enumeration; if there is no DT
+present, the kernel will try to use ACPI tables, but only if they are present.
+In neither is available, the kernel will not boot.  If acpi=force is used
+on the command line, the kernel will attempt to use ACPI tables first, but
+fall back to DT if there are no ACPI tables present.  The basic idea is that
+the kernel will not fail to boot unless it absolutely has no other choice.
+
+Processing of ACPI tables may be disabled by passing acpi=off on the kernel
+command line; this is the default behavior.
+
+In order for the kernel to load and use ACPI tables, the UEFI implementation
+MUST set the ACPI_20_TABLE_GUID to point to the RSDP table (the table with
+the ACPI signature "RSD PTR ").  If this pointer is incorrect and acpi=force
+is used, the kernel will disable ACPI and try to use DT to boot instead; the
+kernel has, in effect, determined that ACPI tables are not present at that
+point.
+
+If the pointer to the RSDP table is correct, the table will be mapped into
+the kernel by the ACPI core, using the address provided by UEFI.
+
+The ACPI core will then locate and map in all other ACPI tables provided by
+using the addresses in the RSDP table to find the XSDT (eXtended System
+Description Table).  The XSDT in turn provides the addresses to all other
+ACPI tables provided by the system firmware; the ACPI core will then traverse
+this table and map in the tables listed.
+
+The ACPI core will ignore any provided RSDT (Root System Description Table).
+RSDTs have been deprecated and are ignored on arm64 since they only allow
+for 32-bit addresses.
+
+Further, the ACPI core will only use the 64-bit address fields in the FADT
+(Fixed ACPI Description Table).  Any 32-bit address fields in the FADT will
+be ignored on arm64.
+
+Hardware reduced mode (see Section 4.1 of the ACPI 6.1 specification) will
+be enforced by the ACPI core on arm64.  Doing so allows the ACPI core to
+run less complex code since it no longer has to provide support for legacy
+hardware from other architectures.  Any fields that are not to be used for
+hardware reduced mode must be set to zero.
+
+For the ACPI core to operate properly, and in turn provide the information
+the kernel needs to configure devices, it expects to find the following
+tables (all section numbers refer to the ACPI 6.1 specification):
+
+    -  RSDP (Root System Description Pointer), section 5.2.5
+
+    -  XSDT (eXtended System Description Table), section 5.2.8
+
+    -  FADT (Fixed ACPI Description Table), section 5.2.9
+
+    -  DSDT (Differentiated System Description Table), section
+       5.2.11.1
+
+    -  MADT (Multiple APIC Description Table), section 5.2.12
+
+    -  GTDT (Generic Timer Description Table), section 5.2.24
+
+    -  If PCI is supported, the MCFG (Memory mapped ConFiGuration
+       Table), section 5.2.6, specifically Table 5-31.
+
+    -  If booting without a console=<device> kernel parameter is
+       supported, the SPCR (Serial Port Console Redirection table),
+       section 5.2.6, specifically Table 5-31.
+
+    -  If necessary to describe the I/O topology, SMMUs and GIC ITSs,
+       the IORT (Input Output Remapping Table, section 5.2.6, specifically
+       Table 5-31).
+
+    -  If NUMA is supported, the SRAT (System Resource Affinity Table)
+       and SLIT (System Locality distance Information Table), sections
+       5.2.16 and 5.2.17, respectively.
+
+If the above tables are not all present, the kernel may or may not be
+able to boot properly since it may not be able to configure all of the
+devices available.  This list of tables is not meant to be all inclusive;
+in some environments other tables may be needed (e.g., any of the APEI
+tables from section 18) to support specific functionality.
+
+
+ACPI Detection
+--------------
+Drivers should determine their probe() type by checking for a null
+value for ACPI_HANDLE, or checking .of_node, or other information in
+the device structure.  This is detailed further in the "Driver
+Recommendations" section.
+
+In non-driver code, if the presence of ACPI needs to be detected at
+run time, then check the value of acpi_disabled. If CONFIG_ACPI is not
+set, acpi_disabled will always be 1.
+
+
+Device Enumeration
+------------------
+Device descriptions in ACPI should use standard recognized ACPI interfaces.
+These may contain less information than is typically provided via a Device
+Tree description for the same device.  This is also one of the reasons that
+ACPI can be useful -- the driver takes into account that it may have less
+detailed information about the device and uses sensible defaults instead.
+If done properly in the driver, the hardware can change and improve over
+time without the driver having to change at all.
+
+Clocks provide an excellent example.  In DT, clocks need to be specified
+and the drivers need to take them into account.  In ACPI, the assumption
+is that UEFI will leave the device in a reasonable default state, including
+any clock settings.  If for some reason the driver needs to change a clock
+value, this can be done in an ACPI method; all the driver needs to do is
+invoke the method and not concern itself with what the method needs to do
+to change the clock.  Changing the hardware can then take place over time
+by changing what the ACPI method does, and not the driver.
+
+In DT, the parameters needed by the driver to set up clocks as in the example
+above are known as "bindings"; in ACPI, these are known as "Device Properties"
+and provided to a driver via the _DSD object.
+
+ACPI tables are described with a formal language called ASL, the ACPI
+Source Language (section 19 of the specification).  This means that there
+are always multiple ways to describe the same thing -- including device
+properties.  For example, device properties could use an ASL construct
+that looks like this: Name(KEY0, "value0").  An ACPI device driver would
+then retrieve the value of the property by evaluating the KEY0 object.
+However, using Name() this way has multiple problems: (1) ACPI limits
+names ("KEY0") to four characters unlike DT; (2) there is no industry
+wide registry that maintains a list of names, minimizing re-use; (3)
+there is also no registry for the definition of property values ("value0"),
+again making re-use difficult; and (4) how does one maintain backward
+compatibility as new hardware comes out?  The _DSD method was created
+to solve precisely these sorts of problems; Linux drivers should ALWAYS
+use the _DSD method for device properties and nothing else.
+
+The _DSM object (ACPI Section 9.14.1) could also be used for conveying
+device properties to a driver.  Linux drivers should only expect it to
+be used if _DSD cannot represent the data required, and there is no way
+to create a new UUID for the _DSD object.  Note that there is even less
+regulation of the use of _DSM than there is of _DSD.  Drivers that depend
+on the contents of _DSM objects will be more difficult to maintain over
+time because of this; as of this writing, the use of _DSM is the cause
+of quite a few firmware problems and is not recommended.
+
+Drivers should look for device properties in the _DSD object ONLY; the _DSD
+object is described in the ACPI specification section 6.2.5, but this only
+describes how to define the structure of an object returned via _DSD, and
+how specific data structures are defined by specific UUIDs.  Linux should
+only use the _DSD Device Properties UUID [5]:
+
+   - UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301
+
+   - http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf
+
+The UEFI Forum provides a mechanism for registering device properties [4]
+so that they may be used across all operating systems supporting ACPI.
+Device properties that have not been registered with the UEFI Forum should
+not be used.
+
+Before creating new device properties, check to be sure that they have not
+been defined before and either registered in the Linux kernel documentation
+as DT bindings, or the UEFI Forum as device properties.  While we do not want
+to simply move all DT bindings into ACPI device properties, we can learn from
+what has been previously defined.
+
+If it is necessary to define a new device property, or if it makes sense to
+synthesize the definition of a binding so it can be used in any firmware,
+both DT bindings and ACPI device properties for device drivers have review
+processes.  Use them both.  When the driver itself is submitted for review
+to the Linux mailing lists, the device property definitions needed must be
+submitted at the same time.  A driver that supports ACPI and uses device
+properties will not be considered complete without their definitions.  Once
+the device property has been accepted by the Linux community, it must be
+registered with the UEFI Forum [4], which will review it again for consistency
+within the registry.  This may require iteration.  The UEFI Forum, though,
+will always be the canonical site for device property definitions.
+
+It may make sense to provide notice to the UEFI Forum that there is the
+intent to register a previously unused device property name as a means of
+reserving the name for later use.  Other operating system vendors will
+also be submitting registration requests and this may help smooth the
+process.
+
+Once registration and review have been completed, the kernel provides an
+interface for looking up device properties in a manner independent of
+whether DT or ACPI is being used.  This API should be used [6]; it can
+eliminate some duplication of code paths in driver probing functions and
+discourage divergence between DT bindings and ACPI device properties.
+
+
+Programmable Power Control Resources
+------------------------------------
+Programmable power control resources include such resources as voltage/current
+providers (regulators) and clock sources.
+
+With ACPI, the kernel clock and regulator framework is not expected to be used
+at all.
+
+The kernel assumes that power control of these resources is represented with
+Power Resource Objects (ACPI section 7.1).  The ACPI core will then handle
+correctly enabling and disabling resources as they are needed.  In order to
+get that to work, ACPI assumes each device has defined D-states and that these
+can be controlled through the optional ACPI methods _PS0, _PS1, _PS2, and _PS3;
+in ACPI, _PS0 is the method to invoke to turn a device full on, and _PS3 is for
+turning a device full off.
+
+There are two options for using those Power Resources.  They can:
+
+   -  be managed in a _PSx method which gets called on entry to power
+      state Dx.
+
+   -  be declared separately as power resources with their own _ON and _OFF
+      methods.  They are then tied back to D-states for a particular device
+      via _PRx which specifies which power resources a device needs to be on
+      while in Dx.  Kernel then tracks number of devices using a power resource
+      and calls _ON/_OFF as needed.
+
+The kernel ACPI code will also assume that the _PSx methods follow the normal
+ACPI rules for such methods:
+
+   -  If either _PS0 or _PS3 is implemented, then the other method must also
+      be implemented.
+
+   -  If a device requires usage or setup of a power resource when on, the ASL
+      should organize that it is allocated/enabled using the _PS0 method.
+
+   -  Resources allocated or enabled in the _PS0 method should be disabled
+      or de-allocated in the _PS3 method.
+
+   -  Firmware will leave the resources in a reasonable state before handing
+      over control to the kernel.
+
+Such code in _PSx methods will of course be very platform specific.  But,
+this allows the driver to abstract out the interface for operating the device
+and avoid having to read special non-standard values from ACPI tables. Further,
+abstracting the use of these resources allows the hardware to change over time
+without requiring updates to the driver.
+
+
+Clocks
+------
+ACPI makes the assumption that clocks are initialized by the firmware --
+UEFI, in this case -- to some working value before control is handed over
+to the kernel.  This has implications for devices such as UARTs, or SoC-driven
+LCD displays, for example.
+
+When the kernel boots, the clocks are assumed to be set to reasonable
+working values.  If for some reason the frequency needs to change -- e.g.,
+throttling for power management -- the device driver should expect that
+process to be abstracted out into some ACPI method that can be invoked
+(please see the ACPI specification for further recommendations on standard
+methods to be expected).  The only exceptions to this are CPU clocks where
+CPPC provides a much richer interface than ACPI methods.  If the clocks
+are not set, there is no direct way for Linux to control them.
+
+If an SoC vendor wants to provide fine-grained control of the system clocks,
+they could do so by providing ACPI methods that could be invoked by Linux
+drivers.  However, this is NOT recommended and Linux drivers should NOT use
+such methods, even if they are provided.  Such methods are not currently
+standardized in the ACPI specification, and using them could tie a kernel
+to a very specific SoC, or tie an SoC to a very specific version of the
+kernel, both of which we are trying to avoid.
+
+
+Driver Recommendations
+----------------------
+DO NOT remove any DT handling when adding ACPI support for a driver.  The
+same device may be used on many different systems.
+
+DO try to structure the driver so that it is data-driven.  That is, set up
+a struct containing internal per-device state based on defaults and whatever
+else must be discovered by the driver probe function.  Then, have the rest
+of the driver operate off of the contents of that struct.  Doing so should
+allow most divergence between ACPI and DT functionality to be kept local to
+the probe function instead of being scattered throughout the driver.  For
+example::
+
+  static int device_probe_dt(struct platform_device *pdev)
+  {
+         /* DT specific functionality */
+         ...
+  }
+
+  static int device_probe_acpi(struct platform_device *pdev)
+  {
+         /* ACPI specific functionality */
+         ...
+  }
+
+  static int device_probe(struct platform_device *pdev)
+  {
+         ...
+         struct device_node node = pdev->dev.of_node;
+         ...
+
+         if (node)
+                 ret = device_probe_dt(pdev);
+         else if (ACPI_HANDLE(&pdev->dev))
+                 ret = device_probe_acpi(pdev);
+         else
+                 /* other initialization */
+                 ...
+         /* Continue with any generic probe operations */
+         ...
+  }
+
+DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it
+clear the different names the driver is probed for, both from DT and from
+ACPI::
+
+  static struct of_device_id virtio_mmio_match[] = {
+          { .compatible = "virtio,mmio", },
+          { }
+  };
+  MODULE_DEVICE_TABLE(of, virtio_mmio_match);
+
+  static const struct acpi_device_id virtio_mmio_acpi_match[] = {
+          { "LNRO0005", },
+          { }
+  };
+  MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match);
+
+
+ASWG
+----
+The ACPI specification changes regularly.  During the year 2014, for instance,
+version 5.1 was released and version 6.0 substantially completed, with most of
+the changes being driven by ARM-specific requirements.  Proposed changes are
+presented and discussed in the ASWG (ACPI Specification Working Group) which
+is a part of the UEFI Forum.  The current version of the ACPI specification
+is 6.1 release in January 2016.
+
+Participation in this group is open to all UEFI members.  Please see
+http://www.uefi.org/workinggroup for details on group membership.
+
+It is the intent of the ARMv8 ACPI kernel code to follow the ACPI specification
+as closely as possible, and to only implement functionality that complies with
+the released standards from UEFI ASWG.  As a practical matter, there will be
+vendors that provide bad ACPI tables or violate the standards in some way.
+If this is because of errors, quirks and fix-ups may be necessary, but will
+be avoided if possible.  If there are features missing from ACPI that preclude
+it from being used on a platform, ECRs (Engineering Change Requests) should be
+submitted to ASWG and go through the normal approval process; for those that
+are not UEFI members, many other members of the Linux community are and would
+likely be willing to assist in submitting ECRs.
+
+
+Linux Code
+----------
+Individual items specific to Linux on ARM, contained in the the Linux
+source code, are in the list that follows:
+
+ACPI_OS_NAME
+                       This macro defines the string to be returned when
+                       an ACPI method invokes the _OS method.  On ARM64
+                       systems, this macro will be "Linux" by default.
+                       The command line parameter acpi_os=<string>
+                       can be used to set it to some other value.  The
+                       default value for other architectures is "Microsoft
+                       Windows NT", for example.
+
+ACPI Objects
+------------
+Detailed expectations for ACPI tables and object are listed in the file
+Documentation/arm64/acpi_object_usage.rst.
+
+
+References
+----------
+[0] http://silver.arm.com
+    document ARM-DEN-0029, or newer:
+    "Server Base System Architecture", version 2.3, dated 27 Mar 2014
+
+[1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf
+    Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System
+    Software on ARM Platforms", dated 16 Aug 2014
+
+[2] http://www.secretlab.ca/archives/151,
+    10 Jan 2015, Copyright (c) 2015,
+    Linaro Ltd., written by Grant Likely.
+
+[3] AMD ACPI for Seattle platform documentation
+    http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf
+
+
+[4] http://www.uefi.org/acpi
+    please see the link for the "ACPI _DSD Device
+    Property Registry Instructions"
+
+[5] http://www.uefi.org/acpi
+    please see the link for the "_DSD (Device
+    Specific Data) Implementation Guide"
+
+[6] Kernel code for the unified device
+    property interface can be found in
+    include/linux/property.h and drivers/base/property.c.
+
+
+Authors
+-------
+- Al Stone <al.stone@linaro.org>
+- Graeme Gregory <graeme.gregory@linaro.org>
+- Hanjun Guo <hanjun.guo@linaro.org>
+
+- Grant Likely <grant.likely@linaro.org>, for the "Why ACPI on ARM?" section
diff --git a/Documentation/arm64/arm-acpi.txt b/Documentation/arm64/arm-acpi.txt
deleted file mode 100644
index 1a74a041a443..000000000000
--- a/Documentation/arm64/arm-acpi.txt
+++ /dev/null
@@ -1,519 +0,0 @@
-ACPI on ARMv8 Servers
----------------------
-ACPI can be used for ARMv8 general purpose servers designed to follow
-the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server
-Base Boot Requirements) [1] specifications.  Please note that the SBBR
-can be retrieved simply by visiting [1], but the SBSA is currently only
-available to those with an ARM login due to ARM IP licensing concerns.
-
-The ARMv8 kernel implements the reduced hardware model of ACPI version
-5.1 or later.  Links to the specification and all external documents
-it refers to are managed by the UEFI Forum.  The specification is
-available at http://www.uefi.org/specifications and documents referenced
-by the specification can be found via http://www.uefi.org/acpi.
-
-If an ARMv8 system does not meet the requirements of the SBSA and SBBR,
-or cannot be described using the mechanisms defined in the required ACPI
-specifications, then ACPI may not be a good fit for the hardware.
-
-While the documents mentioned above set out the requirements for building
-industry-standard ARMv8 servers, they also apply to more than one operating
-system.  The purpose of this document is to describe the interaction between
-ACPI and Linux only, on an ARMv8 system -- that is, what Linux expects of
-ACPI and what ACPI can expect of Linux.
-
-
-Why ACPI on ARM?
-----------------
-Before examining the details of the interface between ACPI and Linux, it is
-useful to understand why ACPI is being used.  Several technologies already
-exist in Linux for describing non-enumerable hardware, after all.  In this
-section we summarize a blog post [2] from Grant Likely that outlines the
-reasoning behind ACPI on ARMv8 servers.  Actually, we snitch a good portion
-of the summary text almost directly, to be honest.
-
-The short form of the rationale for ACPI on ARM is:
-
--- ACPI’s byte code (AML) allows the platform to encode hardware behavior,
-   while DT explicitly does not support this.  For hardware vendors, being
-   able to encode behavior is a key tool used in supporting operating
-   system releases on new hardware.
-
--- ACPI’s OSPM defines a power management model that constrains what the
-   platform is allowed to do into a specific model, while still providing
-   flexibility in hardware design.
-
--- In the enterprise server environment, ACPI has established bindings (such
-   as for RAS) which are currently used in production systems.  DT does not.
-   Such bindings could be defined in DT at some point, but doing so means ARM
-   and x86 would end up using completely different code paths in both firmware
-   and the kernel.
-
--- Choosing a single interface to describe the abstraction between a platform
-   and an OS is important.  Hardware vendors would not be required to implement
-   both DT and ACPI if they want to support multiple operating systems.  And,
-   agreeing on a single interface instead of being fragmented into per OS
-   interfaces makes for better interoperability overall.
-
--- The new ACPI governance process works well and Linux is now at the same
-   table as hardware vendors and other OS vendors.  In fact, there is no
-   longer any reason to feel that ACPI only belongs to Windows or that
-   Linux is in any way secondary to Microsoft in this arena.  The move of
-   ACPI governance into the UEFI forum has significantly opened up the
-   specification development process, and currently, a large portion of the
-   changes being made to ACPI are being driven by Linux.
-
-Key to the use of ACPI is the support model.  For servers in general, the
-responsibility for hardware behaviour cannot solely be the domain of the
-kernel, but rather must be split between the platform and the kernel, in
-order to allow for orderly change over time.  ACPI frees the OS from needing
-to understand all the minute details of the hardware so that the OS doesn’t
-need to be ported to each and every device individually.  It allows the
-hardware vendors to take responsibility for power management behaviour without
-depending on an OS release cycle which is not under their control.
-
-ACPI is also important because hardware and OS vendors have already worked
-out the mechanisms for supporting a general purpose computing ecosystem.  The
-infrastructure is in place, the bindings are in place, and the processes are
-in place.  DT does exactly what Linux needs it to when working with vertically
-integrated devices, but there are no good processes for supporting what the
-server vendors need.  Linux could potentially get there with DT, but doing so
-really just duplicates something that already works.  ACPI already does what
-the hardware vendors need, Microsoft won’t collaborate on DT, and hardware
-vendors would still end up providing two completely separate firmware
-interfaces -- one for Linux and one for Windows.
-
-
-Kernel Compatibility
---------------------
-One of the primary motivations for ACPI is standardization, and using that
-to provide backward compatibility for Linux kernels.  In the server market,
-software and hardware are often used for long periods.  ACPI allows the
-kernel and firmware to agree on a consistent abstraction that can be
-maintained over time, even as hardware or software change.  As long as the
-abstraction is supported, systems can be updated without necessarily having
-to replace the kernel.
-
-When a Linux driver or subsystem is first implemented using ACPI, it by
-definition ends up requiring a specific version of the ACPI specification
--- it's baseline.  ACPI firmware must continue to work, even though it may
-not be optimal, with the earliest kernel version that first provides support
-for that baseline version of ACPI.  There may be a need for additional drivers,
-but adding new functionality (e.g., CPU power management) should not break
-older kernel versions.  Further, ACPI firmware must also work with the most
-recent version of the kernel.
-
-
-Relationship with Device Tree
------------------------------
-ACPI support in drivers and subsystems for ARMv8 should never be mutually
-exclusive with DT support at compile time.
-
-At boot time the kernel will only use one description method depending on
-parameters passed from the boot loader (including kernel bootargs).
-
-Regardless of whether DT or ACPI is used, the kernel must always be capable
-of booting with either scheme (in kernels with both schemes enabled at compile
-time).
-
-
-Booting using ACPI tables
--------------------------
-The only defined method for passing ACPI tables to the kernel on ARMv8
-is via the UEFI system configuration table.  Just so it is explicit, this
-means that ACPI is only supported on platforms that boot via UEFI.
-
-When an ARMv8 system boots, it can either have DT information, ACPI tables,
-or in some very unusual cases, both.  If no command line parameters are used,
-the kernel will try to use DT for device enumeration; if there is no DT
-present, the kernel will try to use ACPI tables, but only if they are present.
-In neither is available, the kernel will not boot.  If acpi=force is used
-on the command line, the kernel will attempt to use ACPI tables first, but
-fall back to DT if there are no ACPI tables present.  The basic idea is that
-the kernel will not fail to boot unless it absolutely has no other choice.
-
-Processing of ACPI tables may be disabled by passing acpi=off on the kernel
-command line; this is the default behavior.
-
-In order for the kernel to load and use ACPI tables, the UEFI implementation
-MUST set the ACPI_20_TABLE_GUID to point to the RSDP table (the table with
-the ACPI signature "RSD PTR ").  If this pointer is incorrect and acpi=force
-is used, the kernel will disable ACPI and try to use DT to boot instead; the
-kernel has, in effect, determined that ACPI tables are not present at that
-point.
-
-If the pointer to the RSDP table is correct, the table will be mapped into
-the kernel by the ACPI core, using the address provided by UEFI.
-
-The ACPI core will then locate and map in all other ACPI tables provided by
-using the addresses in the RSDP table to find the XSDT (eXtended System
-Description Table).  The XSDT in turn provides the addresses to all other
-ACPI tables provided by the system firmware; the ACPI core will then traverse
-this table and map in the tables listed.
-
-The ACPI core will ignore any provided RSDT (Root System Description Table).
-RSDTs have been deprecated and are ignored on arm64 since they only allow
-for 32-bit addresses.
-
-Further, the ACPI core will only use the 64-bit address fields in the FADT
-(Fixed ACPI Description Table).  Any 32-bit address fields in the FADT will
-be ignored on arm64.
-
-Hardware reduced mode (see Section 4.1 of the ACPI 6.1 specification) will
-be enforced by the ACPI core on arm64.  Doing so allows the ACPI core to
-run less complex code since it no longer has to provide support for legacy
-hardware from other architectures.  Any fields that are not to be used for
-hardware reduced mode must be set to zero.
-
-For the ACPI core to operate properly, and in turn provide the information
-the kernel needs to configure devices, it expects to find the following
-tables (all section numbers refer to the ACPI 6.1 specification):
-
-    -- RSDP (Root System Description Pointer), section 5.2.5
-
-    -- XSDT (eXtended System Description Table), section 5.2.8
-
-    -- FADT (Fixed ACPI Description Table), section 5.2.9
-
-    -- DSDT (Differentiated System Description Table), section
-       5.2.11.1
-
-    -- MADT (Multiple APIC Description Table), section 5.2.12
-
-    -- GTDT (Generic Timer Description Table), section 5.2.24
-
-    -- If PCI is supported, the MCFG (Memory mapped ConFiGuration
-       Table), section 5.2.6, specifically Table 5-31.
-
-    -- If booting without a console=<device> kernel parameter is
-       supported, the SPCR (Serial Port Console Redirection table),
-       section 5.2.6, specifically Table 5-31.
-
-    -- If necessary to describe the I/O topology, SMMUs and GIC ITSs,
-       the IORT (Input Output Remapping Table, section 5.2.6, specifically
-       Table 5-31).
-
-    -- If NUMA is supported, the SRAT (System Resource Affinity Table)
-       and SLIT (System Locality distance Information Table), sections
-       5.2.16 and 5.2.17, respectively.
-
-If the above tables are not all present, the kernel may or may not be
-able to boot properly since it may not be able to configure all of the
-devices available.  This list of tables is not meant to be all inclusive;
-in some environments other tables may be needed (e.g., any of the APEI
-tables from section 18) to support specific functionality.
-
-
-ACPI Detection
---------------
-Drivers should determine their probe() type by checking for a null
-value for ACPI_HANDLE, or checking .of_node, or other information in
-the device structure.  This is detailed further in the "Driver
-Recommendations" section.
-
-In non-driver code, if the presence of ACPI needs to be detected at
-run time, then check the value of acpi_disabled. If CONFIG_ACPI is not
-set, acpi_disabled will always be 1.
-
-
-Device Enumeration
-------------------
-Device descriptions in ACPI should use standard recognized ACPI interfaces.
-These may contain less information than is typically provided via a Device
-Tree description for the same device.  This is also one of the reasons that
-ACPI can be useful -- the driver takes into account that it may have less
-detailed information about the device and uses sensible defaults instead.
-If done properly in the driver, the hardware can change and improve over
-time without the driver having to change at all.
-
-Clocks provide an excellent example.  In DT, clocks need to be specified
-and the drivers need to take them into account.  In ACPI, the assumption
-is that UEFI will leave the device in a reasonable default state, including
-any clock settings.  If for some reason the driver needs to change a clock
-value, this can be done in an ACPI method; all the driver needs to do is
-invoke the method and not concern itself with what the method needs to do
-to change the clock.  Changing the hardware can then take place over time
-by changing what the ACPI method does, and not the driver.
-
-In DT, the parameters needed by the driver to set up clocks as in the example
-above are known as "bindings"; in ACPI, these are known as "Device Properties"
-and provided to a driver via the _DSD object.
-
-ACPI tables are described with a formal language called ASL, the ACPI
-Source Language (section 19 of the specification).  This means that there
-are always multiple ways to describe the same thing -- including device
-properties.  For example, device properties could use an ASL construct
-that looks like this: Name(KEY0, "value0").  An ACPI device driver would
-then retrieve the value of the property by evaluating the KEY0 object.
-However, using Name() this way has multiple problems: (1) ACPI limits
-names ("KEY0") to four characters unlike DT; (2) there is no industry
-wide registry that maintains a list of names, minimizing re-use; (3)
-there is also no registry for the definition of property values ("value0"),
-again making re-use difficult; and (4) how does one maintain backward
-compatibility as new hardware comes out?  The _DSD method was created
-to solve precisely these sorts of problems; Linux drivers should ALWAYS
-use the _DSD method for device properties and nothing else.
-
-The _DSM object (ACPI Section 9.14.1) could also be used for conveying
-device properties to a driver.  Linux drivers should only expect it to
-be used if _DSD cannot represent the data required, and there is no way
-to create a new UUID for the _DSD object.  Note that there is even less
-regulation of the use of _DSM than there is of _DSD.  Drivers that depend
-on the contents of _DSM objects will be more difficult to maintain over
-time because of this; as of this writing, the use of _DSM is the cause
-of quite a few firmware problems and is not recommended.
-
-Drivers should look for device properties in the _DSD object ONLY; the _DSD
-object is described in the ACPI specification section 6.2.5, but this only
-describes how to define the structure of an object returned via _DSD, and
-how specific data structures are defined by specific UUIDs.  Linux should
-only use the _DSD Device Properties UUID [5]:
-
-   -- UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301
-
-   -- http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf
-
-The UEFI Forum provides a mechanism for registering device properties [4]
-so that they may be used across all operating systems supporting ACPI.
-Device properties that have not been registered with the UEFI Forum should
-not be used.
-
-Before creating new device properties, check to be sure that they have not
-been defined before and either registered in the Linux kernel documentation
-as DT bindings, or the UEFI Forum as device properties.  While we do not want
-to simply move all DT bindings into ACPI device properties, we can learn from
-what has been previously defined.
-
-If it is necessary to define a new device property, or if it makes sense to
-synthesize the definition of a binding so it can be used in any firmware,
-both DT bindings and ACPI device properties for device drivers have review
-processes.  Use them both.  When the driver itself is submitted for review
-to the Linux mailing lists, the device property definitions needed must be
-submitted at the same time.  A driver that supports ACPI and uses device
-properties will not be considered complete without their definitions.  Once
-the device property has been accepted by the Linux community, it must be
-registered with the UEFI Forum [4], which will review it again for consistency
-within the registry.  This may require iteration.  The UEFI Forum, though,
-will always be the canonical site for device property definitions.
-
-It may make sense to provide notice to the UEFI Forum that there is the
-intent to register a previously unused device property name as a means of
-reserving the name for later use.  Other operating system vendors will
-also be submitting registration requests and this may help smooth the
-process.
-
-Once registration and review have been completed, the kernel provides an
-interface for looking up device properties in a manner independent of
-whether DT or ACPI is being used.  This API should be used [6]; it can
-eliminate some duplication of code paths in driver probing functions and
-discourage divergence between DT bindings and ACPI device properties.
-
-
-Programmable Power Control Resources
-------------------------------------
-Programmable power control resources include such resources as voltage/current
-providers (regulators) and clock sources.
-
-With ACPI, the kernel clock and regulator framework is not expected to be used
-at all.
-
-The kernel assumes that power control of these resources is represented with
-Power Resource Objects (ACPI section 7.1).  The ACPI core will then handle
-correctly enabling and disabling resources as they are needed.  In order to
-get that to work, ACPI assumes each device has defined D-states and that these
-can be controlled through the optional ACPI methods _PS0, _PS1, _PS2, and _PS3;
-in ACPI, _PS0 is the method to invoke to turn a device full on, and _PS3 is for
-turning a device full off.
-
-There are two options for using those Power Resources.  They can:
-
-   -- be managed in a _PSx method which gets called on entry to power
-      state Dx.
-
-   -- be declared separately as power resources with their own _ON and _OFF
-      methods.  They are then tied back to D-states for a particular device
-      via _PRx which specifies which power resources a device needs to be on
-      while in Dx.  Kernel then tracks number of devices using a power resource
-      and calls _ON/_OFF as needed.
-
-The kernel ACPI code will also assume that the _PSx methods follow the normal
-ACPI rules for such methods:
-
-   -- If either _PS0 or _PS3 is implemented, then the other method must also
-      be implemented.
-
-   -- If a device requires usage or setup of a power resource when on, the ASL
-      should organize that it is allocated/enabled using the _PS0 method.
-
-   -- Resources allocated or enabled in the _PS0 method should be disabled
-      or de-allocated in the _PS3 method.
-
-   -- Firmware will leave the resources in a reasonable state before handing
-      over control to the kernel.
-
-Such code in _PSx methods will of course be very platform specific.  But,
-this allows the driver to abstract out the interface for operating the device
-and avoid having to read special non-standard values from ACPI tables. Further,
-abstracting the use of these resources allows the hardware to change over time
-without requiring updates to the driver.
-
-
-Clocks
-------
-ACPI makes the assumption that clocks are initialized by the firmware --
-UEFI, in this case -- to some working value before control is handed over
-to the kernel.  This has implications for devices such as UARTs, or SoC-driven
-LCD displays, for example.
-
-When the kernel boots, the clocks are assumed to be set to reasonable
-working values.  If for some reason the frequency needs to change -- e.g.,
-throttling for power management -- the device driver should expect that
-process to be abstracted out into some ACPI method that can be invoked
-(please see the ACPI specification for further recommendations on standard
-methods to be expected).  The only exceptions to this are CPU clocks where
-CPPC provides a much richer interface than ACPI methods.  If the clocks
-are not set, there is no direct way for Linux to control them.
-
-If an SoC vendor wants to provide fine-grained control of the system clocks,
-they could do so by providing ACPI methods that could be invoked by Linux
-drivers.  However, this is NOT recommended and Linux drivers should NOT use
-such methods, even if they are provided.  Such methods are not currently
-standardized in the ACPI specification, and using them could tie a kernel
-to a very specific SoC, or tie an SoC to a very specific version of the
-kernel, both of which we are trying to avoid.
-
-
-Driver Recommendations
-----------------------
-DO NOT remove any DT handling when adding ACPI support for a driver.  The
-same device may be used on many different systems.
-
-DO try to structure the driver so that it is data-driven.  That is, set up
-a struct containing internal per-device state based on defaults and whatever
-else must be discovered by the driver probe function.  Then, have the rest
-of the driver operate off of the contents of that struct.  Doing so should
-allow most divergence between ACPI and DT functionality to be kept local to
-the probe function instead of being scattered throughout the driver.  For
-example:
-
-static int device_probe_dt(struct platform_device *pdev)
-{
-       /* DT specific functionality */
-       ...
-}
-
-static int device_probe_acpi(struct platform_device *pdev)
-{
-       /* ACPI specific functionality */
-       ...
-}
-
-static int device_probe(struct platform_device *pdev)
-{
-       ...
-       struct device_node node = pdev->dev.of_node;
-       ...
-
-       if (node)
-               ret = device_probe_dt(pdev);
-       else if (ACPI_HANDLE(&pdev->dev))
-               ret = device_probe_acpi(pdev);
-       else
-               /* other initialization */
-               ...
-       /* Continue with any generic probe operations */
-       ...
-}
-
-DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it
-clear the different names the driver is probed for, both from DT and from
-ACPI:
-
-static struct of_device_id virtio_mmio_match[] = {
-        { .compatible = "virtio,mmio", },
-        { }
-};
-MODULE_DEVICE_TABLE(of, virtio_mmio_match);
-
-static const struct acpi_device_id virtio_mmio_acpi_match[] = {
-        { "LNRO0005", },
-        { }
-};
-MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match);
-
-
-ASWG
-----
-The ACPI specification changes regularly.  During the year 2014, for instance,
-version 5.1 was released and version 6.0 substantially completed, with most of
-the changes being driven by ARM-specific requirements.  Proposed changes are
-presented and discussed in the ASWG (ACPI Specification Working Group) which
-is a part of the UEFI Forum.  The current version of the ACPI specification
-is 6.1 release in January 2016.
-
-Participation in this group is open to all UEFI members.  Please see
-http://www.uefi.org/workinggroup for details on group membership.
-
-It is the intent of the ARMv8 ACPI kernel code to follow the ACPI specification
-as closely as possible, and to only implement functionality that complies with
-the released standards from UEFI ASWG.  As a practical matter, there will be
-vendors that provide bad ACPI tables or violate the standards in some way.
-If this is because of errors, quirks and fix-ups may be necessary, but will
-be avoided if possible.  If there are features missing from ACPI that preclude
-it from being used on a platform, ECRs (Engineering Change Requests) should be
-submitted to ASWG and go through the normal approval process; for those that
-are not UEFI members, many other members of the Linux community are and would
-likely be willing to assist in submitting ECRs.
-
-
-Linux Code
-----------
-Individual items specific to Linux on ARM, contained in the the Linux
-source code, are in the list that follows:
-
-ACPI_OS_NAME           This macro defines the string to be returned when
-                       an ACPI method invokes the _OS method.  On ARM64
-                       systems, this macro will be "Linux" by default.
-                       The command line parameter acpi_os=<string>
-                       can be used to set it to some other value.  The
-                       default value for other architectures is "Microsoft
-                       Windows NT", for example.
-
-ACPI Objects
-------------
-Detailed expectations for ACPI tables and object are listed in the file
-Documentation/arm64/acpi_object_usage.txt.
-
-
-References
-----------
-[0] http://silver.arm.com -- document ARM-DEN-0029, or newer
-    "Server Base System Architecture", version 2.3, dated 27 Mar 2014
-
-[1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf
-    Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System
-    Software on ARM Platforms", dated 16 Aug 2014
-
-[2] http://www.secretlab.ca/archives/151, 10 Jan 2015, Copyright (c) 2015,
-    Linaro Ltd., written by Grant Likely.
-
-[3] AMD ACPI for Seattle platform documentation:
-    http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf
-
-[4] http://www.uefi.org/acpi -- please see the link for the "ACPI _DSD Device
-    Property Registry Instructions"
-
-[5] http://www.uefi.org/acpi -- please see the link for the "_DSD (Device
-    Specific Data) Implementation Guide"
-
-[6] Kernel code for the unified device property interface can be found in
-    include/linux/property.h and drivers/base/property.c.
-
-
-Authors
--------
-Al Stone <al.stone@linaro.org>
-Graeme Gregory <graeme.gregory@linaro.org>
-Hanjun Guo <hanjun.guo@linaro.org>
-
-Grant Likely <grant.likely@linaro.org>, for the "Why ACPI on ARM?" section
diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
new file mode 100644
index 000000000000..d3f3a60fbf25
--- /dev/null
+++ b/Documentation/arm64/booting.rst
@@ -0,0 +1,293 @@
+=====================
+Booting AArch64 Linux
+=====================
+
+Author: Will Deacon <will.deacon@arm.com>
+
+Date  : 07 September 2012
+
+This document is based on the ARM booting document by Russell King and
+is relevant to all public releases of the AArch64 Linux kernel.
+
+The AArch64 exception model is made up of a number of exception levels
+(EL0 - EL3), with EL0 and EL1 having a secure and a non-secure
+counterpart.  EL2 is the hypervisor level and exists only in non-secure
+mode. EL3 is the highest priority level and exists only in secure mode.
+
+For the purposes of this document, we will use the term `boot loader`
+simply to define all software that executes on the CPU(s) before control
+is passed to the Linux kernel.  This may include secure monitor and
+hypervisor code, or it may just be a handful of instructions for
+preparing a minimal boot environment.
+
+Essentially, the boot loader should provide (as a minimum) the
+following:
+
+1. Setup and initialise the RAM
+2. Setup the device tree
+3. Decompress the kernel image
+4. Call the kernel image
+
+
+1. Setup and initialise RAM
+---------------------------
+
+Requirement: MANDATORY
+
+The boot loader is expected to find and initialise all RAM that the
+kernel will use for volatile data storage in the system.  It performs
+this in a machine dependent manner.  (It may use internal algorithms
+to automatically locate and size all RAM, or it may use knowledge of
+the RAM in the machine, or any other method the boot loader designer
+sees fit.)
+
+
+2. Setup the device tree
+-------------------------
+
+Requirement: MANDATORY
+
+The device tree blob (dtb) must be placed on an 8-byte boundary and must
+not exceed 2 megabytes in size. Since the dtb will be mapped cacheable
+using blocks of up to 2 megabytes in size, it must not be placed within
+any 2M region which must be mapped with any specific attributes.
+
+NOTE: versions prior to v4.2 also require that the DTB be placed within
+the 512 MB region starting at text_offset bytes below the kernel Image.
+
+3. Decompress the kernel image
+------------------------------
+
+Requirement: OPTIONAL
+
+The AArch64 kernel does not currently provide a decompressor and
+therefore requires decompression (gzip etc.) to be performed by the boot
+loader if a compressed Image target (e.g. Image.gz) is used.  For
+bootloaders that do not implement this requirement, the uncompressed
+Image target is available instead.
+
+
+4. Call the kernel image
+------------------------
+
+Requirement: MANDATORY
+
+The decompressed kernel image contains a 64-byte header as follows::
+
+  u32 code0;			/* Executable code */
+  u32 code1;			/* Executable code */
+  u64 text_offset;		/* Image load offset, little endian */
+  u64 image_size;		/* Effective Image size, little endian */
+  u64 flags;			/* kernel flags, little endian */
+  u64 res2	= 0;		/* reserved */
+  u64 res3	= 0;		/* reserved */
+  u64 res4	= 0;		/* reserved */
+  u32 magic	= 0x644d5241;	/* Magic number, little endian, "ARM\x64" */
+  u32 res5;			/* reserved (used for PE COFF offset) */
+
+
+Header notes:
+
+- As of v3.17, all fields are little endian unless stated otherwise.
+
+- code0/code1 are responsible for branching to stext.
+
+- when booting through EFI, code0/code1 are initially skipped.
+  res5 is an offset to the PE header and the PE header has the EFI
+  entry point (efi_stub_entry).  When the stub has done its work, it
+  jumps to code0 to resume the normal boot process.
+
+- Prior to v3.17, the endianness of text_offset was not specified.  In
+  these cases image_size is zero and text_offset is 0x80000 in the
+  endianness of the kernel.  Where image_size is non-zero image_size is
+  little-endian and must be respected.  Where image_size is zero,
+  text_offset can be assumed to be 0x80000.
+
+- The flags field (introduced in v3.17) is a little-endian 64-bit field
+  composed as follows:
+
+  ============= ===============================================================
+  Bit 0		Kernel endianness.  1 if BE, 0 if LE.
+  Bit 1-2	Kernel Page size.
+
+			* 0 - Unspecified.
+			* 1 - 4K
+			* 2 - 16K
+			* 3 - 64K
+  Bit 3		Kernel physical placement
+
+			0
+			  2MB aligned base should be as close as possible
+			  to the base of DRAM, since memory below it is not
+			  accessible via the linear mapping
+			1
+			  2MB aligned base may be anywhere in physical
+			  memory
+  Bits 4-63	Reserved.
+  ============= ===============================================================
+
+- When image_size is zero, a bootloader should attempt to keep as much
+  memory as possible free for use by the kernel immediately after the
+  end of the kernel image. The amount of space required will vary
+  depending on selected features, and is effectively unbound.
+
+The Image must be placed text_offset bytes from a 2MB aligned base
+address anywhere in usable system RAM and called there. The region
+between the 2 MB aligned base address and the start of the image has no
+special significance to the kernel, and may be used for other purposes.
+At least image_size bytes from the start of the image must be free for
+use by the kernel.
+NOTE: versions prior to v4.6 cannot make use of memory below the
+physical offset of the Image so it is recommended that the Image be
+placed as close as possible to the start of system RAM.
+
+If an initrd/initramfs is passed to the kernel at boot, it must reside
+entirely within a 1 GB aligned physical memory window of up to 32 GB in
+size that fully covers the kernel Image as well.
+
+Any memory described to the kernel (even that below the start of the
+image) which is not marked as reserved from the kernel (e.g., with a
+memreserve region in the device tree) will be considered as available to
+the kernel.
+
+Before jumping into the kernel, the following conditions must be met:
+
+- Quiesce all DMA capable devices so that memory does not get
+  corrupted by bogus network packets or disk data.  This will save
+  you many hours of debug.
+
+- Primary CPU general-purpose register settings:
+
+    - x0 = physical address of device tree blob (dtb) in system RAM.
+    - x1 = 0 (reserved for future use)
+    - x2 = 0 (reserved for future use)
+    - x3 = 0 (reserved for future use)
+
+- CPU mode
+
+  All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError,
+  IRQ and FIQ).
+  The CPU must be in either EL2 (RECOMMENDED in order to have access to
+  the virtualisation extensions) or non-secure EL1.
+
+- Caches, MMUs
+
+  The MMU must be off.
+  Instruction cache may be on or off.
+  The address range corresponding to the loaded kernel image must be
+  cleaned to the PoC. In the presence of a system cache or other
+  coherent masters with caches enabled, this will typically require
+  cache maintenance by VA rather than set/way operations.
+  System caches which respect the architected cache maintenance by VA
+  operations must be configured and may be enabled.
+  System caches which do not respect architected cache maintenance by VA
+  operations (not recommended) must be configured and disabled.
+
+- Architected timers
+
+  CNTFRQ must be programmed with the timer frequency and CNTVOFF must
+  be programmed with a consistent value on all CPUs.  If entering the
+  kernel at EL1, CNTHCTL_EL2 must have EL1PCTEN (bit 0) set where
+  available.
+
+- Coherency
+
+  All CPUs to be booted by the kernel must be part of the same coherency
+  domain on entry to the kernel.  This may require IMPLEMENTATION DEFINED
+  initialisation to enable the receiving of maintenance operations on
+  each CPU.
+
+- System registers
+
+  All writable architected system registers at the exception level where
+  the kernel image will be entered must be initialised by software at a
+  higher exception level to prevent execution in an UNKNOWN state.
+
+  - SCR_EL3.FIQ must have the same value across all CPUs the kernel is
+    executing on.
+  - The value of SCR_EL3.FIQ must be the same as the one present at boot
+    time whenever the kernel is executing.
+
+  For systems with a GICv3 interrupt controller to be used in v3 mode:
+  - If EL3 is present:
+
+      - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
+      - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1.
+
+  - If the kernel is entered at EL1:
+
+      - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1
+      - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1.
+
+  - The DT or ACPI tables must describe a GICv3 interrupt controller.
+
+  For systems with a GICv3 interrupt controller to be used in
+  compatibility (v2) mode:
+
+  - If EL3 is present:
+
+      ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0.
+
+  - If the kernel is entered at EL1:
+
+      ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0.
+
+  - The DT or ACPI tables must describe a GICv2 interrupt controller.
+
+  For CPUs with pointer authentication functionality:
+  - If EL3 is present:
+
+    - SCR_EL3.APK (bit 16) must be initialised to 0b1
+    - SCR_EL3.API (bit 17) must be initialised to 0b1
+
+  - If the kernel is entered at EL1:
+
+    - HCR_EL2.APK (bit 40) must be initialised to 0b1
+    - HCR_EL2.API (bit 41) must be initialised to 0b1
+
+The requirements described above for CPU mode, caches, MMUs, architected
+timers, coherency and system registers apply to all CPUs.  All CPUs must
+enter the kernel in the same exception level.
+
+The boot loader is expected to enter the kernel on each CPU in the
+following manner:
+
+- The primary CPU must jump directly to the first instruction of the
+  kernel image.  The device tree blob passed by this CPU must contain
+  an 'enable-method' property for each cpu node.  The supported
+  enable-methods are described below.
+
+  It is expected that the bootloader will generate these device tree
+  properties and insert them into the blob prior to kernel entry.
+
+- CPUs with a "spin-table" enable-method must have a 'cpu-release-addr'
+  property in their cpu node.  This property identifies a
+  naturally-aligned 64-bit zero-initalised memory location.
+
+  These CPUs should spin outside of the kernel in a reserved area of
+  memory (communicated to the kernel by a /memreserve/ region in the
+  device tree) polling their cpu-release-addr location, which must be
+  contained in the reserved region.  A wfe instruction may be inserted
+  to reduce the overhead of the busy-loop and a sev will be issued by
+  the primary CPU.  When a read of the location pointed to by the
+  cpu-release-addr returns a non-zero value, the CPU must jump to this
+  value.  The value will be written as a single 64-bit little-endian
+  value, so CPUs must convert the read value to their native endianness
+  before jumping to it.
+
+- CPUs with a "psci" enable method should remain outside of
+  the kernel (i.e. outside of the regions of memory described to the
+  kernel in the memory node, or in a reserved area of memory described
+  to the kernel by a /memreserve/ region in the device tree).  The
+  kernel will issue CPU_ON calls as described in ARM document number ARM
+  DEN 0022A ("Power State Coordination Interface System Software on ARM
+  processors") to bring CPUs into the kernel.
+
+  The device tree should contain a 'psci' node, as described in
+  Documentation/devicetree/bindings/arm/psci.yaml.
+
+- Secondary CPU general-purpose register settings
+  x0 = 0 (reserved for future use)
+  x1 = 0 (reserved for future use)
+  x2 = 0 (reserved for future use)
+  x3 = 0 (reserved for future use)
diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
deleted file mode 100644
index fbab7e21d116..000000000000
--- a/Documentation/arm64/booting.txt
+++ /dev/null
@@ -1,266 +0,0 @@
-			Booting AArch64 Linux
-			=====================
-
-Author: Will Deacon <will.deacon@arm.com>
-Date  : 07 September 2012
-
-This document is based on the ARM booting document by Russell King and
-is relevant to all public releases of the AArch64 Linux kernel.
-
-The AArch64 exception model is made up of a number of exception levels
-(EL0 - EL3), with EL0 and EL1 having a secure and a non-secure
-counterpart.  EL2 is the hypervisor level and exists only in non-secure
-mode. EL3 is the highest priority level and exists only in secure mode.
-
-For the purposes of this document, we will use the term `boot loader'
-simply to define all software that executes on the CPU(s) before control
-is passed to the Linux kernel.  This may include secure monitor and
-hypervisor code, or it may just be a handful of instructions for
-preparing a minimal boot environment.
-
-Essentially, the boot loader should provide (as a minimum) the
-following:
-
-1. Setup and initialise the RAM
-2. Setup the device tree
-3. Decompress the kernel image
-4. Call the kernel image
-
-
-1. Setup and initialise RAM
----------------------------
-
-Requirement: MANDATORY
-
-The boot loader is expected to find and initialise all RAM that the
-kernel will use for volatile data storage in the system.  It performs
-this in a machine dependent manner.  (It may use internal algorithms
-to automatically locate and size all RAM, or it may use knowledge of
-the RAM in the machine, or any other method the boot loader designer
-sees fit.)
-
-
-2. Setup the device tree
--------------------------
-
-Requirement: MANDATORY
-
-The device tree blob (dtb) must be placed on an 8-byte boundary and must
-not exceed 2 megabytes in size. Since the dtb will be mapped cacheable
-using blocks of up to 2 megabytes in size, it must not be placed within
-any 2M region which must be mapped with any specific attributes.
-
-NOTE: versions prior to v4.2 also require that the DTB be placed within
-the 512 MB region starting at text_offset bytes below the kernel Image.
-
-3. Decompress the kernel image
-------------------------------
-
-Requirement: OPTIONAL
-
-The AArch64 kernel does not currently provide a decompressor and
-therefore requires decompression (gzip etc.) to be performed by the boot
-loader if a compressed Image target (e.g. Image.gz) is used.  For
-bootloaders that do not implement this requirement, the uncompressed
-Image target is available instead.
-
-
-4. Call the kernel image
-------------------------
-
-Requirement: MANDATORY
-
-The decompressed kernel image contains a 64-byte header as follows:
-
-  u32 code0;			/* Executable code */
-  u32 code1;			/* Executable code */
-  u64 text_offset;		/* Image load offset, little endian */
-  u64 image_size;		/* Effective Image size, little endian */
-  u64 flags;			/* kernel flags, little endian */
-  u64 res2	= 0;		/* reserved */
-  u64 res3	= 0;		/* reserved */
-  u64 res4	= 0;		/* reserved */
-  u32 magic	= 0x644d5241;	/* Magic number, little endian, "ARM\x64" */
-  u32 res5;			/* reserved (used for PE COFF offset) */
-
-
-Header notes:
-
-- As of v3.17, all fields are little endian unless stated otherwise.
-
-- code0/code1 are responsible for branching to stext.
-
-- when booting through EFI, code0/code1 are initially skipped.
-  res5 is an offset to the PE header and the PE header has the EFI
-  entry point (efi_stub_entry).  When the stub has done its work, it
-  jumps to code0 to resume the normal boot process.
-
-- Prior to v3.17, the endianness of text_offset was not specified.  In
-  these cases image_size is zero and text_offset is 0x80000 in the
-  endianness of the kernel.  Where image_size is non-zero image_size is
-  little-endian and must be respected.  Where image_size is zero,
-  text_offset can be assumed to be 0x80000.
-
-- The flags field (introduced in v3.17) is a little-endian 64-bit field
-  composed as follows:
-  Bit 0:	Kernel endianness.  1 if BE, 0 if LE.
-  Bit 1-2:	Kernel Page size.
-			0 - Unspecified.
-			1 - 4K
-			2 - 16K
-			3 - 64K
-  Bit 3:	Kernel physical placement
-			0 - 2MB aligned base should be as close as possible
-			    to the base of DRAM, since memory below it is not
-			    accessible via the linear mapping
-			1 - 2MB aligned base may be anywhere in physical
-			    memory
-  Bits 4-63:	Reserved.
-
-- When image_size is zero, a bootloader should attempt to keep as much
-  memory as possible free for use by the kernel immediately after the
-  end of the kernel image. The amount of space required will vary
-  depending on selected features, and is effectively unbound.
-
-The Image must be placed text_offset bytes from a 2MB aligned base
-address anywhere in usable system RAM and called there. The region
-between the 2 MB aligned base address and the start of the image has no
-special significance to the kernel, and may be used for other purposes.
-At least image_size bytes from the start of the image must be free for
-use by the kernel.
-NOTE: versions prior to v4.6 cannot make use of memory below the
-physical offset of the Image so it is recommended that the Image be
-placed as close as possible to the start of system RAM.
-
-If an initrd/initramfs is passed to the kernel at boot, it must reside
-entirely within a 1 GB aligned physical memory window of up to 32 GB in
-size that fully covers the kernel Image as well.
-
-Any memory described to the kernel (even that below the start of the
-image) which is not marked as reserved from the kernel (e.g., with a
-memreserve region in the device tree) will be considered as available to
-the kernel.
-
-Before jumping into the kernel, the following conditions must be met:
-
-- Quiesce all DMA capable devices so that memory does not get
-  corrupted by bogus network packets or disk data.  This will save
-  you many hours of debug.
-
-- Primary CPU general-purpose register settings
-  x0 = physical address of device tree blob (dtb) in system RAM.
-  x1 = 0 (reserved for future use)
-  x2 = 0 (reserved for future use)
-  x3 = 0 (reserved for future use)
-
-- CPU mode
-  All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError,
-  IRQ and FIQ).
-  The CPU must be in either EL2 (RECOMMENDED in order to have access to
-  the virtualisation extensions) or non-secure EL1.
-
-- Caches, MMUs
-  The MMU must be off.
-  Instruction cache may be on or off.
-  The address range corresponding to the loaded kernel image must be
-  cleaned to the PoC. In the presence of a system cache or other
-  coherent masters with caches enabled, this will typically require
-  cache maintenance by VA rather than set/way operations.
-  System caches which respect the architected cache maintenance by VA
-  operations must be configured and may be enabled.
-  System caches which do not respect architected cache maintenance by VA
-  operations (not recommended) must be configured and disabled.
-
-- Architected timers
-  CNTFRQ must be programmed with the timer frequency and CNTVOFF must
-  be programmed with a consistent value on all CPUs.  If entering the
-  kernel at EL1, CNTHCTL_EL2 must have EL1PCTEN (bit 0) set where
-  available.
-
-- Coherency
-  All CPUs to be booted by the kernel must be part of the same coherency
-  domain on entry to the kernel.  This may require IMPLEMENTATION DEFINED
-  initialisation to enable the receiving of maintenance operations on
-  each CPU.
-
-- System registers
-  All writable architected system registers at the exception level where
-  the kernel image will be entered must be initialised by software at a
-  higher exception level to prevent execution in an UNKNOWN state.
-
-  - SCR_EL3.FIQ must have the same value across all CPUs the kernel is
-    executing on.
-  - The value of SCR_EL3.FIQ must be the same as the one present at boot
-    time whenever the kernel is executing.
-
-  For systems with a GICv3 interrupt controller to be used in v3 mode:
-  - If EL3 is present:
-    ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
-    ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1.
-  - If the kernel is entered at EL1:
-    ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1
-    ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1.
-  - The DT or ACPI tables must describe a GICv3 interrupt controller.
-
-  For systems with a GICv3 interrupt controller to be used in
-  compatibility (v2) mode:
-  - If EL3 is present:
-    ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0.
-  - If the kernel is entered at EL1:
-    ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0.
-  - The DT or ACPI tables must describe a GICv2 interrupt controller.
-
-  For CPUs with pointer authentication functionality:
-  - If EL3 is present:
-    SCR_EL3.APK (bit 16) must be initialised to 0b1
-    SCR_EL3.API (bit 17) must be initialised to 0b1
-  - If the kernel is entered at EL1:
-    HCR_EL2.APK (bit 40) must be initialised to 0b1
-    HCR_EL2.API (bit 41) must be initialised to 0b1
-
-The requirements described above for CPU mode, caches, MMUs, architected
-timers, coherency and system registers apply to all CPUs.  All CPUs must
-enter the kernel in the same exception level.
-
-The boot loader is expected to enter the kernel on each CPU in the
-following manner:
-
-- The primary CPU must jump directly to the first instruction of the
-  kernel image.  The device tree blob passed by this CPU must contain
-  an 'enable-method' property for each cpu node.  The supported
-  enable-methods are described below.
-
-  It is expected that the bootloader will generate these device tree
-  properties and insert them into the blob prior to kernel entry.
-
-- CPUs with a "spin-table" enable-method must have a 'cpu-release-addr'
-  property in their cpu node.  This property identifies a
-  naturally-aligned 64-bit zero-initalised memory location.
-
-  These CPUs should spin outside of the kernel in a reserved area of
-  memory (communicated to the kernel by a /memreserve/ region in the
-  device tree) polling their cpu-release-addr location, which must be
-  contained in the reserved region.  A wfe instruction may be inserted
-  to reduce the overhead of the busy-loop and a sev will be issued by
-  the primary CPU.  When a read of the location pointed to by the
-  cpu-release-addr returns a non-zero value, the CPU must jump to this
-  value.  The value will be written as a single 64-bit little-endian
-  value, so CPUs must convert the read value to their native endianness
-  before jumping to it.
-
-- CPUs with a "psci" enable method should remain outside of
-  the kernel (i.e. outside of the regions of memory described to the
-  kernel in the memory node, or in a reserved area of memory described
-  to the kernel by a /memreserve/ region in the device tree).  The
-  kernel will issue CPU_ON calls as described in ARM document number ARM
-  DEN 0022A ("Power State Coordination Interface System Software on ARM
-  processors") to bring CPUs into the kernel.
-
-  The device tree should contain a 'psci' node, as described in
-  Documentation/devicetree/bindings/arm/psci.txt.
-
-- Secondary CPU general-purpose register settings
-  x0 = 0 (reserved for future use)
-  x1 = 0 (reserved for future use)
-  x2 = 0 (reserved for future use)
-  x3 = 0 (reserved for future use)
diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
new file mode 100644
index 000000000000..2955287e9acc
--- /dev/null
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -0,0 +1,304 @@
+===========================
+ARM64 CPU Feature Registers
+===========================
+
+Author: Suzuki K Poulose <suzuki.poulose@arm.com>
+
+
+This file describes the ABI for exporting the AArch64 CPU ID/feature
+registers to userspace. The availability of this ABI is advertised
+via the HWCAP_CPUID in HWCAPs.
+
+1. Motivation
+-------------
+
+The ARM architecture defines a set of feature registers, which describe
+the capabilities of the CPU/system. Access to these system registers is
+restricted from EL0 and there is no reliable way for an application to
+extract this information to make better decisions at runtime. There is
+limited information available to the application via HWCAPs, however
+there are some issues with their usage.
+
+ a) Any change to the HWCAPs requires an update to userspace (e.g libc)
+    to detect the new changes, which can take a long time to appear in
+    distributions. Exposing the registers allows applications to get the
+    information without requiring updates to the toolchains.
+
+ b) Access to HWCAPs is sometimes limited (e.g prior to libc, or
+    when ld is initialised at startup time).
+
+ c) HWCAPs cannot represent non-boolean information effectively. The
+    architecture defines a canonical format for representing features
+    in the ID registers; this is well defined and is capable of
+    representing all valid architecture variations.
+
+
+2. Requirements
+---------------
+
+ a) Safety:
+
+    Applications should be able to use the information provided by the
+    infrastructure to run safely across the system. This has greater
+    implications on a system with heterogeneous CPUs.
+    The infrastructure exports a value that is safe across all the
+    available CPU on the system.
+
+    e.g, If at least one CPU doesn't implement CRC32 instructions, while
+    others do, we should report that the CRC32 is not implemented.
+    Otherwise an application could crash when scheduled on the CPU
+    which doesn't support CRC32.
+
+ b) Security:
+
+    Applications should only be able to receive information that is
+    relevant to the normal operation in userspace. Hence, some of the
+    fields are masked out(i.e, made invisible) and their values are set to
+    indicate the feature is 'not supported'. See Section 4 for the list
+    of visible features. Also, the kernel may manipulate the fields
+    based on what it supports. e.g, If FP is not supported by the
+    kernel, the values could indicate that the FP is not available
+    (even when the CPU provides it).
+
+ c) Implementation Defined Features
+
+    The infrastructure doesn't expose any register which is
+    IMPLEMENTATION DEFINED as per ARMv8-A Architecture.
+
+ d) CPU Identification:
+
+    MIDR_EL1 is exposed to help identify the processor. On a
+    heterogeneous system, this could be racy (just like getcpu()). The
+    process could be migrated to another CPU by the time it uses the
+    register value, unless the CPU affinity is set. Hence, there is no
+    guarantee that the value reflects the processor that it is
+    currently executing on. The REVIDR is not exposed due to this
+    constraint, as REVIDR makes sense only in conjunction with the
+    MIDR. Alternately, MIDR_EL1 and REVIDR_EL1 are exposed via sysfs
+    at::
+
+	/sys/devices/system/cpu/cpu$ID/regs/identification/
+	                                              \- midr
+	                                              \- revidr
+
+3. Implementation
+--------------------
+
+The infrastructure is built on the emulation of the 'MRS' instruction.
+Accessing a restricted system register from an application generates an
+exception and ends up in SIGILL being delivered to the process.
+The infrastructure hooks into the exception handler and emulates the
+operation if the source belongs to the supported system register space.
+
+The infrastructure emulates only the following system register space::
+
+	Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7
+
+(See Table C5-6 'System instruction encodings for non-Debug System
+register accesses' in ARMv8 ARM DDI 0487A.h, for the list of
+registers).
+
+The following rules are applied to the value returned by the
+infrastructure:
+
+ a) The value of an 'IMPLEMENTATION DEFINED' field is set to 0.
+ b) The value of a reserved field is populated with the reserved
+    value as defined by the architecture.
+ c) The value of a 'visible' field holds the system wide safe value
+    for the particular feature (except for MIDR_EL1, see section 4).
+ d) All other fields (i.e, invisible fields) are set to indicate
+    the feature is missing (as defined by the architecture).
+
+4. List of registers with visible features
+-------------------------------------------
+
+  1) ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | TS                           | [55-52] |    y    |
+     +------------------------------+---------+---------+
+     | FHM                          | [51-48] |    y    |
+     +------------------------------+---------+---------+
+     | DP                           | [47-44] |    y    |
+     +------------------------------+---------+---------+
+     | SM4                          | [43-40] |    y    |
+     +------------------------------+---------+---------+
+     | SM3                          | [39-36] |    y    |
+     +------------------------------+---------+---------+
+     | SHA3                         | [35-32] |    y    |
+     +------------------------------+---------+---------+
+     | RDM                          | [31-28] |    y    |
+     +------------------------------+---------+---------+
+     | ATOMICS                      | [23-20] |    y    |
+     +------------------------------+---------+---------+
+     | CRC32                        | [19-16] |    y    |
+     +------------------------------+---------+---------+
+     | SHA2                         | [15-12] |    y    |
+     +------------------------------+---------+---------+
+     | SHA1                         | [11-8]  |    y    |
+     +------------------------------+---------+---------+
+     | AES                          | [7-4]   |    y    |
+     +------------------------------+---------+---------+
+
+
+  2) ID_AA64PFR0_EL1 - Processor Feature Register 0
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | DIT                          | [51-48] |    y    |
+     +------------------------------+---------+---------+
+     | SVE                          | [35-32] |    y    |
+     +------------------------------+---------+---------+
+     | GIC                          | [27-24] |    n    |
+     +------------------------------+---------+---------+
+     | AdvSIMD                      | [23-20] |    y    |
+     +------------------------------+---------+---------+
+     | FP                           | [19-16] |    y    |
+     +------------------------------+---------+---------+
+     | EL3                          | [15-12] |    n    |
+     +------------------------------+---------+---------+
+     | EL2                          | [11-8]  |    n    |
+     +------------------------------+---------+---------+
+     | EL1                          | [7-4]   |    n    |
+     +------------------------------+---------+---------+
+     | EL0                          | [3-0]   |    n    |
+     +------------------------------+---------+---------+
+
+
+  3) MIDR_EL1 - Main ID Register
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | Implementer                  | [31-24] |    y    |
+     +------------------------------+---------+---------+
+     | Variant                      | [23-20] |    y    |
+     +------------------------------+---------+---------+
+     | Architecture                 | [19-16] |    y    |
+     +------------------------------+---------+---------+
+     | PartNum                      | [15-4]  |    y    |
+     +------------------------------+---------+---------+
+     | Revision                     | [3-0]   |    y    |
+     +------------------------------+---------+---------+
+
+   NOTE: The 'visible' fields of MIDR_EL1 will contain the value
+   as available on the CPU where it is fetched and is not a system
+   wide safe value.
+
+  4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | GPI                          | [31-28] |    y    |
+     +------------------------------+---------+---------+
+     | GPA                          | [27-24] |    y    |
+     +------------------------------+---------+---------+
+     | LRCPC                        | [23-20] |    y    |
+     +------------------------------+---------+---------+
+     | FCMA                         | [19-16] |    y    |
+     +------------------------------+---------+---------+
+     | JSCVT                        | [15-12] |    y    |
+     +------------------------------+---------+---------+
+     | API                          | [11-8]  |    y    |
+     +------------------------------+---------+---------+
+     | APA                          | [7-4]   |    y    |
+     +------------------------------+---------+---------+
+     | DPB                          | [3-0]   |    y    |
+     +------------------------------+---------+---------+
+
+  5) ID_AA64MMFR2_EL1 - Memory model feature register 2
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | AT                           | [35-32] |    y    |
+     +------------------------------+---------+---------+
+
+  6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | SM4                          | [43-40] |    y    |
+     +------------------------------+---------+---------+
+     | SHA3                         | [35-32] |    y    |
+     +------------------------------+---------+---------+
+     | BitPerm                      | [19-16] |    y    |
+     +------------------------------+---------+---------+
+     | AES                          | [7-4]   |    y    |
+     +------------------------------+---------+---------+
+     | SVEVer                       | [3-0]   |    y    |
+     +------------------------------+---------+---------+
+
+Appendix I: Example
+-------------------
+
+::
+
+  /*
+   * Sample program to demonstrate the MRS emulation ABI.
+   *
+   * Copyright (C) 2015-2016, ARM Ltd
+   *
+   * Author: Suzuki K Poulose <suzuki.poulose@arm.com>
+   *
+   * This program is free software; you can redistribute it and/or modify
+   * it under the terms of the GNU General Public License version 2 as
+   * published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   * GNU General Public License for more details.
+   * This program is free software; you can redistribute it and/or modify
+   * it under the terms of the GNU General Public License version 2 as
+   * published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   * GNU General Public License for more details.
+   */
+
+  #include <asm/hwcap.h>
+  #include <stdio.h>
+  #include <sys/auxv.h>
+
+  #define get_cpu_ftr(id) ({					\
+		unsigned long __val;				\
+		asm("mrs %0, "#id : "=r" (__val));		\
+		printf("%-20s: 0x%016lx\n", #id, __val);	\
+	})
+
+  int main(void)
+  {
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
+		fputs("CPUID registers unavailable\n", stderr);
+		return 1;
+	}
+
+	get_cpu_ftr(ID_AA64ISAR0_EL1);
+	get_cpu_ftr(ID_AA64ISAR1_EL1);
+	get_cpu_ftr(ID_AA64MMFR0_EL1);
+	get_cpu_ftr(ID_AA64MMFR1_EL1);
+	get_cpu_ftr(ID_AA64PFR0_EL1);
+	get_cpu_ftr(ID_AA64PFR1_EL1);
+	get_cpu_ftr(ID_AA64DFR0_EL1);
+	get_cpu_ftr(ID_AA64DFR1_EL1);
+
+	get_cpu_ftr(MIDR_EL1);
+	get_cpu_ftr(MPIDR_EL1);
+	get_cpu_ftr(REVIDR_EL1);
+
+  #if 0
+	/* Unexposed register access causes SIGILL */
+	get_cpu_ftr(ID_MMFR0_EL1);
+  #endif
+
+	return 0;
+  }
diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt
deleted file mode 100644
index 684a0da39378..000000000000
--- a/Documentation/arm64/cpu-feature-registers.txt
+++ /dev/null
@@ -1,296 +0,0 @@
-		ARM64 CPU Feature Registers
-		===========================
-
-Author: Suzuki K Poulose <suzuki.poulose@arm.com>
-
-
-This file describes the ABI for exporting the AArch64 CPU ID/feature
-registers to userspace. The availability of this ABI is advertised
-via the HWCAP_CPUID in HWCAPs.
-
-1. Motivation
----------------
-
-The ARM architecture defines a set of feature registers, which describe
-the capabilities of the CPU/system. Access to these system registers is
-restricted from EL0 and there is no reliable way for an application to
-extract this information to make better decisions at runtime. There is
-limited information available to the application via HWCAPs, however
-there are some issues with their usage.
-
- a) Any change to the HWCAPs requires an update to userspace (e.g libc)
-    to detect the new changes, which can take a long time to appear in
-    distributions. Exposing the registers allows applications to get the
-    information without requiring updates to the toolchains.
-
- b) Access to HWCAPs is sometimes limited (e.g prior to libc, or
-    when ld is initialised at startup time).
-
- c) HWCAPs cannot represent non-boolean information effectively. The
-    architecture defines a canonical format for representing features
-    in the ID registers; this is well defined and is capable of
-    representing all valid architecture variations.
-
-
-2. Requirements
------------------
-
- a) Safety :
-    Applications should be able to use the information provided by the
-    infrastructure to run safely across the system. This has greater
-    implications on a system with heterogeneous CPUs.
-    The infrastructure exports a value that is safe across all the
-    available CPU on the system.
-
-    e.g, If at least one CPU doesn't implement CRC32 instructions, while
-    others do, we should report that the CRC32 is not implemented.
-    Otherwise an application could crash when scheduled on the CPU
-    which doesn't support CRC32.
-
- b) Security :
-    Applications should only be able to receive information that is
-    relevant to the normal operation in userspace. Hence, some of the
-    fields are masked out(i.e, made invisible) and their values are set to
-    indicate the feature is 'not supported'. See Section 4 for the list
-    of visible features. Also, the kernel may manipulate the fields
-    based on what it supports. e.g, If FP is not supported by the
-    kernel, the values could indicate that the FP is not available
-    (even when the CPU provides it).
-
- c) Implementation Defined Features
-    The infrastructure doesn't expose any register which is
-    IMPLEMENTATION DEFINED as per ARMv8-A Architecture.
-
- d) CPU Identification :
-    MIDR_EL1 is exposed to help identify the processor. On a
-    heterogeneous system, this could be racy (just like getcpu()). The
-    process could be migrated to another CPU by the time it uses the
-    register value, unless the CPU affinity is set. Hence, there is no
-    guarantee that the value reflects the processor that it is
-    currently executing on. The REVIDR is not exposed due to this
-    constraint, as REVIDR makes sense only in conjunction with the
-    MIDR. Alternately, MIDR_EL1 and REVIDR_EL1 are exposed via sysfs
-    at:
-
-	/sys/devices/system/cpu/cpu$ID/regs/identification/
-	                                              \- midr
-	                                              \- revidr
-
-3. Implementation
---------------------
-
-The infrastructure is built on the emulation of the 'MRS' instruction.
-Accessing a restricted system register from an application generates an
-exception and ends up in SIGILL being delivered to the process.
-The infrastructure hooks into the exception handler and emulates the
-operation if the source belongs to the supported system register space.
-
-The infrastructure emulates only the following system register space:
-	Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7
-
-(See Table C5-6 'System instruction encodings for non-Debug System
-register accesses' in ARMv8 ARM DDI 0487A.h, for the list of
-registers).
-
-The following rules are applied to the value returned by the
-infrastructure:
-
- a) The value of an 'IMPLEMENTATION DEFINED' field is set to 0.
- b) The value of a reserved field is populated with the reserved
-    value as defined by the architecture.
- c) The value of a 'visible' field holds the system wide safe value
-    for the particular feature (except for MIDR_EL1, see section 4).
- d) All other fields (i.e, invisible fields) are set to indicate
-    the feature is missing (as defined by the architecture).
-
-4. List of registers with visible features
--------------------------------------------
-
-  1) ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | TS                           | [55-52] |    y    |
-     |--------------------------------------------------|
-     | FHM                          | [51-48] |    y    |
-     |--------------------------------------------------|
-     | DP                           | [47-44] |    y    |
-     |--------------------------------------------------|
-     | SM4                          | [43-40] |    y    |
-     |--------------------------------------------------|
-     | SM3                          | [39-36] |    y    |
-     |--------------------------------------------------|
-     | SHA3                         | [35-32] |    y    |
-     |--------------------------------------------------|
-     | RDM                          | [31-28] |    y    |
-     |--------------------------------------------------|
-     | ATOMICS                      | [23-20] |    y    |
-     |--------------------------------------------------|
-     | CRC32                        | [19-16] |    y    |
-     |--------------------------------------------------|
-     | SHA2                         | [15-12] |    y    |
-     |--------------------------------------------------|
-     | SHA1                         | [11-8]  |    y    |
-     |--------------------------------------------------|
-     | AES                          | [7-4]   |    y    |
-     x--------------------------------------------------x
-
-
-  2) ID_AA64PFR0_EL1 - Processor Feature Register 0
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | DIT                          | [51-48] |    y    |
-     |--------------------------------------------------|
-     | SVE                          | [35-32] |    y    |
-     |--------------------------------------------------|
-     | GIC                          | [27-24] |    n    |
-     |--------------------------------------------------|
-     | AdvSIMD                      | [23-20] |    y    |
-     |--------------------------------------------------|
-     | FP                           | [19-16] |    y    |
-     |--------------------------------------------------|
-     | EL3                          | [15-12] |    n    |
-     |--------------------------------------------------|
-     | EL2                          | [11-8]  |    n    |
-     |--------------------------------------------------|
-     | EL1                          | [7-4]   |    n    |
-     |--------------------------------------------------|
-     | EL0                          | [3-0]   |    n    |
-     x--------------------------------------------------x
-
-
-  3) MIDR_EL1 - Main ID Register
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | Implementer                  | [31-24] |    y    |
-     |--------------------------------------------------|
-     | Variant                      | [23-20] |    y    |
-     |--------------------------------------------------|
-     | Architecture                 | [19-16] |    y    |
-     |--------------------------------------------------|
-     | PartNum                      | [15-4]  |    y    |
-     |--------------------------------------------------|
-     | Revision                     | [3-0]   |    y    |
-     x--------------------------------------------------x
-
-   NOTE: The 'visible' fields of MIDR_EL1 will contain the value
-   as available on the CPU where it is fetched and is not a system
-   wide safe value.
-
-  4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
-
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | GPI                          | [31-28] |    y    |
-     |--------------------------------------------------|
-     | GPA                          | [27-24] |    y    |
-     |--------------------------------------------------|
-     | LRCPC                        | [23-20] |    y    |
-     |--------------------------------------------------|
-     | FCMA                         | [19-16] |    y    |
-     |--------------------------------------------------|
-     | JSCVT                        | [15-12] |    y    |
-     |--------------------------------------------------|
-     | API                          | [11-8]  |    y    |
-     |--------------------------------------------------|
-     | APA                          | [7-4]   |    y    |
-     |--------------------------------------------------|
-     | DPB                          | [3-0]   |    y    |
-     x--------------------------------------------------x
-
-  5) ID_AA64MMFR2_EL1 - Memory model feature register 2
-
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | AT                           | [35-32] |    y    |
-     x--------------------------------------------------x
-
-  6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
-
-     x--------------------------------------------------x
-     | Name                         |  bits   | visible |
-     |--------------------------------------------------|
-     | SM4                          | [43-40] |    y    |
-     |--------------------------------------------------|
-     | SHA3                         | [35-32] |    y    |
-     |--------------------------------------------------|
-     | BitPerm                      | [19-16] |    y    |
-     |--------------------------------------------------|
-     | AES                          | [7-4]   |    y    |
-     |--------------------------------------------------|
-     | SVEVer                       | [3-0]   |    y    |
-     x--------------------------------------------------x
-
-Appendix I: Example
----------------------------
-
-/*
- * Sample program to demonstrate the MRS emulation ABI.
- *
- * Copyright (C) 2015-2016, ARM Ltd
- *
- * Author: Suzuki K Poulose <suzuki.poulose@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <asm/hwcap.h>
-#include <stdio.h>
-#include <sys/auxv.h>
-
-#define get_cpu_ftr(id) ({					\
-		unsigned long __val;				\
-		asm("mrs %0, "#id : "=r" (__val));		\
-		printf("%-20s: 0x%016lx\n", #id, __val);	\
-	})
-
-int main(void)
-{
-
-	if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
-		fputs("CPUID registers unavailable\n", stderr);
-		return 1;
-	}
-
-	get_cpu_ftr(ID_AA64ISAR0_EL1);
-	get_cpu_ftr(ID_AA64ISAR1_EL1);
-	get_cpu_ftr(ID_AA64MMFR0_EL1);
-	get_cpu_ftr(ID_AA64MMFR1_EL1);
-	get_cpu_ftr(ID_AA64PFR0_EL1);
-	get_cpu_ftr(ID_AA64PFR1_EL1);
-	get_cpu_ftr(ID_AA64DFR0_EL1);
-	get_cpu_ftr(ID_AA64DFR1_EL1);
-
-	get_cpu_ftr(MIDR_EL1);
-	get_cpu_ftr(MPIDR_EL1);
-	get_cpu_ftr(REVIDR_EL1);
-
-#if 0
-	/* Unexposed register access causes SIGILL */
-	get_cpu_ftr(ID_MMFR0_EL1);
-#endif
-
-	return 0;
-}
-
-
-
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
new file mode 100644
index 000000000000..91f79529c58c
--- /dev/null
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -0,0 +1,209 @@
+================
+ARM64 ELF hwcaps
+================
+
+This document describes the usage and semantics of the arm64 ELF hwcaps.
+
+
+1. Introduction
+---------------
+
+Some hardware or software features are only available on some CPU
+implementations, and/or with certain kernel configurations, but have no
+architected discovery mechanism available to userspace code at EL0. The
+kernel exposes the presence of these features to userspace through a set
+of flags called hwcaps, exposed in the auxilliary vector.
+
+Userspace software can test for features by acquiring the AT_HWCAP or
+AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
+flags are set, e.g.::
+
+	bool floating_point_is_present(void)
+	{
+		unsigned long hwcaps = getauxval(AT_HWCAP);
+		if (hwcaps & HWCAP_FP)
+			return true;
+
+		return false;
+	}
+
+Where software relies on a feature described by a hwcap, it should check
+the relevant hwcap flag to verify that the feature is present before
+attempting to make use of the feature.
+
+Features cannot be probed reliably through other means. When a feature
+is not available, attempting to use it may result in unpredictable
+behaviour, and is not guaranteed to result in any reliable indication
+that the feature is unavailable, such as a SIGILL.
+
+
+2. Interpretation of hwcaps
+---------------------------
+
+The majority of hwcaps are intended to indicate the presence of features
+which are described by architected ID registers inaccessible to
+userspace code at EL0. These hwcaps are defined in terms of ID register
+fields, and should be interpreted with reference to the definition of
+these fields in the ARM Architecture Reference Manual (ARM ARM).
+
+Such hwcaps are described below in the form::
+
+    Functionality implied by idreg.field == val.
+
+Such hwcaps indicate the availability of functionality that the ARM ARM
+defines as being present when idreg.field has value val, but do not
+indicate that idreg.field is precisely equal to val, nor do they
+indicate the absence of functionality implied by other values of
+idreg.field.
+
+Other hwcaps may indicate the presence of features which cannot be
+described by ID registers alone. These may be described without
+reference to ID registers, and may refer to other documentation.
+
+
+3. The hwcaps exposed in AT_HWCAP
+---------------------------------
+
+HWCAP_FP
+    Functionality implied by ID_AA64PFR0_EL1.FP == 0b0000.
+
+HWCAP_ASIMD
+    Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0000.
+
+HWCAP_EVTSTRM
+    The generic timer is configured to generate events at a frequency of
+    approximately 100KHz.
+
+HWCAP_AES
+    Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001.
+
+HWCAP_PMULL
+    Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010.
+
+HWCAP_SHA1
+    Functionality implied by ID_AA64ISAR0_EL1.SHA1 == 0b0001.
+
+HWCAP_SHA2
+    Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0001.
+
+HWCAP_CRC32
+    Functionality implied by ID_AA64ISAR0_EL1.CRC32 == 0b0001.
+
+HWCAP_ATOMICS
+    Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0010.
+
+HWCAP_FPHP
+    Functionality implied by ID_AA64PFR0_EL1.FP == 0b0001.
+
+HWCAP_ASIMDHP
+    Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0001.
+
+HWCAP_CPUID
+    EL0 access to certain ID registers is available, to the extent
+    described by Documentation/arm64/cpu-feature-registers.rst.
+
+    These ID registers may imply the availability of features.
+
+HWCAP_ASIMDRDM
+    Functionality implied by ID_AA64ISAR0_EL1.RDM == 0b0001.
+
+HWCAP_JSCVT
+    Functionality implied by ID_AA64ISAR1_EL1.JSCVT == 0b0001.
+
+HWCAP_FCMA
+    Functionality implied by ID_AA64ISAR1_EL1.FCMA == 0b0001.
+
+HWCAP_LRCPC
+    Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0001.
+
+HWCAP_DCPOP
+    Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
+
+HWCAP2_DCPODP
+
+    Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
+
+HWCAP_SHA3
+    Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
+
+HWCAP_SM3
+    Functionality implied by ID_AA64ISAR0_EL1.SM3 == 0b0001.
+
+HWCAP_SM4
+    Functionality implied by ID_AA64ISAR0_EL1.SM4 == 0b0001.
+
+HWCAP_ASIMDDP
+    Functionality implied by ID_AA64ISAR0_EL1.DP == 0b0001.
+
+HWCAP_SHA512
+    Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010.
+
+HWCAP_SVE
+    Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
+
+HWCAP2_SVE2
+
+    Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
+
+HWCAP2_SVEAES
+
+    Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
+
+HWCAP2_SVEPMULL
+
+    Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
+
+HWCAP2_SVEBITPERM
+
+    Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
+
+HWCAP2_SVESHA3
+
+    Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
+
+HWCAP2_SVESM4
+
+    Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
+
+HWCAP_ASIMDFHM
+   Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
+
+HWCAP_DIT
+    Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001.
+
+HWCAP_USCAT
+    Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001.
+
+HWCAP_ILRCPC
+    Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010.
+
+HWCAP_FLAGM
+    Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001.
+
+HWCAP2_FLAGM2
+
+    Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010.
+
+HWCAP_SSBS
+    Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
+
+HWCAP_PACA
+    Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
+    ID_AA64ISAR1_EL1.API == 0b0001, as described by
+    Documentation/arm64/pointer-authentication.rst.
+
+HWCAP_PACG
+    Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
+    ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
+    Documentation/arm64/pointer-authentication.rst.
+
+HWCAP2_FRINT
+
+    Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
+
+
+4. Unused AT_HWCAP bits
+-----------------------
+
+For interoperation with userspace, the kernel guarantees that bits 62
+and 63 of AT_HWCAP will always be returned as 0.
diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt
deleted file mode 100644
index b73a2519ecf2..000000000000
--- a/Documentation/arm64/elf_hwcaps.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-ARM64 ELF hwcaps
-================
-
-This document describes the usage and semantics of the arm64 ELF hwcaps.
-
-
-1. Introduction
----------------
-
-Some hardware or software features are only available on some CPU
-implementations, and/or with certain kernel configurations, but have no
-architected discovery mechanism available to userspace code at EL0. The
-kernel exposes the presence of these features to userspace through a set
-of flags called hwcaps, exposed in the auxilliary vector.
-
-Userspace software can test for features by acquiring the AT_HWCAP or
-AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
-flags are set, e.g.
-
-bool floating_point_is_present(void)
-{
-	unsigned long hwcaps = getauxval(AT_HWCAP);
-	if (hwcaps & HWCAP_FP)
-		return true;
-
-	return false;
-}
-
-Where software relies on a feature described by a hwcap, it should check
-the relevant hwcap flag to verify that the feature is present before
-attempting to make use of the feature.
-
-Features cannot be probed reliably through other means. When a feature
-is not available, attempting to use it may result in unpredictable
-behaviour, and is not guaranteed to result in any reliable indication
-that the feature is unavailable, such as a SIGILL.
-
-
-2. Interpretation of hwcaps
----------------------------
-
-The majority of hwcaps are intended to indicate the presence of features
-which are described by architected ID registers inaccessible to
-userspace code at EL0. These hwcaps are defined in terms of ID register
-fields, and should be interpreted with reference to the definition of
-these fields in the ARM Architecture Reference Manual (ARM ARM).
-
-Such hwcaps are described below in the form:
-
-    Functionality implied by idreg.field == val.
-
-Such hwcaps indicate the availability of functionality that the ARM ARM
-defines as being present when idreg.field has value val, but do not
-indicate that idreg.field is precisely equal to val, nor do they
-indicate the absence of functionality implied by other values of
-idreg.field.
-
-Other hwcaps may indicate the presence of features which cannot be
-described by ID registers alone. These may be described without
-reference to ID registers, and may refer to other documentation.
-
-
-3. The hwcaps exposed in AT_HWCAP
----------------------------------
-
-HWCAP_FP
-
-    Functionality implied by ID_AA64PFR0_EL1.FP == 0b0000.
-
-HWCAP_ASIMD
-
-    Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0000.
-
-HWCAP_EVTSTRM
-
-    The generic timer is configured to generate events at a frequency of
-    approximately 100KHz.
-
-HWCAP_AES
-
-    Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001.
-
-HWCAP_PMULL
-
-    Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010.
-
-HWCAP_SHA1
-
-    Functionality implied by ID_AA64ISAR0_EL1.SHA1 == 0b0001.
-
-HWCAP_SHA2
-
-    Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0001.
-
-HWCAP_CRC32
-
-    Functionality implied by ID_AA64ISAR0_EL1.CRC32 == 0b0001.
-
-HWCAP_ATOMICS
-
-    Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0010.
-
-HWCAP_FPHP
-
-    Functionality implied by ID_AA64PFR0_EL1.FP == 0b0001.
-
-HWCAP_ASIMDHP
-
-    Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0001.
-
-HWCAP_CPUID
-
-    EL0 access to certain ID registers is available, to the extent
-    described by Documentation/arm64/cpu-feature-registers.txt.
-
-    These ID registers may imply the availability of features.
-
-HWCAP_ASIMDRDM
-
-    Functionality implied by ID_AA64ISAR0_EL1.RDM == 0b0001.
-
-HWCAP_JSCVT
-
-    Functionality implied by ID_AA64ISAR1_EL1.JSCVT == 0b0001.
-
-HWCAP_FCMA
-
-    Functionality implied by ID_AA64ISAR1_EL1.FCMA == 0b0001.
-
-HWCAP_LRCPC
-
-    Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0001.
-
-HWCAP_DCPOP
-
-    Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
-
-HWCAP2_DCPODP
-
-    Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
-
-HWCAP_SHA3
-
-    Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
-
-HWCAP_SM3
-
-    Functionality implied by ID_AA64ISAR0_EL1.SM3 == 0b0001.
-
-HWCAP_SM4
-
-    Functionality implied by ID_AA64ISAR0_EL1.SM4 == 0b0001.
-
-HWCAP_ASIMDDP
-
-    Functionality implied by ID_AA64ISAR0_EL1.DP == 0b0001.
-
-HWCAP_SHA512
-
-    Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010.
-
-HWCAP_SVE
-
-    Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
-
-HWCAP2_SVE2
-
-    Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
-
-HWCAP2_SVEAES
-
-    Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
-
-HWCAP2_SVEPMULL
-
-    Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
-
-HWCAP2_SVEBITPERM
-
-    Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
-
-HWCAP2_SVESHA3
-
-    Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
-
-HWCAP2_SVESM4
-
-    Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
-
-HWCAP_ASIMDFHM
-
-   Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
-
-HWCAP_DIT
-
-    Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001.
-
-HWCAP_USCAT
-
-    Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001.
-
-HWCAP_ILRCPC
-
-    Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010.
-
-HWCAP_FLAGM
-
-    Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001.
-
-HWCAP_SSBS
-
-    Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
-
-HWCAP_PACA
-
-    Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or
-    ID_AA64ISAR1_EL1.API == 0b0001, as described by
-    Documentation/arm64/pointer-authentication.txt.
-
-HWCAP_PACG
-
-    Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
-    ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
-    Documentation/arm64/pointer-authentication.txt.
-
-
-4. Unused AT_HWCAP bits
------------------------
-
-For interoperation with userspace, the kernel guarantees that bits 62
-and 63 of AT_HWCAP will always be returned as 0.
diff --git a/Documentation/arm64/hugetlbpage.rst b/Documentation/arm64/hugetlbpage.rst
new file mode 100644
index 000000000000..b44f939e5210
--- /dev/null
+++ b/Documentation/arm64/hugetlbpage.rst
@@ -0,0 +1,41 @@
+====================
+HugeTLBpage on ARM64
+====================
+
+Hugepage relies on making efficient use of TLBs to improve performance of
+address translations. The benefit depends on both -
+
+  - the size of hugepages
+  - size of entries supported by the TLBs
+
+The ARM64 port supports two flavours of hugepages.
+
+1) Block mappings at the pud/pmd level
+--------------------------------------
+
+These are regular hugepages where a pmd or a pud page table entry points to a
+block of memory. Regardless of the supported size of entries in TLB, block
+mappings reduce the depth of page table walk needed to translate hugepage
+addresses.
+
+2) Using the Contiguous bit
+---------------------------
+
+The architecture provides a contiguous bit in the translation table entries
+(D4.5.3, ARM DDI 0487C.a) that hints to the MMU to indicate that it is one of a
+contiguous set of entries that can be cached in a single TLB entry.
+
+The contiguous bit is used in Linux to increase the mapping size at the pmd and
+pte (last) level. The number of supported contiguous entries varies by page size
+and level of the page table.
+
+
+The following hugepage sizes are supported -
+
+  ====== ========   ====    ========    ===
+  -      CONT PTE    PMD    CONT PMD    PUD
+  ====== ========   ====    ========    ===
+  4K:         64K     2M         32M     1G
+  16K:         2M    32M          1G
+  64K:         2M   512M         16G
+  ====== ========   ====    ========    ===
diff --git a/Documentation/arm64/hugetlbpage.txt b/Documentation/arm64/hugetlbpage.txt
deleted file mode 100644
index cfae87dc653b..000000000000
--- a/Documentation/arm64/hugetlbpage.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-HugeTLBpage on ARM64
-====================
-
-Hugepage relies on making efficient use of TLBs to improve performance of
-address translations. The benefit depends on both -
-
-  - the size of hugepages
-  - size of entries supported by the TLBs
-
-The ARM64 port supports two flavours of hugepages.
-
-1) Block mappings at the pud/pmd level
---------------------------------------
-
-These are regular hugepages where a pmd or a pud page table entry points to a
-block of memory. Regardless of the supported size of entries in TLB, block
-mappings reduce the depth of page table walk needed to translate hugepage
-addresses.
-
-2) Using the Contiguous bit
----------------------------
-
-The architecture provides a contiguous bit in the translation table entries
-(D4.5.3, ARM DDI 0487C.a) that hints to the MMU to indicate that it is one of a
-contiguous set of entries that can be cached in a single TLB entry.
-
-The contiguous bit is used in Linux to increase the mapping size at the pmd and
-pte (last) level. The number of supported contiguous entries varies by page size
-and level of the page table.
-
-
-The following hugepage sizes are supported -
-
-         CONT PTE    PMD    CONT PMD    PUD
-         --------    ---    --------    ---
-  4K:         64K     2M         32M     1G
-  16K:         2M    32M          1G
-  64K:         2M   512M         16G
diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst
new file mode 100644
index 000000000000..96b696ba4e6c
--- /dev/null
+++ b/Documentation/arm64/index.rst
@@ -0,0 +1,26 @@
+==================
+ARM64 Architecture
+==================
+
+.. toctree::
+    :maxdepth: 1
+
+    acpi_object_usage
+    arm-acpi
+    booting
+    cpu-feature-registers
+    elf_hwcaps
+    hugetlbpage
+    legacy_instructions
+    memory
+    pointer-authentication
+    silicon-errata
+    sve
+    tagged-pointers
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/arm64/legacy_instructions.rst b/Documentation/arm64/legacy_instructions.rst
new file mode 100644
index 000000000000..54401b22cb8f
--- /dev/null
+++ b/Documentation/arm64/legacy_instructions.rst
@@ -0,0 +1,68 @@
+===================
+Legacy instructions
+===================
+
+The arm64 port of the Linux kernel provides infrastructure to support
+emulation of instructions which have been deprecated, or obsoleted in
+the architecture. The infrastructure code uses undefined instruction
+hooks to support emulation. Where available it also allows turning on
+the instruction execution in hardware.
+
+The emulation mode can be controlled by writing to sysctl nodes
+(/proc/sys/abi). The following explains the different execution
+behaviours and the corresponding values of the sysctl nodes -
+
+* Undef
+    Value: 0
+
+  Generates undefined instruction abort. Default for instructions that
+  have been obsoleted in the architecture, e.g., SWP
+
+* Emulate
+    Value: 1
+
+  Uses software emulation. To aid migration of software, in this mode
+  usage of emulated instruction is traced as well as rate limited
+  warnings are issued. This is the default for deprecated
+  instructions, .e.g., CP15 barriers
+
+* Hardware Execution
+    Value: 2
+
+  Although marked as deprecated, some implementations may support the
+  enabling/disabling of hardware support for the execution of these
+  instructions. Using hardware execution generally provides better
+  performance, but at the loss of ability to gather runtime statistics
+  about the use of the deprecated instructions.
+
+The default mode depends on the status of the instruction in the
+architecture. Deprecated instructions should default to emulation
+while obsolete instructions must be undefined by default.
+
+Note: Instruction emulation may not be possible in all cases. See
+individual instruction notes for further information.
+
+Supported legacy instructions
+-----------------------------
+* SWP{B}
+
+:Node: /proc/sys/abi/swp
+:Status: Obsolete
+:Default: Undef (0)
+
+* CP15 Barriers
+
+:Node: /proc/sys/abi/cp15_barrier
+:Status: Deprecated
+:Default: Emulate (1)
+
+* SETEND
+
+:Node: /proc/sys/abi/setend
+:Status: Deprecated
+:Default: Emulate (1)*
+
+  Note: All the cpus on the system must have mixed endian support at EL0
+  for this feature to be enabled. If a new CPU - which doesn't support mixed
+  endian - is hotplugged in after this feature has been enabled, there could
+  be unexpected results in the application.
diff --git a/Documentation/arm64/legacy_instructions.txt b/Documentation/arm64/legacy_instructions.txt
deleted file mode 100644
index 01bf3d9fac85..000000000000
--- a/Documentation/arm64/legacy_instructions.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-The arm64 port of the Linux kernel provides infrastructure to support
-emulation of instructions which have been deprecated, or obsoleted in
-the architecture. The infrastructure code uses undefined instruction
-hooks to support emulation. Where available it also allows turning on
-the instruction execution in hardware.
-
-The emulation mode can be controlled by writing to sysctl nodes
-(/proc/sys/abi). The following explains the different execution
-behaviours and the corresponding values of the sysctl nodes -
-
-* Undef
-  Value: 0
-  Generates undefined instruction abort. Default for instructions that
-  have been obsoleted in the architecture, e.g., SWP
-
-* Emulate
-  Value: 1
-  Uses software emulation. To aid migration of software, in this mode
-  usage of emulated instruction is traced as well as rate limited
-  warnings are issued. This is the default for deprecated
-  instructions, .e.g., CP15 barriers
-
-* Hardware Execution
-  Value: 2
-  Although marked as deprecated, some implementations may support the
-  enabling/disabling of hardware support for the execution of these
-  instructions. Using hardware execution generally provides better
-  performance, but at the loss of ability to gather runtime statistics
-  about the use of the deprecated instructions.
-
-The default mode depends on the status of the instruction in the
-architecture. Deprecated instructions should default to emulation
-while obsolete instructions must be undefined by default.
-
-Note: Instruction emulation may not be possible in all cases. See
-individual instruction notes for further information.
-
-Supported legacy instructions
------------------------------
-* SWP{B}
-Node: /proc/sys/abi/swp
-Status: Obsolete
-Default: Undef (0)
-
-* CP15 Barriers
-Node: /proc/sys/abi/cp15_barrier
-Status: Deprecated
-Default: Emulate (1)
-
-* SETEND
-Node: /proc/sys/abi/setend
-Status: Deprecated
-Default: Emulate (1)*
-Note: All the cpus on the system must have mixed endian support at EL0
-for this feature to be enabled. If a new CPU - which doesn't support mixed
-endian - is hotplugged in after this feature has been enabled, there could
-be unexpected results in the application.
diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
new file mode 100644
index 000000000000..464b880fc4b7
--- /dev/null
+++ b/Documentation/arm64/memory.rst
@@ -0,0 +1,98 @@
+==============================
+Memory Layout on AArch64 Linux
+==============================
+
+Author: Catalin Marinas <catalin.marinas@arm.com>
+
+This document describes the virtual memory layout used by the AArch64
+Linux kernel. The architecture allows up to 4 levels of translation
+tables with a 4KB page size and up to 3 levels with a 64KB page size.
+
+AArch64 Linux uses either 3 levels or 4 levels of translation tables
+with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit
+(256TB) virtual addresses, respectively, for both user and kernel. With
+64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB)
+virtual address, are used but the memory layout is the same.
+
+User addresses have bits 63:48 set to 0 while the kernel addresses have
+the same bits set to 1. TTBRx selection is given by bit 63 of the
+virtual address. The swapper_pg_dir contains only kernel (global)
+mappings while the user pgd contains only user (non-global) mappings.
+The swapper_pg_dir address is written to TTBR1 and never written to
+TTBR0.
+
+
+AArch64 Linux memory layout with 4KB pages + 3 levels::
+
+  Start			End			Size		Use
+  -----------------------------------------------------------------------
+  0000000000000000	0000007fffffffff	 512GB		user
+  ffffff8000000000	ffffffffffffffff	 512GB		kernel
+
+
+AArch64 Linux memory layout with 4KB pages + 4 levels::
+
+  Start			End			Size		Use
+  -----------------------------------------------------------------------
+  0000000000000000	0000ffffffffffff	 256TB		user
+  ffff000000000000	ffffffffffffffff	 256TB		kernel
+
+
+AArch64 Linux memory layout with 64KB pages + 2 levels::
+
+  Start			End			Size		Use
+  -----------------------------------------------------------------------
+  0000000000000000	000003ffffffffff	   4TB		user
+  fffffc0000000000	ffffffffffffffff	   4TB		kernel
+
+
+AArch64 Linux memory layout with 64KB pages + 3 levels::
+
+  Start			End			Size		Use
+  -----------------------------------------------------------------------
+  0000000000000000	0000ffffffffffff	 256TB		user
+  ffff000000000000	ffffffffffffffff	 256TB		kernel
+
+
+For details of the virtual kernel memory layout please see the kernel
+booting log.
+
+
+Translation table lookup with 4KB pages::
+
+  +--------+--------+--------+--------+--------+--------+--------+--------+
+  |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
+  +--------+--------+--------+--------+--------+--------+--------+--------+
+   |                 |         |         |         |         |
+   |                 |         |         |         |         v
+   |                 |         |         |         |   [11:0]  in-page offset
+   |                 |         |         |         +-> [20:12] L3 index
+   |                 |         |         +-----------> [29:21] L2 index
+   |                 |         +---------------------> [38:30] L1 index
+   |                 +-------------------------------> [47:39] L0 index
+   +-------------------------------------------------> [63] TTBR0/1
+
+
+Translation table lookup with 64KB pages::
+
+  +--------+--------+--------+--------+--------+--------+--------+--------+
+  |63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
+  +--------+--------+--------+--------+--------+--------+--------+--------+
+   |                 |    |               |              |
+   |                 |    |               |              v
+   |                 |    |               |            [15:0]  in-page offset
+   |                 |    |               +----------> [28:16] L3 index
+   |                 |    +--------------------------> [41:29] L2 index
+   |                 +-------------------------------> [47:42] L1 index
+   +-------------------------------------------------> [63] TTBR0/1
+
+
+When using KVM without the Virtualization Host Extensions, the
+hypervisor maps kernel pages in EL2 at a fixed (and potentially
+random) offset from the linear mapping. See the kern_hyp_va macro and
+kvm_update_va_mask function for more details. MMIO devices such as
+GICv2 gets mapped next to the HYP idmap page, as do vectors when
+ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs.
+
+When using KVM with the Virtualization Host Extensions, no additional
+mappings are created, since the host kernel runs directly in EL2.
diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt
deleted file mode 100644
index c5dab30d3389..000000000000
--- a/Documentation/arm64/memory.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-		     Memory Layout on AArch64 Linux
-		     ==============================
-
-Author: Catalin Marinas <catalin.marinas@arm.com>
-
-This document describes the virtual memory layout used by the AArch64
-Linux kernel. The architecture allows up to 4 levels of translation
-tables with a 4KB page size and up to 3 levels with a 64KB page size.
-
-AArch64 Linux uses either 3 levels or 4 levels of translation tables
-with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit
-(256TB) virtual addresses, respectively, for both user and kernel. With
-64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB)
-virtual address, are used but the memory layout is the same.
-
-User addresses have bits 63:48 set to 0 while the kernel addresses have
-the same bits set to 1. TTBRx selection is given by bit 63 of the
-virtual address. The swapper_pg_dir contains only kernel (global)
-mappings while the user pgd contains only user (non-global) mappings.
-The swapper_pg_dir address is written to TTBR1 and never written to
-TTBR0.
-
-
-AArch64 Linux memory layout with 4KB pages + 3 levels:
-
-Start			End			Size		Use
------------------------------------------------------------------------
-0000000000000000	0000007fffffffff	 512GB		user
-ffffff8000000000	ffffffffffffffff	 512GB		kernel
-
-
-AArch64 Linux memory layout with 4KB pages + 4 levels:
-
-Start			End			Size		Use
------------------------------------------------------------------------
-0000000000000000	0000ffffffffffff	 256TB		user
-ffff000000000000	ffffffffffffffff	 256TB		kernel
-
-
-AArch64 Linux memory layout with 64KB pages + 2 levels:
-
-Start			End			Size		Use
------------------------------------------------------------------------
-0000000000000000	000003ffffffffff	   4TB		user
-fffffc0000000000	ffffffffffffffff	   4TB		kernel
-
-
-AArch64 Linux memory layout with 64KB pages + 3 levels:
-
-Start			End			Size		Use
------------------------------------------------------------------------
-0000000000000000	0000ffffffffffff	 256TB		user
-ffff000000000000	ffffffffffffffff	 256TB		kernel
-
-
-For details of the virtual kernel memory layout please see the kernel
-booting log.
-
-
-Translation table lookup with 4KB pages:
-
-+--------+--------+--------+--------+--------+--------+--------+--------+
-|63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
-+--------+--------+--------+--------+--------+--------+--------+--------+
- |                 |         |         |         |         |
- |                 |         |         |         |         v
- |                 |         |         |         |   [11:0]  in-page offset
- |                 |         |         |         +-> [20:12] L3 index
- |                 |         |         +-----------> [29:21] L2 index
- |                 |         +---------------------> [38:30] L1 index
- |                 +-------------------------------> [47:39] L0 index
- +-------------------------------------------------> [63] TTBR0/1
-
-
-Translation table lookup with 64KB pages:
-
-+--------+--------+--------+--------+--------+--------+--------+--------+
-|63    56|55    48|47    40|39    32|31    24|23    16|15     8|7      0|
-+--------+--------+--------+--------+--------+--------+--------+--------+
- |                 |    |               |              |
- |                 |    |               |              v
- |                 |    |               |            [15:0]  in-page offset
- |                 |    |               +----------> [28:16] L3 index
- |                 |    +--------------------------> [41:29] L2 index
- |                 +-------------------------------> [47:42] L1 index
- +-------------------------------------------------> [63] TTBR0/1
-
-
-When using KVM without the Virtualization Host Extensions, the
-hypervisor maps kernel pages in EL2 at a fixed (and potentially
-random) offset from the linear mapping. See the kern_hyp_va macro and
-kvm_update_va_mask function for more details. MMIO devices such as
-GICv2 gets mapped next to the HYP idmap page, as do vectors when
-ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs.
-
-When using KVM with the Virtualization Host Extensions, no additional
-mappings are created, since the host kernel runs directly in EL2.
diff --git a/Documentation/arm64/pointer-authentication.rst b/Documentation/arm64/pointer-authentication.rst
new file mode 100644
index 000000000000..30b2ab06526b
--- /dev/null
+++ b/Documentation/arm64/pointer-authentication.rst
@@ -0,0 +1,109 @@
+=======================================
+Pointer authentication in AArch64 Linux
+=======================================
+
+Author: Mark Rutland <mark.rutland@arm.com>
+
+Date: 2017-07-19
+
+This document briefly describes the provision of pointer authentication
+functionality in AArch64 Linux.
+
+
+Architecture overview
+---------------------
+
+The ARMv8.3 Pointer Authentication extension adds primitives that can be
+used to mitigate certain classes of attack where an attacker can corrupt
+the contents of some memory (e.g. the stack).
+
+The extension uses a Pointer Authentication Code (PAC) to determine
+whether pointers have been modified unexpectedly. A PAC is derived from
+a pointer, another value (such as the stack pointer), and a secret key
+held in system registers.
+
+The extension adds instructions to insert a valid PAC into a pointer,
+and to verify/remove the PAC from a pointer. The PAC occupies a number
+of high-order bits of the pointer, which varies dependent on the
+configured virtual address size and whether pointer tagging is in use.
+
+A subset of these instructions have been allocated from the HINT
+encoding space. In the absence of the extension (or when disabled),
+these instructions behave as NOPs. Applications and libraries using
+these instructions operate correctly regardless of the presence of the
+extension.
+
+The extension provides five separate keys to generate PACs - two for
+instruction addresses (APIAKey, APIBKey), two for data addresses
+(APDAKey, APDBKey), and one for generic authentication (APGAKey).
+
+
+Basic support
+-------------
+
+When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is
+present, the kernel will assign random key values to each process at
+exec*() time. The keys are shared by all threads within the process, and
+are preserved across fork().
+
+Presence of address authentication functionality is advertised via
+HWCAP_PACA, and generic authentication functionality via HWCAP_PACG.
+
+The number of bits that the PAC occupies in a pointer is 55 minus the
+virtual address size configured by the kernel. For example, with a
+virtual address size of 48, the PAC is 7 bits wide.
+
+Recent versions of GCC can compile code with APIAKey-based return
+address protection when passed the -msign-return-address option. This
+uses instructions in the HINT space (unless -march=armv8.3-a or higher
+is also passed), and such code can run on systems without the pointer
+authentication extension.
+
+In addition to exec(), keys can also be reinitialized to random values
+using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY,
+PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY
+specifies which keys are to be reinitialized; specifying 0 means "all
+keys".
+
+
+Debugging
+---------
+
+When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address
+authentication is present, the kernel will expose the position of TTBR0
+PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which
+userspace can acquire via PTRACE_GETREGSET.
+
+The regset is exposed only when HWCAP_PACA is set. Separate masks are
+exposed for data pointers and instruction pointers, as the set of PAC
+bits can vary between the two. Note that the masks apply to TTBR0
+addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
+pointers).
+
+Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
+will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
+user_pac_address_keys and struct user_pac_generic_keys). These can be
+used to get and set the keys for a thread.
+
+
+Virtualization
+--------------
+
+Pointer authentication is enabled in KVM guest when each virtual cpu is
+initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
+requesting these two separate cpu features to be enabled. The current KVM
+guest implementation works by enabling both features together, so both
+these userspace flags are checked before enabling pointer authentication.
+The separate userspace flag will allow to have no userspace ABI changes
+if support is added in the future to allow these two features to be
+enabled independently of one another.
+
+As Arm Architecture specifies that Pointer Authentication feature is
+implemented along with the VHE feature so KVM arm64 ptrauth code relies
+on VHE mode to be present.
+
+Additionally, when these vcpu feature flags are not set then KVM will
+filter out the Pointer Authentication system key registers from
+KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
+register. Any attempt to use the Pointer Authentication instructions will
+result in an UNDEFINED exception being injected into the guest.
diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt
deleted file mode 100644
index fc71b33de87e..000000000000
--- a/Documentation/arm64/pointer-authentication.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-Pointer authentication in AArch64 Linux
-=======================================
-
-Author: Mark Rutland <mark.rutland@arm.com>
-Date: 2017-07-19
-
-This document briefly describes the provision of pointer authentication
-functionality in AArch64 Linux.
-
-
-Architecture overview
----------------------
-
-The ARMv8.3 Pointer Authentication extension adds primitives that can be
-used to mitigate certain classes of attack where an attacker can corrupt
-the contents of some memory (e.g. the stack).
-
-The extension uses a Pointer Authentication Code (PAC) to determine
-whether pointers have been modified unexpectedly. A PAC is derived from
-a pointer, another value (such as the stack pointer), and a secret key
-held in system registers.
-
-The extension adds instructions to insert a valid PAC into a pointer,
-and to verify/remove the PAC from a pointer. The PAC occupies a number
-of high-order bits of the pointer, which varies dependent on the
-configured virtual address size and whether pointer tagging is in use.
-
-A subset of these instructions have been allocated from the HINT
-encoding space. In the absence of the extension (or when disabled),
-these instructions behave as NOPs. Applications and libraries using
-these instructions operate correctly regardless of the presence of the
-extension.
-
-The extension provides five separate keys to generate PACs - two for
-instruction addresses (APIAKey, APIBKey), two for data addresses
-(APDAKey, APDBKey), and one for generic authentication (APGAKey).
-
-
-Basic support
--------------
-
-When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is
-present, the kernel will assign random key values to each process at
-exec*() time. The keys are shared by all threads within the process, and
-are preserved across fork().
-
-Presence of address authentication functionality is advertised via
-HWCAP_PACA, and generic authentication functionality via HWCAP_PACG.
-
-The number of bits that the PAC occupies in a pointer is 55 minus the
-virtual address size configured by the kernel. For example, with a
-virtual address size of 48, the PAC is 7 bits wide.
-
-Recent versions of GCC can compile code with APIAKey-based return
-address protection when passed the -msign-return-address option. This
-uses instructions in the HINT space (unless -march=armv8.3-a or higher
-is also passed), and such code can run on systems without the pointer
-authentication extension.
-
-In addition to exec(), keys can also be reinitialized to random values
-using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY,
-PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY
-specifies which keys are to be reinitialized; specifying 0 means "all
-keys".
-
-
-Debugging
----------
-
-When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address
-authentication is present, the kernel will expose the position of TTBR0
-PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which
-userspace can acquire via PTRACE_GETREGSET.
-
-The regset is exposed only when HWCAP_PACA is set. Separate masks are
-exposed for data pointers and instruction pointers, as the set of PAC
-bits can vary between the two. Note that the masks apply to TTBR0
-addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
-pointers).
-
-Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
-will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
-user_pac_address_keys and struct user_pac_generic_keys). These can be
-used to get and set the keys for a thread.
-
-
-Virtualization
---------------
-
-Pointer authentication is enabled in KVM guest when each virtual cpu is
-initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
-requesting these two separate cpu features to be enabled. The current KVM
-guest implementation works by enabling both features together, so both
-these userspace flags are checked before enabling pointer authentication.
-The separate userspace flag will allow to have no userspace ABI changes
-if support is added in the future to allow these two features to be
-enabled independently of one another.
-
-As Arm Architecture specifies that Pointer Authentication feature is
-implemented along with the VHE feature so KVM arm64 ptrauth code relies
-on VHE mode to be present.
-
-Additionally, when these vcpu feature flags are not set then KVM will
-filter out the Pointer Authentication system key registers from
-KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
-register. Any attempt to use the Pointer Authentication instructions will
-result in an UNDEFINED exception being injected into the guest.
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
new file mode 100644
index 000000000000..3e57d09246e6
--- /dev/null
+++ b/Documentation/arm64/silicon-errata.rst
@@ -0,0 +1,133 @@
+=======================================
+Silicon Errata and Software Workarounds
+=======================================
+
+Author: Will Deacon <will.deacon@arm.com>
+
+Date  : 27 November 2015
+
+It is an unfortunate fact of life that hardware is often produced with
+so-called "errata", which can cause it to deviate from the architecture
+under specific circumstances.  For hardware produced by ARM, these
+errata are broadly classified into the following categories:
+
+  ==========  ========================================================
+  Category A  A critical error without a viable workaround.
+  Category B  A significant or critical error with an acceptable
+              workaround.
+  Category C  A minor error that is not expected to occur under normal
+              operation.
+  ==========  ========================================================
+
+For more information, consult one of the "Software Developers Errata
+Notice" documents available on infocenter.arm.com (registration
+required).
+
+As far as Linux is concerned, Category B errata may require some special
+treatment in the operating system. For example, avoiding a particular
+sequence of code, or configuring the processor in a particular way. A
+less common situation may require similar actions in order to declassify
+a Category A erratum into a Category C erratum. These are collectively
+known as "software workarounds" and are only required in the minority of
+cases (e.g. those cases that both require a non-secure workaround *and*
+can be triggered by Linux).
+
+For software workarounds that may adversely impact systems unaffected by
+the erratum in question, a Kconfig entry is added under "Kernel
+Features" -> "ARM errata workarounds via the alternatives framework".
+These are enabled by default and patched in at runtime when an affected
+CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+available and the code is structured (preferably with a comment) in such
+a way that the erratum will not be hit.
+
+This approach can make it slightly onerous to determine exactly which
+errata are worked around in an arbitrary kernel source tree, so this
+file acts as a registry of software workarounds in the Linux Kernel and
+will be updated when new workarounds are committed and backported to
+stable kernels.
+
++----------------+-----------------+-----------------+-----------------------------+
+| Implementor    | Component       | Erratum ID      | Kconfig                     |
++================+=================+=================+=============================+
+| Allwinner      | A64/R18         | UNKNOWN1        | SUN50I_ERRATUM_UNKNOWN1     |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #826319         | ARM64_ERRATUM_826319        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #827319         | ARM64_ERRATUM_827319        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #824069         | ARM64_ERRATUM_824069        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #819472         | ARM64_ERRATUM_819472        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #845719         | ARM64_ERRATUM_845719        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A53      | #843419         | ARM64_ERRATUM_843419        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A57      | #852523         | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A72      | #853709         | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A73      | #858921         | ARM64_ERRATUM_858921        |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A55      | #1024718        | ARM64_ERRATUM_1024718       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A76      | #1188873,1418040| ARM64_ERRATUM_1418040       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A76      | #1165522        | ARM64_ERRATUM_1165522       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A76      | #1286807        | ARM64_ERRATUM_1286807       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A76      | #1463225        | ARM64_ERRATUM_1463225       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Neoverse-N1     | #1188873,1418040| ARM64_ERRATUM_1418040       |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Neoverse-N1     | #1349291        | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM            | MMU-500         | #841119,826419  | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX ITS    | #22375,24313    | CAVIUM_ERRATUM_22375        |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX ITS    | #23144          | CAVIUM_ERRATUM_23144        |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154        |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456        |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX Core   | #30115          | CAVIUM_ERRATUM_30115        |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX SMMUv2 | #27704          | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX2 SMMUv3| #74             | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium         | ThunderX2 SMMUv3| #126            | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon      | Hip0{5,6,7}     | #161010101      | HISILICON_ERRATUM_161010101 |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon      | Hip0{6,7}       | #161010701      | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon      | Hip07           | #161600802      | HISILICON_ERRATUM_161600802 |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon      | Hip08 SMMU PMCG | #162001800      | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Kryo/Falkor v1  | E1003           | QCOM_FALKOR_ERRATUM_1003    |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | QDF2400 ITS     | E0065           | QCOM_QDF2400_ERRATUM_0065   |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Falkor v{1,2}   | E1041           | QCOM_FALKOR_ERRATUM_1041    |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Fujitsu        | A64FX           | E#010001        | FUJITSU_ERRATUM_010001      |
++----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
deleted file mode 100644
index 68d9b74fd751..000000000000
--- a/Documentation/arm64/silicon-errata.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-                Silicon Errata and Software Workarounds
-                =======================================
-
-Author: Will Deacon <will.deacon@arm.com>
-Date  : 27 November 2015
-
-It is an unfortunate fact of life that hardware is often produced with
-so-called "errata", which can cause it to deviate from the architecture
-under specific circumstances.  For hardware produced by ARM, these
-errata are broadly classified into the following categories:
-
-  Category A: A critical error without a viable workaround.
-  Category B: A significant or critical error with an acceptable
-              workaround.
-  Category C: A minor error that is not expected to occur under normal
-              operation.
-
-For more information, consult one of the "Software Developers Errata
-Notice" documents available on infocenter.arm.com (registration
-required).
-
-As far as Linux is concerned, Category B errata may require some special
-treatment in the operating system. For example, avoiding a particular
-sequence of code, or configuring the processor in a particular way. A
-less common situation may require similar actions in order to declassify
-a Category A erratum into a Category C erratum. These are collectively
-known as "software workarounds" and are only required in the minority of
-cases (e.g. those cases that both require a non-secure workaround *and*
-can be triggered by Linux).
-
-For software workarounds that may adversely impact systems unaffected by
-the erratum in question, a Kconfig entry is added under "Kernel
-Features" -> "ARM errata workarounds via the alternatives framework".
-These are enabled by default and patched in at runtime when an affected
-CPU is detected. For less-intrusive workarounds, a Kconfig option is not
-available and the code is structured (preferably with a comment) in such
-a way that the erratum will not be hit.
-
-This approach can make it slightly onerous to determine exactly which
-errata are worked around in an arbitrary kernel source tree, so this
-file acts as a registry of software workarounds in the Linux Kernel and
-will be updated when new workarounds are committed and backported to
-stable kernels.
-
-| Implementor    | Component       | Erratum ID      | Kconfig                     |
-+----------------+-----------------+-----------------+-----------------------------+
-| Allwinner      | A64/R18         | UNKNOWN1        | SUN50I_ERRATUM_UNKNOWN1     |
-|                |                 |                 |                             |
-| ARM            | Cortex-A53      | #826319         | ARM64_ERRATUM_826319        |
-| ARM            | Cortex-A53      | #827319         | ARM64_ERRATUM_827319        |
-| ARM            | Cortex-A53      | #824069         | ARM64_ERRATUM_824069        |
-| ARM            | Cortex-A53      | #819472         | ARM64_ERRATUM_819472        |
-| ARM            | Cortex-A53      | #845719         | ARM64_ERRATUM_845719        |
-| ARM            | Cortex-A53      | #843419         | ARM64_ERRATUM_843419        |
-| ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075        |
-| ARM            | Cortex-A57      | #852523         | N/A                         |
-| ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220        |
-| ARM            | Cortex-A72      | #853709         | N/A                         |
-| ARM            | Cortex-A73      | #858921         | ARM64_ERRATUM_858921        |
-| ARM            | Cortex-A55      | #1024718        | ARM64_ERRATUM_1024718       |
-| ARM            | Cortex-A76      | #1188873        | ARM64_ERRATUM_1188873       |
-| ARM            | Cortex-A76      | #1165522        | ARM64_ERRATUM_1165522       |
-| ARM            | Cortex-A76      | #1286807        | ARM64_ERRATUM_1286807       |
-| ARM            | Neoverse-N1     | #1188873        | ARM64_ERRATUM_1188873       |
-| ARM            | MMU-500         | #841119,#826419 | N/A                         |
-|                |                 |                 |                             |
-| Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375        |
-| Cavium         | ThunderX ITS    | #23144          | CAVIUM_ERRATUM_23144        |
-| Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154        |
-| Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456        |
-| Cavium         | ThunderX Core   | #30115          | CAVIUM_ERRATUM_30115        |
-| Cavium         | ThunderX SMMUv2 | #27704          | N/A                         |
-| Cavium         | ThunderX2 SMMUv3| #74             | N/A                         |
-| Cavium         | ThunderX2 SMMUv3| #126            | N/A                         |
-|                |                 |                 |                             |
-| Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
-|                |                 |                 |                             |
-| Hisilicon      | Hip0{5,6,7}     | #161010101      | HISILICON_ERRATUM_161010101 |
-| Hisilicon      | Hip0{6,7}       | #161010701      | N/A                         |
-| Hisilicon      | Hip07           | #161600802      | HISILICON_ERRATUM_161600802 |
-| Hisilicon      | Hip08 SMMU PMCG | #162001800      | N/A                         |
-|                |                 |                 |                             |
-| Qualcomm Tech. | Kryo/Falkor v1  | E1003           | QCOM_FALKOR_ERRATUM_1003    |
-| Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
-| Qualcomm Tech. | QDF2400 ITS     | E0065           | QCOM_QDF2400_ERRATUM_0065   |
-| Qualcomm Tech. | Falkor v{1,2}   | E1041           | QCOM_FALKOR_ERRATUM_1041    |
-| Fujitsu        | A64FX           | E#010001        | FUJITSU_ERRATUM_010001      |
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
new file mode 100644
index 000000000000..5689c74c8082
--- /dev/null
+++ b/Documentation/arm64/sve.rst
@@ -0,0 +1,545 @@
+===================================================
+Scalable Vector Extension support for AArch64 Linux
+===================================================
+
+Author: Dave Martin <Dave.Martin@arm.com>
+
+Date:   4 August 2017
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Scalable Vector Extension (SVE).
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive.
+
+This document does not aim to describe the SVE architecture or programmer's
+model.  To aid understanding, a minimal description of relevant programmer's
+model features for SVE is included in Appendix A.
+
+
+1.  General
+-----------
+
+* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are
+  tracked per-thread.
+
+* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector
+  AT_HWCAP entry.  Presence of this flag implies the presence of the SVE
+  instructions and registers, and the Linux-specific system interfaces
+  described in this document.  SVE is reported in /proc/cpuinfo as "sve".
+
+* Support for the execution of SVE instructions in userspace can also be
+  detected by reading the CPU ID register ID_AA64PFR0_EL1 using an MRS
+  instruction, and checking that the value of the SVE field is nonzero. [3]
+
+  It does not guarantee the presence of the system interfaces described in the
+  following sections: software that needs to verify that those interfaces are
+  present must check for HWCAP_SVE instead.
+
+* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also
+  be reported in the AT_HWCAP2 aux vector entry.  In addition to this,
+  optional extensions to SVE2 may be reported by the presence of:
+
+	HWCAP2_SVE2
+	HWCAP2_SVEAES
+	HWCAP2_SVEPMULL
+	HWCAP2_SVEBITPERM
+	HWCAP2_SVESHA3
+	HWCAP2_SVESM4
+
+  This list may be extended over time as the SVE architecture evolves.
+
+  These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1,
+  which userspace can read using an MRS instruction.  See elf_hwcaps.txt and
+  cpu-feature-registers.txt for details.
+
+* Debuggers should restrict themselves to interacting with the target via the
+  NT_ARM_SVE regset.  The recommended way of detecting support for this regset
+  is to connect to a target process first and then attempt a
+  ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov).
+
+* Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory
+  between userspace and the kernel, the register value is encoded in memory in
+  an endianness-invariant layout, with bits [(8 * i + 7) : (8 * i)] encoded at
+  byte offset i from the start of the memory representation.  This affects for
+  example the signal frame (struct sve_context) and ptrace interface
+  (struct user_sve_header) and associated data.
+
+  Beware that on big-endian systems this results in a different byte order than
+  for the FPSIMD V-registers, which are stored as single host-endian 128-bit
+  values, with bits [(127 - 8 * i) : (120 - 8 * i)] of the register encoded at
+  byte offset i.  (struct fpsimd_context, struct user_fpsimd_state).
+
+
+2.  Vector length terminology
+-----------------------------
+
+The size of an SVE vector (Z) register is referred to as the "vector length".
+
+To avoid confusion about the units used to express vector length, the kernel
+adopts the following conventions:
+
+* Vector length (VL) = size of a Z-register in bytes
+
+* Vector quadwords (VQ) = size of a Z-register in units of 128 bits
+
+(So, VL = 16 * VQ.)
+
+The VQ convention is used where the underlying granularity is important, such
+as in data structure definitions.  In most other situations, the VL convention
+is used.  This is consistent with the meaning of the "VL" pseudo-register in
+the SVE instruction set architecture.
+
+
+3.  System call behaviour
+-------------------------
+
+* On syscall, V0..V31 are preserved (as without SVE).  Thus, bits [127:0] of
+  Z0..Z31 are preserved.  All other bits of Z0..Z31, and all of P0..P15 and FFR
+  become unspecified on return from a syscall.
+
+* The SVE registers are not used to pass arguments to or receive results from
+  any syscall.
+
+* In practice the affected registers/bits will be preserved or will be replaced
+  with zeros on return from a syscall, but userspace should not make
+  assumptions about this.  The kernel behaviour may vary on a case-by-case
+  basis.
+
+* All other SVE state of a thread, including the currently configured vector
+  length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector
+  length (if any), is preserved across all syscalls, subject to the specific
+  exceptions for execve() described in section 6.
+
+  In particular, on return from a fork() or clone(), the parent and new child
+  process or thread share identical SVE configuration, matching that of the
+  parent before the call.
+
+
+4.  Signal handling
+-------------------
+
+* A new signal frame record sve_context encodes the SVE registers on signal
+  delivery. [1]
+
+* This record is supplementary to fpsimd_context.  The FPSR and FPCR registers
+  are only present in fpsimd_context.  For convenience, the content of V0..V31
+  is duplicated between sve_context and fpsimd_context.
+
+* The signal frame record for SVE always contains basic metadata, in particular
+  the thread's vector length (in sve_context.vl).
+
+* The SVE registers may or may not be included in the record, depending on
+  whether the registers are live for the thread.  The registers are present if
+  and only if:
+  sve_context.head.size >= SVE_SIG_CONTEXT_SIZE(sve_vq_from_vl(sve_context.vl)).
+
+* If the registers are present, the remainder of the record has a vl-dependent
+  size and layout.  Macros SVE_SIG_* are defined [1] to facilitate access to
+  the members.
+
+* Each scalable register (Zn, Pn, FFR) is stored in an endianness-invariant
+  layout, with bits [(8 * i + 7) : (8 * i)] stored at byte offset i from the
+  start of the register's representation in memory.
+
+* If the SVE context is too big to fit in sigcontext.__reserved[], then extra
+  space is allocated on the stack, an extra_context record is written in
+  __reserved[] referencing this space.  sve_context is then written in the
+  extra space.  Refer to [1] for further details about this mechanism.
+
+
+5.  Signal return
+-----------------
+
+When returning from a signal handler:
+
+* If there is no sve_context record in the signal frame, or if the record is
+  present but contains no register data as desribed in the previous section,
+  then the SVE registers/bits become non-live and take unspecified values.
+
+* If sve_context is present in the signal frame and contains full register
+  data, the SVE registers become live and are populated with the specified
+  data.  However, for backward compatibility reasons, bits [127:0] of Z0..Z31
+  are always restored from the corresponding members of fpsimd_context.vregs[]
+  and not from sve_context.  The remaining bits are restored from sve_context.
+
+* Inclusion of fpsimd_context in the signal frame remains mandatory,
+  irrespective of whether sve_context is present or not.
+
+* The vector length cannot be changed via signal return.  If sve_context.vl in
+  the signal frame does not match the current vector length, the signal return
+  attempt is treated as illegal, resulting in a forced SIGSEGV.
+
+
+6.  prctl extensions
+--------------------
+
+Some new prctl() calls are added to allow programs to manage the SVE vector
+length:
+
+prctl(PR_SVE_SET_VL, unsigned long arg)
+
+    Sets the vector length of the calling thread and related flags, where
+    arg == vl | flags.  Other threads of the calling process are unaffected.
+
+    vl is the desired vector length, where sve_vl_valid(vl) must be true.
+
+    flags:
+
+	PR_SVE_SET_VL_INHERIT
+
+	    Inherit the current vector length across execve().  Otherwise, the
+	    vector length is reset to the system default at execve().  (See
+	    Section 9.)
+
+	PR_SVE_SET_VL_ONEXEC
+
+	    Defer the requested vector length change until the next execve()
+	    performed by this thread.
+
+	    The effect is equivalent to implicit exceution of the following
+	    call immediately after the next execve() (if any) by the thread:
+
+		prctl(PR_SVE_SET_VL, arg & ~PR_SVE_SET_VL_ONEXEC)
+
+	    This allows launching of a new program with a different vector
+	    length, while avoiding runtime side effects in the caller.
+
+
+	    Without PR_SVE_SET_VL_ONEXEC, the requested change takes effect
+	    immediately.
+
+
+    Return value: a nonnegative on success, or a negative value on error:
+	EINVAL: SVE not supported, invalid vector length requested, or
+	    invalid flags.
+
+
+    On success:
+
+    * Either the calling thread's vector length or the deferred vector length
+      to be applied at the next execve() by the thread (dependent on whether
+      PR_SVE_SET_VL_ONEXEC is present in arg), is set to the largest value
+      supported by the system that is less than or equal to vl.  If vl ==
+      SVE_VL_MAX, the value set will be the largest value supported by the
+      system.
+
+    * Any previously outstanding deferred vector length change in the calling
+      thread is cancelled.
+
+    * The returned value describes the resulting configuration, encoded as for
+      PR_SVE_GET_VL.  The vector length reported in this value is the new
+      current vector length for this thread if PR_SVE_SET_VL_ONEXEC was not
+      present in arg; otherwise, the reported vector length is the deferred
+      vector length that will be applied at the next execve() by the calling
+      thread.
+
+    * Changing the vector length causes all of P0..P15, FFR and all bits of
+      Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
+      unspecified.  Calling PR_SVE_SET_VL with vl equal to the thread's current
+      vector length, or calling PR_SVE_SET_VL with the PR_SVE_SET_VL_ONEXEC
+      flag, does not constitute a change to the vector length for this purpose.
+
+
+prctl(PR_SVE_GET_VL)
+
+    Gets the vector length of the calling thread.
+
+    The following flag may be OR-ed into the result:
+
+	PR_SVE_SET_VL_INHERIT
+
+	    Vector length will be inherited across execve().
+
+    There is no way to determine whether there is an outstanding deferred
+    vector length change (which would only normally be the case between a
+    fork() or vfork() and the corresponding execve() in typical use).
+
+    To extract the vector length from the result, and it with
+    PR_SVE_VL_LEN_MASK.
+
+    Return value: a nonnegative value on success, or a negative value on error:
+	EINVAL: SVE not supported.
+
+
+7.  ptrace extensions
+---------------------
+
+* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and
+  PTRACE_SETREGSET.
+
+  Refer to [2] for definitions.
+
+The regset data starts with struct user_sve_header, containing:
+
+    size
+
+	Size of the complete regset, in bytes.
+	This depends on vl and possibly on other things in the future.
+
+	If a call to PTRACE_GETREGSET requests less data than the value of
+	size, the caller can allocate a larger buffer and retry in order to
+	read the complete regset.
+
+    max_size
+
+	Maximum size in bytes that the regset can grow to for the target
+	thread.  The regset won't grow bigger than this even if the target
+	thread changes its vector length etc.
+
+    vl
+
+	Target thread's current vector length, in bytes.
+
+    max_vl
+
+	Maximum possible vector length for the target thread.
+
+    flags
+
+	either
+
+	    SVE_PT_REGS_FPSIMD
+
+		SVE registers are not live (GETREGSET) or are to be made
+		non-live (SETREGSET).
+
+		The payload is of type struct user_fpsimd_state, with the same
+		meaning as for NT_PRFPREG, starting at offset
+		SVE_PT_FPSIMD_OFFSET from the start of user_sve_header.
+
+		Extra data might be appended in the future: the size of the
+		payload should be obtained using SVE_PT_FPSIMD_SIZE(vq, flags).
+
+		vq should be obtained using sve_vq_from_vl(vl).
+
+		or
+
+	    SVE_PT_REGS_SVE
+
+		SVE registers are live (GETREGSET) or are to be made live
+		(SETREGSET).
+
+		The payload contains the SVE register data, starting at offset
+		SVE_PT_SVE_OFFSET from the start of user_sve_header, and with
+		size SVE_PT_SVE_SIZE(vq, flags);
+
+	... OR-ed with zero or more of the following flags, which have the same
+	meaning and behaviour as the corresponding PR_SET_VL_* flags:
+
+	    SVE_PT_VL_INHERIT
+
+	    SVE_PT_VL_ONEXEC (SETREGSET only).
+
+* The effects of changing the vector length and/or flags are equivalent to
+  those documented for PR_SVE_SET_VL.
+
+  The caller must make a further GETREGSET call if it needs to know what VL is
+  actually set by SETREGSET, unless is it known in advance that the requested
+  VL is supported.
+
+* In the SVE_PT_REGS_SVE case, the size and layout of the payload depends on
+  the header fields.  The SVE_PT_SVE_*() macros are provided to facilitate
+  access to the members.
+
+* In either case, for SETREGSET it is permissible to omit the payload, in which
+  case only the vector length and flags are changed (along with any
+  consequences of those changes).
+
+* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the
+  requested VL is not supported, the effect will be the same as if the
+  payload were omitted, except that an EIO error is reported.  No
+  attempt is made to translate the payload data to the correct layout
+  for the vector length actually set.  The thread's FPSIMD state is
+  preserved, but the remaining bits of the SVE registers become
+  unspecified.  It is up to the caller to translate the payload layout
+  for the actual VL and retry.
+
+* The effect of writing a partial, incomplete payload is unspecified.
+
+
+8.  ELF coredump extensions
+---------------------------
+
+* A NT_ARM_SVE note will be added to each coredump for each thread of the
+  dumped process.  The contents will be equivalent to the data that would have
+  been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread
+  when the coredump was generated.
+
+
+9.  System runtime configuration
+--------------------------------
+
+* To mitigate the ABI impact of expansion of the signal frame, a policy
+  mechanism is provided for administrators, distro maintainers and developers
+  to set the default vector length for userspace processes:
+
+/proc/sys/abi/sve_default_vector_length
+
+    Writing the text representation of an integer to this file sets the system
+    default vector length to the specified value, unless the value is greater
+    than the maximum vector length supported by the system in which case the
+    default vector length is set to that maximum.
+
+    The result can be determined by reopening the file and reading its
+    contents.
+
+    At boot, the default vector length is initially set to 64 or the maximum
+    supported vector length, whichever is smaller.  This determines the initial
+    vector length of the init process (PID 1).
+
+    Reading this file returns the current system default vector length.
+
+* At every execve() call, the new vector length of the new process is set to
+  the system default vector length, unless
+
+    * PR_SVE_SET_VL_INHERIT (or equivalently SVE_PT_VL_INHERIT) is set for the
+      calling thread, or
+
+    * a deferred vector length change is pending, established via the
+      PR_SVE_SET_VL_ONEXEC flag (or SVE_PT_VL_ONEXEC).
+
+* Modifying the system default vector length does not affect the vector length
+  of any existing process or thread that does not make an execve() call.
+
+
+Appendix A.  SVE programmer's model (informative)
+=================================================
+
+This section provides a minimal description of the additions made by SVE to the
+ARMv8-A programmer's model that are relevant to this document.
+
+Note: This section is for information only and not intended to be complete or
+to replace any architectural specification.
+
+A.1.  Registers
+---------------
+
+In A64 state, SVE adds the following:
+
+* 32 8VL-bit vector registers Z0..Z31
+  For each Zn, Zn bits [127:0] alias the ARMv8-A vector register Vn.
+
+  A register write using a Vn register name zeros all bits of the corresponding
+  Zn except for bits [127:0].
+
+* 16 VL-bit predicate registers P0..P15
+
+* 1 VL-bit special-purpose predicate register FFR (the "first-fault register")
+
+* a VL "pseudo-register" that determines the size of each vector register
+
+  The SVE instruction set architecture provides no way to write VL directly.
+  Instead, it can be modified only by EL1 and above, by writing appropriate
+  system registers.
+
+* The value of VL can be configured at runtime by EL1 and above:
+  16 <= VL <= VLmax, where VL must be a multiple of 16.
+
+* The maximum vector length is determined by the hardware:
+  16 <= VLmax <= 256.
+
+  (The SVE architecture specifies 256, but permits future architecture
+  revisions to raise this limit.)
+
+* FPSR and FPCR are retained from ARMv8-A, and interact with SVE floating-point
+  operations in a similar way to the way in which they interact with ARMv8
+  floating-point operations::
+
+         8VL-1                       128               0  bit index
+        +----          ////            -----------------+
+     Z0 |                               :       V0      |
+      :                                          :
+     Z7 |                               :       V7      |
+     Z8 |                               :     * V8      |
+      :                                       :  :
+    Z15 |                               :     *V15      |
+    Z16 |                               :      V16      |
+      :                                          :
+    Z31 |                               :      V31      |
+        +----          ////            -----------------+
+                                                 31    0
+         VL-1                  0                +-------+
+        +----       ////      --+          FPSR |       |
+     P0 |                       |               +-------+
+      : |                       |         *FPCR |       |
+    P15 |                       |               +-------+
+        +----       ////      --+
+    FFR |                       |               +-----+
+        +----       ////      --+            VL |     |
+                                                +-----+
+
+(*) callee-save:
+    This only applies to bits [63:0] of Z-/V-registers.
+    FPCR contains callee-save and caller-save bits.  See [4] for details.
+
+
+A.2.  Procedure call standard
+-----------------------------
+
+The ARMv8-A base procedure call standard is extended as follows with respect to
+the additional SVE register state:
+
+* All SVE register bits that are not shared with FP/SIMD are caller-save.
+
+* Z8 bits [63:0] .. Z15 bits [63:0] are callee-save.
+
+  This follows from the way these bits are mapped to V8..V15, which are caller-
+  save in the base procedure call standard.
+
+
+Appendix B.  ARMv8-A FP/SIMD programmer's model
+===============================================
+
+Note: This section is for information only and not intended to be complete or
+to replace any architectural specification.
+
+Refer to [4] for for more information.
+
+ARMv8-A defines the following floating-point / SIMD register state:
+
+* 32 128-bit vector registers V0..V31
+* 2 32-bit status/control registers FPSR, FPCR
+
+::
+
+         127           0  bit index
+        +---------------+
+     V0 |               |
+      : :               :
+     V7 |               |
+   * V8 |               |
+   :  : :               :
+   *V15 |               |
+    V16 |               |
+      : :               :
+    V31 |               |
+        +---------------+
+
+                 31    0
+                +-------+
+           FPSR |       |
+                +-------+
+          *FPCR |       |
+                +-------+
+
+(*) callee-save:
+    This only applies to bits [63:0] of V-registers.
+    FPCR contains a mixture of callee-save and caller-save bits.
+
+
+References
+==========
+
+[1] arch/arm64/include/uapi/asm/sigcontext.h
+    AArch64 Linux signal ABI definitions
+
+[2] arch/arm64/include/uapi/asm/ptrace.h
+    AArch64 Linux ptrace ABI definitions
+
+[3] Documentation/arm64/cpu-feature-registers.rst
+
+[4] ARM IHI0055C
+    http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
+    http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
+    Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
diff --git a/Documentation/arm64/sve.txt b/Documentation/arm64/sve.txt
deleted file mode 100644
index 9940e924a47e..000000000000
--- a/Documentation/arm64/sve.txt
+++ /dev/null
@@ -1,525 +0,0 @@
-            Scalable Vector Extension support for AArch64 Linux
-            ===================================================
-
-Author: Dave Martin <Dave.Martin@arm.com>
-Date:   4 August 2017
-
-This document outlines briefly the interface provided to userspace by Linux in
-order to support use of the ARM Scalable Vector Extension (SVE).
-
-This is an outline of the most important features and issues only and not
-intended to be exhaustive.
-
-This document does not aim to describe the SVE architecture or programmer's
-model.  To aid understanding, a minimal description of relevant programmer's
-model features for SVE is included in Appendix A.
-
-
-1.  General
------------
-
-* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are
-  tracked per-thread.
-
-* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector
-  AT_HWCAP entry.  Presence of this flag implies the presence of the SVE
-  instructions and registers, and the Linux-specific system interfaces
-  described in this document.  SVE is reported in /proc/cpuinfo as "sve".
-
-* Support for the execution of SVE instructions in userspace can also be
-  detected by reading the CPU ID register ID_AA64PFR0_EL1 using an MRS
-  instruction, and checking that the value of the SVE field is nonzero. [3]
-
-  It does not guarantee the presence of the system interfaces described in the
-  following sections: software that needs to verify that those interfaces are
-  present must check for HWCAP_SVE instead.
-
-* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also
-  be reported in the AT_HWCAP2 aux vector entry.  In addition to this,
-  optional extensions to SVE2 may be reported by the presence of:
-
-	HWCAP2_SVE2
-	HWCAP2_SVEAES
-	HWCAP2_SVEPMULL
-	HWCAP2_SVEBITPERM
-	HWCAP2_SVESHA3
-	HWCAP2_SVESM4
-
-  This list may be extended over time as the SVE architecture evolves.
-
-  These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1,
-  which userspace can read using an MRS instruction.  See elf_hwcaps.txt and
-  cpu-feature-registers.txt for details.
-
-* Debuggers should restrict themselves to interacting with the target via the
-  NT_ARM_SVE regset.  The recommended way of detecting support for this regset
-  is to connect to a target process first and then attempt a
-  ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov).
-
-
-2.  Vector length terminology
------------------------------
-
-The size of an SVE vector (Z) register is referred to as the "vector length".
-
-To avoid confusion about the units used to express vector length, the kernel
-adopts the following conventions:
-
-* Vector length (VL) = size of a Z-register in bytes
-
-* Vector quadwords (VQ) = size of a Z-register in units of 128 bits
-
-(So, VL = 16 * VQ.)
-
-The VQ convention is used where the underlying granularity is important, such
-as in data structure definitions.  In most other situations, the VL convention
-is used.  This is consistent with the meaning of the "VL" pseudo-register in
-the SVE instruction set architecture.
-
-
-3.  System call behaviour
--------------------------
-
-* On syscall, V0..V31 are preserved (as without SVE).  Thus, bits [127:0] of
-  Z0..Z31 are preserved.  All other bits of Z0..Z31, and all of P0..P15 and FFR
-  become unspecified on return from a syscall.
-
-* The SVE registers are not used to pass arguments to or receive results from
-  any syscall.
-
-* In practice the affected registers/bits will be preserved or will be replaced
-  with zeros on return from a syscall, but userspace should not make
-  assumptions about this.  The kernel behaviour may vary on a case-by-case
-  basis.
-
-* All other SVE state of a thread, including the currently configured vector
-  length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector
-  length (if any), is preserved across all syscalls, subject to the specific
-  exceptions for execve() described in section 6.
-
-  In particular, on return from a fork() or clone(), the parent and new child
-  process or thread share identical SVE configuration, matching that of the
-  parent before the call.
-
-
-4.  Signal handling
--------------------
-
-* A new signal frame record sve_context encodes the SVE registers on signal
-  delivery. [1]
-
-* This record is supplementary to fpsimd_context.  The FPSR and FPCR registers
-  are only present in fpsimd_context.  For convenience, the content of V0..V31
-  is duplicated between sve_context and fpsimd_context.
-
-* The signal frame record for SVE always contains basic metadata, in particular
-  the thread's vector length (in sve_context.vl).
-
-* The SVE registers may or may not be included in the record, depending on
-  whether the registers are live for the thread.  The registers are present if
-  and only if:
-  sve_context.head.size >= SVE_SIG_CONTEXT_SIZE(sve_vq_from_vl(sve_context.vl)).
-
-* If the registers are present, the remainder of the record has a vl-dependent
-  size and layout.  Macros SVE_SIG_* are defined [1] to facilitate access to
-  the members.
-
-* If the SVE context is too big to fit in sigcontext.__reserved[], then extra
-  space is allocated on the stack, an extra_context record is written in
-  __reserved[] referencing this space.  sve_context is then written in the
-  extra space.  Refer to [1] for further details about this mechanism.
-
-
-5.  Signal return
------------------
-
-When returning from a signal handler:
-
-* If there is no sve_context record in the signal frame, or if the record is
-  present but contains no register data as desribed in the previous section,
-  then the SVE registers/bits become non-live and take unspecified values.
-
-* If sve_context is present in the signal frame and contains full register
-  data, the SVE registers become live and are populated with the specified
-  data.  However, for backward compatibility reasons, bits [127:0] of Z0..Z31
-  are always restored from the corresponding members of fpsimd_context.vregs[]
-  and not from sve_context.  The remaining bits are restored from sve_context.
-
-* Inclusion of fpsimd_context in the signal frame remains mandatory,
-  irrespective of whether sve_context is present or not.
-
-* The vector length cannot be changed via signal return.  If sve_context.vl in
-  the signal frame does not match the current vector length, the signal return
-  attempt is treated as illegal, resulting in a forced SIGSEGV.
-
-
-6.  prctl extensions
---------------------
-
-Some new prctl() calls are added to allow programs to manage the SVE vector
-length:
-
-prctl(PR_SVE_SET_VL, unsigned long arg)
-
-    Sets the vector length of the calling thread and related flags, where
-    arg == vl | flags.  Other threads of the calling process are unaffected.
-
-    vl is the desired vector length, where sve_vl_valid(vl) must be true.
-
-    flags:
-
-	PR_SVE_SET_VL_INHERIT
-
-	    Inherit the current vector length across execve().  Otherwise, the
-	    vector length is reset to the system default at execve().  (See
-	    Section 9.)
-
-	PR_SVE_SET_VL_ONEXEC
-
-	    Defer the requested vector length change until the next execve()
-	    performed by this thread.
-
-	    The effect is equivalent to implicit exceution of the following
-	    call immediately after the next execve() (if any) by the thread:
-
-		prctl(PR_SVE_SET_VL, arg & ~PR_SVE_SET_VL_ONEXEC)
-
-	    This allows launching of a new program with a different vector
-	    length, while avoiding runtime side effects in the caller.
-
-
-	    Without PR_SVE_SET_VL_ONEXEC, the requested change takes effect
-	    immediately.
-
-
-    Return value: a nonnegative on success, or a negative value on error:
-	EINVAL: SVE not supported, invalid vector length requested, or
-	    invalid flags.
-
-
-    On success:
-
-    * Either the calling thread's vector length or the deferred vector length
-      to be applied at the next execve() by the thread (dependent on whether
-      PR_SVE_SET_VL_ONEXEC is present in arg), is set to the largest value
-      supported by the system that is less than or equal to vl.  If vl ==
-      SVE_VL_MAX, the value set will be the largest value supported by the
-      system.
-
-    * Any previously outstanding deferred vector length change in the calling
-      thread is cancelled.
-
-    * The returned value describes the resulting configuration, encoded as for
-      PR_SVE_GET_VL.  The vector length reported in this value is the new
-      current vector length for this thread if PR_SVE_SET_VL_ONEXEC was not
-      present in arg; otherwise, the reported vector length is the deferred
-      vector length that will be applied at the next execve() by the calling
-      thread.
-
-    * Changing the vector length causes all of P0..P15, FFR and all bits of
-      Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
-      unspecified.  Calling PR_SVE_SET_VL with vl equal to the thread's current
-      vector length, or calling PR_SVE_SET_VL with the PR_SVE_SET_VL_ONEXEC
-      flag, does not constitute a change to the vector length for this purpose.
-
-
-prctl(PR_SVE_GET_VL)
-
-    Gets the vector length of the calling thread.
-
-    The following flag may be OR-ed into the result:
-
-	PR_SVE_SET_VL_INHERIT
-
-	    Vector length will be inherited across execve().
-
-    There is no way to determine whether there is an outstanding deferred
-    vector length change (which would only normally be the case between a
-    fork() or vfork() and the corresponding execve() in typical use).
-
-    To extract the vector length from the result, and it with
-    PR_SVE_VL_LEN_MASK.
-
-    Return value: a nonnegative value on success, or a negative value on error:
-	EINVAL: SVE not supported.
-
-
-7.  ptrace extensions
----------------------
-
-* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and
-  PTRACE_SETREGSET.
-
-  Refer to [2] for definitions.
-
-The regset data starts with struct user_sve_header, containing:
-
-    size
-
-	Size of the complete regset, in bytes.
-	This depends on vl and possibly on other things in the future.
-
-	If a call to PTRACE_GETREGSET requests less data than the value of
-	size, the caller can allocate a larger buffer and retry in order to
-	read the complete regset.
-
-    max_size
-
-	Maximum size in bytes that the regset can grow to for the target
-	thread.  The regset won't grow bigger than this even if the target
-	thread changes its vector length etc.
-
-    vl
-
-	Target thread's current vector length, in bytes.
-
-    max_vl
-
-	Maximum possible vector length for the target thread.
-
-    flags
-
-	either
-
-	    SVE_PT_REGS_FPSIMD
-
-		SVE registers are not live (GETREGSET) or are to be made
-		non-live (SETREGSET).
-
-		The payload is of type struct user_fpsimd_state, with the same
-		meaning as for NT_PRFPREG, starting at offset
-		SVE_PT_FPSIMD_OFFSET from the start of user_sve_header.
-
-		Extra data might be appended in the future: the size of the
-		payload should be obtained using SVE_PT_FPSIMD_SIZE(vq, flags).
-
-		vq should be obtained using sve_vq_from_vl(vl).
-
-		or
-
-	    SVE_PT_REGS_SVE
-
-		SVE registers are live (GETREGSET) or are to be made live
-		(SETREGSET).
-
-		The payload contains the SVE register data, starting at offset
-		SVE_PT_SVE_OFFSET from the start of user_sve_header, and with
-		size SVE_PT_SVE_SIZE(vq, flags);
-
-	... OR-ed with zero or more of the following flags, which have the same
-	meaning and behaviour as the corresponding PR_SET_VL_* flags:
-
-	    SVE_PT_VL_INHERIT
-
-	    SVE_PT_VL_ONEXEC (SETREGSET only).
-
-* The effects of changing the vector length and/or flags are equivalent to
-  those documented for PR_SVE_SET_VL.
-
-  The caller must make a further GETREGSET call if it needs to know what VL is
-  actually set by SETREGSET, unless is it known in advance that the requested
-  VL is supported.
-
-* In the SVE_PT_REGS_SVE case, the size and layout of the payload depends on
-  the header fields.  The SVE_PT_SVE_*() macros are provided to facilitate
-  access to the members.
-
-* In either case, for SETREGSET it is permissible to omit the payload, in which
-  case only the vector length and flags are changed (along with any
-  consequences of those changes).
-
-* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the
-  requested VL is not supported, the effect will be the same as if the
-  payload were omitted, except that an EIO error is reported.  No
-  attempt is made to translate the payload data to the correct layout
-  for the vector length actually set.  The thread's FPSIMD state is
-  preserved, but the remaining bits of the SVE registers become
-  unspecified.  It is up to the caller to translate the payload layout
-  for the actual VL and retry.
-
-* The effect of writing a partial, incomplete payload is unspecified.
-
-
-8.  ELF coredump extensions
----------------------------
-
-* A NT_ARM_SVE note will be added to each coredump for each thread of the
-  dumped process.  The contents will be equivalent to the data that would have
-  been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread
-  when the coredump was generated.
-
-
-9.  System runtime configuration
---------------------------------
-
-* To mitigate the ABI impact of expansion of the signal frame, a policy
-  mechanism is provided for administrators, distro maintainers and developers
-  to set the default vector length for userspace processes:
-
-/proc/sys/abi/sve_default_vector_length
-
-    Writing the text representation of an integer to this file sets the system
-    default vector length to the specified value, unless the value is greater
-    than the maximum vector length supported by the system in which case the
-    default vector length is set to that maximum.
-
-    The result can be determined by reopening the file and reading its
-    contents.
-
-    At boot, the default vector length is initially set to 64 or the maximum
-    supported vector length, whichever is smaller.  This determines the initial
-    vector length of the init process (PID 1).
-
-    Reading this file returns the current system default vector length.
-
-* At every execve() call, the new vector length of the new process is set to
-  the system default vector length, unless
-
-    * PR_SVE_SET_VL_INHERIT (or equivalently SVE_PT_VL_INHERIT) is set for the
-      calling thread, or
-
-    * a deferred vector length change is pending, established via the
-      PR_SVE_SET_VL_ONEXEC flag (or SVE_PT_VL_ONEXEC).
-
-* Modifying the system default vector length does not affect the vector length
-  of any existing process or thread that does not make an execve() call.
-
-
-Appendix A.  SVE programmer's model (informative)
-=================================================
-
-This section provides a minimal description of the additions made by SVE to the
-ARMv8-A programmer's model that are relevant to this document.
-
-Note: This section is for information only and not intended to be complete or
-to replace any architectural specification.
-
-A.1.  Registers
----------------
-
-In A64 state, SVE adds the following:
-
-* 32 8VL-bit vector registers Z0..Z31
-  For each Zn, Zn bits [127:0] alias the ARMv8-A vector register Vn.
-
-  A register write using a Vn register name zeros all bits of the corresponding
-  Zn except for bits [127:0].
-
-* 16 VL-bit predicate registers P0..P15
-
-* 1 VL-bit special-purpose predicate register FFR (the "first-fault register")
-
-* a VL "pseudo-register" that determines the size of each vector register
-
-  The SVE instruction set architecture provides no way to write VL directly.
-  Instead, it can be modified only by EL1 and above, by writing appropriate
-  system registers.
-
-* The value of VL can be configured at runtime by EL1 and above:
-  16 <= VL <= VLmax, where VL must be a multiple of 16.
-
-* The maximum vector length is determined by the hardware:
-  16 <= VLmax <= 256.
-
-  (The SVE architecture specifies 256, but permits future architecture
-  revisions to raise this limit.)
-
-* FPSR and FPCR are retained from ARMv8-A, and interact with SVE floating-point
-  operations in a similar way to the way in which they interact with ARMv8
-  floating-point operations.
-
-         8VL-1                       128               0  bit index
-        +----          ////            -----------------+
-     Z0 |                               :       V0      |
-      :                                          :
-     Z7 |                               :       V7      |
-     Z8 |                               :     * V8      |
-      :                                       :  :
-    Z15 |                               :     *V15      |
-    Z16 |                               :      V16      |
-      :                                          :
-    Z31 |                               :      V31      |
-        +----          ////            -----------------+
-                                                 31    0
-         VL-1                  0                +-------+
-        +----       ////      --+          FPSR |       |
-     P0 |                       |               +-------+
-      : |                       |         *FPCR |       |
-    P15 |                       |               +-------+
-        +----       ////      --+
-    FFR |                       |               +-----+
-        +----       ////      --+            VL |     |
-                                                +-----+
-
-(*) callee-save:
-    This only applies to bits [63:0] of Z-/V-registers.
-    FPCR contains callee-save and caller-save bits.  See [4] for details.
-
-
-A.2.  Procedure call standard
------------------------------
-
-The ARMv8-A base procedure call standard is extended as follows with respect to
-the additional SVE register state:
-
-* All SVE register bits that are not shared with FP/SIMD are caller-save.
-
-* Z8 bits [63:0] .. Z15 bits [63:0] are callee-save.
-
-  This follows from the way these bits are mapped to V8..V15, which are caller-
-  save in the base procedure call standard.
-
-
-Appendix B.  ARMv8-A FP/SIMD programmer's model
-===============================================
-
-Note: This section is for information only and not intended to be complete or
-to replace any architectural specification.
-
-Refer to [4] for for more information.
-
-ARMv8-A defines the following floating-point / SIMD register state:
-
-* 32 128-bit vector registers V0..V31
-* 2 32-bit status/control registers FPSR, FPCR
-
-         127           0  bit index
-        +---------------+
-     V0 |               |
-      : :               :
-     V7 |               |
-   * V8 |               |
-   :  : :               :
-   *V15 |               |
-    V16 |               |
-      : :               :
-    V31 |               |
-        +---------------+
-
-                 31    0
-                +-------+
-           FPSR |       |
-                +-------+
-          *FPCR |       |
-                +-------+
-
-(*) callee-save:
-    This only applies to bits [63:0] of V-registers.
-    FPCR contains a mixture of callee-save and caller-save bits.
-
-
-References
-==========
-
-[1] arch/arm64/include/uapi/asm/sigcontext.h
-    AArch64 Linux signal ABI definitions
-
-[2] arch/arm64/include/uapi/asm/ptrace.h
-    AArch64 Linux ptrace ABI definitions
-
-[3] Documentation/arm64/cpu-feature-registers.txt
-
-[4] ARM IHI0055C
-    http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
-    http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
-    Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
new file mode 100644
index 000000000000..2acdec3ebbeb
--- /dev/null
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -0,0 +1,68 @@
+=========================================
+Tagged virtual addresses in AArch64 Linux
+=========================================
+
+Author: Will Deacon <will.deacon@arm.com>
+
+Date  : 12 June 2013
+
+This document briefly describes the provision of tagged virtual
+addresses in the AArch64 translation system and their potential uses
+in AArch64 Linux.
+
+The kernel configures the translation tables so that translations made
+via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
+the virtual address ignored by the translation hardware. This frees up
+this byte for application use.
+
+
+Passing tagged addresses to the kernel
+--------------------------------------
+
+All interpretation of userspace memory addresses by the kernel assumes
+an address tag of 0x00.
+
+This includes, but is not limited to, addresses found in:
+
+ - pointer arguments to system calls, including pointers in structures
+   passed to system calls,
+
+ - the stack pointer (sp), e.g. when interpreting it to deliver a
+   signal,
+
+ - the frame pointer (x29) and frame records, e.g. when interpreting
+   them to generate a backtrace or call graph.
+
+Using non-zero address tags in any of these locations may result in an
+error code being returned, a (fatal) signal being raised, or other modes
+of failure.
+
+For these reasons, passing non-zero address tags to the kernel via
+system calls is forbidden, and using a non-zero address tag for sp is
+strongly discouraged.
+
+Programs maintaining a frame pointer and frame records that use non-zero
+address tags may suffer impaired or inaccurate debug and profiling
+visibility.
+
+
+Preserving tags
+---------------
+
+Non-zero tags are not preserved when delivering signals. This means that
+signal handlers in applications making use of tags cannot rely on the
+tag information for user virtual addresses being maintained for fields
+inside siginfo_t. One exception to this rule is for signals raised in
+response to watchpoint debug exceptions, where the tag information will
+be preserved.
+
+The architecture prevents the use of a tagged PC, so the upper byte will
+be set to a sign-extension of bit 55 on exception return.
+
+
+Other considerations
+--------------------
+
+Special care should be taken when using tagged pointers, since it is
+likely that C compilers will not hazard two virtual addresses differing
+only in the upper byte.
diff --git a/Documentation/arm64/tagged-pointers.txt b/Documentation/arm64/tagged-pointers.txt
deleted file mode 100644
index a25a99e82bb1..000000000000
--- a/Documentation/arm64/tagged-pointers.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-		Tagged virtual addresses in AArch64 Linux
-		=========================================
-
-Author: Will Deacon <will.deacon@arm.com>
-Date  : 12 June 2013
-
-This document briefly describes the provision of tagged virtual
-addresses in the AArch64 translation system and their potential uses
-in AArch64 Linux.
-
-The kernel configures the translation tables so that translations made
-via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
-the virtual address ignored by the translation hardware. This frees up
-this byte for application use.
-
-
-Passing tagged addresses to the kernel
---------------------------------------
-
-All interpretation of userspace memory addresses by the kernel assumes
-an address tag of 0x00.
-
-This includes, but is not limited to, addresses found in:
-
- - pointer arguments to system calls, including pointers in structures
-   passed to system calls,
-
- - the stack pointer (sp), e.g. when interpreting it to deliver a
-   signal,
-
- - the frame pointer (x29) and frame records, e.g. when interpreting
-   them to generate a backtrace or call graph.
-
-Using non-zero address tags in any of these locations may result in an
-error code being returned, a (fatal) signal being raised, or other modes
-of failure.
-
-For these reasons, passing non-zero address tags to the kernel via
-system calls is forbidden, and using a non-zero address tag for sp is
-strongly discouraged.
-
-Programs maintaining a frame pointer and frame records that use non-zero
-address tags may suffer impaired or inaccurate debug and profiling
-visibility.
-
-
-Preserving tags
----------------
-
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
-
-The architecture prevents the use of a tagged PC, so the upper byte will
-be set to a sign-extension of bit 55 on exception return.
-
-
-Other considerations
---------------------
-
-Special care should be taken when using tagged pointers, since it is
-likely that C compilers will not hazard two virtual addresses differing
-only in the upper byte.
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index dca3fb0554db..0ab747e0d5ac 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -81,9 +81,11 @@ Non-RMW ops:
 
 The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
 implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
-smp_store_release() respectively.
+smp_store_release() respectively. Therefore, if you find yourself only using
+the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all
+and are doing it wrong.
 
-The one detail to this is that atomic_set{}() should be observable to the RMW
+A subtle detail of atomic_set{}() is that it should be observable to the RMW
 ops. That is:
 
   C atomic-set
@@ -187,13 +189,22 @@ The barriers:
 
   smp_mb__{before,after}_atomic()
 
-only apply to the RMW ops and can be used to augment/upgrade the ordering
-inherent to the used atomic op. These barriers provide a full smp_mb().
+only apply to the RMW atomic ops and can be used to augment/upgrade the
+ordering inherent to the op. These barriers act almost like a full smp_mb():
+smp_mb__before_atomic() orders all earlier accesses against the RMW op
+itself and all accesses following it, and smp_mb__after_atomic() orders all
+later accesses against the RMW op and all accesses preceding it. However,
+accesses between the smp_mb__{before,after}_atomic() and the RMW op are not
+ordered, so it is advisable to place the barrier right next to the RMW atomic
+op whenever possible.
 
 These helper barriers exist because architectures have varying implicit
 ordering on their SMP atomic primitives. For example our TSO architectures
 provide full ordered atomics and these barriers are no-ops.
 
+NOTE: when the atomic RmW ops are fully ordered, they should also imply a
+compiler barrier.
+
 Thus:
 
   atomic_fetch_add();
@@ -212,7 +223,9 @@ Further, while something like:
   atomic_dec(&X);
 
 is a 'typical' RELEASE pattern, the barrier is strictly stronger than
-a RELEASE. Similarly for something like:
+a RELEASE because it orders preceding instructions against both the read
+and write parts of the atomic_dec(), and against all following instructions
+as well. Similarly, something like:
 
   atomic_inc(&X);
   smp_mb__after_atomic();
@@ -244,7 +257,8 @@ strictly stronger than ACQUIRE. As illustrated:
 
 This should not happen; but a hypothetical atomic_inc_acquire() --
 (void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
-since then:
+because it would not order the W part of the RMW against the following
+WRITE_ONCE.  Thus:
 
   P1			P2
 
diff --git a/Documentation/auxdisplay/lcd-panel-cgram.txt b/Documentation/auxdisplay/lcd-panel-cgram.txt
deleted file mode 100644
index 7f82c905763d..000000000000
--- a/Documentation/auxdisplay/lcd-panel-cgram.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Some LCDs allow you to define up to 8 characters, mapped to ASCII
-characters 0 to 7. The escape code to define a new character is
-'\e[LG' followed by one digit from 0 to 7, representing the character
-number, and up to 8 couples of hex digits terminated by a semi-colon
-(';'). Each couple of digits represents a line, with 1-bits for each
-illuminated pixel with LSB on the right. Lines are numbered from the
-top of the character to the bottom. On a 5x7 matrix, only the 5 lower
-bits of the 7 first bytes are used for each character. If the string
-is incomplete, only complete lines will be redefined. Here are some
-examples :
-
-  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
-  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
-  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
-  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
-  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
-  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
-  printf "\e[LG60016051516141400;"  => 6 = "IP"
-
-  printf "\e[LG00103071F1F070301;"  => big speaker
-  printf "\e[LG00002061E1E060200;"  => small speaker
-
-Willy
-
diff --git a/Documentation/backlight/lp855x-driver.txt b/Documentation/backlight/lp855x-driver.txt
deleted file mode 100644
index 01bce243d3d7..000000000000
--- a/Documentation/backlight/lp855x-driver.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-Kernel driver lp855x
-====================
-
-Backlight driver for LP855x ICs
-
-Supported chips:
-	Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and
-	LP8557
-
-Author: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
------------
-
-* Brightness control
-
-Brightness can be controlled by the pwm input or the i2c command.
-The lp855x driver supports both cases.
-
-* Device attributes
-
-1) bl_ctl_mode
-Backlight control mode.
-Value : pwm based or register based
-
-2) chip_id
-The lp855x chip id.
-Value : lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557
-
-Platform data for lp855x
-------------------------
-
-For supporting platform specific data, the lp855x platform data can be used.
-
-* name : Backlight driver name. If it is not defined, default name is set.
-* device_control : Value of DEVICE CONTROL register.
-* initial_brightness : Initial value of backlight brightness.
-* period_ns : Platform specific PWM period value. unit is nano.
-	     Only valid when brightness is pwm input mode.
-* size_program : Total size of lp855x_rom_data.
-* rom_data : List of new eeprom/eprom registers.
-
-example 1) lp8552 platform data : i2c register mode with new eeprom data
-
-#define EEPROM_A5_ADDR	0xA5
-#define EEPROM_A5_VAL	0x4f	/* EN_VSYNC=0 */
-
-static struct lp855x_rom_data lp8552_eeprom_arr[] = {
-	{EEPROM_A5_ADDR, EEPROM_A5_VAL},
-};
-
-static struct lp855x_platform_data lp8552_pdata = {
-	.name = "lcd-bl",
-	.device_control = I2C_CONFIG(LP8552),
-	.initial_brightness = INITIAL_BRT,
-	.size_program = ARRAY_SIZE(lp8552_eeprom_arr),
-	.rom_data = lp8552_eeprom_arr,
-};
-
-example 2) lp8556 platform data : pwm input mode with default rom data
-
-static struct lp855x_platform_data lp8556_pdata = {
-	.device_control = PWM_CONFIG(LP8556),
-	.initial_brightness = INITIAL_BRT,
-	.period_ns = 1000000,
-};
diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst
new file mode 100644
index 000000000000..0d237d402860
--- /dev/null
+++ b/Documentation/block/bfq-iosched.rst
@@ -0,0 +1,597 @@
+==========================
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+
+- BFQ guarantees a high system and application responsiveness, and a
+  low latency for time-sensitive applications, such as audio or video
+  players;
+- BFQ distributes bandwidth, and not just time, among processes or
+  groups (switching back to time distribution when needed to keep
+  throughput high).
+
+In its default configuration, BFQ privileges latency over
+throughput. So, when needed for achieving a lower latency, BFQ builds
+schedules that may lead to a lower throughput. If your main or only
+goal, for a given device, is to achieve the maximum-possible
+throughput at all times, then do switch off all low-latency heuristics
+for that device, by setting low_latency to 0. See Section 3 for
+details on how to configure BFQ for the desired tradeoff between
+latency and throughput, or on how to maximize throughput.
+
+As every I/O scheduler, BFQ adds some overhead to per-I/O-request
+processing. To give an idea of this overhead, the total,
+single-lock-protected, per-request processing time of BFQ---i.e., the
+sum of the execution times of the request insertion, dispatch and
+completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
+(dated CPU for notebooks; time measured with simple code
+instrumentation, and using the throughput-sync.sh script of the S
+suite [1], in performance-profiling mode). To put this result into
+context, the total, single-lock-protected, per-request execution time
+of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
+us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
+
+Scheduling overhead further limits the maximum IOPS that a CPU can
+process (already limited by the execution of the rest of the I/O
+stack). To give an idea of the limits with BFQ, on slow or average
+CPUs, here are, first, the limits of BFQ for three different CPUs, on,
+respectively, an average laptop, an old desktop, and a cheap embedded
+system, in case full hierarchical support is enabled (i.e.,
+CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_BFQ_CGROUP_DEBUG is not
+set (Section 4-2):
+- Intel i7-4850HQ: 400 KIOPS
+- AMD A8-3850: 250 KIOPS
+- ARM CortexTM-A53 Octa-core: 80 KIOPS
+
+If CONFIG_BFQ_CGROUP_DEBUG is set (and of course full hierarchical
+support is enabled), then the sustainable throughput with BFQ
+decreases, because all blkio.bfq* statistics are created and updated
+(Section 4-2). For BFQ, this leads to the following maximum
+sustainable throughputs, on the same systems as above:
+- Intel i7-4850HQ: 310 KIOPS
+- AMD A8-3850: 200 KIOPS
+- ARM CortexTM-A53 Octa-core: 56 KIOPS
+
+BFQ works for multi-queue devices too.
+
+.. The table of contents follow. Impatients can just jump to Section 3.
+
+.. CONTENTS
+
+   1. When may BFQ be useful?
+    1-1 Personal systems
+    1-2 Server systems
+   2. How does BFQ work?
+   3. What are BFQ's tunables and how to properly configure BFQ?
+   4. BFQ group scheduling
+    4-1 Service guarantees provided
+    4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+  databases,
+
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+^^^^^^^^^^^^^^^
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+* audio and video-streaming with zero or very low jitter and drop
+  rate;
+
+* fast retrieval of WEB pages and embedded objects;
+
+* real-time recording of data in live-dumping applications (e.g.,
+  packet logging);
+
+* responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+  `(bfq_)queue`.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+  (process) at a time, and implements this service model by
+  associating every queue with a budget, measured in number of
+  sectors.
+
+  - After a queue is granted access to the device, the budget of the
+    queue is decremented, on each request dispatch, by the size of the
+    request.
+
+  - The in-service queue is expired, i.e., its service is suspended,
+    only if one of the following events occurs: 1) the queue finishes
+    its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+    - The budget timeout prevents processes doing random I/O from
+      holding the device for too long and dramatically reducing
+      throughput.
+
+    - Actually, as in CFQ, a queue associated with a process issuing
+      sync requests may not be expired immediately when it empties. In
+      contrast, BFQ may idle the device for a short time interval,
+      giving the process the chance to go on being served if it issues
+      a new request in time. Device idling typically boosts the
+      throughput on rotational devices and on non-queueing flash-based
+      devices, if processes do synchronous and sequential I/O. In
+      addition, under BFQ, device idling is also instrumental in
+      guaranteeing the desired throughput fraction to processes
+      issuing sync requests (see the description of the slice_idle
+      tunable in this document, or [1, 2], for more details).
+
+      - With respect to idling for service guarantees, if several
+	processes are competing for the device at the same time, but
+	all processes and groups have the same weight, then BFQ
+	guarantees the expected throughput distribution without ever
+	idling the device. Throughput is thus as high as possible in
+	this common scenario.
+
+     - On flash-based storage with internal queueing of commands
+       (typically NCQ), device idling happens to be always detrimental
+       for throughput. So, with these devices, BFQ performs idling
+       only when strictly needed for service guarantees, i.e., for
+       guaranteeing low latency or fairness. In these cases, overall
+       throughput may be sub-optimal. No solution currently exists to
+       provide both strong service guarantees and optimal throughput
+       on devices with internal queueing.
+
+  - If low-latency mode is enabled (default configuration), BFQ
+    executes some special heuristics to detect interactive and soft
+    real-time applications (e.g., video or audio players/streamers),
+    and to reduce their latency. The most important action taken to
+    achieve this goal is to give to the queues associated with these
+    applications more than their fair share of the device
+    throughput. For brevity, we call just "weight-raising" the whole
+    sets of actions taken by BFQ to privilege these queues. In
+    particular, BFQ provides a milder form of weight-raising for
+    interactive applications, and a stronger form for soft real-time
+    applications.
+
+  - BFQ automatically deactivates idling for queues born in a burst of
+    queue creations. In fact, these queues are usually associated with
+    the processes of applications and services that benefit mostly
+    from a high throughput. Examples are systemd during boot, or git
+    grep.
+
+  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+    performing random I/O that becomes mostly sequential if
+    merged. Differently from CFQ, BFQ achieves this goal with a more
+    reactive mechanism, called Early Queue Merge (EQM). EQM is so
+    responsive in detecting interleaved I/O (cooperating processes),
+    that it enables BFQ to achieve a high throughput, by queue
+    merging, even for queues for which CFQ needs a different
+    mechanism, preemption, to get a high throughput. As such EQM is a
+    unified mechanism to achieve a high throughput with interleaved
+    I/O.
+
+  - Queues are scheduled according to a variant of WF2Q+, named
+    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
+    also ready for hierarchical scheduling, details in Section 4.
+
+  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+    perfectly fair, and smooth service. In particular, B-WF2Q+
+    guarantees that each queue receives a fraction of the device
+    throughput proportional to its weight, even if the throughput
+    fluctuates, and regardless of: the device parameters, the current
+    workload and the budgets assigned to the queue.
+
+  - The last, budget-independence, property (although probably
+    counterintuitive in the first place) is definitely beneficial, for
+    the following reasons:
+
+    - First, with any proportional-share scheduler, the maximum
+      deviation with respect to an ideal service is proportional to
+      the maximum budget (slice) assigned to queues. As a consequence,
+      BFQ can keep this deviation tight not only because of the
+      accurate service of B-WF2Q+, but also because BFQ *does not*
+      need to assign a larger budget to a queue to let the queue
+      receive a higher fraction of the device throughput.
+
+    - Second, BFQ is free to choose, for every process (queue), the
+      budget that best fits the needs of the process, or best
+      leverages the I/O pattern of the process. In particular, BFQ
+      updates queue budgets with a simple feedback-loop algorithm that
+      allows a high throughput to be achieved, while still providing
+      tight latency guarantees to time-sensitive applications. When
+      the in-service queue expires, this algorithm computes the next
+      budget of the queue so as to:
+
+      - Let large budgets be eventually assigned to the queues
+	associated with I/O-bound applications performing sequential
+	I/O: in fact, the longer these applications are served once
+	got access to the device, the higher the throughput is.
+
+      - Let small budgets be eventually assigned to the queues
+	associated with time-sensitive applications (which typically
+	perform sporadic and short I/O), because, the smaller the
+	budget assigned to a queue waiting for service is, the sooner
+	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+  but all processes and groups have the same weight, then BFQ
+  guarantees the expected throughput distribution without ever idling
+  the device. It uses preemption instead. Throughput is then much
+  higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+  lower-priority queues are not served as long as there are
+  higher-priority queues.  Among queues in the same class, the
+  bandwidth is distributed in proportion to the weight of each
+  queue. A very thin extra bandwidth is however guaranteed to
+  the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunables and how to properly configure BFQ?
+=============================================================
+
+Most BFQ tunables affect service guarantees (basically latency and
+fairness) and throughput. For full details on how to choose the
+desired tradeoff between service guarantees and throughput, see the
+parameters slice_idle, strict_guarantees and low_latency. For details
+on how to maximise throughput, see slice_idle, timeout_sync and
+max_budget. The other performance-related parameters have been
+inherited from, and have been preserved mostly for compatibility with
+CFQ. So far, no performance improvement has been reported after
+changing the latter parameters in BFQ.
+
+In particular, the tunables back_seek-max, back_seek_penalty,
+fifo_expire_async and fifo_expire_sync below are the same as in
+CFQ. Their description is just copied from that for CFQ. Some
+considerations in the description of slice_idle are copied from CFQ
+too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration, as well
+as flash-based storage with internal command queueing (and
+parallelism).
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0.  In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), or with
+flash-based fast storage, setting slice_idle=0 might end up in better
+throughput and acceptable latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput (as detailed in Section 2). As a
+consequence of this behavior, and of further issues described for the
+strict_guarantees tunable, short-term service guarantees may be
+occasionally violated. And, in some cases, these guarantees may be
+more important than guaranteeing maximum throughput. For example, in
+video playing/streaming, a very low drop rate may be more important
+than maximum throughput. In these cases, consider setting the
+strict_guarantees parameter.
+
+slice_idle_us
+-------------
+
+Controls the same tuning parameter as slice_idle, but in microseconds.
+Either tunable can be used to set idling behavior.  Afterwards, the
+other tunable will reflect the newly set value in sysfs.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+  new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable.  The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DISABLE this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+In addition, as already highlighted at the beginning of this document,
+DISABLE this mode if your only goal is to achieve a high throughput.
+In fact, privileging the I/O of some application over the rest may
+entail a lower throughput. To achieve the highest-possible throughput
+on a non-rotational device, setting slice_idle to 0 may be needed too
+(at the cost of giving up any strong guarantee on fairness and low
+latency).
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+For specific devices, some users have occasionally reported to have
+reached a higher throughput by setting max_budget explicitly, i.e., by
+setting max_budget to a higher value than 0. In particular, they have
+set max_budget to higher values than those to which BFQ would have set
+it with auto-tuning. An alternative way to achieve this goal is to
+just increase the value of timeout_sync, leaving max_budget equal to 0.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+As for cgroups-v1 (blkio controller), the exact set of stat files
+created, and kept up-to-date by bfq, depends on whether
+CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all
+the stat files documented in
+Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead,
+CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files::
+
+  blkio.bfq.io_service_bytes
+  blkio.bfq.io_service_bytes_recursive
+  blkio.bfq.io_serviced
+  blkio.bfq.io_serviced_recursive
+
+The value of CONFIG_BFQ_CGROUP_DEBUG greatly influences the maximum
+throughput sustainable with bfq, because updating the blkio.bfq.*
+stats is rather costly, especially for some of the stats enabled by
+CONFIG_BFQ_CGROUP_DEBUG.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1]
+    P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+    Scheduler", Proceedings of the First Workshop on Mobile System
+    Technologies (MST-2015), May 2015.
+
+    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2]
+    P. Valente and M. Andreolini, "Improving Application
+    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+    the 5th Annual International Systems and Storage Conference
+    (SYSTOR '12), June 2012.
+
+    Slightly extended version:
+
+    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-results.pdf
+
+[3]
+   https://github.com/Algodev-github/S
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
deleted file mode 100644
index 1a0f2ac02eb6..000000000000
--- a/Documentation/block/bfq-iosched.txt
+++ /dev/null
@@ -1,583 +0,0 @@
-BFQ (Budget Fair Queueing)
-==========================
-
-BFQ is a proportional-share I/O scheduler, with some extra
-low-latency capabilities. In addition to cgroups support (blkio or io
-controllers), BFQ's main features are:
-- BFQ guarantees a high system and application responsiveness, and a
-  low latency for time-sensitive applications, such as audio or video
-  players;
-- BFQ distributes bandwidth, and not just time, among processes or
-  groups (switching back to time distribution when needed to keep
-  throughput high).
-
-In its default configuration, BFQ privileges latency over
-throughput. So, when needed for achieving a lower latency, BFQ builds
-schedules that may lead to a lower throughput. If your main or only
-goal, for a given device, is to achieve the maximum-possible
-throughput at all times, then do switch off all low-latency heuristics
-for that device, by setting low_latency to 0. See Section 3 for
-details on how to configure BFQ for the desired tradeoff between
-latency and throughput, or on how to maximize throughput.
-
-As every I/O scheduler, BFQ adds some overhead to per-I/O-request
-processing. To give an idea of this overhead, the total,
-single-lock-protected, per-request processing time of BFQ---i.e., the
-sum of the execution times of the request insertion, dispatch and
-completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
-(dated CPU for notebooks; time measured with simple code
-instrumentation, and using the throughput-sync.sh script of the S
-suite [1], in performance-profiling mode). To put this result into
-context, the total, single-lock-protected, per-request execution time
-of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
-us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
-
-Scheduling overhead further limits the maximum IOPS that a CPU can
-process (already limited by the execution of the rest of the I/O
-stack). To give an idea of the limits with BFQ, on slow or average
-CPUs, here are, first, the limits of BFQ for three different CPUs, on,
-respectively, an average laptop, an old desktop, and a cheap embedded
-system, in case full hierarchical support is enabled (i.e.,
-CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
-set (Section 4-2):
-- Intel i7-4850HQ: 400 KIOPS
-- AMD A8-3850: 250 KIOPS
-- ARM CortexTM-A53 Octa-core: 80 KIOPS
-
-If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical
-support is enabled), then the sustainable throughput with BFQ
-decreases, because all blkio.bfq* statistics are created and updated
-(Section 4-2). For BFQ, this leads to the following maximum
-sustainable throughputs, on the same systems as above:
-- Intel i7-4850HQ: 310 KIOPS
-- AMD A8-3850: 200 KIOPS
-- ARM CortexTM-A53 Octa-core: 56 KIOPS
-
-BFQ works for multi-queue devices too.
-
-The table of contents follow. Impatients can just jump to Section 3.
-
-CONTENTS
-
-1. When may BFQ be useful?
- 1-1 Personal systems
- 1-2 Server systems
-2. How does BFQ work?
-3. What are BFQ's tunables and how to properly configure BFQ?
-4. BFQ group scheduling
- 4-1 Service guarantees provided
- 4-2 Interface
-
-1. When may BFQ be useful?
-==========================
-
-BFQ provides the following benefits on personal and server systems.
-
-1-1 Personal systems
---------------------
-
-Low latency for interactive applications
-
-Regardless of the actual background workload, BFQ guarantees that, for
-interactive tasks, the storage device is virtually as responsive as if
-it was idle. For example, even if one or more of the following
-background workloads are being executed:
-- one or more large files are being read, written or copied,
-- a tree of source files is being compiled,
-- one or more virtual machines are performing I/O,
-- a software update is in progress,
-- indexing daemons are scanning filesystems and updating their
-  databases,
-starting an application or loading a file from within an application
-takes about the same time as if the storage device was idle. As a
-comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
-applications experience high latencies, or even become unresponsive
-until the background workload terminates (also on SSDs).
-
-Low latency for soft real-time applications
-
-Also soft real-time applications, such as audio and video
-players/streamers, enjoy a low latency and a low drop rate, regardless
-of the background I/O workload. As a consequence, these applications
-do not suffer from almost any glitch due to the background workload.
-
-Higher speed for code-development tasks
-
-If some additional workload happens to be executed in parallel, then
-BFQ executes the I/O-related components of typical code-development
-tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
-NOOP or DEADLINE.
-
-High throughput
-
-On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
-up to 150% higher throughput than DEADLINE and NOOP, with all the
-sequential workloads considered in our tests. With random workloads,
-and with all the workloads on flash-based devices, BFQ achieves,
-instead, about the same throughput as the other schedulers.
-
-Strong fairness, bandwidth and delay guarantees
-
-BFQ distributes the device throughput, and not just the device time,
-among I/O-bound applications in proportion their weights, with any
-workload and regardless of the device parameters. From these bandwidth
-guarantees, it is possible to compute tight per-I/O-request delay
-guarantees by a simple formula. If not configured for strict service
-guarantees, BFQ switches to time-based resource sharing (only) for
-applications that would otherwise cause a throughput loss.
-
-1-2 Server systems
-------------------
-
-Most benefits for server systems follow from the same service
-properties as above. In particular, regardless of whether additional,
-possibly heavy workloads are being served, BFQ guarantees:
-
-. audio and video-streaming with zero or very low jitter and drop
-  rate;
-
-. fast retrieval of WEB pages and embedded objects;
-
-. real-time recording of data in live-dumping applications (e.g.,
-  packet logging);
-
-. responsiveness in local and remote access to a server.
-
-
-2. How does BFQ work?
-=====================
-
-BFQ is a proportional-share I/O scheduler, whose general structure,
-plus a lot of code, are borrowed from CFQ.
-
-- Each process doing I/O on a device is associated with a weight and a
-  (bfq_)queue.
-
-- BFQ grants exclusive access to the device, for a while, to one queue
-  (process) at a time, and implements this service model by
-  associating every queue with a budget, measured in number of
-  sectors.
-
-  - After a queue is granted access to the device, the budget of the
-    queue is decremented, on each request dispatch, by the size of the
-    request.
-
-  - The in-service queue is expired, i.e., its service is suspended,
-    only if one of the following events occurs: 1) the queue finishes
-    its budget, 2) the queue empties, 3) a "budget timeout" fires.
-
-    - The budget timeout prevents processes doing random I/O from
-      holding the device for too long and dramatically reducing
-      throughput.
-
-    - Actually, as in CFQ, a queue associated with a process issuing
-      sync requests may not be expired immediately when it empties. In
-      contrast, BFQ may idle the device for a short time interval,
-      giving the process the chance to go on being served if it issues
-      a new request in time. Device idling typically boosts the
-      throughput on rotational devices and on non-queueing flash-based
-      devices, if processes do synchronous and sequential I/O. In
-      addition, under BFQ, device idling is also instrumental in
-      guaranteeing the desired throughput fraction to processes
-      issuing sync requests (see the description of the slice_idle
-      tunable in this document, or [1, 2], for more details).
-
-      - With respect to idling for service guarantees, if several
-	processes are competing for the device at the same time, but
-	all processes and groups have the same weight, then BFQ
-	guarantees the expected throughput distribution without ever
-	idling the device. Throughput is thus as high as possible in
-	this common scenario.
-
-     - On flash-based storage with internal queueing of commands
-       (typically NCQ), device idling happens to be always detrimental
-       for throughput. So, with these devices, BFQ performs idling
-       only when strictly needed for service guarantees, i.e., for
-       guaranteeing low latency or fairness. In these cases, overall
-       throughput may be sub-optimal. No solution currently exists to
-       provide both strong service guarantees and optimal throughput
-       on devices with internal queueing.
-
-  - If low-latency mode is enabled (default configuration), BFQ
-    executes some special heuristics to detect interactive and soft
-    real-time applications (e.g., video or audio players/streamers),
-    and to reduce their latency. The most important action taken to
-    achieve this goal is to give to the queues associated with these
-    applications more than their fair share of the device
-    throughput. For brevity, we call just "weight-raising" the whole
-    sets of actions taken by BFQ to privilege these queues. In
-    particular, BFQ provides a milder form of weight-raising for
-    interactive applications, and a stronger form for soft real-time
-    applications.
-
-  - BFQ automatically deactivates idling for queues born in a burst of
-    queue creations. In fact, these queues are usually associated with
-    the processes of applications and services that benefit mostly
-    from a high throughput. Examples are systemd during boot, or git
-    grep.
-
-  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
-    performing random I/O that becomes mostly sequential if
-    merged. Differently from CFQ, BFQ achieves this goal with a more
-    reactive mechanism, called Early Queue Merge (EQM). EQM is so
-    responsive in detecting interleaved I/O (cooperating processes),
-    that it enables BFQ to achieve a high throughput, by queue
-    merging, even for queues for which CFQ needs a different
-    mechanism, preemption, to get a high throughput. As such EQM is a
-    unified mechanism to achieve a high throughput with interleaved
-    I/O.
-
-  - Queues are scheduled according to a variant of WF2Q+, named
-    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
-    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
-    also ready for hierarchical scheduling, details in Section 4.
-
-  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
-    perfectly fair, and smooth service. In particular, B-WF2Q+
-    guarantees that each queue receives a fraction of the device
-    throughput proportional to its weight, even if the throughput
-    fluctuates, and regardless of: the device parameters, the current
-    workload and the budgets assigned to the queue.
-
-  - The last, budget-independence, property (although probably
-    counterintuitive in the first place) is definitely beneficial, for
-    the following reasons:
-
-    - First, with any proportional-share scheduler, the maximum
-      deviation with respect to an ideal service is proportional to
-      the maximum budget (slice) assigned to queues. As a consequence,
-      BFQ can keep this deviation tight not only because of the
-      accurate service of B-WF2Q+, but also because BFQ *does not*
-      need to assign a larger budget to a queue to let the queue
-      receive a higher fraction of the device throughput.
-
-    - Second, BFQ is free to choose, for every process (queue), the
-      budget that best fits the needs of the process, or best
-      leverages the I/O pattern of the process. In particular, BFQ
-      updates queue budgets with a simple feedback-loop algorithm that
-      allows a high throughput to be achieved, while still providing
-      tight latency guarantees to time-sensitive applications. When
-      the in-service queue expires, this algorithm computes the next
-      budget of the queue so as to:
-
-      - Let large budgets be eventually assigned to the queues
-	associated with I/O-bound applications performing sequential
-	I/O: in fact, the longer these applications are served once
-	got access to the device, the higher the throughput is.
-
-      - Let small budgets be eventually assigned to the queues
-	associated with time-sensitive applications (which typically
-	perform sporadic and short I/O), because, the smaller the
-	budget assigned to a queue waiting for service is, the sooner
-	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
-
-- If several processes are competing for the device at the same time,
-  but all processes and groups have the same weight, then BFQ
-  guarantees the expected throughput distribution without ever idling
-  the device. It uses preemption instead. Throughput is then much
-  higher in this common scenario.
-
-- ioprio classes are served in strict priority order, i.e.,
-  lower-priority queues are not served as long as there are
-  higher-priority queues.  Among queues in the same class, the
-  bandwidth is distributed in proportion to the weight of each
-  queue. A very thin extra bandwidth is however guaranteed to
-  the Idle class, to prevent it from starving.
-
-
-3. What are BFQ's tunables and how to properly configure BFQ?
-=============================================================
-
-Most BFQ tunables affect service guarantees (basically latency and
-fairness) and throughput. For full details on how to choose the
-desired tradeoff between service guarantees and throughput, see the
-parameters slice_idle, strict_guarantees and low_latency. For details
-on how to maximise throughput, see slice_idle, timeout_sync and
-max_budget. The other performance-related parameters have been
-inherited from, and have been preserved mostly for compatibility with
-CFQ. So far, no performance improvement has been reported after
-changing the latter parameters in BFQ.
-
-In particular, the tunables back_seek-max, back_seek_penalty,
-fifo_expire_async and fifo_expire_sync below are the same as in
-CFQ. Their description is just copied from that for CFQ. Some
-considerations in the description of slice_idle are copied from CFQ
-too.
-
-per-process ioprio and weight
------------------------------
-
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
-weights can be assigned to processes only indirectly, through I/O
-priorities, and according to the relation:
-weight = (IOPRIO_BE_NR - ioprio) * 10.
-
-Beware that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-slice_idle
-----------
-
-This parameter specifies how long BFQ should idle for next I/O
-request, when certain sync BFQ queues become empty. By default
-slice_idle is a non-zero value. Idling has a double purpose: boosting
-throughput and making sure that the desired throughput distribution is
-respected (see the description of how BFQ works, and, if needed, the
-papers referred there).
-
-As for throughput, idling can be very helpful on highly seeky media
-like single spindle SATA/SAS disks where we can cut down on overall
-number of seeks and see improved throughput.
-
-Setting slice_idle to 0 will remove all the idling on queues and one
-should see an overall improved throughput on faster storage devices
-like multiple SATA/SAS disks in hardware RAID configuration, as well
-as flash-based storage with internal command queueing (and
-parallelism).
-
-So depending on storage and workload, it might be useful to set
-slice_idle=0.  In general for SATA/SAS disks and software RAID of
-SATA/SAS disks keeping slice_idle enabled should be useful. For any
-configurations where there are multiple spindles behind single LUN
-(Host based hardware RAID controller or for storage arrays), or with
-flash-based fast storage, setting slice_idle=0 might end up in better
-throughput and acceptable latencies.
-
-Idling is however necessary to have service guarantees enforced in
-case of differentiated weights or differentiated I/O-request lengths.
-To see why, suppose that a given BFQ queue A must get several I/O
-requests served for each request served for another queue B. Idling
-ensures that, if A makes a new I/O request slightly after becoming
-empty, then no request of B is dispatched in the middle, and thus A
-does not lose the possibility to get more than one request dispatched
-before the next request of B is dispatched. Note that idling
-guarantees the desired differentiated treatment of queues only in
-terms of I/O-request dispatches. To guarantee that the actual service
-order then corresponds to the dispatch order, the strict_guarantees
-tunable must be set too.
-
-There is an important flipside for idling: apart from the above cases
-where it is beneficial also for throughput, idling can severely impact
-throughput. One important case is random workload. Because of this
-issue, BFQ tends to avoid idling as much as possible, when it is not
-beneficial also for throughput (as detailed in Section 2). As a
-consequence of this behavior, and of further issues described for the
-strict_guarantees tunable, short-term service guarantees may be
-occasionally violated. And, in some cases, these guarantees may be
-more important than guaranteeing maximum throughput. For example, in
-video playing/streaming, a very low drop rate may be more important
-than maximum throughput. In these cases, consider setting the
-strict_guarantees parameter.
-
-slice_idle_us
--------------
-
-Controls the same tuning parameter as slice_idle, but in microseconds.
-Either tunable can be used to set idling behavior.  Afterwards, the
-other tunable will reflect the newly set value in sysfs.
-
-strict_guarantees
------------------
-
-If this parameter is set (default: unset), then BFQ
-
-- always performs idling when the in-service queue becomes empty;
-
-- forces the device to serve one I/O request at a time, by dispatching a
-  new request only if there is no outstanding request.
-
-In the presence of differentiated weights or I/O-request sizes, both
-the above conditions are needed to guarantee that every BFQ queue
-receives its allotted share of the bandwidth. The first condition is
-needed for the reasons explained in the description of the slice_idle
-tunable.  The second condition is needed because all modern storage
-devices reorder internally-queued requests, which may trivially break
-the service guarantees enforced by the I/O scheduler.
-
-Setting strict_guarantees may evidently affect throughput.
-
-back_seek_max
--------------
-
-This specifies, given in Kbytes, the maximum "distance" for backward seeking.
-The distance is the amount of space from the current head location to the
-sectors that are backward in terms of distance.
-
-This parameter allows the scheduler to anticipate requests in the "backward"
-direction and consider them as being the "next" if they are within this
-distance from the current head location.
-
-back_seek_penalty
------------------
-
-This parameter is used to compute the cost of backward seeking. If the
-backward distance of request is just 1/back_seek_penalty from a "front"
-request, then the seeking cost of two requests is considered equivalent.
-
-So scheduler will not bias toward one or the other request (otherwise scheduler
-will bias toward front request). Default value of back_seek_penalty is 2.
-
-fifo_expire_async
------------------
-
-This parameter is used to set the timeout of asynchronous requests. Default
-value of this is 248ms.
-
-fifo_expire_sync
-----------------
-
-This parameter is used to set the timeout of synchronous requests. Default
-value of this is 124ms. In case to favor synchronous requests over asynchronous
-one, this value should be decreased relative to fifo_expire_async.
-
-low_latency
------------
-
-This parameter is used to enable/disable BFQ's low latency mode. By
-default, low latency mode is enabled. If enabled, interactive and soft
-real-time applications are privileged and experience a lower latency,
-as explained in more detail in the description of how BFQ works.
-
-DISABLE this mode if you need full control on bandwidth
-distribution. In fact, if it is enabled, then BFQ automatically
-increases the bandwidth share of privileged applications, as the main
-means to guarantee a lower latency to them.
-
-In addition, as already highlighted at the beginning of this document,
-DISABLE this mode if your only goal is to achieve a high throughput.
-In fact, privileging the I/O of some application over the rest may
-entail a lower throughput. To achieve the highest-possible throughput
-on a non-rotational device, setting slice_idle to 0 may be needed too
-(at the cost of giving up any strong guarantee on fairness and low
-latency).
-
-timeout_sync
-------------
-
-Maximum amount of device time that can be given to a task (queue) once
-it has been selected for service. On devices with costly seeks,
-increasing this time usually increases maximum throughput. On the
-opposite end, increasing this time coarsens the granularity of the
-short-term bandwidth and latency guarantees, especially if the
-following parameter is set to zero.
-
-max_budget
-----------
-
-Maximum amount of service, measured in sectors, that can be provided
-to a BFQ queue once it is set in service (of course within the limits
-of the above timeout). According to what said in the description of
-the algorithm, larger values increase the throughput in proportion to
-the percentage of sequential I/O requests issued. The price of larger
-values is that they coarsen the granularity of short-term bandwidth
-and latency guarantees.
-
-The default value is 0, which enables auto-tuning: BFQ sets max_budget
-to the maximum number of sectors that can be served during
-timeout_sync, according to the estimated peak rate.
-
-For specific devices, some users have occasionally reported to have
-reached a higher throughput by setting max_budget explicitly, i.e., by
-setting max_budget to a higher value than 0. In particular, they have
-set max_budget to higher values than those to which BFQ would have set
-it with auto-tuning. An alternative way to achieve this goal is to
-just increase the value of timeout_sync, leaving max_budget equal to 0.
-
-weights
--------
-
-Read-only parameter, used to show the weights of the currently active
-BFQ queues.
-
-
-4. Group scheduling with BFQ
-============================
-
-BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
-blkio and io. In particular, BFQ supports weight-based proportional
-share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
-
-4-1 Service guarantees provided
--------------------------------
-
-With BFQ, proportional share means true proportional share of the
-device bandwidth, according to group weights. For example, a group
-with weight 200 gets twice the bandwidth, and not just twice the time,
-of a group with weight 100.
-
-BFQ supports hierarchies (group trees) of any depth. Bandwidth is
-distributed among groups and processes in the expected way: for each
-group, the children of the group share the whole bandwidth of the
-group in proportion to their weights. In particular, this implies
-that, for each leaf group, every process of the group receives the
-same share of the whole group bandwidth, unless the ioprio of the
-process is modified.
-
-The resource-sharing guarantee for a group may partially or totally
-switch from bandwidth to time, if providing bandwidth guarantees to
-the group lowers the throughput too much. This switch occurs on a
-per-process basis: if a process of a leaf group causes throughput loss
-if served in such a way to receive its share of the bandwidth, then
-BFQ switches back to just time-based proportional share for that
-process.
-
-4-2 Interface
--------------
-
-To get proportional sharing of bandwidth with BFQ for a given device,
-BFQ must of course be the active scheduler for that device.
-
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
-
-As for cgroups-v1 (blkio controller), the exact set of stat files
-created, and kept up-to-date by bfq, depends on whether
-CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all
-the stat files documented in
-Documentation/cgroup-v1/blkio-controller.txt. If, instead,
-CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
-blkio.bfq.io_service_bytes
-blkio.bfq.io_service_bytes_recursive
-blkio.bfq.io_serviced
-blkio.bfq.io_serviced_recursive
-
-The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum
-throughput sustainable with bfq, because updating the blkio.bfq.*
-stats is rather costly, especially for some of the stats enabled by
-CONFIG_DEBUG_BLK_CGROUP.
-
-Parameters to set
------------------
-
-For each group, there is only the following parameter to set.
-
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..10000 (default 100). The
-linear mapping between ioprio and weights, described at the beginning
-of the tunable section, is still valid, but all weights higher than
-IOPRIO_BE_NR*10 are mapped to ioprio 0.
-
-Recall that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-
-[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
-    Scheduler", Proceedings of the First Workshop on Mobile System
-    Technologies (MST-2015), May 2015.
-    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
-
-[2] P. Valente and M. Andreolini, "Improving Application
-    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
-    the 5th Annual International Systems and Storage Conference
-    (SYSTOR '12), June 2012.
-    Slightly extended version:
-    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
-							results.pdf
-
-[3] https://github.com/Algodev-github/S
diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
new file mode 100644
index 000000000000..b964796ec9c7
--- /dev/null
+++ b/Documentation/block/biodoc.rst
@@ -0,0 +1,1164 @@
+=====================================================
+Notes on the Generic Block Layer Rewrite in Linux 2.5
+=====================================================
+
+.. note::
+
+	It seems that there are lot of outdated stuff here. This seems
+	to be written somewhat as a task list. Yet, eventually, something
+	here might still be useful.
+
+Notes Written on Jan 15, 2002:
+
+	- Jens Axboe <jens.axboe@oracle.com>
+	- Suparna Bhattacharya <suparna@in.ibm.com>
+
+Last Updated May 2, 2002
+
+September 2003: Updated I/O Scheduler portions
+	- Nick Piggin <npiggin@kernel.dk>
+
+Introduction
+============
+
+These are some notes describing some aspects of the 2.5 block layer in the
+context of the bio rewrite. The idea is to bring out some of the key
+changes and a glimpse of the rationale behind those changes.
+
+Please mail corrections & suggestions to suparna@in.ibm.com.
+
+Credits
+=======
+
+2.5 bio rewrite:
+	- Jens Axboe <jens.axboe@oracle.com>
+
+Many aspects of the generic block layer redesign were driven by and evolved
+over discussions, prior patches and the collective experience of several
+people. See sections 8 and 9 for a list of some related references.
+
+The following people helped with review comments and inputs for this
+document:
+
+	- Christoph Hellwig <hch@infradead.org>
+	- Arjan van de Ven <arjanv@redhat.com>
+	- Randy Dunlap <rdunlap@xenotime.net>
+	- Andre Hedrick <andre@linux-ide.org>
+
+The following people helped with fixes/contributions to the bio patches
+while it was still work-in-progress:
+
+	- David S. Miller <davem@redhat.com>
+
+
+.. Description of Contents:
+
+   1. Scope for tuning of logic to various needs
+     1.1 Tuning based on device or low level driver capabilities
+	- Per-queue parameters
+	- Highmem I/O support
+	- I/O scheduler modularization
+     1.2 Tuning based on high level requirements/capabilities
+	1.2.1 Request Priority/Latency
+     1.3 Direct access/bypass to lower layers for diagnostics and special
+	 device operations
+	1.3.1 Pre-built commands
+   2. New flexible and generic but minimalist i/o structure or descriptor
+      (instead of using buffer heads at the i/o layer)
+     2.1 Requirements/Goals addressed
+     2.2 The bio struct in detail (multi-page io unit)
+     2.3 Changes in the request structure
+   3. Using bios
+     3.1 Setup/teardown (allocation, splitting)
+     3.2 Generic bio helper routines
+       3.2.1 Traversing segments and completion units in a request
+       3.2.2 Setting up DMA scatterlists
+       3.2.3 I/O completion
+       3.2.4 Implications for drivers that do not interpret bios (don't handle
+	  multiple segments)
+     3.3 I/O submission
+   4. The I/O scheduler
+   5. Scalability related changes
+     5.1 Granular locking: Removal of io_request_lock
+     5.2 Prepare for transition to 64 bit sector_t
+   6. Other Changes/Implications
+     6.1 Partition re-mapping handled by the generic block layer
+   7. A few tips on migration of older drivers
+   8. A list of prior/related/impacted patches/ideas
+   9. Other References/Discussion Threads
+
+
+Bio Notes
+=========
+
+Let us discuss the changes in the context of how some overall goals for the
+block layer are addressed.
+
+1. Scope for tuning the generic logic to satisfy various requirements
+=====================================================================
+
+The block layer design supports adaptable abstractions to handle common
+processing with the ability to tune the logic to an appropriate extent
+depending on the nature of the device and the requirements of the caller.
+One of the objectives of the rewrite was to increase the degree of tunability
+and to enable higher level code to utilize underlying device/driver
+capabilities to the maximum extent for better i/o performance. This is
+important especially in the light of ever improving hardware capabilities
+and application/middleware software designed to take advantage of these
+capabilities.
+
+1.1 Tuning based on low level device / driver capabilities
+----------------------------------------------------------
+
+Sophisticated devices with large built-in caches, intelligent i/o scheduling
+optimizations, high memory DMA support, etc may find some of the
+generic processing an overhead, while for less capable devices the
+generic functionality is essential for performance or correctness reasons.
+Knowledge of some of the capabilities or parameters of the device should be
+used at the generic block layer to take the right decisions on
+behalf of the driver.
+
+How is this achieved ?
+
+Tuning at a per-queue level:
+
+i. Per-queue limits/values exported to the generic layer by the driver
+
+Various parameters that the generic i/o scheduler logic uses are set at
+a per-queue level (e.g maximum request size, maximum number of segments in
+a scatter-gather list, logical block size)
+
+Some parameters that were earlier available as global arrays indexed by
+major/minor are now directly associated with the queue. Some of these may
+move into the block device structure in the future. Some characteristics
+have been incorporated into a queue flags field rather than separate fields
+in themselves.  There are blk_queue_xxx functions to set the parameters,
+rather than update the fields directly
+
+Some new queue property settings:
+
+	blk_queue_bounce_limit(q, u64 dma_address)
+		Enable I/O to highmem pages, dma_address being the
+		limit. No highmem default.
+
+	blk_queue_max_sectors(q, max_sectors)
+		Sets two variables that limit the size of the request.
+
+		- The request queue's max_sectors, which is a soft size in
+		  units of 512 byte sectors, and could be dynamically varied
+		  by the core kernel.
+
+		- The request queue's max_hw_sectors, which is a hard limit
+		  and reflects the maximum size request a driver can handle
+		  in units of 512 byte sectors.
+
+		The default for both max_sectors and max_hw_sectors is
+		255. The upper limit of max_sectors is 1024.
+
+	blk_queue_max_phys_segments(q, max_segments)
+		Maximum physical segments you can handle in a request. 128
+		default (driver limit). (See 3.2.2)
+
+	blk_queue_max_hw_segments(q, max_segments)
+		Maximum dma segments the hardware can handle in a request. 128
+		default (host adapter limit, after dma remapping).
+		(See 3.2.2)
+
+	blk_queue_max_segment_size(q, max_seg_size)
+		Maximum size of a clustered segment, 64kB default.
+
+	blk_queue_logical_block_size(q, logical_block_size)
+		Lowest possible sector size that the hardware can operate
+		on, 512 bytes default.
+
+New queue flags:
+
+	- QUEUE_FLAG_CLUSTER (see 3.2.2)
+	- QUEUE_FLAG_QUEUED (see 3.2.4)
+
+
+ii. High-mem i/o capabilities are now considered the default
+
+The generic bounce buffer logic, present in 2.4, where the block layer would
+by default copyin/out i/o requests on high-memory buffers to low-memory buffers
+assuming that the driver wouldn't be able to handle it directly, has been
+changed in 2.5. The bounce logic is now applied only for memory ranges
+for which the device cannot handle i/o. A driver can specify this by
+setting the queue bounce limit for the request queue for the device
+(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
+where a device is capable of handling high memory i/o.
+
+In order to enable high-memory i/o where the device is capable of supporting
+it, the pci dma mapping routines and associated data structures have now been
+modified to accomplish a direct page -> bus translation, without requiring
+a virtual address mapping (unlike the earlier scheme of virtual address
+-> bus translation). So this works uniformly for high-memory pages (which
+do not have a corresponding kernel virtual address space mapping) and
+low-memory pages.
+
+Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion
+on PCI high mem DMA aspects and mapping of scatter gather lists, and support
+for 64 bit PCI.
+
+Special handling is required only for cases where i/o needs to happen on
+pages at physical memory addresses beyond what the device can support. In these
+cases, a bounce bio representing a buffer from the supported memory range
+is used for performing the i/o with copyin/copyout as needed depending on
+the type of the operation.  For example, in case of a read operation, the
+data read has to be copied to the original buffer on i/o completion, so a
+callback routine is set up to do this, while for write, the data is copied
+from the original buffer to the bounce buffer prior to issuing the
+operation. Since an original buffer may be in a high memory area that's not
+mapped in kernel virtual addr, a kmap operation may be required for
+performing the copy, and special care may be needed in the completion path
+as it may not be in irq context. Special care is also required (by way of
+GFP flags) when allocating bounce buffers, to avoid certain highmem
+deadlock possibilities.
+
+It is also possible that a bounce buffer may be allocated from high-memory
+area that's not mapped in kernel virtual addr, but within the range that the
+device can use directly; so the bounce page may need to be kmapped during
+copy operations. [Note: This does not hold in the current implementation,
+though]
+
+There are some situations when pages from high memory may need to
+be kmapped, even if bounce buffers are not necessary. For example a device
+may need to abort DMA operations and revert to PIO for the transfer, in
+which case a virtual mapping of the page is required. For SCSI it is also
+done in some scenarios where the low level driver cannot be trusted to
+handle a single sg entry correctly. The driver is expected to perform the
+kmaps as needed on such occasions as appropriate. A driver could also use
+the blk_queue_bounce() routine on its own to bounce highmem i/o to low
+memory for specific requests if so desired.
+
+iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
+
+As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
+queue or pick from (copy) existing generic schedulers and replace/override
+certain portions of it. The 2.5 rewrite provides improved modularization
+of the i/o scheduler. There are more pluggable callbacks, e.g for init,
+add request, extract request, which makes it possible to abstract specific
+i/o scheduling algorithm aspects and details outside of the generic loop.
+It also makes it possible to completely hide the implementation details of
+the i/o scheduler from block drivers.
+
+I/O scheduler wrappers are to be used instead of accessing the queue directly.
+See section 4. The I/O scheduler for details.
+
+1.2 Tuning Based on High level code capabilities
+------------------------------------------------
+
+i. Application capabilities for raw i/o
+
+This comes from some of the high-performance database/middleware
+requirements where an application prefers to make its own i/o scheduling
+decisions based on an understanding of the access patterns and i/o
+characteristics
+
+ii. High performance filesystems or other higher level kernel code's
+capabilities
+
+Kernel components like filesystems could also take their own i/o scheduling
+decisions for optimizing performance. Journalling filesystems may need
+some control over i/o ordering.
+
+What kind of support exists at the generic block layer for this ?
+
+The flags and rw fields in the bio structure can be used for some tuning
+from above e.g indicating that an i/o is just a readahead request, or priority
+settings (currently unused). As far as user applications are concerned they
+would need an additional mechanism either via open flags or ioctls, or some
+other upper level mechanism to communicate such settings to block.
+
+1.2.1 Request Priority/Latency
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Todo/Under discussion::
+
+  Arjan's proposed request priority scheme allows higher levels some broad
+  control (high/med/low) over the priority  of an i/o request vs other pending
+  requests in the queue. For example it allows reads for bringing in an
+  executable page on demand to be given a higher priority over pending write
+  requests which haven't aged too much on the queue. Potentially this priority
+  could even be exposed to applications in some manner, providing higher level
+  tunability. Time based aging avoids starvation of lower priority
+  requests. Some bits in the bi_opf flags field in the bio structure are
+  intended to be used for this priority information.
+
+
+1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
+-----------------------------------------------------------------------
+
+(e.g Diagnostics, Systems Management)
+
+There are situations where high-level code needs to have direct access to
+the low level device capabilities or requires the ability to issue commands
+to the device bypassing some of the intermediate i/o layers.
+These could, for example, be special control commands issued through ioctl
+interfaces, or could be raw read/write commands that stress the drive's
+capabilities for certain kinds of fitness tests. Having direct interfaces at
+multiple levels without having to pass through upper layers makes
+it possible to perform bottom up validation of the i/o path, layer by
+layer, starting from the media.
+
+The normal i/o submission interfaces, e.g submit_bio, could be bypassed
+for specially crafted requests which such ioctl or diagnostics
+interfaces would typically use, and the elevator add_request routine
+can instead be used to directly insert such requests in the queue or preferably
+the blk_do_rq routine can be used to place the request on the queue and
+wait for completion. Alternatively, sometimes the caller might just
+invoke a lower level driver specific interface with the request as a
+parameter.
+
+If the request is a means for passing on special information associated with
+the command, then such information is associated with the request->special
+field (rather than misuse the request->buffer field which is meant for the
+request data buffer's virtual mapping).
+
+For passing request data, the caller must build up a bio descriptor
+representing the concerned memory buffer if the underlying driver interprets
+bio segments or uses the block layer end*request* functions for i/o
+completion. Alternatively one could directly use the request->buffer field to
+specify the virtual address of the buffer, if the driver expects buffer
+addresses passed in this way and ignores bio entries for the request type
+involved. In the latter case, the driver would modify and manage the
+request->buffer, request->sector and request->nr_sectors or
+request->current_nr_sectors fields itself rather than using the block layer
+end_request or end_that_request_first completion interfaces.
+(See 2.3 or Documentation/block/request.rst for a brief explanation of
+the request structure fields)
+
+::
+
+  [TBD: end_that_request_last should be usable even in this case;
+  Perhaps an end_that_direct_request_first routine could be implemented to make
+  handling direct requests easier for such drivers; Also for drivers that
+  expect bios, a helper function could be provided for setting up a bio
+  corresponding to a data buffer]
+
+  <JENS: I dont understand the above, why is end_that_request_first() not
+  usable? Or _last for that matter. I must be missing something>
+
+  <SUP: What I meant here was that if the request doesn't have a bio, then
+   end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
+   and hence can't be used for advancing request state settings on the
+   completion of partial transfers. The driver has to modify these fields
+   directly by hand.
+   This is because end_that_request_first only iterates over the bio list,
+   and always returns 0 if there are none associated with the request.
+   _last works OK in this case, and is not a problem, as I mentioned earlier
+  >
+
+1.3.1 Pre-built Commands
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+A request can be created with a pre-built custom command  to be sent directly
+to the device. The cmd block in the request structure has room for filling
+in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
+command pre-building, and the type of the request is now indicated
+through rq->flags instead of via rq->cmd)
+
+The request structure flags can be set up to indicate the type of request
+in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
+packet command issued via blk_do_rq, REQ_SPECIAL: special request).
+
+It can help to pre-build device commands for requests in advance.
+Drivers can now specify a request prepare function (q->prep_rq_fn) that the
+block layer would invoke to pre-build device commands for a given request,
+or perform other preparatory processing for the request. This is routine is
+called by elv_next_request(), i.e. typically just before servicing a request.
+(The prepare function would not be called for requests that have RQF_DONTPREP
+enabled)
+
+Aside:
+  Pre-building could possibly even be done early, i.e before placing the
+  request on the queue, rather than construct the command on the fly in the
+  driver while servicing the request queue when it may affect latencies in
+  interrupt context or responsiveness in general. One way to add early
+  pre-building would be to do it whenever we fail to merge on a request.
+  Now REQ_NOMERGE is set in the request flags to skip this one in the future,
+  which means that it will not change before we feed it to the device. So
+  the pre-builder hook can be invoked there.
+
+
+2. Flexible and generic but minimalist i/o structure/descriptor
+===============================================================
+
+2.1 Reason for a new structure and requirements addressed
+---------------------------------------------------------
+
+Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
+layer, and the low level request structure was associated with a chain of
+buffer heads for a contiguous i/o request. This led to certain inefficiencies
+when it came to large i/o requests and readv/writev style operations, as it
+forced such requests to be broken up into small chunks before being passed
+on to the generic block layer, only to be merged by the i/o scheduler
+when the underlying device was capable of handling the i/o in one shot.
+Also, using the buffer head as an i/o structure for i/os that didn't originate
+from the buffer cache unnecessarily added to the weight of the descriptors
+which were generated for each such chunk.
+
+The following were some of the goals and expectations considered in the
+redesign of the block i/o data structure in 2.5.
+
+1.  Should be appropriate as a descriptor for both raw and buffered i/o  -
+    avoid cache related fields which are irrelevant in the direct/page i/o path,
+    or filesystem block size alignment restrictions which may not be relevant
+    for raw i/o.
+2.  Ability to represent high-memory buffers (which do not have a virtual
+    address mapping in kernel address space).
+3.  Ability to represent large i/os w/o unnecessarily breaking them up (i.e
+    greater than PAGE_SIZE chunks in one shot)
+4.  At the same time, ability to retain independent identity of i/os from
+    different sources or i/o units requiring individual completion (e.g. for
+    latency reasons)
+5.  Ability to represent an i/o involving multiple physical memory segments
+    (including non-page aligned page fragments, as specified via readv/writev)
+    without unnecessarily breaking it up, if the underlying device is capable of
+    handling it.
+6.  Preferably should be based on a memory descriptor structure that can be
+    passed around different types of subsystems or layers, maybe even
+    networking, without duplication or extra copies of data/descriptor fields
+    themselves in the process
+7.  Ability to handle the possibility of splits/merges as the structure passes
+    through layered drivers (lvm, md, evms), with minimal overhead.
+
+The solution was to define a new structure (bio)  for the block layer,
+instead of using the buffer head structure (bh) directly, the idea being
+avoidance of some associated baggage and limitations. The bio structure
+is uniformly used for all i/o at the block layer ; it forms a part of the
+bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
+mapped to bio structures.
+
+2.2 The bio struct
+------------------
+
+The bio structure uses a vector representation pointing to an array of tuples
+of <page, offset, len> to describe the i/o buffer, and has various other
+fields describing i/o parameters and state that needs to be maintained for
+performing the i/o.
+
+Notice that this representation means that a bio has no virtual address
+mapping at all (unlike buffer heads).
+
+::
+
+  struct bio_vec {
+       struct page     *bv_page;
+       unsigned short  bv_len;
+       unsigned short  bv_offset;
+  };
+
+  /*
+   * main unit of I/O for the block layer and lower layers (ie drivers)
+   */
+  struct bio {
+       struct bio          *bi_next;    /* request queue link */
+       struct block_device *bi_bdev;	/* target device */
+       unsigned long       bi_flags;    /* status, command, etc */
+       unsigned long       bi_opf;       /* low bits: r/w, high: priority */
+
+       unsigned int	bi_vcnt;     /* how may bio_vec's */
+       struct bvec_iter	bi_iter;	/* current index into bio_vec array */
+
+       unsigned int	bi_size;     /* total size in bytes */
+       unsigned short	bi_hw_segments; /* segments after DMA remapping */
+       unsigned int	bi_max;	     /* max bio_vecs we can hold
+                                        used as index into pool */
+       struct bio_vec   *bi_io_vec;  /* the actual vec list */
+       bio_end_io_t	*bi_end_io;  /* bi_end_io (bio) */
+       atomic_t		bi_cnt;	     /* pin count: free when it hits zero */
+       void             *bi_private;
+  };
+
+With this multipage bio design:
+
+- Large i/os can be sent down in one go using a bio_vec list consisting
+  of an array of <page, offset, len> fragments (similar to the way fragments
+  are represented in the zero-copy network code)
+- Splitting of an i/o request across multiple devices (as in the case of
+  lvm or raid) is achieved by cloning the bio (where the clone points to
+  the same bi_io_vec array, but with the index and size accordingly modified)
+- A linked list of bios is used as before for unrelated merges [#]_ - this
+  avoids reallocs and makes independent completions easier to handle.
+- Code that traverses the req list can find all the segments of a bio
+  by using rq_for_each_segment.  This handles the fact that a request
+  has multiple bios, each of which can have multiple segments.
+- Drivers which can't process a large bio in one shot can use the bi_iter
+  field to keep track of the next bio_vec entry to process.
+  (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
+  [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
+  bi_offset an len fields]
+
+.. [#]
+
+	unrelated merges -- a request ends up containing two or more bios that
+	didn't originate from the same place.
+
+bi_end_io() i/o callback gets called on i/o completion of the entire bio.
+
+At a lower level, drivers build a scatter gather list from the merged bios.
+The scatter gather list is in the form of an array of <page, offset, len>
+entries with their corresponding dma address mappings filled in at the
+appropriate time. As an optimization, contiguous physical pages can be
+covered by a single entry where <page> refers to the first page and <len>
+covers the range of pages (up to 16 contiguous pages could be covered this
+way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
+the sg list.
+
+Note: Right now the only user of bios with more than one page is ll_rw_kio,
+which in turn means that only raw I/O uses it (direct i/o may not work
+right now). The intent however is to enable clustering of pages etc to
+become possible. The pagebuf abstraction layer from SGI also uses multi-page
+bios, but that is currently not included in the stock development kernels.
+The same is true of Andrew Morton's work-in-progress multipage bio writeout
+and readahead patches.
+
+2.3 Changes in the Request Structure
+------------------------------------
+
+The request structure is the structure that gets passed down to low level
+drivers. The block layer make_request function builds up a request structure,
+places it on the queue and invokes the drivers request_fn. The driver makes
+use of block layer helper routine elv_next_request to pull the next request
+off the queue. Control or diagnostic functions might bypass block and directly
+invoke underlying driver entry points passing in a specially constructed
+request structure.
+
+Only some relevant fields (mainly those which changed or may be referred
+to in some of the discussion here) are listed below, not necessarily in
+the order in which they occur in the structure (see include/linux/blkdev.h)
+Refer to Documentation/block/request.rst for details about all the request
+structure fields and a quick reference about the layers which are
+supposed to use or modify those fields::
+
+  struct request {
+	struct list_head queuelist;  /* Not meant to be directly accessed by
+					the driver.
+					Used by q->elv_next_request_fn
+					rq->queue is gone
+					*/
+	.
+	.
+	unsigned char cmd[16]; /* prebuilt command data block */
+	unsigned long flags;   /* also includes earlier rq->cmd settings */
+	.
+	.
+	sector_t sector; /* this field is now of type sector_t instead of int
+			    preparation for 64 bit sectors */
+	.
+	.
+
+	/* Number of scatter-gather DMA addr+len pairs after
+	 * physical address coalescing is performed.
+	 */
+	unsigned short nr_phys_segments;
+
+	/* Number of scatter-gather addr+len pairs after
+	 * physical and DMA remapping hardware coalescing is performed.
+	 * This is the number of scatter-gather entries the driver
+	 * will actually have to deal with after DMA mapping is done.
+	 */
+	unsigned short nr_hw_segments;
+
+	/* Various sector counts */
+	unsigned long nr_sectors;  /* no. of sectors left: driver modifiable */
+	unsigned long hard_nr_sectors;  /* block internal copy of above */
+	unsigned int current_nr_sectors; /* no. of sectors left in the
+					   current segment:driver modifiable */
+	unsigned long hard_cur_sectors; /* block internal copy of the above */
+	.
+	.
+	int tag;	/* command tag associated with request */
+	void *special;  /* same as before */
+	char *buffer;   /* valid only for low memory buffers up to
+			 current_nr_sectors */
+	.
+	.
+	struct bio *bio, *biotail;  /* bio list instead of bh */
+	struct request_list *rl;
+  }
+
+See the req_ops and req_flag_bits definitions for an explanation of the various
+flags available. Some bits are used by the block layer or i/o scheduler.
+
+The behaviour of the various sector counts are almost the same as before,
+except that since we have multi-segment bios, current_nr_sectors refers
+to the numbers of sectors in the current segment being processed which could
+be one of the many segments in the current bio (i.e i/o completion unit).
+The nr_sectors value refers to the total number of sectors in the whole
+request that remain to be transferred (no change). The purpose of the
+hard_xxx values is for block to remember these counts every time it hands
+over the request to the driver. These values are updated by block on
+end_that_request_first, i.e. every time the driver completes a part of the
+transfer and invokes block end*request helpers to mark this. The
+driver should not modify these values. The block layer sets up the
+nr_sectors and current_nr_sectors fields (based on the corresponding
+hard_xxx values and the number of bytes transferred) and updates it on
+every transfer that invokes end_that_request_first. It does the same for the
+buffer, bio, bio->bi_iter fields too.
+
+The buffer field is just a virtual address mapping of the current segment
+of the i/o buffer in cases where the buffer resides in low-memory. For high
+memory i/o, this field is not valid and must not be used by drivers.
+
+Code that sets up its own request structures and passes them down to
+a driver needs to be careful about interoperation with the block layer helper
+functions which the driver uses. (Section 1.3)
+
+3. Using bios
+=============
+
+3.1 Setup/Teardown
+------------------
+
+There are routines for managing the allocation, and reference counting, and
+freeing of bios (bio_alloc, bio_get, bio_put).
+
+This makes use of Ingo Molnar's mempool implementation, which enables
+subsystems like bio to maintain their own reserve memory pools for guaranteed
+deadlock-free allocations during extreme VM load. For example, the VM
+subsystem makes use of the block layer to writeout dirty pages in order to be
+able to free up memory space, a case which needs careful handling. The
+allocation logic draws from the preallocated emergency reserve in situations
+where it cannot allocate through normal means. If the pool is empty and it
+can wait, then it would trigger action that would help free up memory or
+replenish the pool (without deadlocking) and wait for availability in the pool.
+If it is in IRQ context, and hence not in a position to do this, allocation
+could fail if the pool is empty. In general mempool always first tries to
+perform allocation without having to wait, even if it means digging into the
+pool as long it is not less that 50% full.
+
+On a free, memory is released to the pool or directly freed depending on
+the current availability in the pool. The mempool interface lets the
+subsystem specify the routines to be used for normal alloc and free. In the
+case of bio, these routines make use of the standard slab allocator.
+
+The caller of bio_alloc is expected to taken certain steps to avoid
+deadlocks, e.g. avoid trying to allocate more memory from the pool while
+already holding memory obtained from the pool.
+
+::
+
+  [TBD: This is a potential issue, though a rare possibility
+   in the bounce bio allocation that happens in the current code, since
+   it ends up allocating a second bio from the same pool while
+   holding the original bio ]
+
+Memory allocated from the pool should be released back within a limited
+amount of time (in the case of bio, that would be after the i/o is completed).
+This ensures that if part of the pool has been used up, some work (in this
+case i/o) must already be in progress and memory would be available when it
+is over. If allocating from multiple pools in the same code path, the order
+or hierarchy of allocation needs to be consistent, just the way one deals
+with multiple locks.
+
+The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
+for a non-clone bio. There are the 6 pools setup for different size biovecs,
+so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
+given size from these slabs.
+
+The bio_get() routine may be used to hold an extra reference on a bio prior
+to i/o submission, if the bio fields are likely to be accessed after the
+i/o is issued (since the bio may otherwise get freed in case i/o completion
+happens in the meantime).
+
+The bio_clone_fast() routine may be used to duplicate a bio, where the clone
+shares the bio_vec_list with the original bio (i.e. both point to the
+same bio_vec_list). This would typically be used for splitting i/o requests
+in lvm or md.
+
+3.2 Generic bio helper Routines
+-------------------------------
+
+3.2.1 Traversing segments and completion units in a request
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The macro rq_for_each_segment() should be used for traversing the bios
+in the request list (drivers should avoid directly trying to do it
+themselves). Using these helpers should also make it easier to cope
+with block changes in the future.
+
+::
+
+	struct req_iterator iter;
+	rq_for_each_segment(bio_vec, rq, iter)
+		/* bio_vec is now current segment */
+
+I/O completion callbacks are per-bio rather than per-segment, so drivers
+that traverse bio chains on completion need to keep that in mind. Drivers
+which don't make a distinction between segments and completion units would
+need to be reorganized to support multi-segment bios.
+
+3.2.2 Setting up DMA scatterlists
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The blk_rq_map_sg() helper routine would be used for setting up scatter
+gather lists from a request, so a driver need not do it on its own.
+
+	nr_segments = blk_rq_map_sg(q, rq, scatterlist);
+
+The helper routine provides a level of abstraction which makes it easier
+to modify the internals of request to scatterlist conversion down the line
+without breaking drivers. The blk_rq_map_sg routine takes care of several
+things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
+is set) and correct segment accounting to avoid exceeding the limits which
+the i/o hardware can handle, based on various queue properties.
+
+- Prevents a clustered segment from crossing a 4GB mem boundary
+- Avoids building segments that would exceed the number of physical
+  memory segments that the driver can handle (phys_segments) and the
+  number that the underlying hardware can handle at once, accounting for
+  DMA remapping (hw_segments)  (i.e. IOMMU aware limits).
+
+Routines which the low level driver can use to set up the segment limits:
+
+blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
+hw data segments in a request (i.e. the maximum number of address/length
+pairs the host adapter can actually hand to the device at once)
+
+blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
+of physical data segments in a request (i.e. the largest sized scatter list
+a driver could handle)
+
+3.2.3 I/O completion
+^^^^^^^^^^^^^^^^^^^^
+
+The existing generic block layer helper routines end_request,
+end_that_request_first and end_that_request_last can be used for i/o
+completion (and setting things up so the rest of the i/o or the next
+request can be kicked of) as before. With the introduction of multi-page
+bio support, end_that_request_first requires an additional argument indicating
+the number of sectors completed.
+
+3.2.4 Implications for drivers that do not interpret bios
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+(don't handle multiple segments)
+
+Drivers that do not interpret bios e.g those which do not handle multiple
+segments and do not support i/o into high memory addresses (require bounce
+buffers) and expect only virtually mapped buffers, can access the rq->buffer
+field. As before the driver should use current_nr_sectors to determine the
+size of remaining data in the current segment (that is the maximum it can
+transfer in one go unless it interprets segments), and rely on the block layer
+end_request, or end_that_request_first/last to take care of all accounting
+and transparent mapping of the next bio segment when a segment boundary
+is crossed on completion of a transfer. (The end*request* functions should
+be used if only if the request has come down from block/bio path, not for
+direct access requests which only specify rq->buffer without a valid rq->bio)
+
+3.3 I/O Submission
+------------------
+
+The routine submit_bio() is used to submit a single io. Higher level i/o
+routines make use of this:
+
+(a) Buffered i/o:
+
+The routine submit_bh() invokes submit_bio() on a bio corresponding to the
+bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
+
+(b) Kiobuf i/o (for raw/direct i/o):
+
+The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
+maps the array to one or more multi-page bios, issuing submit_bio() to
+perform the i/o on each of these.
+
+The embedded bh array in the kiobuf structure has been removed and no
+preallocation of bios is done for kiobufs. [The intent is to remove the
+blocks array as well, but it's currently in there to kludge around direct i/o.]
+Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
+
+Todo/Observation:
+
+ A single kiobuf structure is assumed to correspond to a contiguous range
+ of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
+ So right now it wouldn't work for direct i/o on non-contiguous blocks.
+ This is to be resolved.  The eventual direction is to replace kiobuf
+ by kvec's.
+
+ Badari Pulavarty has a patch to implement direct i/o correctly using
+ bio and kvec.
+
+
+(c) Page i/o:
+
+Todo/Under discussion:
+
+ Andrew Morton's multi-page bio patches attempt to issue multi-page
+ writeouts (and reads) from the page cache, by directly building up
+ large bios for submission completely bypassing the usage of buffer
+ heads. This work is still in progress.
+
+ Christoph Hellwig had some code that uses bios for page-io (rather than
+ bh). This isn't included in bio as yet. Christoph was also working on a
+ design for representing virtual/real extents as an entity and modifying
+ some of the address space ops interfaces to utilize this abstraction rather
+ than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
+ abstraction, but intended to be as lightweight as possible).
+
+(d) Direct access i/o:
+
+Direct access requests that do not contain bios would be submitted differently
+as discussed earlier in section 1.3.
+
+Aside:
+
+  Kvec i/o:
+
+  Ben LaHaise's aio code uses a slightly different structure instead
+  of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
+  tuples (very much like the networking code), together with a callback function
+  and data pointer. This is embedded into a brw_cb structure when passed
+  to brw_kvec_async().
+
+  Now it should be possible to directly map these kvecs to a bio. Just as while
+  cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
+  array pointer to point to the veclet array in kvecs.
+
+  TBD: In order for this to work, some changes are needed in the way multi-page
+  bios are handled today. The values of the tuples in such a vector passed in
+  from higher level code should not be modified by the block layer in the course
+  of its request processing, since that would make it hard for the higher layer
+  to continue to use the vector descriptor (kvec) after i/o completes. Instead,
+  all such transient state should either be maintained in the request structure,
+  and passed on in some way to the endio completion routine.
+
+
+4. The I/O scheduler
+====================
+
+I/O scheduler, a.k.a. elevator, is implemented in two layers.  Generic dispatch
+queue and specific I/O schedulers.  Unless stated otherwise, elevator is used
+to refer to both parts and I/O scheduler to specific I/O schedulers.
+
+Block layer implements generic dispatch queue in `block/*.c`.
+The generic dispatch queue is responsible for requeueing, handling non-fs
+requests and all other subtleties.
+
+Specific I/O schedulers are responsible for ordering normal filesystem
+requests.  They can also choose to delay certain requests to improve
+throughput or whatever purpose.  As the plural form indicates, there are
+multiple I/O schedulers.  They can be built as modules but at least one should
+be built inside the kernel.  Each queue can choose different one and can also
+change to another one dynamically.
+
+A block layer call to the i/o scheduler follows the convention elv_xxx(). This
+calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
+and xxx might not match exactly, but use your imagination. If an elevator
+doesn't implement a function, the switch does nothing or some minimal house
+keeping work.
+
+4.1. I/O scheduler API
+----------------------
+
+The functions an elevator may implement are: (* are mandatory)
+
+=============================== ================================================
+elevator_merge_fn		called to query requests for merge with a bio
+
+elevator_merge_req_fn		called when two requests get merged. the one
+				which gets merged into the other one will be
+				never seen by I/O scheduler again. IOW, after
+				being merged, the request is gone.
+
+elevator_merged_fn		called when a request in the scheduler has been
+				involved in a merge. It is used in the deadline
+				scheduler for example, to reposition the request
+				if its sorting order has changed.
+
+elevator_allow_merge_fn		called whenever the block layer determines
+				that a bio can be merged into an existing
+				request safely. The io scheduler may still
+				want to stop a merge at this point if it
+				results in some sort of conflict internally,
+				this hook allows it to do that. Note however
+				that two *requests* can still be merged at later
+				time. Currently the io scheduler has no way to
+				prevent that. It can only learn about the fact
+				from elevator_merge_req_fn callback.
+
+elevator_dispatch_fn*		fills the dispatch queue with ready requests.
+				I/O schedulers are free to postpone requests by
+				not filling the dispatch queue unless @force
+				is non-zero.  Once dispatched, I/O schedulers
+				are not allowed to manipulate the requests -
+				they belong to generic dispatch queue.
+
+elevator_add_req_fn*		called to add a new request into the scheduler
+
+elevator_former_req_fn
+elevator_latter_req_fn		These return the request before or after the
+				one specified in disk sort order. Used by the
+				block layer to find merge possibilities.
+
+elevator_completed_req_fn	called when a request is completed.
+
+elevator_set_req_fn
+elevator_put_req_fn		Must be used to allocate and free any elevator
+				specific storage for a request.
+
+elevator_activate_req_fn	Called when device driver first sees a request.
+				I/O schedulers can use this callback to
+				determine when actual execution of a request
+				starts.
+elevator_deactivate_req_fn	Called when device driver decides to delay
+				a request by requeueing it.
+
+elevator_init_fn*
+elevator_exit_fn		Allocate and free any elevator specific storage
+				for a queue.
+=============================== ================================================
+
+4.2 Request flows seen by I/O schedulers
+----------------------------------------
+
+All requests seen by I/O schedulers strictly follow one of the following three
+flows.
+
+ set_req_fn ->
+
+ i.   add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
+      (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
+ ii.  add_req_fn -> (merged_fn ->)* -> merge_req_fn
+ iii. [none]
+
+ -> put_req_fn
+
+4.3 I/O scheduler implementation
+--------------------------------
+
+The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
+optimal disk scan and request servicing performance (based on generic
+principles and device capabilities), optimized for:
+
+i.   improved throughput
+ii.  improved latency
+iii. better utilization of h/w & CPU time
+
+Characteristics:
+
+i. Binary tree
+AS and deadline i/o schedulers use red black binary trees for disk position
+sorting and searching, and a fifo linked list for time-based searching. This
+gives good scalability and good availability of information. Requests are
+almost always dispatched in disk sort order, so a cache is kept of the next
+request in sort order to prevent binary tree lookups.
+
+This arrangement is not a generic block layer characteristic however, so
+elevators may implement queues as they please.
+
+ii. Merge hash
+AS and deadline use a hash table indexed by the last sector of a request. This
+enables merging code to quickly look up "back merge" candidates, even when
+multiple I/O streams are being performed at once on one disk.
+
+"Front merges", a new request being merged at the front of an existing request,
+are far less common than "back merges" due to the nature of most I/O patterns.
+Front merges are handled by the binary trees in AS and deadline schedulers.
+
+iii. Plugging the queue to batch requests in anticipation of opportunities for
+     merge/sort optimizations
+
+Plugging is an approach that the current i/o scheduling algorithm resorts to so
+that it collects up enough requests in the queue to be able to take
+advantage of the sorting/merging logic in the elevator. If the
+queue is empty when a request comes in, then it plugs the request queue
+(sort of like plugging the bath tub of a vessel to get fluid to build up)
+till it fills up with a few more requests, before starting to service
+the requests. This provides an opportunity to merge/sort the requests before
+passing them down to the device. There are various conditions when the queue is
+unplugged (to open up the flow again), either through a scheduled task or
+could be on demand. For example wait_on_buffer sets the unplugging going
+through sync_buffer() running blk_run_address_space(mapping). Or the caller
+can do it explicity through blk_unplug(bdev). So in the read case,
+the queue gets explicitly unplugged as part of waiting for completion on that
+buffer.
+
+Aside:
+  This is kind of controversial territory, as it's not clear if plugging is
+  always the right thing to do. Devices typically have their own queues,
+  and allowing a big queue to build up in software, while letting the device be
+  idle for a while may not always make sense. The trick is to handle the fine
+  balance between when to plug and when to open up. Also now that we have
+  multi-page bios being queued in one shot, we may not need to wait to merge
+  a big request from the broken up pieces coming by.
+
+4.4 I/O contexts
+----------------
+
+I/O contexts provide a dynamically allocated per process data area. They may
+be used in I/O schedulers, and in the block layer (could be used for IO statis,
+priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c
+for an example of usage in an i/o scheduler.
+
+
+5. Scalability related changes
+==============================
+
+5.1 Granular Locking: io_request_lock replaced by a per-queue lock
+------------------------------------------------------------------
+
+The global io_request_lock has been removed as of 2.5, to avoid
+the scalability bottleneck it was causing, and has been replaced by more
+granular locking. The request queue structure has a pointer to the
+lock to be used for that queue. As a result, locking can now be
+per-queue, with a provision for sharing a lock across queues if
+necessary (e.g the scsi layer sets the queue lock pointers to the
+corresponding adapter lock, which results in a per host locking
+granularity). The locking semantics are the same, i.e. locking is
+still imposed by the block layer, grabbing the lock before
+request_fn execution which it means that lots of older drivers
+should still be SMP safe. Drivers are free to drop the queue
+lock themselves, if required. Drivers that explicitly used the
+io_request_lock for serialization need to be modified accordingly.
+Usually it's as easy as adding a global lock::
+
+	static DEFINE_SPINLOCK(my_driver_lock);
+
+and passing the address to that lock to blk_init_queue().
+
+5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
+----------------------------------------------------------------
+
+The sector number used in the bio structure has been changed to sector_t,
+which could be defined as 64 bit in preparation for 64 bit sector support.
+
+6. Other Changes/Implications
+=============================
+
+6.1 Partition re-mapping handled by the generic block layer
+-----------------------------------------------------------
+
+In 2.5 some of the gendisk/partition related code has been reorganized.
+Now the generic block layer performs partition-remapping early and thus
+provides drivers with a sector number relative to whole device, rather than
+having to take partition number into account in order to arrive at the true
+sector number. The routine blk_partition_remap() is invoked by
+generic_make_request even before invoking the queue specific make_request_fn,
+so the i/o scheduler also gets to operate on whole disk sector numbers. This
+should typically not require changes to block drivers, it just never gets
+to invoke its own partition sector offset calculations since all bios
+sent are offset from the beginning of the device.
+
+
+7. A Few Tips on Migration of older drivers
+===========================================
+
+Old-style drivers that just use CURRENT and ignores clustered requests,
+may not need much change.  The generic layer will automatically handle
+clustered requests, multi-page bios, etc for the driver.
+
+For a low performance driver or hardware that is PIO driven or just doesn't
+support scatter-gather changes should be minimal too.
+
+The following are some points to keep in mind when converting old drivers
+to bio.
+
+Drivers should use elv_next_request to pick up requests and are no longer
+supposed to handle looping directly over the request list.
+(struct request->queue has been removed)
+
+Now end_that_request_first takes an additional number_of_sectors argument.
+It used to handle always just the first buffer_head in a request, now
+it will loop and handle as many sectors (on a bio-segment granularity)
+as specified.
+
+Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
+right thing to use is bio_endio(bio) instead.
+
+If the driver is dropping the io_request_lock from its request_fn strategy,
+then it just needs to replace that with q->queue_lock instead.
+
+As described in Sec 1.1, drivers can set max sector size, max segment size
+etc per queue now. Drivers that used to define their own merge functions i
+to handle things like this can now just use the blk_queue_* functions at
+blk_init_queue time.
+
+Drivers no longer have to map a {partition, sector offset} into the
+correct absolute location anymore, this is done by the block layer, so
+where a driver received a request ala this before::
+
+	rq->rq_dev = mk_kdev(3, 5);	/* /dev/hda5 */
+	rq->sector = 0;			/* first sector on hda5 */
+
+it will now see::
+
+	rq->rq_dev = mk_kdev(3, 0);	/* /dev/hda */
+	rq->sector = 123128;		/* offset from start of disk */
+
+As mentioned, there is no virtual mapping of a bio. For DMA, this is
+not a problem as the driver probably never will need a virtual mapping.
+Instead it needs a bus mapping (dma_map_page for a single segment or
+use dma_map_sg for scatter gather) to be able to ship it to the driver. For
+PIO drivers (or drivers that need to revert to PIO transfer once in a
+while (IDE for example)), where the CPU is doing the actual data
+transfer a virtual mapping is needed. If the driver supports highmem I/O,
+(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
+a bio into the virtual address space.
+
+
+8. Prior/Related/Impacted patches
+=================================
+
+8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
+-----------------------------------------------------
+
+- orig kiobuf & raw i/o patches (now in 2.4 tree)
+- direct kiobuf based i/o to devices (no intermediate bh's)
+- page i/o using kiobuf
+- kiobuf splitting for lvm (mkp)
+- elevator support for kiobuf request merging (axboe)
+
+8.2. Zero-copy networking (Dave Miller)
+---------------------------------------
+
+8.3. SGI XFS - pagebuf patches - use of kiobufs
+-----------------------------------------------
+8.4. Multi-page pioent patch for bio (Christoph Hellwig)
+--------------------------------------------------------
+8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
+--------------------------------------------------------------------
+8.6. Async i/o implementation patch (Ben LaHaise)
+-------------------------------------------------
+8.7. EVMS layering design (IBM EVMS team)
+-----------------------------------------
+8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips)
+-------------------------------------------------------------------------------------
+
+    => larger contiguous physical memory buffers
+
+8.9. VM reservations patch (Ben LaHaise)
+----------------------------------------
+8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
+----------------------------------------------------------
+8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
+---------------------------------------------------------------------------
+8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari)
+-------------------------------------------------------------------------------
+8.13  Priority based i/o scheduler - prepatches (Arjan van de Ven)
+------------------------------------------------------------------
+8.14  IDE Taskfile i/o patch (Andre Hedrick)
+--------------------------------------------
+8.15  Multi-page writeout and readahead patches (Andrew Morton)
+---------------------------------------------------------------
+8.16  Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
+-----------------------------------------------------------------------
+
+9. Other References
+===================
+
+9.1 The Splice I/O Model
+------------------------
+
+Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001
+
+9.2 Discussions about kiobuf and bh design
+------------------------------------------
+
+On lkml between sct, linus, alan et al - Feb-March 2001 (many of the
+initial thoughts that led to bio were brought up in this discussion thread)
+
+9.3 Discussions on mempool on lkml - Dec 2001.
+----------------------------------------------
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
deleted file mode 100644
index ac18b488cb5e..000000000000
--- a/Documentation/block/biodoc.txt
+++ /dev/null
@@ -1,1077 +0,0 @@
-	Notes on the Generic Block Layer Rewrite in Linux 2.5
-	=====================================================
-
-Notes Written on Jan 15, 2002:
-	Jens Axboe <jens.axboe@oracle.com>
-	Suparna Bhattacharya <suparna@in.ibm.com>
-
-Last Updated May 2, 2002
-September 2003: Updated I/O Scheduler portions
-	Nick Piggin <npiggin@kernel.dk>
-
-Introduction:
-
-These are some notes describing some aspects of the 2.5 block layer in the
-context of the bio rewrite. The idea is to bring out some of the key
-changes and a glimpse of the rationale behind those changes.
-
-Please mail corrections & suggestions to suparna@in.ibm.com.
-
-Credits:
----------
-
-2.5 bio rewrite:
-	Jens Axboe <jens.axboe@oracle.com>
-
-Many aspects of the generic block layer redesign were driven by and evolved
-over discussions, prior patches and the collective experience of several
-people. See sections 8 and 9 for a list of some related references.
-
-The following people helped with review comments and inputs for this
-document:
-	Christoph Hellwig <hch@infradead.org>
-	Arjan van de Ven <arjanv@redhat.com>
-	Randy Dunlap <rdunlap@xenotime.net>
-	Andre Hedrick <andre@linux-ide.org>
-
-The following people helped with fixes/contributions to the bio patches
-while it was still work-in-progress:
-	David S. Miller <davem@redhat.com>
-
-
-Description of Contents:
-------------------------
-
-1. Scope for tuning of logic to various needs
-  1.1 Tuning based on device or low level driver capabilities
-	- Per-queue parameters
-	- Highmem I/O support
-	- I/O scheduler modularization
-  1.2 Tuning based on high level requirements/capabilities
-	1.2.1 Request Priority/Latency
-  1.3 Direct access/bypass to lower layers for diagnostics and special
-      device operations
-	1.3.1 Pre-built commands
-2. New flexible and generic but minimalist i/o structure or descriptor
-   (instead of using buffer heads at the i/o layer)
-  2.1 Requirements/Goals addressed
-  2.2 The bio struct in detail (multi-page io unit)
-  2.3 Changes in the request structure
-3. Using bios
-  3.1 Setup/teardown (allocation, splitting)
-  3.2 Generic bio helper routines
-    3.2.1 Traversing segments and completion units in a request
-    3.2.2 Setting up DMA scatterlists
-    3.2.3 I/O completion
-    3.2.4 Implications for drivers that do not interpret bios (don't handle
- 	  multiple segments)
-  3.3 I/O submission
-4. The I/O scheduler
-5. Scalability related changes
-  5.1 Granular locking: Removal of io_request_lock
-  5.2 Prepare for transition to 64 bit sector_t
-6. Other Changes/Implications
-  6.1 Partition re-mapping handled by the generic block layer
-7. A few tips on migration of older drivers
-8. A list of prior/related/impacted patches/ideas
-9. Other References/Discussion Threads
-
----------------------------------------------------------------------------
-
-Bio Notes
---------
-
-Let us discuss the changes in the context of how some overall goals for the
-block layer are addressed.
-
-1. Scope for tuning the generic logic to satisfy various requirements
-
-The block layer design supports adaptable abstractions to handle common
-processing with the ability to tune the logic to an appropriate extent
-depending on the nature of the device and the requirements of the caller.
-One of the objectives of the rewrite was to increase the degree of tunability
-and to enable higher level code to utilize underlying device/driver
-capabilities to the maximum extent for better i/o performance. This is
-important especially in the light of ever improving hardware capabilities
-and application/middleware software designed to take advantage of these
-capabilities.
-
-1.1 Tuning based on low level device / driver capabilities
-
-Sophisticated devices with large built-in caches, intelligent i/o scheduling
-optimizations, high memory DMA support, etc may find some of the
-generic processing an overhead, while for less capable devices the
-generic functionality is essential for performance or correctness reasons.
-Knowledge of some of the capabilities or parameters of the device should be
-used at the generic block layer to take the right decisions on
-behalf of the driver.
-
-How is this achieved ?
-
-Tuning at a per-queue level:
-
-i. Per-queue limits/values exported to the generic layer by the driver
-
-Various parameters that the generic i/o scheduler logic uses are set at
-a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, logical block size)
-
-Some parameters that were earlier available as global arrays indexed by
-major/minor are now directly associated with the queue. Some of these may
-move into the block device structure in the future. Some characteristics
-have been incorporated into a queue flags field rather than separate fields
-in themselves.  There are blk_queue_xxx functions to set the parameters,
-rather than update the fields directly
-
-Some new queue property settings:
-
-	blk_queue_bounce_limit(q, u64 dma_address)
-		Enable I/O to highmem pages, dma_address being the
-		limit. No highmem default.
-
-	blk_queue_max_sectors(q, max_sectors)
-		Sets two variables that limit the size of the request.
-
-		- The request queue's max_sectors, which is a soft size in
-		units of 512 byte sectors, and could be dynamically varied
-		by the core kernel.
-
-		- The request queue's max_hw_sectors, which is a hard limit
-		and reflects the maximum size request a driver can handle
-		in units of 512 byte sectors.
-
-		The default for both max_sectors and max_hw_sectors is
-		255. The upper limit of max_sectors is 1024.
-
-	blk_queue_max_phys_segments(q, max_segments)
-		Maximum physical segments you can handle in a request. 128
-		default (driver limit). (See 3.2.2)
-
-	blk_queue_max_hw_segments(q, max_segments)
-		Maximum dma segments the hardware can handle in a request. 128
-		default (host adapter limit, after dma remapping).
-		(See 3.2.2)
-
-	blk_queue_max_segment_size(q, max_seg_size)
-		Maximum size of a clustered segment, 64kB default.
-
-	blk_queue_logical_block_size(q, logical_block_size)
-		Lowest possible sector size that the hardware can operate
-		on, 512 bytes default.
-
-New queue flags:
-
-	QUEUE_FLAG_CLUSTER (see 3.2.2)
-	QUEUE_FLAG_QUEUED (see 3.2.4)
-
-
-ii. High-mem i/o capabilities are now considered the default
-
-The generic bounce buffer logic, present in 2.4, where the block layer would
-by default copyin/out i/o requests on high-memory buffers to low-memory buffers
-assuming that the driver wouldn't be able to handle it directly, has been
-changed in 2.5. The bounce logic is now applied only for memory ranges
-for which the device cannot handle i/o. A driver can specify this by
-setting the queue bounce limit for the request queue for the device
-(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
-where a device is capable of handling high memory i/o.
-
-In order to enable high-memory i/o where the device is capable of supporting
-it, the pci dma mapping routines and associated data structures have now been
-modified to accomplish a direct page -> bus translation, without requiring
-a virtual address mapping (unlike the earlier scheme of virtual address
--> bus translation). So this works uniformly for high-memory pages (which
-do not have a corresponding kernel virtual address space mapping) and
-low-memory pages.
-
-Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion
-on PCI high mem DMA aspects and mapping of scatter gather lists, and support
-for 64 bit PCI.
-
-Special handling is required only for cases where i/o needs to happen on
-pages at physical memory addresses beyond what the device can support. In these
-cases, a bounce bio representing a buffer from the supported memory range
-is used for performing the i/o with copyin/copyout as needed depending on
-the type of the operation.  For example, in case of a read operation, the
-data read has to be copied to the original buffer on i/o completion, so a
-callback routine is set up to do this, while for write, the data is copied
-from the original buffer to the bounce buffer prior to issuing the
-operation. Since an original buffer may be in a high memory area that's not
-mapped in kernel virtual addr, a kmap operation may be required for
-performing the copy, and special care may be needed in the completion path
-as it may not be in irq context. Special care is also required (by way of
-GFP flags) when allocating bounce buffers, to avoid certain highmem
-deadlock possibilities.
-
-It is also possible that a bounce buffer may be allocated from high-memory
-area that's not mapped in kernel virtual addr, but within the range that the
-device can use directly; so the bounce page may need to be kmapped during
-copy operations. [Note: This does not hold in the current implementation,
-though]
-
-There are some situations when pages from high memory may need to
-be kmapped, even if bounce buffers are not necessary. For example a device
-may need to abort DMA operations and revert to PIO for the transfer, in
-which case a virtual mapping of the page is required. For SCSI it is also
-done in some scenarios where the low level driver cannot be trusted to
-handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions as appropriate. A driver could also use
-the blk_queue_bounce() routine on its own to bounce highmem i/o to low
-memory for specific requests if so desired.
-
-iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
-
-As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
-queue or pick from (copy) existing generic schedulers and replace/override
-certain portions of it. The 2.5 rewrite provides improved modularization
-of the i/o scheduler. There are more pluggable callbacks, e.g for init,
-add request, extract request, which makes it possible to abstract specific
-i/o scheduling algorithm aspects and details outside of the generic loop.
-It also makes it possible to completely hide the implementation details of
-the i/o scheduler from block drivers.
-
-I/O scheduler wrappers are to be used instead of accessing the queue directly.
-See section 4. The I/O scheduler for details.
-
-1.2 Tuning Based on High level code capabilities
-
-i. Application capabilities for raw i/o
-
-This comes from some of the high-performance database/middleware
-requirements where an application prefers to make its own i/o scheduling
-decisions based on an understanding of the access patterns and i/o
-characteristics
-
-ii. High performance filesystems or other higher level kernel code's
-capabilities
-
-Kernel components like filesystems could also take their own i/o scheduling
-decisions for optimizing performance. Journalling filesystems may need
-some control over i/o ordering.
-
-What kind of support exists at the generic block layer for this ?
-
-The flags and rw fields in the bio structure can be used for some tuning
-from above e.g indicating that an i/o is just a readahead request, or priority
-settings (currently unused). As far as user applications are concerned they
-would need an additional mechanism either via open flags or ioctls, or some
-other upper level mechanism to communicate such settings to block.
-
-1.2.1 Request Priority/Latency
-
-Todo/Under discussion:
-Arjan's proposed request priority scheme allows higher levels some broad
-  control (high/med/low) over the priority  of an i/o request vs other pending
-  requests in the queue. For example it allows reads for bringing in an
-  executable page on demand to be given a higher priority over pending write
-  requests which haven't aged too much on the queue. Potentially this priority
-  could even be exposed to applications in some manner, providing higher level
-  tunability. Time based aging avoids starvation of lower priority
-  requests. Some bits in the bi_opf flags field in the bio structure are
-  intended to be used for this priority information.
-
-
-1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
-    (e.g Diagnostics, Systems Management)
-
-There are situations where high-level code needs to have direct access to
-the low level device capabilities or requires the ability to issue commands
-to the device bypassing some of the intermediate i/o layers.
-These could, for example, be special control commands issued through ioctl
-interfaces, or could be raw read/write commands that stress the drive's
-capabilities for certain kinds of fitness tests. Having direct interfaces at
-multiple levels without having to pass through upper layers makes
-it possible to perform bottom up validation of the i/o path, layer by
-layer, starting from the media.
-
-The normal i/o submission interfaces, e.g submit_bio, could be bypassed
-for specially crafted requests which such ioctl or diagnostics
-interfaces would typically use, and the elevator add_request routine
-can instead be used to directly insert such requests in the queue or preferably
-the blk_do_rq routine can be used to place the request on the queue and
-wait for completion. Alternatively, sometimes the caller might just
-invoke a lower level driver specific interface with the request as a
-parameter.
-
-If the request is a means for passing on special information associated with
-the command, then such information is associated with the request->special
-field (rather than misuse the request->buffer field which is meant for the
-request data buffer's virtual mapping).
-
-For passing request data, the caller must build up a bio descriptor
-representing the concerned memory buffer if the underlying driver interprets
-bio segments or uses the block layer end*request* functions for i/o
-completion. Alternatively one could directly use the request->buffer field to
-specify the virtual address of the buffer, if the driver expects buffer
-addresses passed in this way and ignores bio entries for the request type
-involved. In the latter case, the driver would modify and manage the
-request->buffer, request->sector and request->nr_sectors or
-request->current_nr_sectors fields itself rather than using the block layer
-end_request or end_that_request_first completion interfaces.
-(See 2.3 or Documentation/block/request.txt for a brief explanation of
-the request structure fields)
-
-[TBD: end_that_request_last should be usable even in this case;
-Perhaps an end_that_direct_request_first routine could be implemented to make
-handling direct requests easier for such drivers; Also for drivers that
-expect bios, a helper function could be provided for setting up a bio
-corresponding to a data buffer]
-
-<JENS: I dont understand the above, why is end_that_request_first() not
-usable? Or _last for that matter. I must be missing something>
-<SUP: What I meant here was that if the request doesn't have a bio, then
- end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
- and hence can't be used for advancing request state settings on the
- completion of partial transfers. The driver has to modify these fields 
- directly by hand.
- This is because end_that_request_first only iterates over the bio list,
- and always returns 0 if there are none associated with the request.
- _last works OK in this case, and is not a problem, as I mentioned earlier
->
-
-1.3.1 Pre-built Commands
-
-A request can be created with a pre-built custom command  to be sent directly
-to the device. The cmd block in the request structure has room for filling
-in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
-command pre-building, and the type of the request is now indicated
-through rq->flags instead of via rq->cmd)
-
-The request structure flags can be set up to indicate the type of request
-in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
-packet command issued via blk_do_rq, REQ_SPECIAL: special request).
-
-It can help to pre-build device commands for requests in advance.
-Drivers can now specify a request prepare function (q->prep_rq_fn) that the
-block layer would invoke to pre-build device commands for a given request,
-or perform other preparatory processing for the request. This is routine is
-called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have RQF_DONTPREP
-enabled)
-
-Aside:
-  Pre-building could possibly even be done early, i.e before placing the
-  request on the queue, rather than construct the command on the fly in the
-  driver while servicing the request queue when it may affect latencies in
-  interrupt context or responsiveness in general. One way to add early
-  pre-building would be to do it whenever we fail to merge on a request.
-  Now REQ_NOMERGE is set in the request flags to skip this one in the future,
-  which means that it will not change before we feed it to the device. So
-  the pre-builder hook can be invoked there.
-
-
-2. Flexible and generic but minimalist i/o structure/descriptor.
-
-2.1 Reason for a new structure and requirements addressed
-
-Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
-layer, and the low level request structure was associated with a chain of
-buffer heads for a contiguous i/o request. This led to certain inefficiencies
-when it came to large i/o requests and readv/writev style operations, as it
-forced such requests to be broken up into small chunks before being passed
-on to the generic block layer, only to be merged by the i/o scheduler
-when the underlying device was capable of handling the i/o in one shot.
-Also, using the buffer head as an i/o structure for i/os that didn't originate
-from the buffer cache unnecessarily added to the weight of the descriptors
-which were generated for each such chunk.
-
-The following were some of the goals and expectations considered in the
-redesign of the block i/o data structure in 2.5.
-
-i.  Should be appropriate as a descriptor for both raw and buffered i/o  -
-    avoid cache related fields which are irrelevant in the direct/page i/o path,
-    or filesystem block size alignment restrictions which may not be relevant
-    for raw i/o.
-ii. Ability to represent high-memory buffers (which do not have a virtual
-    address mapping in kernel address space).
-iii.Ability to represent large i/os w/o unnecessarily breaking them up (i.e
-    greater than PAGE_SIZE chunks in one shot)
-iv. At the same time, ability to retain independent identity of i/os from
-    different sources or i/o units requiring individual completion (e.g. for
-    latency reasons)
-v.  Ability to represent an i/o involving multiple physical memory segments
-    (including non-page aligned page fragments, as specified via readv/writev)
-    without unnecessarily breaking it up, if the underlying device is capable of
-    handling it.
-vi. Preferably should be based on a memory descriptor structure that can be
-    passed around different types of subsystems or layers, maybe even
-    networking, without duplication or extra copies of data/descriptor fields
-    themselves in the process
-vii.Ability to handle the possibility of splits/merges as the structure passes
-    through layered drivers (lvm, md, evms), with minimal overhead.
-
-The solution was to define a new structure (bio)  for the block layer,
-instead of using the buffer head structure (bh) directly, the idea being
-avoidance of some associated baggage and limitations. The bio structure
-is uniformly used for all i/o at the block layer ; it forms a part of the
-bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
-mapped to bio structures.
-
-2.2 The bio struct
-
-The bio structure uses a vector representation pointing to an array of tuples
-of <page, offset, len> to describe the i/o buffer, and has various other
-fields describing i/o parameters and state that needs to be maintained for
-performing the i/o.
-
-Notice that this representation means that a bio has no virtual address
-mapping at all (unlike buffer heads).
-
-struct bio_vec {
-       struct page     *bv_page;
-       unsigned short  bv_len;
-       unsigned short  bv_offset;
-};
-
-/*
- * main unit of I/O for the block layer and lower layers (ie drivers)
- */
-struct bio {
-       struct bio          *bi_next;    /* request queue link */
-       struct block_device *bi_bdev;	/* target device */
-       unsigned long       bi_flags;    /* status, command, etc */
-       unsigned long       bi_opf;       /* low bits: r/w, high: priority */
-
-       unsigned int	bi_vcnt;     /* how may bio_vec's */
-       struct bvec_iter	bi_iter;	/* current index into bio_vec array */
-
-       unsigned int	bi_size;     /* total size in bytes */
-       unsigned short 	bi_phys_segments; /* segments after physaddr coalesce*/
-       unsigned short	bi_hw_segments; /* segments after DMA remapping */
-       unsigned int	bi_max;	     /* max bio_vecs we can hold
-                                        used as index into pool */
-       struct bio_vec   *bi_io_vec;  /* the actual vec list */
-       bio_end_io_t	*bi_end_io;  /* bi_end_io (bio) */
-       atomic_t		bi_cnt;	     /* pin count: free when it hits zero */
-       void             *bi_private;
-};
-
-With this multipage bio design:
-
-- Large i/os can be sent down in one go using a bio_vec list consisting
-  of an array of <page, offset, len> fragments (similar to the way fragments
-  are represented in the zero-copy network code)
-- Splitting of an i/o request across multiple devices (as in the case of
-  lvm or raid) is achieved by cloning the bio (where the clone points to
-  the same bi_io_vec array, but with the index and size accordingly modified)
-- A linked list of bios is used as before for unrelated merges (*) - this
-  avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list can find all the segments of a bio
-  by using rq_for_each_segment.  This handles the fact that a request
-  has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_iter
-  field to keep track of the next bio_vec entry to process.
-  (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
-  [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
-   bi_offset an len fields]
-
-(*) unrelated merges -- a request ends up containing two or more bios that
-    didn't originate from the same place.
-
-bi_end_io() i/o callback gets called on i/o completion of the entire bio.
-
-At a lower level, drivers build a scatter gather list from the merged bios.
-The scatter gather list is in the form of an array of <page, offset, len>
-entries with their corresponding dma address mappings filled in at the
-appropriate time. As an optimization, contiguous physical pages can be
-covered by a single entry where <page> refers to the first page and <len>
-covers the range of pages (up to 16 contiguous pages could be covered this
-way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
-the sg list.
-
-Note: Right now the only user of bios with more than one page is ll_rw_kio,
-which in turn means that only raw I/O uses it (direct i/o may not work
-right now). The intent however is to enable clustering of pages etc to
-become possible. The pagebuf abstraction layer from SGI also uses multi-page
-bios, but that is currently not included in the stock development kernels.
-The same is true of Andrew Morton's work-in-progress multipage bio writeout 
-and readahead patches.
-
-2.3 Changes in the Request Structure
-
-The request structure is the structure that gets passed down to low level
-drivers. The block layer make_request function builds up a request structure,
-places it on the queue and invokes the drivers request_fn. The driver makes
-use of block layer helper routine elv_next_request to pull the next request
-off the queue. Control or diagnostic functions might bypass block and directly
-invoke underlying driver entry points passing in a specially constructed
-request structure.
-
-Only some relevant fields (mainly those which changed or may be referred
-to in some of the discussion here) are listed below, not necessarily in
-the order in which they occur in the structure (see include/linux/blkdev.h)
-Refer to Documentation/block/request.txt for details about all the request
-structure fields and a quick reference about the layers which are
-supposed to use or modify those fields.
-
-struct request {
-	struct list_head queuelist;  /* Not meant to be directly accessed by
-					the driver.
-					Used by q->elv_next_request_fn
-					rq->queue is gone
-					*/
-	.
-	.
-	unsigned char cmd[16]; /* prebuilt command data block */
-	unsigned long flags;   /* also includes earlier rq->cmd settings */
-	.
-	.
-	sector_t sector; /* this field is now of type sector_t instead of int
-			    preparation for 64 bit sectors */
-	.
-	.
-
-	/* Number of scatter-gather DMA addr+len pairs after
-	 * physical address coalescing is performed.
-	 */
-	unsigned short nr_phys_segments;
-
-	/* Number of scatter-gather addr+len pairs after
-	 * physical and DMA remapping hardware coalescing is performed.
-	 * This is the number of scatter-gather entries the driver
-	 * will actually have to deal with after DMA mapping is done.
-	 */
-	unsigned short nr_hw_segments;
-
-	/* Various sector counts */
-	unsigned long nr_sectors;  /* no. of sectors left: driver modifiable */
-	unsigned long hard_nr_sectors;  /* block internal copy of above */
-	unsigned int current_nr_sectors; /* no. of sectors left in the
-					   current segment:driver modifiable */
-	unsigned long hard_cur_sectors; /* block internal copy of the above */
-	.
-	.
-	int tag;	/* command tag associated with request */
-	void *special;  /* same as before */
-	char *buffer;   /* valid only for low memory buffers up to
-			 current_nr_sectors */
-	.
-	.
-	struct bio *bio, *biotail;  /* bio list instead of bh */
-	struct request_list *rl;
-}
-	
-See the req_ops and req_flag_bits definitions for an explanation of the various
-flags available. Some bits are used by the block layer or i/o scheduler.
-	
-The behaviour of the various sector counts are almost the same as before,
-except that since we have multi-segment bios, current_nr_sectors refers
-to the numbers of sectors in the current segment being processed which could
-be one of the many segments in the current bio (i.e i/o completion unit).
-The nr_sectors value refers to the total number of sectors in the whole
-request that remain to be transferred (no change). The purpose of the
-hard_xxx values is for block to remember these counts every time it hands
-over the request to the driver. These values are updated by block on
-end_that_request_first, i.e. every time the driver completes a part of the
-transfer and invokes block end*request helpers to mark this. The
-driver should not modify these values. The block layer sets up the
-nr_sectors and current_nr_sectors fields (based on the corresponding
-hard_xxx values and the number of bytes transferred) and updates it on
-every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_iter fields too.
-
-The buffer field is just a virtual address mapping of the current segment
-of the i/o buffer in cases where the buffer resides in low-memory. For high
-memory i/o, this field is not valid and must not be used by drivers.
-
-Code that sets up its own request structures and passes them down to
-a driver needs to be careful about interoperation with the block layer helper
-functions which the driver uses. (Section 1.3)
-
-3. Using bios
-
-3.1 Setup/Teardown
-
-There are routines for managing the allocation, and reference counting, and
-freeing of bios (bio_alloc, bio_get, bio_put).
-
-This makes use of Ingo Molnar's mempool implementation, which enables
-subsystems like bio to maintain their own reserve memory pools for guaranteed
-deadlock-free allocations during extreme VM load. For example, the VM
-subsystem makes use of the block layer to writeout dirty pages in order to be
-able to free up memory space, a case which needs careful handling. The
-allocation logic draws from the preallocated emergency reserve in situations
-where it cannot allocate through normal means. If the pool is empty and it
-can wait, then it would trigger action that would help free up memory or
-replenish the pool (without deadlocking) and wait for availability in the pool.
-If it is in IRQ context, and hence not in a position to do this, allocation
-could fail if the pool is empty. In general mempool always first tries to
-perform allocation without having to wait, even if it means digging into the
-pool as long it is not less that 50% full.
-
-On a free, memory is released to the pool or directly freed depending on
-the current availability in the pool. The mempool interface lets the
-subsystem specify the routines to be used for normal alloc and free. In the
-case of bio, these routines make use of the standard slab allocator.
-
-The caller of bio_alloc is expected to taken certain steps to avoid
-deadlocks, e.g. avoid trying to allocate more memory from the pool while
-already holding memory obtained from the pool.
-[TBD: This is a potential issue, though a rare possibility
- in the bounce bio allocation that happens in the current code, since
- it ends up allocating a second bio from the same pool while
- holding the original bio ]
-
-Memory allocated from the pool should be released back within a limited
-amount of time (in the case of bio, that would be after the i/o is completed).
-This ensures that if part of the pool has been used up, some work (in this
-case i/o) must already be in progress and memory would be available when it
-is over. If allocating from multiple pools in the same code path, the order
-or hierarchy of allocation needs to be consistent, just the way one deals
-with multiple locks.
-
-The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
-for a non-clone bio. There are the 6 pools setup for different size biovecs,
-so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
-given size from these slabs.
-
-The bio_get() routine may be used to hold an extra reference on a bio prior
-to i/o submission, if the bio fields are likely to be accessed after the
-i/o is issued (since the bio may otherwise get freed in case i/o completion
-happens in the meantime).
-
-The bio_clone_fast() routine may be used to duplicate a bio, where the clone
-shares the bio_vec_list with the original bio (i.e. both point to the
-same bio_vec_list). This would typically be used for splitting i/o requests
-in lvm or md.
-
-3.2 Generic bio helper Routines
-
-3.2.1 Traversing segments and completion units in a request
-
-The macro rq_for_each_segment() should be used for traversing the bios
-in the request list (drivers should avoid directly trying to do it
-themselves). Using these helpers should also make it easier to cope
-with block changes in the future.
-
-	struct req_iterator iter;
-	rq_for_each_segment(bio_vec, rq, iter)
-		/* bio_vec is now current segment */
-
-I/O completion callbacks are per-bio rather than per-segment, so drivers
-that traverse bio chains on completion need to keep that in mind. Drivers
-which don't make a distinction between segments and completion units would
-need to be reorganized to support multi-segment bios.
-
-3.2.2 Setting up DMA scatterlists
-
-The blk_rq_map_sg() helper routine would be used for setting up scatter
-gather lists from a request, so a driver need not do it on its own.
-
-	nr_segments = blk_rq_map_sg(q, rq, scatterlist);
-
-The helper routine provides a level of abstraction which makes it easier
-to modify the internals of request to scatterlist conversion down the line
-without breaking drivers. The blk_rq_map_sg routine takes care of several
-things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
-is set) and correct segment accounting to avoid exceeding the limits which
-the i/o hardware can handle, based on various queue properties.
-
-- Prevents a clustered segment from crossing a 4GB mem boundary
-- Avoids building segments that would exceed the number of physical
-  memory segments that the driver can handle (phys_segments) and the
-  number that the underlying hardware can handle at once, accounting for
-  DMA remapping (hw_segments)  (i.e. IOMMU aware limits).
-
-Routines which the low level driver can use to set up the segment limits:
-
-blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
-hw data segments in a request (i.e. the maximum number of address/length
-pairs the host adapter can actually hand to the device at once)
-
-blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
-of physical data segments in a request (i.e. the largest sized scatter list
-a driver could handle)
-
-3.2.3 I/O completion
-
-The existing generic block layer helper routines end_request,
-end_that_request_first and end_that_request_last can be used for i/o
-completion (and setting things up so the rest of the i/o or the next
-request can be kicked of) as before. With the introduction of multi-page
-bio support, end_that_request_first requires an additional argument indicating
-the number of sectors completed.
-
-3.2.4 Implications for drivers that do not interpret bios (don't handle
- multiple segments)
-
-Drivers that do not interpret bios e.g those which do not handle multiple
-segments and do not support i/o into high memory addresses (require bounce
-buffers) and expect only virtually mapped buffers, can access the rq->buffer
-field. As before the driver should use current_nr_sectors to determine the
-size of remaining data in the current segment (that is the maximum it can
-transfer in one go unless it interprets segments), and rely on the block layer
-end_request, or end_that_request_first/last to take care of all accounting
-and transparent mapping of the next bio segment when a segment boundary
-is crossed on completion of a transfer. (The end*request* functions should
-be used if only if the request has come down from block/bio path, not for
-direct access requests which only specify rq->buffer without a valid rq->bio)
-
-3.3 I/O Submission
-
-The routine submit_bio() is used to submit a single io. Higher level i/o
-routines make use of this:
-
-(a) Buffered i/o:
-The routine submit_bh() invokes submit_bio() on a bio corresponding to the
-bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
-
-(b) Kiobuf i/o (for raw/direct i/o):
-The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
-maps the array to one or more multi-page bios, issuing submit_bio() to
-perform the i/o on each of these.
-
-The embedded bh array in the kiobuf structure has been removed and no
-preallocation of bios is done for kiobufs. [The intent is to remove the
-blocks array as well, but it's currently in there to kludge around direct i/o.]
-Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
-
-Todo/Observation:
-
- A single kiobuf structure is assumed to correspond to a contiguous range
- of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
- So right now it wouldn't work for direct i/o on non-contiguous blocks.
- This is to be resolved.  The eventual direction is to replace kiobuf
- by kvec's.
-
- Badari Pulavarty has a patch to implement direct i/o correctly using
- bio and kvec.
-
-
-(c) Page i/o:
-Todo/Under discussion:
-
- Andrew Morton's multi-page bio patches attempt to issue multi-page
- writeouts (and reads) from the page cache, by directly building up
- large bios for submission completely bypassing the usage of buffer
- heads. This work is still in progress.
-
- Christoph Hellwig had some code that uses bios for page-io (rather than
- bh). This isn't included in bio as yet. Christoph was also working on a
- design for representing virtual/real extents as an entity and modifying
- some of the address space ops interfaces to utilize this abstraction rather
- than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
- abstraction, but intended to be as lightweight as possible).
-
-(d) Direct access i/o:
-Direct access requests that do not contain bios would be submitted differently
-as discussed earlier in section 1.3.
-
-Aside:
-
-  Kvec i/o:
-
-  Ben LaHaise's aio code uses a slightly different structure instead
-  of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
-  tuples (very much like the networking code), together with a callback function
-  and data pointer. This is embedded into a brw_cb structure when passed
-  to brw_kvec_async().
-
-  Now it should be possible to directly map these kvecs to a bio. Just as while
-  cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
-  array pointer to point to the veclet array in kvecs.
-
-  TBD: In order for this to work, some changes are needed in the way multi-page
-  bios are handled today. The values of the tuples in such a vector passed in
-  from higher level code should not be modified by the block layer in the course
-  of its request processing, since that would make it hard for the higher layer
-  to continue to use the vector descriptor (kvec) after i/o completes. Instead,
-  all such transient state should either be maintained in the request structure,
-  and passed on in some way to the endio completion routine.
-
-
-4. The I/O scheduler
-I/O scheduler, a.k.a. elevator, is implemented in two layers.  Generic dispatch
-queue and specific I/O schedulers.  Unless stated otherwise, elevator is used
-to refer to both parts and I/O scheduler to specific I/O schedulers.
-
-Block layer implements generic dispatch queue in block/*.c.
-The generic dispatch queue is responsible for requeueing, handling non-fs
-requests and all other subtleties.
-
-Specific I/O schedulers are responsible for ordering normal filesystem
-requests.  They can also choose to delay certain requests to improve
-throughput or whatever purpose.  As the plural form indicates, there are
-multiple I/O schedulers.  They can be built as modules but at least one should
-be built inside the kernel.  Each queue can choose different one and can also
-change to another one dynamically.
-
-A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
-and xxx might not match exactly, but use your imagination. If an elevator
-doesn't implement a function, the switch does nothing or some minimal house
-keeping work.
-
-4.1. I/O scheduler API
-
-The functions an elevator may implement are: (* are mandatory)
-elevator_merge_fn		called to query requests for merge with a bio
-
-elevator_merge_req_fn		called when two requests get merged. the one
-				which gets merged into the other one will be
-				never seen by I/O scheduler again. IOW, after
-				being merged, the request is gone.
-
-elevator_merged_fn		called when a request in the scheduler has been
-				involved in a merge. It is used in the deadline
-				scheduler for example, to reposition the request
-				if its sorting order has changed.
-
-elevator_allow_merge_fn		called whenever the block layer determines
-				that a bio can be merged into an existing
-				request safely. The io scheduler may still
-				want to stop a merge at this point if it
-				results in some sort of conflict internally,
-				this hook allows it to do that. Note however
-				that two *requests* can still be merged at later
-				time. Currently the io scheduler has no way to
-				prevent that. It can only learn about the fact
-				from elevator_merge_req_fn callback.
-
-elevator_dispatch_fn*		fills the dispatch queue with ready requests.
-				I/O schedulers are free to postpone requests by
-				not filling the dispatch queue unless @force
-				is non-zero.  Once dispatched, I/O schedulers
-				are not allowed to manipulate the requests -
-				they belong to generic dispatch queue.
-
-elevator_add_req_fn*		called to add a new request into the scheduler
-
-elevator_former_req_fn
-elevator_latter_req_fn		These return the request before or after the
-				one specified in disk sort order. Used by the
-				block layer to find merge possibilities.
-
-elevator_completed_req_fn	called when a request is completed.
-
-elevator_may_queue_fn		returns true if the scheduler wants to allow the
-				current context to queue a new request even if
-				it is over the queue limit. This must be used
-				very carefully!!
-
-elevator_set_req_fn
-elevator_put_req_fn		Must be used to allocate and free any elevator
-				specific storage for a request.
-
-elevator_activate_req_fn	Called when device driver first sees a request.
-				I/O schedulers can use this callback to
-				determine when actual execution of a request
-				starts.
-elevator_deactivate_req_fn	Called when device driver decides to delay
-				a request by requeueing it.
-
-elevator_init_fn*
-elevator_exit_fn		Allocate and free any elevator specific storage
-				for a queue.
-
-4.2 Request flows seen by I/O schedulers
-All requests seen by I/O schedulers strictly follow one of the following three
-flows.
-
- set_req_fn ->
-
- i.   add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
-      (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
- ii.  add_req_fn -> (merged_fn ->)* -> merge_req_fn
- iii. [none]
-
- -> put_req_fn
-
-4.3 I/O scheduler implementation
-The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
-optimal disk scan and request servicing performance (based on generic
-principles and device capabilities), optimized for:
-i.   improved throughput
-ii.  improved latency
-iii. better utilization of h/w & CPU time
-
-Characteristics:
-
-i. Binary tree
-AS and deadline i/o schedulers use red black binary trees for disk position
-sorting and searching, and a fifo linked list for time-based searching. This
-gives good scalability and good availability of information. Requests are
-almost always dispatched in disk sort order, so a cache is kept of the next
-request in sort order to prevent binary tree lookups.
-
-This arrangement is not a generic block layer characteristic however, so
-elevators may implement queues as they please.
-
-ii. Merge hash
-AS and deadline use a hash table indexed by the last sector of a request. This
-enables merging code to quickly look up "back merge" candidates, even when
-multiple I/O streams are being performed at once on one disk.
-
-"Front merges", a new request being merged at the front of an existing request,
-are far less common than "back merges" due to the nature of most I/O patterns.
-Front merges are handled by the binary trees in AS and deadline schedulers.
-
-iii. Plugging the queue to batch requests in anticipation of opportunities for
-     merge/sort optimizations
-
-Plugging is an approach that the current i/o scheduling algorithm resorts to so
-that it collects up enough requests in the queue to be able to take
-advantage of the sorting/merging logic in the elevator. If the
-queue is empty when a request comes in, then it plugs the request queue
-(sort of like plugging the bath tub of a vessel to get fluid to build up)
-till it fills up with a few more requests, before starting to service
-the requests. This provides an opportunity to merge/sort the requests before
-passing them down to the device. There are various conditions when the queue is
-unplugged (to open up the flow again), either through a scheduled task or
-could be on demand. For example wait_on_buffer sets the unplugging going
-through sync_buffer() running blk_run_address_space(mapping). Or the caller
-can do it explicity through blk_unplug(bdev). So in the read case,
-the queue gets explicitly unplugged as part of waiting for completion on that
-buffer.
-
-Aside:
-  This is kind of controversial territory, as it's not clear if plugging is
-  always the right thing to do. Devices typically have their own queues,
-  and allowing a big queue to build up in software, while letting the device be
-  idle for a while may not always make sense. The trick is to handle the fine
-  balance between when to plug and when to open up. Also now that we have
-  multi-page bios being queued in one shot, we may not need to wait to merge
-  a big request from the broken up pieces coming by.
-
-4.4 I/O contexts
-I/O contexts provide a dynamically allocated per process data area. They may
-be used in I/O schedulers, and in the block layer (could be used for IO statis,
-priorities for example). See *io_context in block/ll_rw_blk.c, and as-iosched.c
-for an example of usage in an i/o scheduler.
-
-
-5. Scalability related changes
-
-5.1 Granular Locking: io_request_lock replaced by a per-queue lock
-
-The global io_request_lock has been removed as of 2.5, to avoid
-the scalability bottleneck it was causing, and has been replaced by more
-granular locking. The request queue structure has a pointer to the
-lock to be used for that queue. As a result, locking can now be
-per-queue, with a provision for sharing a lock across queues if
-necessary (e.g the scsi layer sets the queue lock pointers to the
-corresponding adapter lock, which results in a per host locking
-granularity). The locking semantics are the same, i.e. locking is
-still imposed by the block layer, grabbing the lock before
-request_fn execution which it means that lots of older drivers
-should still be SMP safe. Drivers are free to drop the queue
-lock themselves, if required. Drivers that explicitly used the
-io_request_lock for serialization need to be modified accordingly.
-Usually it's as easy as adding a global lock:
-
-	static DEFINE_SPINLOCK(my_driver_lock);
-
-and passing the address to that lock to blk_init_queue().
-
-5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
-
-The sector number used in the bio structure has been changed to sector_t,
-which could be defined as 64 bit in preparation for 64 bit sector support.
-
-6. Other Changes/Implications
-
-6.1 Partition re-mapping handled by the generic block layer
-
-In 2.5 some of the gendisk/partition related code has been reorganized.
-Now the generic block layer performs partition-remapping early and thus
-provides drivers with a sector number relative to whole device, rather than
-having to take partition number into account in order to arrive at the true
-sector number. The routine blk_partition_remap() is invoked by
-generic_make_request even before invoking the queue specific make_request_fn,
-so the i/o scheduler also gets to operate on whole disk sector numbers. This
-should typically not require changes to block drivers, it just never gets
-to invoke its own partition sector offset calculations since all bios
-sent are offset from the beginning of the device.
-
-
-7. A Few Tips on Migration of older drivers
-
-Old-style drivers that just use CURRENT and ignores clustered requests,
-may not need much change.  The generic layer will automatically handle
-clustered requests, multi-page bios, etc for the driver.
-
-For a low performance driver or hardware that is PIO driven or just doesn't
-support scatter-gather changes should be minimal too.
-
-The following are some points to keep in mind when converting old drivers
-to bio.
-
-Drivers should use elv_next_request to pick up requests and are no longer
-supposed to handle looping directly over the request list.
-(struct request->queue has been removed)
-
-Now end_that_request_first takes an additional number_of_sectors argument.
-It used to handle always just the first buffer_head in a request, now
-it will loop and handle as many sectors (on a bio-segment granularity)
-as specified.
-
-Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio) instead.
-
-If the driver is dropping the io_request_lock from its request_fn strategy,
-then it just needs to replace that with q->queue_lock instead.
-
-As described in Sec 1.1, drivers can set max sector size, max segment size
-etc per queue now. Drivers that used to define their own merge functions i
-to handle things like this can now just use the blk_queue_* functions at
-blk_init_queue time.
-
-Drivers no longer have to map a {partition, sector offset} into the
-correct absolute location anymore, this is done by the block layer, so
-where a driver received a request ala this before:
-
-	rq->rq_dev = mk_kdev(3, 5);	/* /dev/hda5 */
-	rq->sector = 0;			/* first sector on hda5 */
-
-  it will now see
-
-	rq->rq_dev = mk_kdev(3, 0);	/* /dev/hda */
-	rq->sector = 123128;		/* offset from start of disk */
-
-As mentioned, there is no virtual mapping of a bio. For DMA, this is
-not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (dma_map_page for a single segment or
-use dma_map_sg for scatter gather) to be able to ship it to the driver. For
-PIO drivers (or drivers that need to revert to PIO transfer once in a
-while (IDE for example)), where the CPU is doing the actual data
-transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
-a bio into the virtual address space.
-
-
-8. Prior/Related/Impacted patches
-
-8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
-- orig kiobuf & raw i/o patches (now in 2.4 tree)
-- direct kiobuf based i/o to devices (no intermediate bh's)
-- page i/o using kiobuf
-- kiobuf splitting for lvm (mkp)
-- elevator support for kiobuf request merging (axboe)
-8.2. Zero-copy networking (Dave Miller)
-8.3. SGI XFS - pagebuf patches - use of kiobufs
-8.4. Multi-page pioent patch for bio (Christoph Hellwig)
-8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
-8.6. Async i/o implementation patch (Ben LaHaise)
-8.7. EVMS layering design (IBM EVMS team)
-8.8. Larger page cache size patch (Ben LaHaise) and
-     Large page size (Daniel Phillips)
-    => larger contiguous physical memory buffers
-8.9. VM reservations patch (Ben LaHaise)
-8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
-8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
-8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar,
-      Badari)
-8.13  Priority based i/o scheduler - prepatches (Arjan van de Ven)
-8.14  IDE Taskfile i/o patch (Andre Hedrick)
-8.15  Multi-page writeout and readahead patches (Andrew Morton)
-8.16  Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
-
-9. Other References:
-
-9.1 The Splice I/O Model - Larry McVoy (and subsequent discussions on lkml,
-and Linus' comments - Jan 2001)
-9.2 Discussions about kiobuf and bh design on lkml between sct, linus, alan
-et al - Feb-March 2001 (many of the initial thoughts that led to bio were
-brought up in this discussion thread)
-9.3 Discussions on mempool on lkml - Dec 2001.
-
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
new file mode 100644
index 000000000000..86fa66c87172
--- /dev/null
+++ b/Documentation/block/biovecs.rst
@@ -0,0 +1,146 @@
+======================================
+Immutable biovecs and biovec iterators
+======================================
+
+Kent Overstreet <kmo@daterainc.com>
+
+As of 3.13, biovecs should never be modified after a bio has been submitted.
+Instead, we have a new struct bvec_iter which represents a range of a biovec -
+the iterator will be modified as the bio is completed, not the biovec.
+
+More specifically, old code that needed to partially complete a bio would
+update bi_sector and bi_size, and advance bi_idx to the next biovec. If it
+ended up partway through a biovec, it would increment bv_offset and decrement
+bv_len by the number of bytes completed in that biovec.
+
+In the new scheme of things, everything that must be mutated in order to
+partially complete a bio is segregated into struct bvec_iter: bi_sector,
+bi_size and bi_idx have been moved there; and instead of modifying bv_offset
+and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of
+bytes completed in the current bvec.
+
+There are a bunch of new helper macros for hiding the gory details - in
+particular, presenting the illusion of partially completed biovecs so that
+normal code doesn't have to deal with bi_bvec_done.
+
+ * Driver code should no longer refer to biovecs directly; we now have
+   bio_iovec() and bio_iter_iovec() macros that return literal struct biovecs,
+   constructed from the raw biovecs but taking into account bi_bvec_done and
+   bi_size.
+
+   bio_for_each_segment() has been updated to take a bvec_iter argument
+   instead of an integer (that corresponded to bi_idx); for a lot of code the
+   conversion just required changing the types of the arguments to
+   bio_for_each_segment().
+
+ * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a
+   wrapper around bio_advance_iter() that operates on bio->bi_iter, and also
+   advances the bio integrity's iter if present.
+
+   There is a lower level advance function - bvec_iter_advance() - which takes
+   a pointer to a biovec, not a bio; this is used by the bio integrity code.
+
+What's all this get us?
+=======================
+
+Having a real iterator, and making biovecs immutable, has a number of
+advantages:
+
+ * Before, iterating over bios was very awkward when you weren't processing
+   exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
+   which copies the contents of one bio into another. Because the biovecs
+   wouldn't necessarily be the same size, the old code was tricky convoluted -
+   it had to walk two different bios at the same time, keeping both bi_idx and
+   and offset into the current biovec for each.
+
+   The new code is much more straightforward - have a look. This sort of
+   pattern comes up in a lot of places; a lot of drivers were essentially open
+   coding bvec iterators before, and having common implementation considerably
+   simplifies a lot of code.
+
+ * Before, any code that might need to use the biovec after the bio had been
+   completed (perhaps to copy the data somewhere else, or perhaps to resubmit
+   it somewhere else if there was an error) had to save the entire bvec array
+   - again, this was being done in a fair number of places.
+
+ * Biovecs can be shared between multiple bios - a bvec iter can represent an
+   arbitrary range of an existing biovec, both starting and ending midway
+   through biovecs. This is what enables efficient splitting of arbitrary
+   bios. Note that this means we _only_ use bi_size to determine when we've
+   reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes
+   bi_size into account when constructing biovecs.
+
+ * Splitting bios is now much simpler. The old bio_split() didn't even work on
+   bios with more than a single bvec! Now, we can efficiently split arbitrary
+   size bios - because the new bio can share the old bio's biovec.
+
+   Care must be taken to ensure the biovec isn't freed while the split bio is
+   still using it, in case the original bio completes first, though. Using
+   bio_chain() when splitting bios helps with this.
+
+ * Submitting partially completed bios is now perfectly fine - this comes up
+   occasionally in stacking block drivers and various code (e.g. md and
+   bcache) had some ugly workarounds for this.
+
+   It used to be the case that submitting a partially completed bio would work
+   fine to _most_ devices, but since accessing the raw bvec array was the
+   norm, not all drivers would respect bi_idx and those would break. Now,
+   since all drivers _must_ go through the bvec iterator - and have been
+   audited to make sure they are - submitting partially completed bios is
+   perfectly fine.
+
+Other implications:
+===================
+
+ * Almost all usage of bi_idx is now incorrect and has been removed; instead,
+   where previously you would have used bi_idx you'd now use a bvec_iter,
+   probably passing it to one of the helper macros.
+
+   I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you
+   now use bio_iter_iovec(), which takes a bvec_iter and returns a
+   literal struct bio_vec - constructed on the fly from the raw biovec but
+   taking into account bi_bvec_done (and bi_size).
+
+ * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that
+   doesn't actually own the bio. The reason is twofold: firstly, it's not
+   actually needed for iterating over the bio anymore - we only use bi_size.
+   Secondly, when cloning a bio and reusing (a portion of) the original bio's
+   biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate
+   over all the biovecs in the new bio - which is silly as it's not needed.
+
+   So, don't use bi_vcnt anymore.
+
+ * The current interface allows the block layer to split bios as needed, so we
+   could eliminate a lot of complexity particularly in stacked drivers. Code
+   that creates bios can then create whatever size bios are convenient, and
+   more importantly stacked drivers don't have to deal with both their own bio
+   size limitations and the limitations of the underlying devices. Thus
+   there's no need to define ->merge_bvec_fn() callbacks for individual block
+   drivers.
+
+Usage of helpers:
+=================
+
+* The following helpers whose names have the suffix of `_all` can only be used
+  on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+  shouldn't use them because the bio may have been split before it reached the
+  driver.
+
+::
+
+	bio_for_each_segment_all()
+	bio_first_bvec_all()
+	bio_first_page_all()
+	bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+  bio_vec' will contain a single-page IO vector during the iteration::
+
+	bio_for_each_segment()
+	bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+  bio_vec' will contain a multi-page IO vector during the iteration::
+
+	bio_for_each_bvec()
+	rq_for_each_bvec()
diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
deleted file mode 100644
index ce6eccaf5df7..000000000000
--- a/Documentation/block/biovecs.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-
-Immutable biovecs and biovec iterators:
-=======================================
-
-Kent Overstreet <kmo@daterainc.com>
-
-As of 3.13, biovecs should never be modified after a bio has been submitted.
-Instead, we have a new struct bvec_iter which represents a range of a biovec -
-the iterator will be modified as the bio is completed, not the biovec.
-
-More specifically, old code that needed to partially complete a bio would
-update bi_sector and bi_size, and advance bi_idx to the next biovec. If it
-ended up partway through a biovec, it would increment bv_offset and decrement
-bv_len by the number of bytes completed in that biovec.
-
-In the new scheme of things, everything that must be mutated in order to
-partially complete a bio is segregated into struct bvec_iter: bi_sector,
-bi_size and bi_idx have been moved there; and instead of modifying bv_offset
-and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of
-bytes completed in the current bvec.
-
-There are a bunch of new helper macros for hiding the gory details - in
-particular, presenting the illusion of partially completed biovecs so that
-normal code doesn't have to deal with bi_bvec_done.
-
- * Driver code should no longer refer to biovecs directly; we now have
-   bio_iovec() and bio_iter_iovec() macros that return literal struct biovecs,
-   constructed from the raw biovecs but taking into account bi_bvec_done and
-   bi_size.
-
-   bio_for_each_segment() has been updated to take a bvec_iter argument
-   instead of an integer (that corresponded to bi_idx); for a lot of code the
-   conversion just required changing the types of the arguments to
-   bio_for_each_segment().
-
- * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a
-   wrapper around bio_advance_iter() that operates on bio->bi_iter, and also
-   advances the bio integrity's iter if present.
-
-   There is a lower level advance function - bvec_iter_advance() - which takes
-   a pointer to a biovec, not a bio; this is used by the bio integrity code.
-
-What's all this get us?
-=======================
-
-Having a real iterator, and making biovecs immutable, has a number of
-advantages:
-
- * Before, iterating over bios was very awkward when you weren't processing
-   exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
-   which copies the contents of one bio into another. Because the biovecs
-   wouldn't necessarily be the same size, the old code was tricky convoluted -
-   it had to walk two different bios at the same time, keeping both bi_idx and
-   and offset into the current biovec for each.
-
-   The new code is much more straightforward - have a look. This sort of
-   pattern comes up in a lot of places; a lot of drivers were essentially open
-   coding bvec iterators before, and having common implementation considerably
-   simplifies a lot of code.
-
- * Before, any code that might need to use the biovec after the bio had been
-   completed (perhaps to copy the data somewhere else, or perhaps to resubmit
-   it somewhere else if there was an error) had to save the entire bvec array
-   - again, this was being done in a fair number of places.
-
- * Biovecs can be shared between multiple bios - a bvec iter can represent an
-   arbitrary range of an existing biovec, both starting and ending midway
-   through biovecs. This is what enables efficient splitting of arbitrary
-   bios. Note that this means we _only_ use bi_size to determine when we've
-   reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes
-   bi_size into account when constructing biovecs.
-
- * Splitting bios is now much simpler. The old bio_split() didn't even work on
-   bios with more than a single bvec! Now, we can efficiently split arbitrary
-   size bios - because the new bio can share the old bio's biovec.
-
-   Care must be taken to ensure the biovec isn't freed while the split bio is
-   still using it, in case the original bio completes first, though. Using
-   bio_chain() when splitting bios helps with this.
-
- * Submitting partially completed bios is now perfectly fine - this comes up
-   occasionally in stacking block drivers and various code (e.g. md and
-   bcache) had some ugly workarounds for this.
-
-   It used to be the case that submitting a partially completed bio would work
-   fine to _most_ devices, but since accessing the raw bvec array was the
-   norm, not all drivers would respect bi_idx and those would break. Now,
-   since all drivers _must_ go through the bvec iterator - and have been
-   audited to make sure they are - submitting partially completed bios is
-   perfectly fine.
-
-Other implications:
-===================
-
- * Almost all usage of bi_idx is now incorrect and has been removed; instead,
-   where previously you would have used bi_idx you'd now use a bvec_iter,
-   probably passing it to one of the helper macros.
-
-   I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you
-   now use bio_iter_iovec(), which takes a bvec_iter and returns a
-   literal struct bio_vec - constructed on the fly from the raw biovec but
-   taking into account bi_bvec_done (and bi_size).
-
- * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that
-   doesn't actually own the bio. The reason is twofold: firstly, it's not
-   actually needed for iterating over the bio anymore - we only use bi_size.
-   Secondly, when cloning a bio and reusing (a portion of) the original bio's
-   biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate
-   over all the biovecs in the new bio - which is silly as it's not needed.
-
-   So, don't use bi_vcnt anymore.
-
- * The current interface allows the block layer to split bios as needed, so we
-   could eliminate a lot of complexity particularly in stacked drivers. Code
-   that creates bios can then create whatever size bios are convenient, and
-   more importantly stacked drivers don't have to deal with both their own bio
-   size limitations and the limitations of the underlying devices. Thus
-   there's no need to define ->merge_bvec_fn() callbacks for individual block
-   drivers.
-
-Usage of helpers:
-=================
-
-* The following helpers whose names have the suffix of "_all" can only be used
-on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
-shouldn't use them because the bio may have been split before it reached the
-driver.
-
-	bio_for_each_segment_all()
-	bio_first_bvec_all()
-	bio_first_page_all()
-	bio_last_bvec_all()
-
-* The following helpers iterate over single-page segment. The passed 'struct
-bio_vec' will contain a single-page IO vector during the iteration
-
-	bio_for_each_segment()
-	bio_for_each_segment_all()
-
-* The following helpers iterate over multi-page bvec. The passed 'struct
-bio_vec' will contain a multi-page IO vector during the iteration
-
-	bio_for_each_bvec()
-	rq_for_each_bvec()
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst
new file mode 100644
index 000000000000..2cf258d64bbe
--- /dev/null
+++ b/Documentation/block/capability.rst
@@ -0,0 +1,18 @@
+===============================
+Generic Block Device Capability
+===============================
+
+This file documents the sysfs file block/<disk>/capability
+
+capability is a hex word indicating which capabilities a specific disk
+supports.  For more information on bits not listed here, see
+include/linux/genhd.h
+
+GENHD_FL_MEDIA_CHANGE_NOTIFY
+----------------------------
+
+Value: 4
+
+When this bit is set, the disk supports Asynchronous Notification
+of media change events.  These events will be broadcast to user
+space via kernel uevent.
diff --git a/Documentation/block/capability.txt b/Documentation/block/capability.txt
deleted file mode 100644
index 2f1729424ef4..000000000000
--- a/Documentation/block/capability.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Generic Block Device Capability
-===============================================================================
-This file documents the sysfs file block/<disk>/capability
-
-capability is a hex word indicating which capabilities a specific disk
-supports.  For more information on bits not listed here, see
-include/linux/genhd.h
-
-Capability				Value
--------------------------------------------------------------------------------
-GENHD_FL_MEDIA_CHANGE_NOTIFY		4
-	When this bit is set, the disk supports Asynchronous Notification
-	of media change events.  These events will be broadcast to user
-	space via kernel uevent.
-
diff --git a/Documentation/block/cmdline-partition.rst b/Documentation/block/cmdline-partition.rst
new file mode 100644
index 000000000000..530bedff548a
--- /dev/null
+++ b/Documentation/block/cmdline-partition.rst
@@ -0,0 +1,53 @@
+==============================================
+Embedded device command line partition parsing
+==============================================
+
+The "blkdevparts" command line option adds support for reading the
+block device partition table from the kernel command line.
+
+It is typically used for fixed block (eMMC) embedded devices.
+It has no MBR, so saves storage space. Bootloader can be easily accessed
+by absolute address of data on the block device.
+Users can easily change the partition.
+
+The format for the command line is just like mtdparts:
+
+blkdevparts=<blkdev-def>[;<blkdev-def>]
+  <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
+    <partdef> := <size>[@<offset>](part-name)
+
+<blkdev-id>
+    block device disk name. Embedded device uses fixed block device.
+    Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
+
+<size>
+    partition size, in bytes, such as: 512, 1m, 1G.
+    size may contain an optional suffix of (upper or lower case):
+
+      K, M, G, T, P, E.
+
+    "-" is used to denote all remaining space.
+
+<offset>
+    partition start address, in bytes.
+    offset may contain an optional suffix of (upper or lower case):
+
+      K, M, G, T, P, E.
+
+(part-name)
+    partition name. Kernel sends uevent with "PARTNAME". Application can
+    create a link to block device partition with the name "PARTNAME".
+    User space application can access partition by partition name.
+
+Example:
+
+    eMMC disk names are "mmcblk0" and "mmcblk0boot0".
+
+  bootargs::
+
+    'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
+
+  dmesg::
+
+    mmcblk0: p1(data0) p2(data1) p3()
+    mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
deleted file mode 100644
index 760a3f7c3ed4..000000000000
--- a/Documentation/block/cmdline-partition.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Embedded device command line partition parsing
-=====================================================================
-
-The "blkdevparts" command line option adds support for reading the
-block device partition table from the kernel command line.
-
-It is typically used for fixed block (eMMC) embedded devices.
-It has no MBR, so saves storage space. Bootloader can be easily accessed
-by absolute address of data on the block device.
-Users can easily change the partition.
-
-The format for the command line is just like mtdparts:
-
-blkdevparts=<blkdev-def>[;<blkdev-def>]
-  <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
-    <partdef> := <size>[@<offset>](part-name)
-
-<blkdev-id>
-    block device disk name. Embedded device uses fixed block device.
-    Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
-
-<size>
-    partition size, in bytes, such as: 512, 1m, 1G.
-    size may contain an optional suffix of (upper or lower case):
-      K, M, G, T, P, E.
-    "-" is used to denote all remaining space.
-
-<offset>
-    partition start address, in bytes.
-    offset may contain an optional suffix of (upper or lower case):
-      K, M, G, T, P, E.
-
-(part-name)
-    partition name. Kernel sends uevent with "PARTNAME". Application can
-    create a link to block device partition with the name "PARTNAME".
-    User space application can access partition by partition name.
-
-Example:
-    eMMC disk names are "mmcblk0" and "mmcblk0boot0".
-
-  bootargs:
-    'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
-
-  dmesg:
-    mmcblk0: p1(data0) p2(data1) p3()
-    mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst
new file mode 100644
index 000000000000..4f2452a95c43
--- /dev/null
+++ b/Documentation/block/data-integrity.rst
@@ -0,0 +1,291 @@
+==============
+Data Integrity
+==============
+
+1. Introduction
+===============
+
+Modern filesystems feature checksumming of data and metadata to
+protect against data corruption.  However, the detection of the
+corruption is done at read time which could potentially be months
+after the data was written.  At that point the original data that the
+application tried to write is most likely lost.
+
+The solution is to ensure that the disk is actually storing what the
+application meant it to.  Recent additions to both the SCSI family
+protocols (SBC Data Integrity Field, SCC protection proposal) as well
+as SATA/T13 (External Path Protection) try to remedy this by adding
+support for appending integrity metadata to an I/O.  The integrity
+metadata (or protection information in SCSI terminology) includes a
+checksum for each sector as well as an incrementing counter that
+ensures the individual sectors are written in the right order.  And
+for some protection schemes also that the I/O is written to the right
+place on disk.
+
+Current storage controllers and devices implement various protective
+measures, for instance checksumming and scrubbing.  But these
+technologies are working in their own isolated domains or at best
+between adjacent nodes in the I/O path.  The interesting thing about
+DIF and the other integrity extensions is that the protection format
+is well defined and every node in the I/O path can verify the
+integrity of the I/O and reject it if corruption is detected.  This
+allows not only corruption prevention but also isolation of the point
+of failure.
+
+2. The Data Integrity Extensions
+================================
+
+As written, the protocol extensions only protect the path between
+controller and storage device.  However, many controllers actually
+allow the operating system to interact with the integrity metadata
+(IMD).  We have been working with several FC/SAS HBA vendors to enable
+the protection information to be transferred to and from their
+controllers.
+
+The SCSI Data Integrity Field works by appending 8 bytes of protection
+information to each sector.  The data + integrity metadata is stored
+in 520 byte sectors on disk.  Data + IMD are interleaved when
+transferred between the controller and target.  The T13 proposal is
+similar.
+
+Because it is highly inconvenient for operating systems to deal with
+520 (and 4104) byte sectors, we approached several HBA vendors and
+encouraged them to allow separation of the data and integrity metadata
+scatter-gather lists.
+
+The controller will interleave the buffers on write and split them on
+read.  This means that Linux can DMA the data buffers to and from
+host memory without changes to the page cache.
+
+Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
+is somewhat heavy to compute in software.  Benchmarks found that
+calculating this checksum had a significant impact on system
+performance for a number of workloads.  Some controllers allow a
+lighter-weight checksum to be used when interfacing with the operating
+system.  Emulex, for instance, supports the TCP/IP checksum instead.
+The IP checksum received from the OS is converted to the 16-bit CRC
+when writing and vice versa.  This allows the integrity metadata to be
+generated by Linux or the application at very low cost (comparable to
+software RAID5).
+
+The IP checksum is weaker than the CRC in terms of detecting bit
+errors.  However, the strength is really in the separation of the data
+buffers and the integrity metadata.  These two distinct buffers must
+match up for an I/O to complete.
+
+The separation of the data and integrity metadata buffers as well as
+the choice in checksums is referred to as the Data Integrity
+Extensions.  As these extensions are outside the scope of the protocol
+bodies (T10, T13), Oracle and its partners are trying to standardize
+them within the Storage Networking Industry Association.
+
+3. Kernel Changes
+=================
+
+The data integrity framework in Linux enables protection information
+to be pinned to I/Os and sent to/received from controllers that
+support it.
+
+The advantage to the integrity extensions in SCSI and SATA is that
+they enable us to protect the entire path from application to storage
+device.  However, at the same time this is also the biggest
+disadvantage. It means that the protection information must be in a
+format that can be understood by the disk.
+
+Generally Linux/POSIX applications are agnostic to the intricacies of
+the storage devices they are accessing.  The virtual filesystem switch
+and the block layer make things like hardware sector size and
+transport protocols completely transparent to the application.
+
+However, this level of detail is required when preparing the
+protection information to send to a disk.  Consequently, the very
+concept of an end-to-end protection scheme is a layering violation.
+It is completely unreasonable for an application to be aware whether
+it is accessing a SCSI or SATA disk.
+
+The data integrity support implemented in Linux attempts to hide this
+from the application.  As far as the application (and to some extent
+the kernel) is concerned, the integrity metadata is opaque information
+that's attached to the I/O.
+
+The current implementation allows the block layer to automatically
+generate the protection information for any I/O.  Eventually the
+intent is to move the integrity metadata calculation to userspace for
+user data.  Metadata and other I/O that originates within the kernel
+will still use the automatic generation interface.
+
+Some storage devices allow each hardware sector to be tagged with a
+16-bit value.  The owner of this tag space is the owner of the block
+device.  I.e. the filesystem in most cases.  The filesystem can use
+this extra space to tag sectors as they see fit.  Because the tag
+space is limited, the block interface allows tagging bigger chunks by
+way of interleaving.  This way, 8*16 bits of information can be
+attached to a typical 4KB filesystem block.
+
+This also means that applications such as fsck and mkfs will need
+access to manipulate the tags from user space.  A passthrough
+interface for this is being worked on.
+
+
+4. Block Layer Implementation Details
+=====================================
+
+4.1 Bio
+-------
+
+The data integrity patches add a new field to struct bio when
+CONFIG_BLK_DEV_INTEGRITY is enabled.  bio_integrity(bio) returns a
+pointer to a struct bip which contains the bio integrity payload.
+Essentially a bip is a trimmed down struct bio which holds a bio_vec
+containing the integrity metadata and the required housekeeping
+information (bvec pool, vector count, etc.)
+
+A kernel subsystem can enable data integrity protection on a bio by
+calling bio_integrity_alloc(bio).  This will allocate and attach the
+bip to the bio.
+
+Individual pages containing integrity metadata can subsequently be
+attached using bio_integrity_add_page().
+
+bio_free() will automatically free the bip.
+
+
+4.2 Block Device
+----------------
+
+Because the format of the protection data is tied to the physical
+disk, each block device has been extended with a block integrity
+profile (struct blk_integrity).  This optional profile is registered
+with the block layer using blk_integrity_register().
+
+The profile contains callback functions for generating and verifying
+the protection data, as well as getting and setting application tags.
+The profile also contains a few constants to aid in completing,
+merging and splitting the integrity metadata.
+
+Layered block devices will need to pick a profile that's appropriate
+for all subdevices.  blk_integrity_compare() can help with that.  DM
+and MD linear, RAID0 and RAID1 are currently supported.  RAID4/5/6
+will require extra work due to the application tag.
+
+
+5.0 Block Layer Integrity API
+=============================
+
+5.1 Normal Filesystem
+---------------------
+
+    The normal filesystem is unaware that the underlying block device
+    is capable of sending/receiving integrity metadata.  The IMD will
+    be automatically generated by the block layer at submit_bio() time
+    in case of a WRITE.  A READ request will cause the I/O integrity
+    to be verified upon completion.
+
+    IMD generation and verification can be toggled using the::
+
+      /sys/block/<bdev>/integrity/write_generate
+
+    and::
+
+      /sys/block/<bdev>/integrity/read_verify
+
+    flags.
+
+
+5.2 Integrity-Aware Filesystem
+------------------------------
+
+    A filesystem that is integrity-aware can prepare I/Os with IMD
+    attached.  It can also use the application tag space if this is
+    supported by the block device.
+
+
+    `bool bio_integrity_prep(bio);`
+
+      To generate IMD for WRITE and to set up buffers for READ, the
+      filesystem must call bio_integrity_prep(bio).
+
+      Prior to calling this function, the bio data direction and start
+      sector must be set, and the bio should have all data pages
+      added.  It is up to the caller to ensure that the bio does not
+      change while I/O is in progress.
+      Complete bio with error if prepare failed for some reson.
+
+
+5.3 Passing Existing Integrity Metadata
+---------------------------------------
+
+    Filesystems that either generate their own integrity metadata or
+    are capable of transferring IMD from user space can use the
+    following calls:
+
+
+    `struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);`
+
+      Allocates the bio integrity payload and hangs it off of the bio.
+      nr_pages indicate how many pages of protection data need to be
+      stored in the integrity bio_vec list (similar to bio_alloc()).
+
+      The integrity payload will be freed at bio_free() time.
+
+
+    `int bio_integrity_add_page(bio, page, len, offset);`
+
+      Attaches a page containing integrity metadata to an existing
+      bio.  The bio must have an existing bip,
+      i.e. bio_integrity_alloc() must have been called.  For a WRITE,
+      the integrity metadata in the pages must be in a format
+      understood by the target device with the notable exception that
+      the sector numbers will be remapped as the request traverses the
+      I/O stack.  This implies that the pages added using this call
+      will be modified during I/O!  The first reference tag in the
+      integrity metadata must have a value of bip->bip_sector.
+
+      Pages can be added using bio_integrity_add_page() as long as
+      there is room in the bip bio_vec array (nr_pages).
+
+      Upon completion of a READ operation, the attached pages will
+      contain the integrity metadata received from the storage device.
+      It is up to the receiver to process them and verify data
+      integrity upon completion.
+
+
+5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata
+--------------------------------------------------------------------------
+
+    To enable integrity exchange on a block device the gendisk must be
+    registered as capable:
+
+    `int blk_integrity_register(gendisk, blk_integrity);`
+
+      The blk_integrity struct is a template and should contain the
+      following::
+
+        static struct blk_integrity my_profile = {
+            .name                   = "STANDARDSBODY-TYPE-VARIANT-CSUM",
+            .generate_fn            = my_generate_fn,
+	    .verify_fn              = my_verify_fn,
+	    .tuple_size             = sizeof(struct my_tuple_size),
+	    .tag_size               = <tag bytes per hw sector>,
+        };
+
+      'name' is a text string which will be visible in sysfs.  This is
+      part of the userland API so chose it carefully and never change
+      it.  The format is standards body-type-variant.
+      E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
+
+      'generate_fn' generates appropriate integrity metadata (for WRITE).
+
+      'verify_fn' verifies that the data buffer matches the integrity
+      metadata.
+
+      'tuple_size' must be set to match the size of the integrity
+      metadata per sector.  I.e. 8 for DIF and EPP.
+
+      'tag_size' must be set to identify how many bytes of tag space
+      are available per hardware sector.  For DIF this is either 2 or
+      0 depending on the value of the Control Mode Page ATO bit.
+
+----------------------------------------------------------------------
+
+2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
deleted file mode 100644
index 934c44ea0c57..000000000000
--- a/Documentation/block/data-integrity.txt
+++ /dev/null
@@ -1,281 +0,0 @@
-----------------------------------------------------------------------
-1. INTRODUCTION
-
-Modern filesystems feature checksumming of data and metadata to
-protect against data corruption.  However, the detection of the
-corruption is done at read time which could potentially be months
-after the data was written.  At that point the original data that the
-application tried to write is most likely lost.
-
-The solution is to ensure that the disk is actually storing what the
-application meant it to.  Recent additions to both the SCSI family
-protocols (SBC Data Integrity Field, SCC protection proposal) as well
-as SATA/T13 (External Path Protection) try to remedy this by adding
-support for appending integrity metadata to an I/O.  The integrity
-metadata (or protection information in SCSI terminology) includes a
-checksum for each sector as well as an incrementing counter that
-ensures the individual sectors are written in the right order.  And
-for some protection schemes also that the I/O is written to the right
-place on disk.
-
-Current storage controllers and devices implement various protective
-measures, for instance checksumming and scrubbing.  But these
-technologies are working in their own isolated domains or at best
-between adjacent nodes in the I/O path.  The interesting thing about
-DIF and the other integrity extensions is that the protection format
-is well defined and every node in the I/O path can verify the
-integrity of the I/O and reject it if corruption is detected.  This
-allows not only corruption prevention but also isolation of the point
-of failure.
-
-----------------------------------------------------------------------
-2. THE DATA INTEGRITY EXTENSIONS
-
-As written, the protocol extensions only protect the path between
-controller and storage device.  However, many controllers actually
-allow the operating system to interact with the integrity metadata
-(IMD).  We have been working with several FC/SAS HBA vendors to enable
-the protection information to be transferred to and from their
-controllers.
-
-The SCSI Data Integrity Field works by appending 8 bytes of protection
-information to each sector.  The data + integrity metadata is stored
-in 520 byte sectors on disk.  Data + IMD are interleaved when
-transferred between the controller and target.  The T13 proposal is
-similar.
-
-Because it is highly inconvenient for operating systems to deal with
-520 (and 4104) byte sectors, we approached several HBA vendors and
-encouraged them to allow separation of the data and integrity metadata
-scatter-gather lists.
-
-The controller will interleave the buffers on write and split them on
-read.  This means that Linux can DMA the data buffers to and from
-host memory without changes to the page cache.
-
-Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
-is somewhat heavy to compute in software.  Benchmarks found that
-calculating this checksum had a significant impact on system
-performance for a number of workloads.  Some controllers allow a
-lighter-weight checksum to be used when interfacing with the operating
-system.  Emulex, for instance, supports the TCP/IP checksum instead.
-The IP checksum received from the OS is converted to the 16-bit CRC
-when writing and vice versa.  This allows the integrity metadata to be
-generated by Linux or the application at very low cost (comparable to
-software RAID5).
-
-The IP checksum is weaker than the CRC in terms of detecting bit
-errors.  However, the strength is really in the separation of the data
-buffers and the integrity metadata.  These two distinct buffers must
-match up for an I/O to complete.
-
-The separation of the data and integrity metadata buffers as well as
-the choice in checksums is referred to as the Data Integrity
-Extensions.  As these extensions are outside the scope of the protocol
-bodies (T10, T13), Oracle and its partners are trying to standardize
-them within the Storage Networking Industry Association.
-
-----------------------------------------------------------------------
-3. KERNEL CHANGES
-
-The data integrity framework in Linux enables protection information
-to be pinned to I/Os and sent to/received from controllers that
-support it.
-
-The advantage to the integrity extensions in SCSI and SATA is that
-they enable us to protect the entire path from application to storage
-device.  However, at the same time this is also the biggest
-disadvantage. It means that the protection information must be in a
-format that can be understood by the disk.
-
-Generally Linux/POSIX applications are agnostic to the intricacies of
-the storage devices they are accessing.  The virtual filesystem switch
-and the block layer make things like hardware sector size and
-transport protocols completely transparent to the application.
-
-However, this level of detail is required when preparing the
-protection information to send to a disk.  Consequently, the very
-concept of an end-to-end protection scheme is a layering violation.
-It is completely unreasonable for an application to be aware whether
-it is accessing a SCSI or SATA disk.
-
-The data integrity support implemented in Linux attempts to hide this
-from the application.  As far as the application (and to some extent
-the kernel) is concerned, the integrity metadata is opaque information
-that's attached to the I/O.
-
-The current implementation allows the block layer to automatically
-generate the protection information for any I/O.  Eventually the
-intent is to move the integrity metadata calculation to userspace for
-user data.  Metadata and other I/O that originates within the kernel
-will still use the automatic generation interface.
-
-Some storage devices allow each hardware sector to be tagged with a
-16-bit value.  The owner of this tag space is the owner of the block
-device.  I.e. the filesystem in most cases.  The filesystem can use
-this extra space to tag sectors as they see fit.  Because the tag
-space is limited, the block interface allows tagging bigger chunks by
-way of interleaving.  This way, 8*16 bits of information can be
-attached to a typical 4KB filesystem block.
-
-This also means that applications such as fsck and mkfs will need
-access to manipulate the tags from user space.  A passthrough
-interface for this is being worked on.
-
-
-----------------------------------------------------------------------
-4. BLOCK LAYER IMPLEMENTATION DETAILS
-
-4.1 BIO
-
-The data integrity patches add a new field to struct bio when
-CONFIG_BLK_DEV_INTEGRITY is enabled.  bio_integrity(bio) returns a
-pointer to a struct bip which contains the bio integrity payload.
-Essentially a bip is a trimmed down struct bio which holds a bio_vec
-containing the integrity metadata and the required housekeeping
-information (bvec pool, vector count, etc.)
-
-A kernel subsystem can enable data integrity protection on a bio by
-calling bio_integrity_alloc(bio).  This will allocate and attach the
-bip to the bio.
-
-Individual pages containing integrity metadata can subsequently be
-attached using bio_integrity_add_page().
-
-bio_free() will automatically free the bip.
-
-
-4.2 BLOCK DEVICE
-
-Because the format of the protection data is tied to the physical
-disk, each block device has been extended with a block integrity
-profile (struct blk_integrity).  This optional profile is registered
-with the block layer using blk_integrity_register().
-
-The profile contains callback functions for generating and verifying
-the protection data, as well as getting and setting application tags.
-The profile also contains a few constants to aid in completing,
-merging and splitting the integrity metadata.
-
-Layered block devices will need to pick a profile that's appropriate
-for all subdevices.  blk_integrity_compare() can help with that.  DM
-and MD linear, RAID0 and RAID1 are currently supported.  RAID4/5/6
-will require extra work due to the application tag.
-
-
-----------------------------------------------------------------------
-5.0 BLOCK LAYER INTEGRITY API
-
-5.1 NORMAL FILESYSTEM
-
-    The normal filesystem is unaware that the underlying block device
-    is capable of sending/receiving integrity metadata.  The IMD will
-    be automatically generated by the block layer at submit_bio() time
-    in case of a WRITE.  A READ request will cause the I/O integrity
-    to be verified upon completion.
-
-    IMD generation and verification can be toggled using the
-
-      /sys/block/<bdev>/integrity/write_generate
-
-    and
-
-      /sys/block/<bdev>/integrity/read_verify
-
-    flags.
-
-
-5.2 INTEGRITY-AWARE FILESYSTEM
-
-    A filesystem that is integrity-aware can prepare I/Os with IMD
-    attached.  It can also use the application tag space if this is
-    supported by the block device.
-
-
-    bool bio_integrity_prep(bio);
-
-      To generate IMD for WRITE and to set up buffers for READ, the
-      filesystem must call bio_integrity_prep(bio).
-
-      Prior to calling this function, the bio data direction and start
-      sector must be set, and the bio should have all data pages
-      added.  It is up to the caller to ensure that the bio does not
-      change while I/O is in progress.
-      Complete bio with error if prepare failed for some reson.
-
-
-5.3 PASSING EXISTING INTEGRITY METADATA
-
-    Filesystems that either generate their own integrity metadata or
-    are capable of transferring IMD from user space can use the
-    following calls:
-
-
-    struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);
-
-      Allocates the bio integrity payload and hangs it off of the bio.
-      nr_pages indicate how many pages of protection data need to be
-      stored in the integrity bio_vec list (similar to bio_alloc()).
-
-      The integrity payload will be freed at bio_free() time.
-
-
-    int bio_integrity_add_page(bio, page, len, offset);
-
-      Attaches a page containing integrity metadata to an existing
-      bio.  The bio must have an existing bip,
-      i.e. bio_integrity_alloc() must have been called.  For a WRITE,
-      the integrity metadata in the pages must be in a format
-      understood by the target device with the notable exception that
-      the sector numbers will be remapped as the request traverses the
-      I/O stack.  This implies that the pages added using this call
-      will be modified during I/O!  The first reference tag in the
-      integrity metadata must have a value of bip->bip_sector.
-
-      Pages can be added using bio_integrity_add_page() as long as
-      there is room in the bip bio_vec array (nr_pages).
-
-      Upon completion of a READ operation, the attached pages will
-      contain the integrity metadata received from the storage device.
-      It is up to the receiver to process them and verify data
-      integrity upon completion.
-
-
-5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY
-    METADATA
-
-    To enable integrity exchange on a block device the gendisk must be
-    registered as capable:
-
-    int blk_integrity_register(gendisk, blk_integrity);
-
-      The blk_integrity struct is a template and should contain the
-      following:
-
-        static struct blk_integrity my_profile = {
-            .name                   = "STANDARDSBODY-TYPE-VARIANT-CSUM",
-            .generate_fn            = my_generate_fn,
-       	    .verify_fn              = my_verify_fn,
-	    .tuple_size             = sizeof(struct my_tuple_size),
-	    .tag_size               = <tag bytes per hw sector>,
-        };
-
-      'name' is a text string which will be visible in sysfs.  This is
-      part of the userland API so chose it carefully and never change
-      it.  The format is standards body-type-variant.
-      E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
-
-      'generate_fn' generates appropriate integrity metadata (for WRITE).
-
-      'verify_fn' verifies that the data buffer matches the integrity
-      metadata.
-
-      'tuple_size' must be set to match the size of the integrity
-      metadata per sector.  I.e. 8 for DIF and EPP.
-
-      'tag_size' must be set to identify how many bytes of tag space
-      are available per hardware sector.  For DIF this is either 2 or
-      0 depending on the value of the Control Mode Page ATO bit.
-
-----------------------------------------------------------------------
-2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/block/deadline-iosched.rst b/Documentation/block/deadline-iosched.rst
new file mode 100644
index 000000000000..9f5c5a4c370e
--- /dev/null
+++ b/Documentation/block/deadline-iosched.rst
@@ -0,0 +1,72 @@
+==============================
+Deadline IO scheduler tunables
+==============================
+
+This little file attempts to document how the deadline io scheduler works.
+In particular, it will clarify the meaning of the exposed tunables that may be
+of interest to power users.
+
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.rst for information on
+selecting an io scheduler on a per-device basis.
+
+------------------------------------------------------------------------------
+
+read_expire	(in ms)
+-----------------------
+
+The goal of the deadline io scheduler is to attempt to guarantee a start
+service time for a request. As we focus mainly on read latencies, this is
+tunable. When a read request first enters the io scheduler, it is assigned
+a deadline that is the current time + the read_expire value in units of
+milliseconds.
+
+
+write_expire	(in ms)
+-----------------------
+
+Similar to read_expire mentioned above, but for writes.
+
+
+fifo_batch	(number of requests)
+------------------------------------
+
+Requests are grouped into ``batches`` of a particular data direction (read or
+write) which are serviced in increasing sector order.  To limit extra seeking,
+deadline expiries are only checked between batches.  fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput.  When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour).  Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
+
+
+writes_starved	(number of dispatches)
+--------------------------------------
+
+When we have to move requests from the io scheduler queue to the block
+device dispatch queue, we always give a preference to reads. However, we
+don't want to starve writes indefinitely either. So writes_starved controls
+how many times we give preference to reads over writes. When that has been
+done writes_starved number of times, we dispatch some writes based on the
+same criteria as reads.
+
+
+front_merges	(bool)
+----------------------
+
+Sometimes it happens that a request enters the io scheduler that is contiguous
+with a request that is already on the queue. Either it fits in the back of that
+request, or it fits at the front. That is called either a back merge candidate
+or a front merge candidate. Due to the way files are typically laid out,
+back merges are much more common than front merges. For some work loads, you
+may even know that it is a waste of time to spend any time attempting to
+front merge requests. Setting front_merges to 0 disables this functionality.
+Front merges may still occur due to the cached last_merge hint, but since
+that comes at basically 0 cost we leave that on. We simply disable the
+rbtree front sector lookup when the io scheduler merge function is called.
+
+
+Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
deleted file mode 100644
index 2d82c80322cb..000000000000
--- a/Documentation/block/deadline-iosched.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Deadline IO scheduler tunables
-==============================
-
-This little file attempts to document how the deadline io scheduler works.
-In particular, it will clarify the meaning of the exposed tunables that may be
-of interest to power users.
-
-Selecting IO schedulers
------------------------
-Refer to Documentation/block/switching-sched.txt for information on
-selecting an io scheduler on a per-device basis.
-
-
-********************************************************************************
-
-
-read_expire	(in ms)
------------
-
-The goal of the deadline io scheduler is to attempt to guarantee a start
-service time for a request. As we focus mainly on read latencies, this is
-tunable. When a read request first enters the io scheduler, it is assigned
-a deadline that is the current time + the read_expire value in units of
-milliseconds.
-
-
-write_expire	(in ms)
------------
-
-Similar to read_expire mentioned above, but for writes.
-
-
-fifo_batch	(number of requests)
-----------
-
-Requests are grouped into ``batches'' of a particular data direction (read or
-write) which are serviced in increasing sector order.  To limit extra seeking,
-deadline expiries are only checked between batches.  fifo_batch controls the
-maximum number of requests per batch.
-
-This parameter tunes the balance between per-request latency and aggregate
-throughput.  When low latency is the primary concern, smaller is better (where
-a value of 1 yields first-come first-served behaviour).  Increasing fifo_batch
-generally improves throughput, at the cost of latency variation.
-
-
-writes_starved	(number of dispatches)
---------------
-
-When we have to move requests from the io scheduler queue to the block
-device dispatch queue, we always give a preference to reads. However, we
-don't want to starve writes indefinitely either. So writes_starved controls
-how many times we give preference to reads over writes. When that has been
-done writes_starved number of times, we dispatch some writes based on the
-same criteria as reads.
-
-
-front_merges	(bool)
-------------
-
-Sometimes it happens that a request enters the io scheduler that is contiguous
-with a request that is already on the queue. Either it fits in the back of that
-request, or it fits at the front. That is called either a back merge candidate
-or a front merge candidate. Due to the way files are typically laid out,
-back merges are much more common than front merges. For some work loads, you
-may even know that it is a waste of time to spend any time attempting to
-front merge requests. Setting front_merges to 0 disables this functionality.
-Front merges may still occur due to the cached last_merge hint, but since
-that comes at basically 0 cost we leave that on. We simply disable the
-rbtree front sector lookup when the io scheduler merge function is called.
-
-
-Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
-
-
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
new file mode 100644
index 000000000000..3fa7a52fafa4
--- /dev/null
+++ b/Documentation/block/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====
+Block
+=====
+
+.. toctree::
+   :maxdepth: 1
+
+   bfq-iosched
+   biodoc
+   biovecs
+   capability
+   cmdline-partition
+   data-integrity
+   deadline-iosched
+   ioprio
+   kyber-iosched
+   null_blk
+   pr
+   queue-sysfs
+   request
+   stat
+   switching-sched
+   writeback_cache_control
diff --git a/Documentation/block/ioprio.rst b/Documentation/block/ioprio.rst
new file mode 100644
index 000000000000..f72b0de65af7
--- /dev/null
+++ b/Documentation/block/ioprio.rst
@@ -0,0 +1,182 @@
+===================
+Block io priorities
+===================
+
+
+Intro
+-----
+
+With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
+priorities are supported for reads on files.  This enables users to io nice
+processes or process groups, similar to what has been possible with cpu
+scheduling for ages.  This document mainly details the current possibilities
+with cfq; other io schedulers do not support io priorities thus far.
+
+Scheduling classes
+------------------
+
+CFQ implements three generic scheduling classes that determine how io is
+served for a process.
+
+IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
+higher priority than any other in the system, processes from this class are
+given first access to the disk every time. Thus it needs to be used with some
+care, one io RT process can starve the entire system. Within the RT class,
+there are 8 levels of class data that determine exactly how much time this
+process needs the disk for on each service. In the future this might change
+to be more directly mappable to performance, by passing in a wanted data
+rate instead.
+
+IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
+for any process that hasn't set a specific io priority. The class data
+determines how much io bandwidth the process will get, it's directly mappable
+to the cpu nice levels just more coarsely implemented. 0 is the highest
+BE prio level, 7 is the lowest. The mapping between cpu nice level and io
+nice level is determined as: io_nice = (cpu_nice + 20) / 5.
+
+IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
+level only get io time when no one else needs the disk. The idle class has no
+class data, since it doesn't really apply here.
+
+Tools
+-----
+
+See below for a sample ionice tool. Usage::
+
+	# ionice -c<class> -n<level> -p<pid>
+
+If pid isn't given, the current process is assumed. IO priority settings
+are inherited on fork, so you can use ionice to start the process at a given
+level::
+
+	# ionice -c2 -n0 /bin/ls
+
+will run ls at the best-effort scheduling class at the highest priority.
+For a running process, you can give the pid instead::
+
+	# ionice -c1 -n2 -p100
+
+will change pid 100 to run at the realtime scheduling class, at priority 2.
+
+ionice.c tool::
+
+  #include <stdio.h>
+  #include <stdlib.h>
+  #include <errno.h>
+  #include <getopt.h>
+  #include <unistd.h>
+  #include <sys/ptrace.h>
+  #include <asm/unistd.h>
+
+  extern int sys_ioprio_set(int, int, int);
+  extern int sys_ioprio_get(int, int);
+
+  #if defined(__i386__)
+  #define __NR_ioprio_set		289
+  #define __NR_ioprio_get		290
+  #elif defined(__ppc__)
+  #define __NR_ioprio_set		273
+  #define __NR_ioprio_get		274
+  #elif defined(__x86_64__)
+  #define __NR_ioprio_set		251
+  #define __NR_ioprio_get		252
+  #elif defined(__ia64__)
+  #define __NR_ioprio_set		1274
+  #define __NR_ioprio_get		1275
+  #else
+  #error "Unsupported arch"
+  #endif
+
+  static inline int ioprio_set(int which, int who, int ioprio)
+  {
+	return syscall(__NR_ioprio_set, which, who, ioprio);
+  }
+
+  static inline int ioprio_get(int which, int who)
+  {
+	return syscall(__NR_ioprio_get, which, who);
+  }
+
+  enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+  };
+
+  enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+  };
+
+  #define IOPRIO_CLASS_SHIFT	13
+
+  const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
+
+  int main(int argc, char *argv[])
+  {
+	int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
+	int c, pid = 0;
+
+	while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
+		switch (c) {
+		case 'n':
+			ioprio = strtol(optarg, NULL, 10);
+			set = 1;
+			break;
+		case 'c':
+			ioprio_class = strtol(optarg, NULL, 10);
+			set = 1;
+			break;
+		case 'p':
+			pid = strtol(optarg, NULL, 10);
+			break;
+		}
+	}
+
+	switch (ioprio_class) {
+		case IOPRIO_CLASS_NONE:
+			ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_RT:
+		case IOPRIO_CLASS_BE:
+			break;
+		case IOPRIO_CLASS_IDLE:
+			ioprio = 7;
+			break;
+		default:
+			printf("bad prio class %d\n", ioprio_class);
+			return 1;
+	}
+
+	if (!set) {
+		if (!pid && argv[optind])
+			pid = strtol(argv[optind], NULL, 10);
+
+		ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
+
+		printf("pid=%d, %d\n", pid, ioprio);
+
+		if (ioprio == -1)
+			perror("ioprio_get");
+		else {
+			ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
+			ioprio = ioprio & 0xff;
+			printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
+		}
+	} else {
+		if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
+			perror("ioprio_set");
+			return 1;
+		}
+
+		if (argv[optind])
+			execvp(argv[optind], &argv[optind]);
+	}
+
+	return 0;
+  }
+
+
+March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.txt
deleted file mode 100644
index 8ed8c59380b4..000000000000
--- a/Documentation/block/ioprio.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-Block io priorities
-===================
-
-
-Intro
------
-
-With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
-priorities are supported for reads on files.  This enables users to io nice
-processes or process groups, similar to what has been possible with cpu
-scheduling for ages.  This document mainly details the current possibilities
-with cfq; other io schedulers do not support io priorities thus far.
-
-Scheduling classes
-------------------
-
-CFQ implements three generic scheduling classes that determine how io is
-served for a process.
-
-IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
-higher priority than any other in the system, processes from this class are
-given first access to the disk every time. Thus it needs to be used with some
-care, one io RT process can starve the entire system. Within the RT class,
-there are 8 levels of class data that determine exactly how much time this
-process needs the disk for on each service. In the future this might change
-to be more directly mappable to performance, by passing in a wanted data
-rate instead.
-
-IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
-for any process that hasn't set a specific io priority. The class data
-determines how much io bandwidth the process will get, it's directly mappable
-to the cpu nice levels just more coarsely implemented. 0 is the highest
-BE prio level, 7 is the lowest. The mapping between cpu nice level and io
-nice level is determined as: io_nice = (cpu_nice + 20) / 5.
-
-IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
-level only get io time when no one else needs the disk. The idle class has no
-class data, since it doesn't really apply here.
-
-Tools
------
-
-See below for a sample ionice tool. Usage:
-
-# ionice -c<class> -n<level> -p<pid>
-
-If pid isn't given, the current process is assumed. IO priority settings
-are inherited on fork, so you can use ionice to start the process at a given
-level:
-
-# ionice -c2 -n0 /bin/ls
-
-will run ls at the best-effort scheduling class at the highest priority.
-For a running process, you can give the pid instead:
-
-# ionice -c1 -n2 -p100
-
-will change pid 100 to run at the realtime scheduling class, at priority 2.
-
----> snip ionice.c tool <---
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <getopt.h>
-#include <unistd.h>
-#include <sys/ptrace.h>
-#include <asm/unistd.h>
-
-extern int sys_ioprio_set(int, int, int);
-extern int sys_ioprio_get(int, int);
-
-#if defined(__i386__)
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#elif defined(__ppc__)
-#define __NR_ioprio_set		273
-#define __NR_ioprio_get		274
-#elif defined(__x86_64__)
-#define __NR_ioprio_set		251
-#define __NR_ioprio_get		252
-#elif defined(__ia64__)
-#define __NR_ioprio_set		1274
-#define __NR_ioprio_get		1275
-#else
-#error "Unsupported arch"
-#endif
-
-static inline int ioprio_set(int which, int who, int ioprio)
-{
-	return syscall(__NR_ioprio_set, which, who, ioprio);
-}
-
-static inline int ioprio_get(int which, int who)
-{
-	return syscall(__NR_ioprio_get, which, who);
-}
-
-enum {
-	IOPRIO_CLASS_NONE,
-	IOPRIO_CLASS_RT,
-	IOPRIO_CLASS_BE,
-	IOPRIO_CLASS_IDLE,
-};
-
-enum {
-	IOPRIO_WHO_PROCESS = 1,
-	IOPRIO_WHO_PGRP,
-	IOPRIO_WHO_USER,
-};
-
-#define IOPRIO_CLASS_SHIFT	13
-
-const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
-
-int main(int argc, char *argv[])
-{
-	int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
-	int c, pid = 0;
-
-	while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
-		switch (c) {
-		case 'n':
-			ioprio = strtol(optarg, NULL, 10);
-			set = 1;
-			break;
-		case 'c':
-			ioprio_class = strtol(optarg, NULL, 10);
-			set = 1;
-			break;
-		case 'p':
-			pid = strtol(optarg, NULL, 10);
-			break;
-		}
-	}
-
-	switch (ioprio_class) {
-		case IOPRIO_CLASS_NONE:
-			ioprio_class = IOPRIO_CLASS_BE;
-			break;
-		case IOPRIO_CLASS_RT:
-		case IOPRIO_CLASS_BE:
-			break;
-		case IOPRIO_CLASS_IDLE:
-			ioprio = 7;
-			break;
-		default:
-			printf("bad prio class %d\n", ioprio_class);
-			return 1;
-	}
-
-	if (!set) {
-		if (!pid && argv[optind])
-			pid = strtol(argv[optind], NULL, 10);
-
-		ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
-
-		printf("pid=%d, %d\n", pid, ioprio);
-
-		if (ioprio == -1)
-			perror("ioprio_get");
-		else {
-			ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
-			ioprio = ioprio & 0xff;
-			printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
-		}
-	} else {
-		if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
-			perror("ioprio_set");
-			return 1;
-		}
-
-		if (argv[optind])
-			execvp(argv[optind], &argv[optind]);
-	}
-
-	return 0;
-}
-
----> snip ionice.c tool <---
-
-
-March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/kyber-iosched.rst b/Documentation/block/kyber-iosched.rst
new file mode 100644
index 000000000000..3e164dd0617c
--- /dev/null
+++ b/Documentation/block/kyber-iosched.rst
@@ -0,0 +1,15 @@
+============================
+Kyber I/O scheduler tunables
+============================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
deleted file mode 100644
index e94feacd7edc..000000000000
--- a/Documentation/block/kyber-iosched.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Kyber I/O scheduler tunables
-===========================
-
-The only two tunables for the Kyber scheduler are the target latencies for
-reads and synchronous writes. Kyber will throttle requests in order to meet
-these target latencies.
-
-read_lat_nsec
--------------
-Target latency for reads (in nanoseconds).
-
-write_lat_nsec
---------------
-Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
new file mode 100644
index 000000000000..31451d80783c
--- /dev/null
+++ b/Documentation/block/null_blk.rst
@@ -0,0 +1,126 @@
+========================
+Null block device driver
+========================
+
+1. Overview
+===========
+
+The null block device (/dev/nullb*) is used for benchmarking the various
+block-layer implementations. It emulates a block device of X gigabytes in size.
+The following instances are possible:
+
+  Single-queue block-layer
+
+    - Request-based.
+    - Single submission queue per device.
+    - Implements IO scheduling algorithms (CFQ, Deadline, noop).
+
+  Multi-queue block-layer
+
+    - Request-based.
+    - Configurable submission queues per device.
+
+  No block-layer (Known as bio-based)
+
+    - Bio-based. IO requests are submitted directly to the device driver.
+    - Directly accepts bio data structure and returns them.
+
+All of them have a completion queue for each core in the system.
+
+2. Module parameters applicable for all instances
+=================================================
+
+queue_mode=[0-2]: Default: 2-Multi-queue
+  Selects which block-layer the module should instantiate with.
+
+  =  ============
+  0  Bio-based
+  1  Single-queue
+  2  Multi-queue
+  =  ============
+
+home_node=[0--nr_nodes]: Default: NUMA_NO_NODE
+  Selects what CPU node the data structures are allocated from.
+
+gb=[Size in GB]: Default: 250GB
+  The size of the device reported to the system.
+
+bs=[Block size (in bytes)]: Default: 512 bytes
+  The block size reported to the system.
+
+nr_devices=[Number of devices]: Default: 1
+  Number of block devices instantiated. They are instantiated as /dev/nullb0,
+  etc.
+
+irqmode=[0-2]: Default: 1-Soft-irq
+  The completion mode used for completing IOs to the block-layer.
+
+  =  ===========================================================================
+  0  None.
+  1  Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead
+     when IOs are issued from another CPU node than the home the device is
+     connected to.
+  2  Timer: Waits a specific period (completion_nsec) for each IO before
+     completion.
+  =  ===========================================================================
+
+completion_nsec=[ns]: Default: 10,000ns
+  Combined with irqmode=2 (timer). The time each completion event must wait.
+
+submit_queues=[1..nr_cpus]:
+  The number of submission queues attached to the device driver. If unset, it
+  defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
+  parameter is 1.
+
+hw_queue_depth=[0..qdepth]: Default: 64
+  The hardware queue depth of the device.
+
+III: Multi-queue specific parameters
+
+use_per_node_hctx=[0/1]: Default: 0
+
+  =  =====================================================================
+  0  The number of submit queues are set to the value of the submit_queues
+     parameter.
+  1  The multi-queue block layer is instantiated with a hardware dispatch
+     queue for each CPU node in the system.
+  =  =====================================================================
+
+no_sched=[0/1]: Default: 0
+
+  =  ======================================
+  0  nullb* use default blk-mq io scheduler
+  1  nullb* doesn't use io scheduler
+  =  ======================================
+
+blocking=[0/1]: Default: 0
+
+  =  ===============================================================
+  0  Register as a non-blocking blk-mq driver device.
+  1  Register as a blocking blk-mq driver device, null_blk will set
+     the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
+     needs to block in its ->queue_rq() function.
+  =  ===============================================================
+
+shared_tags=[0/1]: Default: 0
+
+  =  ================================================================
+  0  Tag set is not shared.
+  1  Tag set shared between devices for blk-mq. Only makes sense with
+     nr_devices > 1, otherwise there's no tag set to share.
+  =  ================================================================
+
+zoned=[0/1]: Default: 0
+
+  =  ======================================================================
+  0  Block device is exposed as a random-access block device.
+  1  Block device is exposed as a host-managed zoned block device. Requires
+     CONFIG_BLK_DEV_ZONED.
+  =  ======================================================================
+
+zone_size=[MB]: Default: 256
+  Per zone size when exposed as a zoned block device. Must be a power of two.
+
+zone_nr_conv=[nr_conv]: Default: 0
+  The number of conventional zones to create when block device is zoned.  If
+  zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
deleted file mode 100644
index 41f0a3d33bbd..000000000000
--- a/Documentation/block/null_blk.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-Null block device driver
-================================================================================
-
-I. Overview
-
-The null block device (/dev/nullb*) is used for benchmarking the various
-block-layer implementations. It emulates a block device of X gigabytes in size.
-The following instances are possible:
-
-  Single-queue block-layer
-    - Request-based.
-    - Single submission queue per device.
-    - Implements IO scheduling algorithms (CFQ, Deadline, noop).
-  Multi-queue block-layer
-    - Request-based.
-    - Configurable submission queues per device.
-  No block-layer (Known as bio-based)
-    - Bio-based. IO requests are submitted directly to the device driver.
-    - Directly accepts bio data structure and returns them.
-
-All of them have a completion queue for each core in the system.
-
-II. Module parameters applicable for all instances:
-
-queue_mode=[0-2]: Default: 2-Multi-queue
-  Selects which block-layer the module should instantiate with.
-
-  0: Bio-based.
-  1: Single-queue.
-  2: Multi-queue.
-
-home_node=[0--nr_nodes]: Default: NUMA_NO_NODE
-  Selects what CPU node the data structures are allocated from.
-
-gb=[Size in GB]: Default: 250GB
-  The size of the device reported to the system.
-
-bs=[Block size (in bytes)]: Default: 512 bytes
-  The block size reported to the system.
-
-nr_devices=[Number of devices]: Default: 1
-  Number of block devices instantiated. They are instantiated as /dev/nullb0,
-  etc.
-
-irqmode=[0-2]: Default: 1-Soft-irq
-  The completion mode used for completing IOs to the block-layer.
-
-  0: None.
-  1: Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead
-     when IOs are issued from another CPU node than the home the device is
-     connected to.
-  2: Timer: Waits a specific period (completion_nsec) for each IO before
-     completion.
-
-completion_nsec=[ns]: Default: 10,000ns
-  Combined with irqmode=2 (timer). The time each completion event must wait.
-
-submit_queues=[1..nr_cpus]:
-  The number of submission queues attached to the device driver. If unset, it
-  defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
-  parameter is 1.
-
-hw_queue_depth=[0..qdepth]: Default: 64
-  The hardware queue depth of the device.
-
-III: Multi-queue specific parameters
-
-use_per_node_hctx=[0/1]: Default: 0
-  0: The number of submit queues are set to the value of the submit_queues
-     parameter.
-  1: The multi-queue block layer is instantiated with a hardware dispatch
-     queue for each CPU node in the system.
-
-no_sched=[0/1]: Default: 0
-  0: nullb* use default blk-mq io scheduler.
-  1: nullb* doesn't use io scheduler.
-
-blocking=[0/1]: Default: 0
-  0: Register as a non-blocking blk-mq driver device.
-  1: Register as a blocking blk-mq driver device, null_blk will set
-     the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
-     needs to block in its ->queue_rq() function.
-
-shared_tags=[0/1]: Default: 0
-  0: Tag set is not shared.
-  1: Tag set shared between devices for blk-mq. Only makes sense with
-     nr_devices > 1, otherwise there's no tag set to share.
-
-zoned=[0/1]: Default: 0
-  0: Block device is exposed as a random-access block device.
-  1: Block device is exposed as a host-managed zoned block device. Requires
-     CONFIG_BLK_DEV_ZONED.
-
-zone_size=[MB]: Default: 256
-  Per zone size when exposed as a zoned block device. Must be a power of two.
-
-zone_nr_conv=[nr_conv]: Default: 0
-  The number of conventional zones to create when block device is zoned.  If
-  zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
diff --git a/Documentation/block/pr.rst b/Documentation/block/pr.rst
new file mode 100644
index 000000000000..30ea1c2e39eb
--- /dev/null
+++ b/Documentation/block/pr.rst
@@ -0,0 +1,119 @@
+===============================================
+Block layer support for Persistent Reservations
+===============================================
+
+The Linux kernel supports a user space interface for simplified
+Persistent Reservations which map to block devices that support
+these (like SCSI). Persistent Reservations allow restricting
+access to block devices to specific initiators in a shared storage
+setup.
+
+This document gives a general overview of the support ioctl commands.
+For a more detailed reference please refer the the SCSI Primary
+Commands standard, specifically the section on Reservations and the
+"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands.
+
+All implementations are expected to ensure the reservations survive
+a power loss and cover all connections in a multi path environment.
+These behaviors are optional in SPC but will be automatically applied
+by Linux.
+
+
+The following types of reservations are supported:
+--------------------------------------------------
+
+ - PR_WRITE_EXCLUSIVE
+	Only the initiator that owns the reservation can write to the
+	device.  Any initiator can read from the device.
+
+ - PR_EXCLUSIVE_ACCESS
+	Only the initiator that owns the reservation can access the
+	device.
+
+ - PR_WRITE_EXCLUSIVE_REG_ONLY
+	Only initiators with a registered key can write to the device,
+	Any initiator can read from the device.
+
+ - PR_EXCLUSIVE_ACCESS_REG_ONLY
+	Only initiators with a registered key can access the device.
+
+ - PR_WRITE_EXCLUSIVE_ALL_REGS
+
+	Only initiators with a registered key can write to the device,
+	Any initiator can read from the device.
+	All initiators with a registered key are considered reservation
+	holders.
+	Please reference the SPC spec on the meaning of a reservation
+	holder if you want to use this type.
+
+ - PR_EXCLUSIVE_ACCESS_ALL_REGS
+	Only initiators with a registered key can access the device.
+	All initiators with a registered key are considered reservation
+	holders.
+	Please reference the SPC spec on the meaning of a reservation
+	holder if you want to use this type.
+
+
+The following ioctl are supported:
+----------------------------------
+
+1. IOC_PR_REGISTER
+^^^^^^^^^^^^^^^^^^
+
+This ioctl command registers a new reservation if the new_key argument
+is non-null.  If no existing reservation exists old_key must be zero,
+if an existing reservation should be replaced old_key must contain
+the old reservation key.
+
+If the new_key argument is 0 it unregisters the existing reservation passed
+in old_key.
+
+
+2. IOC_PR_RESERVE
+^^^^^^^^^^^^^^^^^
+
+This ioctl command reserves the device and thus restricts access for other
+devices based on the type argument.  The key argument must be the existing
+reservation key for the device as acquired by the IOC_PR_REGISTER,
+IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands.
+
+
+3. IOC_PR_RELEASE
+^^^^^^^^^^^^^^^^^
+
+This ioctl command releases the reservation specified by key and flags
+and thus removes any access restriction implied by it.
+
+
+4. IOC_PR_PREEMPT
+^^^^^^^^^^^^^^^^^
+
+This ioctl command releases the existing reservation referred to by
+old_key and replaces it with a new reservation of type for the
+reservation key new_key.
+
+
+5. IOC_PR_PREEMPT_ABORT
+^^^^^^^^^^^^^^^^^^^^^^^
+
+This ioctl command works like IOC_PR_PREEMPT except that it also aborts
+any outstanding command sent over a connection identified by old_key.
+
+6. IOC_PR_CLEAR
+^^^^^^^^^^^^^^^
+
+This ioctl command unregisters both key and any other reservation key
+registered with the device and drops any existing reservation.
+
+
+Flags
+-----
+
+All the ioctls have a flag field.  Currently only one flag is supported:
+
+ - PR_FL_IGNORE_KEY
+	Ignore the existing reservation key.  This is commonly supported for
+	IOC_PR_REGISTER, and some implementation may support the flag for
+	IOC_PR_RESERVE.
+
+For all unknown flags the kernel will return -EOPNOTSUPP.
diff --git a/Documentation/block/pr.txt b/Documentation/block/pr.txt
deleted file mode 100644
index ac9b8e70e64b..000000000000
--- a/Documentation/block/pr.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-
-Block layer support for Persistent Reservations
-===============================================
-
-The Linux kernel supports a user space interface for simplified
-Persistent Reservations which map to block devices that support
-these (like SCSI). Persistent Reservations allow restricting
-access to block devices to specific initiators in a shared storage
-setup.
-
-This document gives a general overview of the support ioctl commands.
-For a more detailed reference please refer the the SCSI Primary
-Commands standard, specifically the section on Reservations and the
-"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands.
-
-All implementations are expected to ensure the reservations survive
-a power loss and cover all connections in a multi path environment.
-These behaviors are optional in SPC but will be automatically applied
-by Linux.
-
-
-The following types of reservations are supported:
---------------------------------------------------
-
- - PR_WRITE_EXCLUSIVE
-
-	Only the initiator that owns the reservation can write to the
-	device.  Any initiator can read from the device.
-
- - PR_EXCLUSIVE_ACCESS
-
-	Only the initiator that owns the reservation can access the
-	device.
-
- - PR_WRITE_EXCLUSIVE_REG_ONLY
-
-	Only initiators with a registered key can write to the device,
-	Any initiator can read from the device.
-
- - PR_EXCLUSIVE_ACCESS_REG_ONLY
-
-	Only initiators with a registered key can access the device.
-
- - PR_WRITE_EXCLUSIVE_ALL_REGS
-
-	Only initiators with a registered key can write to the device,
-	Any initiator can read from the device.
-	All initiators with a registered key are considered reservation
-	holders.
-	Please reference the SPC spec on the meaning of a reservation
-	holder if you want to use this type. 
-
- - PR_EXCLUSIVE_ACCESS_ALL_REGS
-
-	Only initiators with a registered key can access the device.
-	All initiators with a registered key are considered reservation
-	holders.
-	Please reference the SPC spec on the meaning of a reservation
-	holder if you want to use this type. 
-
-
-The following ioctl are supported:
-----------------------------------
-
-1. IOC_PR_REGISTER
-
-This ioctl command registers a new reservation if the new_key argument
-is non-null.  If no existing reservation exists old_key must be zero,
-if an existing reservation should be replaced old_key must contain
-the old reservation key.
-
-If the new_key argument is 0 it unregisters the existing reservation passed
-in old_key.
-
-
-2. IOC_PR_RESERVE
-
-This ioctl command reserves the device and thus restricts access for other
-devices based on the type argument.  The key argument must be the existing
-reservation key for the device as acquired by the IOC_PR_REGISTER,
-IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands.
-
-
-3. IOC_PR_RELEASE
-
-This ioctl command releases the reservation specified by key and flags
-and thus removes any access restriction implied by it.
-
-
-4. IOC_PR_PREEMPT
-
-This ioctl command releases the existing reservation referred to by
-old_key and replaces it with a new reservation of type for the
-reservation key new_key.
-
-
-5. IOC_PR_PREEMPT_ABORT
-
-This ioctl command works like IOC_PR_PREEMPT except that it also aborts
-any outstanding command sent over a connection identified by old_key.
-
-6. IOC_PR_CLEAR
-
-This ioctl command unregisters both key and any other reservation key
-registered with the device and drops any existing reservation.
-
-
-Flags
------
-
-All the ioctls have a flag field.  Currently only one flag is supported:
-
- - PR_FL_IGNORE_KEY
-
-	Ignore the existing reservation key.  This is commonly supported for
-	IOC_PR_REGISTER, and some implementation may support the flag for
-	IOC_PR_RESERVE.
-
-For all unknown flags the kernel will return -EOPNOTSUPP.
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
new file mode 100644
index 000000000000..6a8513af9201
--- /dev/null
+++ b/Documentation/block/queue-sysfs.rst
@@ -0,0 +1,254 @@
+=================
+Queue sysfs files
+=================
+
+This text file will detail the queue files that are located in the sysfs tree
+for each block device. Note that stacked devices typically do not export
+any settings, since their queue merely functions are a remapping target.
+These files are the ones found in the /sys/block/xxx/queue/ directory.
+
+Files denoted with a RO postfix are readonly and the RW postfix means
+read-write.
+
+add_random (RW)
+---------------
+This file allows to turn off the disk entropy contribution. Default
+value of this file is '1'(on).
+
+chunk_sectors (RO)
+------------------
+This has different meaning depending on the type of the block device.
+For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
+of the RAID volume stripe segment. For a zoned block device, either host-aware
+or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
+of the device, with the eventual exception of the last zone of the device which
+may be smaller.
+
+dax (RO)
+--------
+This file indicates whether the device supports Direct Access (DAX),
+used by CPU-addressable storage to bypass the pagecache.  It shows '1'
+if true, '0' if not.
+
+discard_granularity (RO)
+------------------------
+This shows the size of internal allocation of the device in bytes, if
+reported by the device. A value of '0' means device does not support
+the discard functionality.
+
+discard_max_hw_bytes (RO)
+-------------------------
+Devices that support discard functionality may have internal limits on
+the number of bytes that can be trimmed or unmapped in a single operation.
+The discard_max_bytes parameter is set by the device driver to the maximum
+number of bytes that can be discarded in a single operation. Discard
+requests issued to the device must not exceed this limit. A discard_max_bytes
+value of 0 means that the device does not support discard functionality.
+
+discard_max_bytes (RW)
+----------------------
+While discard_max_hw_bytes is the hardware limit for the device, this
+setting is the software limit. Some devices exhibit large latencies when
+large discards are issued, setting this value lower will make Linux issue
+smaller discards and potentially help reduce latencies induced by large
+discard operations.
+
+discard_zeroes_data (RO)
+------------------------
+Obsolete. Always zero.
+
+fua (RO)
+--------
+Whether or not the block driver supports the FUA flag for write requests.
+FUA stands for Force Unit Access. If the FUA flag is set that means that
+write requests must bypass the volatile cache of the storage device.
+
+hw_sector_size (RO)
+-------------------
+This is the hardware sector size of the device, in bytes.
+
+io_poll (RW)
+------------
+When read, this file shows whether polling is enabled (1) or disabled
+(0).  Writing '0' to this file will disable polling for this device.
+Writing any non-zero value will enable this feature.
+
+io_poll_delay (RW)
+------------------
+If polling is enabled, this controls what kind of polling will be
+performed. It defaults to -1, which is classic polling. In this mode,
+the CPU will repeatedly ask for completions without giving up any time.
+If set to 0, a hybrid polling mode is used, where the kernel will attempt
+to make an educated guess at when the IO will complete. Based on this
+guess, the kernel will put the process issuing IO to sleep for an amount
+of time, before entering a classic poll loop. This mode might be a
+little slower than pure classic polling, but it will be more efficient.
+If set to a value larger than 0, the kernel will put the process issuing
+IO to sleep for this amount of microseconds before entering classic
+polling.
+
+io_timeout (RW)
+---------------
+io_timeout is the request timeout in milliseconds. If a request does not
+complete in this time then the block driver timeout handler is invoked.
+That timeout handler can decide to retry the request, to fail it or to start
+a device recovery strategy.
+
+iostats (RW)
+-------------
+This file is used to control (on/off) the iostats accounting of the
+disk.
+
+logical_block_size (RO)
+-----------------------
+This is the logical block size of the device, in bytes.
+
+max_discard_segments (RO)
+-------------------------
+The maximum number of DMA scatter/gather entries in a discard request.
+
+max_hw_sectors_kb (RO)
+----------------------
+This is the maximum number of kilobytes supported in a single data transfer.
+
+max_integrity_segments (RO)
+---------------------------
+Maximum number of elements in a DMA scatter/gather list with integrity
+data that will be submitted by the block layer core to the associated
+block driver.
+
+max_sectors_kb (RW)
+-------------------
+This is the maximum number of kilobytes that the block layer will allow
+for a filesystem request. Must be smaller than or equal to the maximum
+size allowed by the hardware.
+
+max_segments (RO)
+-----------------
+Maximum number of elements in a DMA scatter/gather list that is submitted
+to the associated block driver.
+
+max_segment_size (RO)
+---------------------
+Maximum size in bytes of a single element in a DMA scatter/gather list.
+
+minimum_io_size (RO)
+--------------------
+This is the smallest preferred IO size reported by the device.
+
+nomerges (RW)
+-------------
+This enables the user to disable the lookup logic involved with IO
+merging requests in the block layer. By default (0) all merges are
+enabled. When set to 1 only simple one-hit merges will be tried. When
+set to 2 no merge algorithms will be tried (including one-hit or more
+complex tree/hash lookups).
+
+nr_requests (RW)
+----------------
+This controls how many requests may be allocated in the block layer for
+read or write requests. Note that the total allocated number may be twice
+this amount, since it applies only to reads or writes (not the accumulated
+sum).
+
+To avoid priority inversion through request starvation, a request
+queue maintains a separate request pool per each cgroup when
+CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
+per-block-cgroup request pool.  IOW, if there are N block cgroups,
+each request queue may have up to N request pools, each independently
+regulated by nr_requests.
+
+nr_zones (RO)
+-------------
+For zoned block devices (zoned attribute indicating "host-managed" or
+"host-aware"), this indicates the total number of zones of the device.
+This is always 0 for regular block devices.
+
+optimal_io_size (RO)
+--------------------
+This is the optimal IO size reported by the device.
+
+physical_block_size (RO)
+------------------------
+This is the physical block size of device, in bytes.
+
+read_ahead_kb (RW)
+------------------
+Maximum number of kilobytes to read-ahead for filesystems on this block
+device.
+
+rotational (RW)
+---------------
+This file is used to stat if the device is of rotational type or
+non-rotational type.
+
+rq_affinity (RW)
+----------------
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
+
+scheduler (RW)
+--------------
+When read, this file will display the current and available IO schedulers
+for this block device. The currently active IO scheduler will be enclosed
+in [] brackets. Writing an IO scheduler name to this file will switch
+control of this block device to that new IO scheduler. Note that writing
+an IO scheduler name to this file will attempt to load that IO scheduler
+module, if it isn't already present in the system.
+
+write_cache (RW)
+----------------
+When read, this file will display whether the device has write back
+caching enabled or not. It will return "write back" for the former
+case, and "write through" for the latter. Writing to this file can
+change the kernels view of the device, but it doesn't alter the
+device state. This means that it might not be safe to toggle the
+setting from "write back" to "write through", since that will also
+eliminate cache flushes issued by the kernel.
+
+write_same_max_bytes (RO)
+-------------------------
+This is the number of bytes the device can write in a single write-same
+command.  A value of '0' means write-same is not supported by this
+device.
+
+wbt_lat_usec (RW)
+-----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes. Writing a value of '0' to this file disables the
+feature. Writing a value of '-1' to this file resets the value to the
+default setting.
+
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
+
+write_zeroes_max_bytes (RO)
+---------------------------
+For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of
+bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES
+is not supported.
+
+zoned (RO)
+----------
+This indicates if the device is a zoned block device and the zone model of the
+device if it is indeed zoned. The possible values indicated by zoned are
+"none" for regular block devices and "host-aware" or "host-managed" for zoned
+block devices. The characteristics of host-aware and host-managed zoned block
+devices are described in the ZBC (Zoned Block Commands) and ZAC
+(Zoned Device ATA Command Set) standards. These standards also define the
+"drive-managed" zone model. However, since drive-managed zoned block devices
+do not support zone commands, they will be treated as regular block devices
+and zoned will report "none".
+
+Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
deleted file mode 100644
index 83b457e24bba..000000000000
--- a/Documentation/block/queue-sysfs.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Queue sysfs files
-=================
-
-This text file will detail the queue files that are located in the sysfs tree
-for each block device. Note that stacked devices typically do not export
-any settings, since their queue merely functions are a remapping target.
-These files are the ones found in the /sys/block/xxx/queue/ directory.
-
-Files denoted with a RO postfix are readonly and the RW postfix means
-read-write.
-
-add_random (RW)
-----------------
-This file allows to turn off the disk entropy contribution. Default
-value of this file is '1'(on).
-
-dax (RO)
---------
-This file indicates whether the device supports Direct Access (DAX),
-used by CPU-addressable storage to bypass the pagecache.  It shows '1'
-if true, '0' if not.
-
-discard_granularity (RO)
------------------------
-This shows the size of internal allocation of the device in bytes, if
-reported by the device. A value of '0' means device does not support
-the discard functionality.
-
-discard_max_hw_bytes (RO)
-----------------------
-Devices that support discard functionality may have internal limits on
-the number of bytes that can be trimmed or unmapped in a single operation.
-The discard_max_bytes parameter is set by the device driver to the maximum
-number of bytes that can be discarded in a single operation. Discard
-requests issued to the device must not exceed this limit. A discard_max_bytes
-value of 0 means that the device does not support discard functionality.
-
-discard_max_bytes (RW)
-----------------------
-While discard_max_hw_bytes is the hardware limit for the device, this
-setting is the software limit. Some devices exhibit large latencies when
-large discards are issued, setting this value lower will make Linux issue
-smaller discards and potentially help reduce latencies induced by large
-discard operations.
-
-hw_sector_size (RO)
--------------------
-This is the hardware sector size of the device, in bytes.
-
-io_poll (RW)
-------------
-When read, this file shows whether polling is enabled (1) or disabled
-(0).  Writing '0' to this file will disable polling for this device.
-Writing any non-zero value will enable this feature.
-
-io_poll_delay (RW)
-------------------
-If polling is enabled, this controls what kind of polling will be
-performed. It defaults to -1, which is classic polling. In this mode,
-the CPU will repeatedly ask for completions without giving up any time.
-If set to 0, a hybrid polling mode is used, where the kernel will attempt
-to make an educated guess at when the IO will complete. Based on this
-guess, the kernel will put the process issuing IO to sleep for an amount
-of time, before entering a classic poll loop. This mode might be a
-little slower than pure classic polling, but it will be more efficient.
-If set to a value larger than 0, the kernel will put the process issuing
-IO to sleep for this amount of microseconds before entering classic
-polling.
-
-io_timeout (RW)
----------------
-io_timeout is the request timeout in milliseconds. If a request does not
-complete in this time then the block driver timeout handler is invoked.
-That timeout handler can decide to retry the request, to fail it or to start
-a device recovery strategy.
-
-iostats (RW)
--------------
-This file is used to control (on/off) the iostats accounting of the
-disk.
-
-logical_block_size (RO)
------------------------
-This is the logical block size of the device, in bytes.
-
-max_hw_sectors_kb (RO)
-----------------------
-This is the maximum number of kilobytes supported in a single data transfer.
-
-max_integrity_segments (RO)
----------------------------
-When read, this file shows the max limit of integrity segments as
-set by block layer which a hardware controller can handle.
-
-max_sectors_kb (RW)
--------------------
-This is the maximum number of kilobytes that the block layer will allow
-for a filesystem request. Must be smaller than or equal to the maximum
-size allowed by the hardware.
-
-max_segments (RO)
------------------
-Maximum number of segments of the device.
-
-max_segment_size (RO)
----------------------
-Maximum segment size of the device.
-
-minimum_io_size (RO)
---------------------
-This is the smallest preferred IO size reported by the device.
-
-nomerges (RW)
--------------
-This enables the user to disable the lookup logic involved with IO
-merging requests in the block layer. By default (0) all merges are
-enabled. When set to 1 only simple one-hit merges will be tried. When
-set to 2 no merge algorithms will be tried (including one-hit or more
-complex tree/hash lookups).
-
-nr_requests (RW)
-----------------
-This controls how many requests may be allocated in the block layer for
-read or write requests. Note that the total allocated number may be twice
-this amount, since it applies only to reads or writes (not the accumulated
-sum).
-
-To avoid priority inversion through request starvation, a request
-queue maintains a separate request pool per each cgroup when
-CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
-per-block-cgroup request pool.  IOW, if there are N block cgroups,
-each request queue may have up to N request pools, each independently
-regulated by nr_requests.
-
-optimal_io_size (RO)
---------------------
-This is the optimal IO size reported by the device.
-
-physical_block_size (RO)
-------------------------
-This is the physical block size of device, in bytes.
-
-read_ahead_kb (RW)
-------------------
-Maximum number of kilobytes to read-ahead for filesystems on this block
-device.
-
-rotational (RW)
----------------
-This file is used to stat if the device is of rotational type or
-non-rotational type.
-
-rq_affinity (RW)
-----------------
-If this option is '1', the block layer will migrate request completions to the
-cpu "group" that originally submitted the request. For some workloads this
-provides a significant reduction in CPU cycles due to caching effects.
-
-For storage configurations that need to maximize distribution of completion
-processing setting this option to '2' forces the completion to run on the
-requesting cpu (bypassing the "group" aggregation logic).
-
-scheduler (RW)
---------------
-When read, this file will display the current and available IO schedulers
-for this block device. The currently active IO scheduler will be enclosed
-in [] brackets. Writing an IO scheduler name to this file will switch
-control of this block device to that new IO scheduler. Note that writing
-an IO scheduler name to this file will attempt to load that IO scheduler
-module, if it isn't already present in the system.
-
-write_cache (RW)
-----------------
-When read, this file will display whether the device has write back
-caching enabled or not. It will return "write back" for the former
-case, and "write through" for the latter. Writing to this file can
-change the kernels view of the device, but it doesn't alter the
-device state. This means that it might not be safe to toggle the
-setting from "write back" to "write through", since that will also
-eliminate cache flushes issued by the kernel.
-
-write_same_max_bytes (RO)
--------------------------
-This is the number of bytes the device can write in a single write-same
-command.  A value of '0' means write-same is not supported by this
-device.
-
-wb_lat_usec (RW)
-----------------
-If the device is registered for writeback throttling, then this file shows
-the target minimum read latency. If this latency is exceeded in a given
-window of time (see wb_window_usec), then the writeback throttling will start
-scaling back writes. Writing a value of '0' to this file disables the
-feature. Writing a value of '-1' to this file resets the value to the
-default setting.
-
-throttle_sample_time (RW)
--------------------------
-This is the time window that blk-throttle samples data, in millisecond.
-blk-throttle makes decision based on the samplings. Lower time means cgroups
-have more smooth throughput, but higher CPU overhead. This exists only when
-CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
-
-zoned (RO)
-----------
-This indicates if the device is a zoned block device and the zone model of the
-device if it is indeed zoned. The possible values indicated by zoned are
-"none" for regular block devices and "host-aware" or "host-managed" for zoned
-block devices. The characteristics of host-aware and host-managed zoned block
-devices are described in the ZBC (Zoned Block Commands) and ZAC
-(Zoned Device ATA Command Set) standards. These standards also define the
-"drive-managed" zone model. However, since drive-managed zoned block devices
-do not support zone commands, they will be treated as regular block devices
-and zoned will report "none".
-
-nr_zones (RO)
--------------
-For zoned block devices (zoned attribute indicating "host-managed" or
-"host-aware"), this indicates the total number of zones of the device.
-This is always 0 for regular block devices.
-
-chunk_sectors (RO)
-------------------
-This has different meaning depending on the type of the block device.
-For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
-of the RAID volume stripe segment. For a zoned block device, either host-aware
-or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
-of the device, with the eventual exception of the last zone of the device which
-may be smaller.
-
-Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/block/request.rst b/Documentation/block/request.rst
new file mode 100644
index 000000000000..747021e1ffdb
--- /dev/null
+++ b/Documentation/block/request.rst
@@ -0,0 +1,99 @@
+============================
+struct request documentation
+============================
+
+Jens Axboe <jens.axboe@oracle.com> 27/05/02
+
+
+.. FIXME:
+   No idea about what does mean - seems just some noise, so comment it
+
+   1.0
+   Index
+
+   2.0 Struct request members classification
+
+       2.1 struct request members explanation
+
+   3.0
+
+
+   2.0
+
+
+
+Short explanation of request members
+====================================
+
+Classification flags:
+
+	=	====================
+	D	driver member
+	B	block layer member
+	I	I/O scheduler member
+	=	====================
+
+Unless an entry contains a D classification, a device driver must not access
+this member. Some members may contain D classifications, but should only be
+access through certain macros or functions (eg ->flags).
+
+<linux/blkdev.h>
+
+=============================== ======= =======================================
+Member				Flag	Comment
+=============================== ======= =======================================
+struct list_head queuelist	BI	Organization on various internal
+					queues
+
+``void *elevator_private``	I	I/O scheduler private data
+
+unsigned char cmd[16]		D	Driver can use this for setting up
+					a cdb before execution, see
+					blk_queue_prep_rq
+
+unsigned long flags		DBI	Contains info about data direction,
+					request type, etc.
+
+int rq_status			D	Request status bits
+
+kdev_t rq_dev			DBI	Target device
+
+int errors			DB	Error counts
+
+sector_t sector			DBI	Target location
+
+unsigned long hard_nr_sectors	B	Used to keep sector sane
+
+unsigned long nr_sectors	DBI	Total number of sectors in request
+
+unsigned long hard_nr_sectors	B	Used to keep nr_sectors sane
+
+unsigned short nr_phys_segments	DB	Number of physical scatter gather
+					segments in a request
+
+unsigned short nr_hw_segments	DB	Number of hardware scatter gather
+					segments in a request
+
+unsigned int current_nr_sectors	DB	Number of sectors in first segment
+					of request
+
+unsigned int hard_cur_sectors	B	Used to keep current_nr_sectors sane
+
+int tag				DB	TCQ tag, if assigned
+
+``void *special``		D	Free to be used by driver
+
+``char *buffer``		D	Map of first segment, also see
+					section on bouncing SECTION
+
+``struct completion *waiting``	D	Can be used by driver to get signalled
+					on request completion
+
+``struct bio *bio``		DBI	First bio in request
+
+``struct bio *biotail``		DBI	Last bio in request
+
+``struct request_queue *q``	DB	Request queue this request belongs to
+
+``struct request_list *rl``	B	Request list this request came from
+=============================== ======= =======================================
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
deleted file mode 100644
index 754e104ed369..000000000000
--- a/Documentation/block/request.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-
-struct request documentation
-
-Jens Axboe <jens.axboe@oracle.com> 27/05/02
-
-1.0
-Index
-
-2.0 Struct request members classification
-
-	2.1 struct request members explanation
-
-3.0
-
-
-2.0
-Short explanation of request members
-
-Classification flags:
-
-	D	driver member
-	B	block layer member
-	I	I/O scheduler member
-
-Unless an entry contains a D classification, a device driver must not access
-this member. Some members may contain D classifications, but should only be
-access through certain macros or functions (eg ->flags).
-
-<linux/blkdev.h>
-
-2.1
-Member				Flag	Comment
-------				----	-------
-
-struct list_head queuelist	BI	Organization on various internal
-					queues
-
-void *elevator_private		I	I/O scheduler private data
-
-unsigned char cmd[16]		D	Driver can use this for setting up
-					a cdb before execution, see
-					blk_queue_prep_rq
-
-unsigned long flags		DBI	Contains info about data direction,
-					request type, etc.
-
-int rq_status			D	Request status bits
-
-kdev_t rq_dev			DBI	Target device
-
-int errors			DB	Error counts
-
-sector_t sector			DBI	Target location
-
-unsigned long hard_nr_sectors	B	Used to keep sector sane
-
-unsigned long nr_sectors	DBI	Total number of sectors in request
-
-unsigned long hard_nr_sectors	B	Used to keep nr_sectors sane
-
-unsigned short nr_phys_segments	DB	Number of physical scatter gather
-					segments in a request
-
-unsigned short nr_hw_segments	DB	Number of hardware scatter gather
-					segments in a request
-
-unsigned int current_nr_sectors	DB	Number of sectors in first segment
-					of request
-
-unsigned int hard_cur_sectors	B	Used to keep current_nr_sectors sane
-
-int tag				DB	TCQ tag, if assigned
-
-void *special			D	Free to be used by driver
-
-char *buffer			D	Map of first segment, also see
-					section on bouncing SECTION
-
-struct completion *waiting	D	Can be used by driver to get signalled
-					on request completion
-
-struct bio *bio			DBI	First bio in request
-
-struct bio *biotail		DBI	Last bio in request
-
-struct request_queue *q		DB	Request queue this request belongs to
-
-struct request_list *rl		B	Request list this request came from
diff --git a/Documentation/block/stat.rst b/Documentation/block/stat.rst
new file mode 100644
index 000000000000..9c07bc22b0bc
--- /dev/null
+++ b/Documentation/block/stat.rst
@@ -0,0 +1,93 @@
+===============================================
+Block layer statistics in /sys/block/<dev>/stat
+===============================================
+
+This file documents the contents of the /sys/block/<dev>/stat file.
+
+The stat file provides several statistics about the state of block
+device <dev>.
+
+Q.
+   Why are there multiple statistics in a single file?  Doesn't sysfs
+   normally contain a single value per file?
+
+A.
+   By having a single file, the kernel can guarantee that the statistics
+   represent a consistent snapshot of the state of the device.  If the
+   statistics were exported as multiple files containing one statistic
+   each, it would be impossible to guarantee that a set of readings
+   represent a single point in time.
+
+The stat file consists of a single line of text containing 11 decimal
+values separated by whitespace.  The fields are summarized in the
+following table, and described in more detail below.
+
+
+=============== ============= =================================================
+Name            units         description
+=============== ============= =================================================
+read I/Os       requests      number of read I/Os processed
+read merges     requests      number of read I/Os merged with in-queue I/O
+read sectors    sectors       number of sectors read
+read ticks      milliseconds  total wait time for read requests
+write I/Os      requests      number of write I/Os processed
+write merges    requests      number of write I/Os merged with in-queue I/O
+write sectors   sectors       number of sectors written
+write ticks     milliseconds  total wait time for write requests
+in_flight       requests      number of I/Os currently in flight
+io_ticks        milliseconds  total time this block device has been active
+time_in_queue   milliseconds  total wait time for all requests
+discard I/Os    requests      number of discard I/Os processed
+discard merges  requests      number of discard I/Os merged with in-queue I/O
+discard sectors sectors       number of sectors discarded
+discard ticks   milliseconds  total wait time for discard requests
+=============== ============= =================================================
+
+read I/Os, write I/Os, discard I/0s
+===================================
+
+These values increment when an I/O request completes.
+
+read merges, write merges, discard merges
+=========================================
+
+These values increment when an I/O request is merged with an
+already-queued I/O request.
+
+read sectors, write sectors, discard_sectors
+============================================
+
+These values count the number of sectors read from, written to, or
+discarded from this block device.  The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size.  The counters are incremented when the I/O completes.
+
+read ticks, write ticks, discard ticks
+======================================
+
+These values count the number of milliseconds that I/O requests have
+waited on this block device.  If there are multiple I/O requests waiting,
+these values will increase at a rate greater than 1000/second; for
+example, if 60 read requests wait for an average of 30 ms, the read_ticks
+field will increase by 60*30 = 1800.
+
+in_flight
+=========
+
+This value counts the number of I/O requests that have been issued to
+the device driver but have not yet completed.  It does not include I/O
+requests that are in the queue but not yet issued to the device driver.
+
+io_ticks
+========
+
+This value counts the number of milliseconds during which the device has
+had I/O requests queued.
+
+time_in_queue
+=============
+
+This value counts the number of milliseconds that I/O requests have waited
+on this block device.  If there are multiple I/O requests waiting, this
+value will increase as the product of the number of milliseconds times the
+number of requests waiting (see "read ticks" above for an example).
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
deleted file mode 100644
index 0aace9cc536c..000000000000
--- a/Documentation/block/stat.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-Block layer statistics in /sys/block/<dev>/stat
-===============================================
-
-This file documents the contents of the /sys/block/<dev>/stat file.
-
-The stat file provides several statistics about the state of block
-device <dev>.
-
-Q. Why are there multiple statistics in a single file?  Doesn't sysfs
-   normally contain a single value per file?
-A. By having a single file, the kernel can guarantee that the statistics
-   represent a consistent snapshot of the state of the device.  If the
-   statistics were exported as multiple files containing one statistic
-   each, it would be impossible to guarantee that a set of readings
-   represent a single point in time.
-
-The stat file consists of a single line of text containing 11 decimal
-values separated by whitespace.  The fields are summarized in the
-following table, and described in more detail below.
-
-Name            units         description
-----            -----         -----------
-read I/Os       requests      number of read I/Os processed
-read merges     requests      number of read I/Os merged with in-queue I/O
-read sectors    sectors       number of sectors read
-read ticks      milliseconds  total wait time for read requests
-write I/Os      requests      number of write I/Os processed
-write merges    requests      number of write I/Os merged with in-queue I/O
-write sectors   sectors       number of sectors written
-write ticks     milliseconds  total wait time for write requests
-in_flight       requests      number of I/Os currently in flight
-io_ticks        milliseconds  total time this block device has been active
-time_in_queue   milliseconds  total wait time for all requests
-discard I/Os    requests      number of discard I/Os processed
-discard merges  requests      number of discard I/Os merged with in-queue I/O
-discard sectors sectors       number of sectors discarded
-discard ticks   milliseconds  total wait time for discard requests
-
-read I/Os, write I/Os, discard I/0s
-===================================
-
-These values increment when an I/O request completes.
-
-read merges, write merges, discard merges
-=========================================
-
-These values increment when an I/O request is merged with an
-already-queued I/O request.
-
-read sectors, write sectors, discard_sectors
-============================================
-
-These values count the number of sectors read from, written to, or
-discarded from this block device.  The "sectors" in question are the
-standard UNIX 512-byte sectors, not any device- or filesystem-specific
-block size.  The counters are incremented when the I/O completes.
-
-read ticks, write ticks, discard ticks
-======================================
-
-These values count the number of milliseconds that I/O requests have
-waited on this block device.  If there are multiple I/O requests waiting,
-these values will increase at a rate greater than 1000/second; for
-example, if 60 read requests wait for an average of 30 ms, the read_ticks
-field will increase by 60*30 = 1800.
-
-in_flight
-=========
-
-This value counts the number of I/O requests that have been issued to
-the device driver but have not yet completed.  It does not include I/O
-requests that are in the queue but not yet issued to the device driver.
-
-io_ticks
-========
-
-This value counts the number of milliseconds during which the device has
-had I/O requests queued.
-
-time_in_queue
-=============
-
-This value counts the number of milliseconds that I/O requests have waited
-on this block device.  If there are multiple I/O requests waiting, this
-value will increase as the product of the number of milliseconds times the
-number of requests waiting (see "read ticks" above for an example).
diff --git a/Documentation/block/switching-sched.rst b/Documentation/block/switching-sched.rst
new file mode 100644
index 000000000000..42042417380e
--- /dev/null
+++ b/Documentation/block/switching-sched.rst
@@ -0,0 +1,39 @@
+===================
+Switching Scheduler
+===================
+
+To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
+'noop' and 'cfq' (the default) are also available. IO schedulers are assigned
+globally at boot time only presently.
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in::
+
+	/sys/block/<device>/queue/iosched
+
+assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
+you can do so by typing::
+
+	# mount none /sys -t sysfs
+
+It is possible to change the IO scheduler for a given block device on
+the fly to select one of mq-deadline, none, bfq, or kyber schedulers -
+which can improve that device's throughput.
+
+To set a specific scheduler, simply do this::
+
+	echo SCHEDNAME > /sys/block/DEV/queue/scheduler
+
+where SCHEDNAME is the name of a defined IO scheduler, and DEV is the
+device name (hda, hdb, sga, or whatever you happen to have).
+
+The list of defined schedulers can be found by simply doing
+a "cat /sys/block/DEV/queue/scheduler" - the list of valid names
+will be displayed, with the currently selected scheduler in brackets::
+
+  # cat /sys/block/sda/queue/scheduler
+  [mq-deadline] kyber bfq none
+  # echo none >/sys/block/sda/queue/scheduler
+  # cat /sys/block/sda/queue/scheduler
+  [none] mq-deadline kyber bfq
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
deleted file mode 100644
index 3b2612e342f1..000000000000
--- a/Documentation/block/switching-sched.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
-'noop' and 'cfq' (the default) are also available. IO schedulers are assigned
-globally at boot time only presently.
-
-Each io queue has a set of io scheduler tunables associated with it. These
-tunables control how the io scheduler works. You can find these entries
-in:
-
-/sys/block/<device>/queue/iosched
-
-assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
-you can do so by typing:
-
-# mount none /sys -t sysfs
-
-As of the Linux 2.6.10 kernel, it is now possible to change the
-IO scheduler for a given block device on the fly (thus making it possible,
-for instance, to set the CFQ scheduler for the system default, but
-set a specific device to use the deadline or noop schedulers - which
-can improve that device's throughput).
-
-To set a specific scheduler, simply do this:
-
-echo SCHEDNAME > /sys/block/DEV/queue/scheduler
-
-where SCHEDNAME is the name of a defined IO scheduler, and DEV is the
-device name (hda, hdb, sga, or whatever you happen to have).
-
-The list of defined schedulers can be found by simply doing
-a "cat /sys/block/DEV/queue/scheduler" - the list of valid names
-will be displayed, with the currently selected scheduler in brackets:
-
-# cat /sys/block/hda/queue/scheduler
-noop deadline [cfq]
-# echo deadline > /sys/block/hda/queue/scheduler
-# cat /sys/block/hda/queue/scheduler
-noop [deadline] cfq
diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst
new file mode 100644
index 000000000000..2c752c57c14c
--- /dev/null
+++ b/Documentation/block/writeback_cache_control.rst
@@ -0,0 +1,86 @@
+==========================================
+Explicit volatile write back cache control
+==========================================
+
+Introduction
+------------
+
+Many storage devices, especially in the consumer market, come with volatile
+write back caches.  That means the devices signal I/O completion to the
+operating system before data actually has hit the non-volatile storage.  This
+behavior obviously speeds up various workloads, but it means the operating
+system needs to force data out to the non-volatile storage when it performs
+a data integrity operation like fsync, sync or an unmount.
+
+The Linux block layer provides two simple mechanisms that let filesystems
+control the caching behavior of the storage device.  These mechanisms are
+a forced cache flush, and the Force Unit Access (FUA) flag for requests.
+
+
+Explicit cache flushes
+----------------------
+
+The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
+the filesystem and will make sure the volatile cache of the storage device
+has been flushed before the actual I/O operation is started.  This explicitly
+guarantees that previously completed write requests are on non-volatile
+storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
+set on an otherwise empty bio structure, which causes only an explicit cache
+flush without any dependent I/O.  It is recommend to use
+the blkdev_issue_flush() helper for a pure cache flush.
+
+
+Forced Unit Access
+------------------
+
+The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
+filesystem and will make sure that I/O completion for this request is only
+signaled after the data has been committed to non-volatile storage.
+
+
+Implementation details for filesystems
+--------------------------------------
+
+Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to
+worry if the underlying devices need any explicit cache flushing and how
+the Forced Unit Access is implemented.  The REQ_PREFLUSH and REQ_FUA flags
+may both be set on a single bio.
+
+
+Implementation details for make_request_fn based block drivers
+--------------------------------------------------------------
+
+These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
+directly below the submit_bio interface.  For remapping drivers the REQ_FUA
+bits need to be propagated to underlying devices, and a global flush needs
+to be implemented for bios with the REQ_PREFLUSH bit set.  For real device
+drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
+on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
+data can be completed successfully without doing any work.  Drivers for
+devices with volatile caches need to implement the support for these
+flags themselves without any help from the block layer.
+
+
+Implementation details for request_fn based block drivers
+---------------------------------------------------------
+
+For devices that do not support volatile write caches there is no driver
+support required, the block layer completes empty REQ_PREFLUSH requests before
+entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
+requests that have a payload.  For devices with volatile write caches the
+driver needs to tell the block layer that it supports flushing caches by
+doing::
+
+	blk_queue_write_cache(sdkp->disk->queue, true, false);
+
+and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn.  Note that
+REQ_PREFLUSH requests with a payload are automatically turned into a sequence
+of an empty REQ_OP_FLUSH request followed by the actual write by the block
+layer.  For devices that also support the FUA bit the block layer needs
+to be told to pass through the REQ_FUA bit using::
+
+	blk_queue_write_cache(sdkp->disk->queue, true, true);
+
+and the driver must handle write requests that have the REQ_FUA bit set
+in prep_fn/request_fn.  If the FUA bit is not natively supported the block
+layer turns it into an empty REQ_OP_FLUSH request after the actual write.
diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
deleted file mode 100644
index 8a6bdada5f6b..000000000000
--- a/Documentation/block/writeback_cache_control.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-
-Explicit volatile write back cache control
-=====================================
-
-Introduction
-------------
-
-Many storage devices, especially in the consumer market, come with volatile
-write back caches.  That means the devices signal I/O completion to the
-operating system before data actually has hit the non-volatile storage.  This
-behavior obviously speeds up various workloads, but it means the operating
-system needs to force data out to the non-volatile storage when it performs
-a data integrity operation like fsync, sync or an unmount.
-
-The Linux block layer provides two simple mechanisms that let filesystems
-control the caching behavior of the storage device.  These mechanisms are
-a forced cache flush, and the Force Unit Access (FUA) flag for requests.
-
-
-Explicit cache flushes
-----------------------
-
-The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
-the filesystem and will make sure the volatile cache of the storage device
-has been flushed before the actual I/O operation is started.  This explicitly
-guarantees that previously completed write requests are on non-volatile
-storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
-set on an otherwise empty bio structure, which causes only an explicit cache
-flush without any dependent I/O.  It is recommend to use
-the blkdev_issue_flush() helper for a pure cache flush.
-
-
-Forced Unit Access
------------------
-
-The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
-filesystem and will make sure that I/O completion for this request is only
-signaled after the data has been committed to non-volatile storage.
-
-
-Implementation details for filesystems
---------------------------------------
-
-Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to
-worry if the underlying devices need any explicit cache flushing and how
-the Forced Unit Access is implemented.  The REQ_PREFLUSH and REQ_FUA flags
-may both be set on a single bio.
-
-
-Implementation details for make_request_fn based block drivers
---------------------------------------------------------------
-
-These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
-directly below the submit_bio interface.  For remapping drivers the REQ_FUA
-bits need to be propagated to underlying devices, and a global flush needs
-to be implemented for bios with the REQ_PREFLUSH bit set.  For real device
-drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
-on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
-data can be completed successfully without doing any work.  Drivers for
-devices with volatile caches need to implement the support for these
-flags themselves without any help from the block layer.
-
-
-Implementation details for request_fn based block drivers
---------------------------------------------------------------
-
-For devices that do not support volatile write caches there is no driver
-support required, the block layer completes empty REQ_PREFLUSH requests before
-entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
-requests that have a payload.  For devices with volatile write caches the
-driver needs to tell the block layer that it supports flushing caches by
-doing:
-
-	blk_queue_write_cache(sdkp->disk->queue, true, false);
-
-and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn.  Note that
-REQ_PREFLUSH requests with a payload are automatically turned into a sequence
-of an empty REQ_OP_FLUSH request followed by the actual write by the block
-layer.  For devices that also support the FUA bit the block layer needs
-to be told to pass through the REQ_FUA bit using:
-
-	blk_queue_write_cache(sdkp->disk->queue, true, true);
-
-and the driver must handle write requests that have the REQ_FUA bit set
-in prep_fn/request_fn.  If the FUA bit is not natively supported the block
-layer turns it into an empty REQ_OP_FLUSH request after the actual write.
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
deleted file mode 100644
index 627b0a1bf35e..000000000000
--- a/Documentation/blockdev/drbd/README.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Description
-
-  DRBD is a shared-nothing, synchronously replicated block device. It
-  is designed to serve as a building block for high availability
-  clusters and in this context, is a "drop-in" replacement for shared
-  storage. Simplistically, you could see it as a network RAID 1.
-
-  Please visit http://www.drbd.org to find out more.
-
-The here included files are intended to help understand the implementation
-
-DRBD-8.3-data-packets.svg, DRBD-data-packets.svg  
-  relates some functions, and write packets.
-
-conn-states-8.dot, disk-states-8.dot, node-states-8.dot
-  The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/data-structure-v9.txt b/Documentation/blockdev/drbd/data-structure-v9.txt
deleted file mode 100644
index 1e52a0e32624..000000000000
--- a/Documentation/blockdev/drbd/data-structure-v9.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-This describes the in kernel data structure for DRBD-9. Starting with
-Linux v3.14 we are reorganizing DRBD to use this data structure.
-
-Basic Data Structure
-====================
-
-A node has a number of DRBD resources.  Each such resource has a number of
-devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD
-device is represented by a block device locally.
-
-The DRBD objects are interconnected to form a matrix as depicted below; a
-drbd_peer_device object sits at each intersection between a drbd_device and a
-drbd_connection:
-
-  /--------------+---------------+.....+---------------\
-  |   resource   |    device     |     |    device     |
-  +--------------+---------------+.....+---------------+
-  |  connection  |  peer_device  |     |  peer_device  |
-  +--------------+---------------+.....+---------------+
-  :              :               :     :               :
-  :              :               :     :               :
-  +--------------+---------------+.....+---------------+
-  |  connection  |  peer_device  |     |  peer_device  |
-  \--------------+---------------+.....+---------------/
-
-In this table, horizontally, devices can be accessed from resources by their
-volume number.  Likewise, peer_devices can be accessed from connections by
-their volume number.  Objects in the vertical direction are connected by double
-linked lists.  There are back pointers from peer_devices to their connections a
-devices, and from connections and devices to their resource.
-
-All resources are in the drbd_resources double-linked list.  In addition, all
-devices can be accessed by their minor device number via the drbd_devices idr.
-
-The drbd_resource, drbd_connection, and drbd_device objects are reference
-counted.  The peer_device objects only serve to establish the links between
-devices and connections; their lifetime is determined by the lifetime of the
-device and connection which they reference.
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt
deleted file mode 100644
index e2240f5ab64d..000000000000
--- a/Documentation/blockdev/floppy.txt
+++ /dev/null
@@ -1,245 +0,0 @@
-This file describes the floppy driver.
-
-FAQ list:
-=========
-
- A FAQ list may be found in the fdutils package (see below), and also
-at <http://fdutils.linux.lu/faq.html>.
-
-
-LILO configuration options (Thinkpad users, read this)
-======================================================
-
- The floppy driver is configured using the 'floppy=' option in
-lilo. This option can be typed at the boot prompt, or entered in the
-lilo configuration file.
-
- Example: If your kernel is called linux-2.6.9, type the following line
-at the lilo boot prompt (if you have a thinkpad):
-
- linux-2.6.9 floppy=thinkpad
-
-You may also enter the following line in /etc/lilo.conf, in the description
-of linux-2.6.9:
-
- append = "floppy=thinkpad"
-
- Several floppy related options may be given, example:
-
- linux-2.6.9 floppy=daring floppy=two_fdc
- append = "floppy=daring floppy=two_fdc"
-
- If you give options both in the lilo config file and on the boot
-prompt, the option strings of both places are concatenated, the boot
-prompt options coming last. That's why there are also options to
-restore the default behavior.
-
-
-Module configuration options
-============================
-
- If you use the floppy driver as a module, use the following syntax:
-modprobe floppy floppy="<options>"
-
-Example:
- modprobe floppy floppy="omnibook messages"
-
- If you need certain options enabled every time you load the floppy driver,
-you can put:
-
- options floppy floppy="omnibook messages"
-
-in a configuration file in /etc/modprobe.d/.
-
-
- The floppy driver related options are:
-
- floppy=asus_pci
-	Sets the bit mask to allow only units 0 and 1. (default)
-
- floppy=daring
-	Tells the floppy driver that you have a well behaved floppy controller.
-	This allows more efficient and smoother operation, but may fail on
-	certain controllers. This may speed up certain operations.
-
- floppy=0,daring
-	Tells the floppy driver that your floppy controller should be used
-	with caution.
-
- floppy=one_fdc
-	Tells the floppy driver that you have only one floppy controller.
-	(default)
-
- floppy=two_fdc
- floppy=<address>,two_fdc
-	Tells the floppy driver that you have two floppy controllers.
-	The second floppy controller is assumed to be at <address>.
-	This option is not needed if the second controller is at address
-	0x370, and if you use the 'cmos' option.
-
- floppy=thinkpad
-	Tells the floppy driver that you have a Thinkpad. Thinkpads use an
-	inverted convention for the disk change line.
-
- floppy=0,thinkpad
-	Tells the floppy driver that you don't have a Thinkpad.
-
- floppy=omnibook
- floppy=nodma
-	Tells the floppy driver not to use Dma for data transfers.
-	This is needed on HP Omnibooks, which don't have a workable
-	DMA channel for the floppy driver. This option is also useful
-	if you frequently get "Unable to allocate DMA memory" messages.
-	Indeed, dma memory needs to be continuous in physical memory,
-	and is thus harder to find, whereas non-dma buffers may be
-	allocated in virtual memory. However, I advise against this if
-	you have an FDC without a FIFO (8272A or 82072). 82072A and
-	later are OK. You also need at least a 486 to use nodma.
-	If you use nodma mode, I suggest you also set the FIFO
-	threshold to 10 or lower, in order to limit the number of data
-	transfer interrupts.
-
-	If you have a FIFO-able FDC, the floppy driver automatically
-	falls back on non DMA mode if no DMA-able memory can be found.
-	If you want to avoid this, explicitly ask for 'yesdma'.
-
- floppy=yesdma
-	Tells the floppy driver that a workable DMA channel is available.
-	(default)
-
- floppy=nofifo
-	Disables the FIFO entirely. This is needed if you get "Bus
-	master arbitration error" messages from your Ethernet card (or
-	from other devices) while accessing the floppy.
-
- floppy=usefifo
-	Enables the FIFO. (default)
-
- floppy=<threshold>,fifo_depth
-	Sets the FIFO threshold. This is mostly relevant in DMA
-	mode. If this is higher, the floppy driver tolerates more
-	interrupt latency, but it triggers more interrupts (i.e. it
-	imposes more load on the rest of the system). If this is
-	lower, the interrupt latency should be lower too (faster
-	processor). The benefit of a lower threshold is less
-	interrupts.
-
-	To tune the fifo threshold, switch on over/underrun messages
-	using 'floppycontrol --messages'. Then access a floppy
-	disk. If you get a huge amount of "Over/Underrun - retrying"
-	messages, then the fifo threshold is too low. Try with a
-	higher value, until you only get an occasional Over/Underrun.
-	It is a good idea to compile the floppy driver as a module
-	when doing this tuning. Indeed, it allows to try different
-	fifo values without rebooting the machine for each test. Note
-	that you need to do 'floppycontrol --messages' every time you
-	re-insert the module.
-
-	Usually, tuning the fifo threshold should not be needed, as
-	the default (0xa) is reasonable.
-
- floppy=<drive>,<type>,cmos
-	Sets the CMOS type of <drive> to <type>. This is mandatory if
-	you have more than two floppy drives (only two can be
-	described in the physical CMOS), or if your BIOS uses
-	non-standard CMOS types. The CMOS types are:
-
-		0 - Use the value of the physical CMOS
-		1 - 5 1/4 DD
-		2 - 5 1/4 HD
-		3 - 3 1/2 DD
-		4 - 3 1/2 HD
-		5 - 3 1/2 ED
-		6 - 3 1/2 ED
-	       16 - unknown or not installed
-
-	(Note: there are two valid types for ED drives. This is because 5 was
-	initially chosen to represent floppy *tapes*, and 6 for ED drives.
-	AMI ignored this, and used 5 for ED drives. That's why the floppy
-	driver handles both.)
-
- floppy=unexpected_interrupts
-	Print a warning message when an unexpected interrupt is received.
-	(default)
-
- floppy=no_unexpected_interrupts
- floppy=L40SX
-	Don't print a message when an unexpected interrupt is received. This
-	is needed on IBM L40SX laptops in certain video modes. (There seems
-	to be an interaction between video and floppy. The unexpected
-	interrupts affect only performance, and can be safely ignored.)
-
- floppy=broken_dcl
-	Don't use the disk change line, but assume that the disk was
-	changed whenever the device node is reopened. Needed on some
-	boxes where the disk change line is broken or unsupported.
-	This should be regarded as a stopgap measure, indeed it makes
-	floppy operation less efficient due to unneeded cache
-	flushings, and slightly more unreliable. Please verify your
-	cable, connection and jumper settings if you have any DCL
-	problems. However, some older drives, and also some laptops
-	are known not to have a DCL.
-
- floppy=debug
-	Print debugging messages.
-
- floppy=messages
-	Print informational messages for some operations (disk change
-	notifications, warnings about over and underruns, and about
-	autodetection).
-
- floppy=silent_dcl_clear
-	Uses a less noisy way to clear the disk change line (which
-	doesn't involve seeks). Implied by 'daring' option.
-
- floppy=<nr>,irq
-	Sets the floppy IRQ to <nr> instead of 6.
-
- floppy=<nr>,dma
-	Sets the floppy DMA channel to <nr> instead of 2.
-
- floppy=slow
-	Use PS/2 stepping rate:
-	 " PS/2 floppies have much slower step rates than regular floppies.
-	   It's been recommended that take about 1/4 of the default speed
-	   in some more extreme cases."
-
-
-Supporting utilities and additional documentation:
-==================================================
-
- Additional parameters of the floppy driver can be configured at
-runtime. Utilities which do this can be found in the fdutils package.
-This package also contains a new version of mtools which allows to
-access high capacity disks (up to 1992K on a high density 3 1/2 disk!).
-It also contains additional documentation about the floppy driver.
-
-The latest version can be found at fdutils homepage:
- http://fdutils.linux.lu
-
-The fdutils releases can be found at:
- http://fdutils.linux.lu/download.html
- http://www.tux.org/pub/knaff/fdutils/
- ftp://metalab.unc.edu/pub/Linux/utils/disk-management/
-
-Reporting problems about the floppy driver
-==========================================
-
- If you have a question or a bug report about the floppy driver, mail
-me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use
-comp.os.linux.hardware. As the volume in these groups is rather high,
-be sure to include the word "floppy" (or "FLOPPY") in the subject
-line.  If the reported problem happens when mounting floppy disks, be
-sure to mention also the type of the filesystem in the subject line.
-
- Be sure to read the FAQ before mailing/posting any bug reports!
-
- Alain
-
-Changelog
-=========
-
-10-30-2004 :	Cleanup, updating, add reference to module configuration.
-		James Nelson <james4765@gmail.com>
-
-6-3-2000 :	Original Document
diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt
deleted file mode 100644
index db242ea2bce8..000000000000
--- a/Documentation/blockdev/nbd.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-Network Block Device (TCP version)
-==================================
-
-1) Overview
------------
-
-What is it: With this compiled in the kernel (or as a module), Linux
-can use a remote server as one of its block devices. So every time
-the client computer wants to read, e.g., /dev/nb0, it sends a
-request over TCP to the server, which will reply with the data read.
-This can be used for stations with low disk space (or even diskless)
-to borrow disk space from another computer.
-Unlike NFS, it is possible to put any filesystem on it, etc.
-
-For more information, or to download the nbd-client and nbd-server
-tools, go to http://nbd.sf.net/.
-
-The nbd kernel module need only be installed on the client
-system, as the nbd-server is completely in userspace. In fact,
-the nbd-server has been successfully ported to other operating
-systems, including Windows.
-
-A) NBD parameters
------------------
-
-max_part
-	Number of partitions per device (default: 0).
-
-nbds_max
-	Number of block devices that should be initialized (default: 16).
-
diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt
deleted file mode 100644
index ee6717e3771d..000000000000
--- a/Documentation/blockdev/paride.txt
+++ /dev/null
@@ -1,417 +0,0 @@
-
-		Linux and parallel port IDE devices
-
-PARIDE v1.03   (c) 1997-8  Grant Guenther <grant@torque.net>
-
-1. Introduction
-
-Owing to the simplicity and near universality of the parallel port interface
-to personal computers, many external devices such as portable hard-disk,
-CD-ROM, LS-120 and tape drives use the parallel port to connect to their
-host computer.  While some devices (notably scanners) use ad-hoc methods
-to pass commands and data through the parallel port interface, most 
-external devices are actually identical to an internal model, but with
-a parallel-port adapter chip added in.  Some of the original parallel port
-adapters were little more than mechanisms for multiplexing a SCSI bus.
-(The Iomega PPA-3 adapter used in the ZIP drives is an example of this
-approach).  Most current designs, however, take a different approach.
-The adapter chip reproduces a small ISA or IDE bus in the external device
-and the communication protocol provides operations for reading and writing
-device registers, as well as data block transfer functions.  Sometimes,
-the device being addressed via the parallel cable is a standard SCSI
-controller like an NCR 5380.  The "ditto" family of external tape
-drives use the ISA replicator to interface a floppy disk controller,
-which is then connected to a floppy-tape mechanism.  The vast majority
-of external parallel port devices, however, are now based on standard
-IDE type devices, which require no intermediate controller.  If one
-were to open up a parallel port CD-ROM drive, for instance, one would
-find a standard ATAPI CD-ROM drive, a power supply, and a single adapter
-that interconnected a standard PC parallel port cable and a standard
-IDE cable.  It is usually possible to exchange the CD-ROM device with
-any other device using the IDE interface. 
-
-The document describes the support in Linux for parallel port IDE
-devices.  It does not cover parallel port SCSI devices, "ditto" tape
-drives or scanners.  Many different devices are supported by the 
-parallel port IDE subsystem, including:
-
-	MicroSolutions backpack CD-ROM
-	MicroSolutions backpack PD/CD
-	MicroSolutions backpack hard-drives
-	MicroSolutions backpack 8000t tape drive
-	SyQuest EZ-135, EZ-230 & SparQ drives
-	Avatar Shark
-	Imation Superdisk LS-120
-	Maxell Superdisk LS-120
-	FreeCom Power CD 
-	Hewlett-Packard 5GB and 8GB tape drives
-	Hewlett-Packard 7100 and 7200 CD-RW drives
-
-as well as most of the clone and no-name products on the market.
-
-To support such a wide range of devices, PARIDE, the parallel port IDE
-subsystem, is actually structured in three parts.   There is a base
-paride module which provides a registry and some common methods for
-accessing the parallel ports.  The second component is a set of 
-high-level drivers for each of the different types of supported devices: 
-
-	pd	IDE disk
-	pcd	ATAPI CD-ROM
-	pf	ATAPI disk
-	pt	ATAPI tape
-	pg	ATAPI generic
-
-(Currently, the pg driver is only used with CD-R drives).
-
-The high-level drivers function according to the relevant standards.
-The third component of PARIDE is a set of low-level protocol drivers
-for each of the parallel port IDE adapter chips.  Thanks to the interest
-and encouragement of Linux users from many parts of the world, 
-support is available for almost all known adapter protocols:
-
-        aten    ATEN EH-100                            (HK)
-        bpck    Microsolutions backpack                (US)
-        comm    DataStor (old-type) "commuter" adapter (TW)
-        dstr    DataStor EP-2000                       (TW)
-        epat    Shuttle EPAT                           (UK)
-        epia    Shuttle EPIA                           (UK)
-	fit2    FIT TD-2000			       (US)
-	fit3    FIT TD-3000			       (US)
-	friq    Freecom IQ cable                       (DE)
-        frpw    Freecom Power                          (DE)
-        kbic    KingByte KBIC-951A and KBIC-971A       (TW)
-	ktti    KT Technology PHd adapter              (SG)
-        on20    OnSpec 90c20                           (US)
-        on26    OnSpec 90c26                           (US)
-
-
-2. Using the PARIDE subsystem
-
-While configuring the Linux kernel, you may choose either to build
-the PARIDE drivers into your kernel, or to build them as modules.
-
-In either case, you will need to select "Parallel port IDE device support"
-as well as at least one of the high-level drivers and at least one
-of the parallel port communication protocols.  If you do not know
-what kind of parallel port adapter is used in your drive, you could
-begin by checking the file names and any text files on your DOS 
-installation floppy.  Alternatively, you can look at the markings on
-the adapter chip itself.  That's usually sufficient to identify the
-correct device.  
-
-You can actually select all the protocol modules, and allow the PARIDE
-subsystem to try them all for you.
-
-For the "brand-name" products listed above, here are the protocol
-and high-level drivers that you would use:
-
-	Manufacturer		Model		Driver	Protocol
-	
-	MicroSolutions		CD-ROM		pcd	bpck
-	MicroSolutions		PD drive	pf	bpck
-	MicroSolutions		hard-drive	pd	bpck
-	MicroSolutions          8000t tape      pt      bpck
-	SyQuest			EZ, SparQ	pd	epat
-	Imation			Superdisk	pf	epat
-	Maxell                  Superdisk       pf      friq
-	Avatar			Shark		pd	epat
-	FreeCom			CD-ROM		pcd	frpw
-	Hewlett-Packard		5GB Tape	pt	epat
-	Hewlett-Packard		7200e (CD)	pcd	epat
-	Hewlett-Packard		7200e (CD-R)	pg	epat
-
-2.1  Configuring built-in drivers
-
-We recommend that you get to know how the drivers work and how to
-configure them as loadable modules, before attempting to compile a
-kernel with the drivers built-in.
-
-If you built all of your PARIDE support directly into your kernel,
-and you have just a single parallel port IDE device, your kernel should
-locate it automatically for you.  If you have more than one device,
-you may need to give some command line options to your bootloader
-(eg: LILO), how to do that is beyond the scope of this document.
-
-The high-level drivers accept a number of command line parameters, all
-of which are documented in the source files in linux/drivers/block/paride.
-By default, each driver will automatically try all parallel ports it
-can find, and all protocol types that have been installed, until it finds
-a parallel port IDE adapter.  Once it finds one, the probe stops.  So,
-if you have more than one device, you will need to tell the drivers
-how to identify them.  This requires specifying the port address, the
-protocol identification number and, for some devices, the drive's
-chain ID.  While your system is booting, a number of messages are
-displayed on the console.  Like all such messages, they can be
-reviewed with the 'dmesg' command.  Among those messages will be
-some lines like:
-
-	paride: bpck registered as protocol 0
-	paride: epat registered as protocol 1
-
-The numbers will always be the same until you build a new kernel with
-different protocol selections.  You should note these numbers as you
-will need them to identify the devices.
-
-If you happen to be using a MicroSolutions backpack device, you will
-also need to know the unit ID number for each drive.  This is usually
-the last two digits of the drive's serial number (but read MicroSolutions'
-documentation about this).
-
-As an example, let's assume that you have a MicroSolutions PD/CD drive
-with unit ID number 36 connected to the parallel port at 0x378, a SyQuest 
-EZ-135 connected to the chained port on the PD/CD drive and also an 
-Imation Superdisk connected to port 0x278.  You could give the following 
-options on your boot command:
-
-	pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
-
-In the last option, pf.drive1 configures device /dev/pf1, the 0x378
-is the parallel port base address, the 0 is the protocol registration
-number and 36 is the chain ID.
-
-Please note:  while PARIDE will work both with and without the 
-PARPORT parallel port sharing system that is included by the
-"Parallel port support" option, PARPORT must be included and enabled
-if you want to use chains of devices on the same parallel port.
-
-2.2  Loading and configuring PARIDE as modules
-
-It is much faster and simpler to get to understand the PARIDE drivers
-if you use them as loadable kernel modules.   
-
-Note 1:  using these drivers with the "kerneld" automatic module loading
-system is not recommended for beginners, and is not documented here.  
-
-Note 2:  if you build PARPORT support as a loadable module, PARIDE must
-also be built as loadable modules, and PARPORT must be loaded before the
-PARIDE modules.
-
-To use PARIDE, you must begin by 
-
-	insmod paride
-
-this loads a base module which provides a registry for the protocols,
-among other tasks.
-
-Then, load as many of the protocol modules as you think you might need.
-As you load each module, it will register the protocols that it supports,
-and print a log message to your kernel log file and your console. For 
-example:
-
-	# insmod epat
-	paride: epat registered as protocol 0
-	# insmod kbic
-	paride: k951 registered as protocol 1
-        paride: k971 registered as protocol 2
-
-Finally, you can load high-level drivers for each kind of device that
-you have connected.  By default, each driver will autoprobe for a single 
-device, but you can support up to four similar devices by giving their
-individual co-ordinates when you load the driver.
-
-For example, if you had two no-name CD-ROM drives both using the
-KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
-you could give the following command:
-
-	# insmod pcd drive0=0x378,1 drive1=0x3bc,1
-
-For most adapters, giving a port address and protocol number is sufficient,
-but check the source files in linux/drivers/block/paride for more 
-information.  (Hopefully someone will write some man pages one day !).
-
-As another example, here's what happens when PARPORT is installed, and
-a SyQuest EZ-135 is attached to port 0x378:
-
-	# insmod paride
-	paride: version 1.0 installed
-	# insmod epat
-	paride: epat registered as protocol 0
-	# insmod pd
-	pd: pd version 1.0, major 45, cluster 64, nice 0
-	pda: Sharing parport1 at 0x378
-	pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
-	pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
-	 pda: pda1
-
-Note that the last line is the output from the generic partition table
-scanner - in this case it reports that it has found a disk with one partition.
-
-2.3  Using a PARIDE device
-
-Once the drivers have been loaded, you can access PARIDE devices in the
-same way as their traditional counterparts.  You will probably need to
-create the device "special files".  Here is a simple script that you can
-cut to a file and execute:
-
-#!/bin/bash
-#
-# mkd -- a script to create the device special files for the PARIDE subsystem
-#
-function mkdev {
-  mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
-}
-#
-function pd {
-  D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
-  mkdev pd$D b 45 $[ $1 * 16 ]
-  for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-  do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
-  done
-}
-#
-cd /dev
-#
-for u in 0 1 2 3 ; do pd $u ; done
-for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done 
-for u in 0 1 2 3 ; do mkdev pf$u  b 47 $u ; done 
-for u in 0 1 2 3 ; do mkdev pt$u  c 96 $u ; done 
-for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done 
-for u in 0 1 2 3 ; do mkdev pg$u  c 97 $u ; done 
-#
-# end of mkd
-
-With the device files and drivers in place, you can access PARIDE devices
-like any other Linux device.   For example, to mount a CD-ROM in pcd0, use:
-
-	mount /dev/pcd0 /cdrom
-
-If you have a fresh Avatar Shark cartridge, and the drive is pda, you
-might do something like:
-
-	fdisk /dev/pda		-- make a new partition table with
-				   partition 1 of type 83
-
-	mke2fs /dev/pda1	-- to build the file system
-
-	mkdir /shark		-- make a place to mount the disk
-
-	mount /dev/pda1 /shark
-
-Devices like the Imation superdisk work in the same way, except that
-they do not have a partition table.  For example to make a 120MB
-floppy that you could share with a DOS system:
-
-	mkdosfs /dev/pf0
-	mount /dev/pf0 /mnt
-
-
-2.4  The pf driver
-
-The pf driver is intended for use with parallel port ATAPI disk
-devices.  The most common devices in this category are PD drives
-and LS-120 drives.  Traditionally, media for these devices are not
-partitioned.  Consequently, the pf driver does not support partitioned
-media.  This may be changed in a future version of the driver. 
-
-2.5  Using the pt driver
-
-The pt driver for parallel port ATAPI tape drives is a minimal driver.
-It does not yet support many of the standard tape ioctl operations. 
-For best performance, a block size of 32KB should be used.  You will
-probably want to set the parallel port delay to 0, if you can.
-
-2.6  Using the pg driver
-
-The pg driver can be used in conjunction with the cdrecord program
-to create CD-ROMs.  Please get cdrecord version 1.6.1 or later
-from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ .  To record CD-R media 
-your parallel port should ideally be set to EPP mode, and the "port delay" 
-should be set to 0.  With those settings it is possible to record at 2x 
-speed without any buffer underruns.  If you cannot get the driver to work
-in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
-
-
-3. Troubleshooting
-
-3.1  Use EPP mode if you can
-
-The most common problems that people report with the PARIDE drivers
-concern the parallel port CMOS settings.  At this time, none of the
-PARIDE protocol modules support ECP mode, or any ECP combination modes.
-If you are able to do so, please set your parallel port into EPP mode
-using your CMOS setup procedure.
-
-3.2  Check the port delay
-
-Some parallel ports cannot reliably transfer data at full speed.  To
-offset the errors, the PARIDE protocol modules introduce a "port
-delay" between each access to the i/o ports.  Each protocol sets
-a default value for this delay.  In most cases, the user can override
-the default and set it to 0 - resulting in somewhat higher transfer
-rates.  In some rare cases (especially with older 486 systems) the
-default delays are not long enough.  if you experience corrupt data
-transfers, or unexpected failures, you may wish to increase the
-port delay.   The delay can be programmed using the "driveN" parameters
-to each of the high-level drivers.  Please see the notes above, or
-read the comments at the beginning of the driver source files in
-linux/drivers/block/paride.
-
-3.3  Some drives need a printer reset
-
-There appear to be a number of "noname" external drives on the market
-that do not always power up correctly.  We have noticed this with some
-drives based on OnSpec and older Freecom adapters.  In these rare cases,
-the adapter can often be reinitialised by issuing a "printer reset" on
-the parallel port.  As the reset operation is potentially disruptive in 
-multiple device environments, the PARIDE drivers will not do it 
-automatically.  You can however, force a printer reset by doing:
-
-	insmod lp reset=1
-	rmmod lp
-
-If you have one of these marginal cases, you should probably build
-your paride drivers as modules, and arrange to do the printer reset
-before loading the PARIDE drivers. 
-
-3.4  Use the verbose option and dmesg if you need help
-
-While a lot of testing has gone into these drivers to make them work
-as smoothly as possible, problems will arise.  If you do have problems,
-please check all the obvious things first:  does the drive work in
-DOS with the manufacturer's drivers ?  If that doesn't yield any useful
-clues, then please make sure that only one drive is hooked to your system,
-and that either (a) PARPORT is enabled or (b) no other device driver
-is using your parallel port (check in /proc/ioports).  Then, load the
-appropriate drivers (you can load several protocol modules if you want)
-as in:
-
-	# insmod paride
-	# insmod epat
-	# insmod bpck
-	# insmod kbic
-	...
-	# insmod pd verbose=1
-
-(using the correct driver for the type of device you have, of course).
-The verbose=1 parameter will cause the drivers to log a trace of their
-activity as they attempt to locate your drive.
-
-Use 'dmesg' to capture a log of all the PARIDE messages (any messages
-beginning with paride:, a protocol module's name or a driver's name) and
-include that with your bug report.  You can submit a bug report in one
-of two ways.  Either send it directly to the author of the PARIDE suite,
-by e-mail to grant@torque.net, or join the linux-parport mailing list
-and post your report there.
-
-3.5  For more information or help
-
-You can join the linux-parport mailing list by sending a mail message
-to 
-		linux-parport-request@torque.net
-
-with the single word 
-
-		subscribe
-
-in the body of the mail message (not in the subject line).   Please be
-sure that your mail program is correctly set up when you do this,  as
-the list manager is a robot that will subscribe you using the reply
-address in your mail headers.  REMOVE any anti-spam gimmicks you may
-have in your mail headers, when sending mail to the list server.
-
-You might also find some useful information on the linux-parport
-web pages (although they are not always up to date) at
-
-	http://web.archive.org/web/*/http://www.torque.net/parport/
-
-
diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt
deleted file mode 100644
index 501e12e0323e..000000000000
--- a/Documentation/blockdev/ramdisk.txt
+++ /dev/null
@@ -1,174 +0,0 @@
-Using the RAM disk block device with Linux
-------------------------------------------
-
-Contents:
-
-	1) Overview
-	2) Kernel Command Line Parameters
-	3) Using "rdev -r"
-	4) An Example of Creating a Compressed RAM Disk
-
-
-1) Overview
------------
-
-The RAM disk driver is a way to use main system memory as a block device.  It
-is required for initrd, an initial filesystem used if you need to load modules
-in order to access the root filesystem (see Documentation/admin-guide/initrd.rst).  It can
-also be used for a temporary filesystem for crypto work, since the contents
-are erased on reboot.
-
-The RAM disk dynamically grows as more space is required. It does this by using
-RAM from the buffer cache. The driver marks the buffers it is using as dirty
-so that the VM subsystem does not try to reclaim them later.
-
-The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
-to support an unlimited number of RAM disks (at your own risk).  Just change
-the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
-and (re)build the kernel.
-
-To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
-directory.  RAM disks are all major number 1, and start with minor number 0
-for /dev/ram0, etc.  If used, modern kernels use /dev/ram0 for an initrd.
-
-The new RAM disk also has the ability to load compressed RAM disk images,
-allowing one to squeeze more programs onto an average installation or
-rescue floppy disk.
-
-
-2) Parameters
----------------------------------
-
-2a) Kernel Command Line Parameters
-
-	ramdisk_size=N
-	==============
-
-This parameter tells the RAM disk driver to set up RAM disks of N k size.  The
-default is 4096 (4 MB).
-
-2b) Module parameters
-
-	rd_nr
-	=====
-	/dev/ramX devices created.
-
-	max_part
-	========
-	Maximum partition number.
-
-	rd_size
-	=======
-	See ramdisk_size.
-
-3) Using "rdev -r"
-------------------
-
-The usage of the word (two bytes) that "rdev -r" sets in the kernel image is
-as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up
-to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit
-14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a
-prompt/wait sequence is to be given before trying to read the RAM disk. Since
-the RAM disk dynamically grows as data is being written into it, a size field
-is not required. Bits 11 to 13 are not currently used and may as well be zero.
-These numbers are no magical secrets, as seen below:
-
-./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK     0x07FF
-./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG          0x8000
-./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG            0x4000
-
-Consider a typical two floppy disk setup, where you will have the
-kernel on disk one, and have already put a RAM disk image onto disk #2.
-
-Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk
-starts at an offset of 0 kB from the beginning of the floppy.
-The command line equivalent is: "ramdisk_start=0"
-
-You want bit 14 as one, indicating that a RAM disk is to be loaded.
-The command line equivalent is: "load_ramdisk=1"
-
-You want bit 15 as one, indicating that you want a prompt/keypress
-sequence so that you have a chance to switch floppy disks.
-The command line equivalent is: "prompt_ramdisk=1"
-
-Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word.
-So to create disk one of the set, you would do:
-
-	/usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0
-	/usr/src/linux# rdev /dev/fd0 /dev/fd0
-	/usr/src/linux# rdev -r /dev/fd0 49152
-
-If you make a boot disk that has LILO, then for the above, you would use:
-	append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1"
-Since the default start = 0 and the default prompt = 1, you could use:
-	append = "load_ramdisk=1"
-
-
-4) An Example of Creating a Compressed RAM Disk
-----------------------------------------------
-
-To create a RAM disk image, you will need a spare block device to
-construct it on. This can be the RAM disk device itself, or an
-unused disk partition (such as an unmounted swap partition). For this
-example, we will use the RAM disk device, "/dev/ram0".
-
-Note: This technique should not be done on a machine with less than 8 MB
-of RAM. If using a spare disk partition instead of /dev/ram0, then this
-restriction does not apply.
-
-a) Decide on the RAM disk size that you want. Say 2 MB for this example.
-   Create it by writing to the RAM disk device. (This step is not currently
-   required, but may be in the future.) It is wise to zero out the
-   area (esp. for disks) so that maximal compression is achieved for
-   the unused blocks of the image that you are about to create.
-
-	dd if=/dev/zero of=/dev/ram0 bs=1k count=2048
-
-b) Make a filesystem on it. Say ext2fs for this example.
-
-	mke2fs -vm0 /dev/ram0 2048
-
-c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...)
-   and unmount it again.
-
-d) Compress the contents of the RAM disk. The level of compression
-   will be approximately 50% of the space used by the files. Unused
-   space on the RAM disk will compress to almost nothing.
-
-	dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz
-
-e) Put the kernel onto the floppy
-
-	dd if=zImage of=/dev/fd0 bs=1k
-
-f) Put the RAM disk image onto the floppy, after the kernel. Use an offset
-   that is slightly larger than the kernel, so that you can put another
-   (possibly larger) kernel onto the same floppy later without overlapping
-   the RAM disk image. An offset of 400 kB for kernels about 350 kB in
-   size would be reasonable. Make sure offset+size of ram_image.gz is
-   not larger than the total space on your floppy (usually 1440 kB).
-
-	dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400
-
-g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc.
-   For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would
-   have 2^15 + 2^14 + 400 = 49552.
-
-	rdev /dev/fd0 /dev/fd0
-	rdev -r /dev/fd0 49552
-
-That is it. You now have your boot/root compressed RAM disk floppy. Some
-users may wish to combine steps (d) and (f) by using a pipe.
-
---------------------------------------------------------------------------
-						Paul Gortmaker 12/95
-
-Changelog:
-----------
-
-10-22-04 :	Updated to reflect changes in command line options, remove
-		obsolete references, general cleanup.
-		James Nelson (james4765@gmail.com)
-
-
-12-95 :		Original Document
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
deleted file mode 100644
index 4df0ce271085..000000000000
--- a/Documentation/blockdev/zram.txt
+++ /dev/null
@@ -1,355 +0,0 @@
-zram: Compressed RAM based block devices
-----------------------------------------
-
-* Introduction
-
-The zram module creates RAM based block devices named /dev/zram<id>
-(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
-in memory itself. These disks allow very fast I/O and compression provides
-good amounts of memory savings. Some of the usecases include /tmp storage,
-use as swap disks, various caches under /var and maybe many more :)
-
-Statistics for individual zram devices are exported through sysfs nodes at
-/sys/block/zram<id>/
-
-* Usage
-
-There are several ways to configure and manage zram device(-s):
-a) using zram and zram_control sysfs attributes
-b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org).
-
-In this document we will describe only 'manual' zram configuration steps,
-IOW, zram and zram_control sysfs attributes.
-
-In order to get a better idea about zramctl please consult util-linux
-documentation, zramctl man-page or `zramctl --help'. Please be informed
-that zram maintainers do not develop/maintain util-linux or zramctl, should
-you have any questions please contact util-linux@vger.kernel.org
-
-Following shows a typical sequence of steps for using zram.
-
-WARNING
-=======
-For the sake of simplicity we skip error checking parts in most of the
-examples below. However, it is your sole responsibility to handle errors.
-
-zram sysfs attributes always return negative values in case of errors.
-The list of possible return codes:
--EBUSY	-- an attempt to modify an attribute that cannot be changed once
-the device has been initialised. Please reset device first;
--ENOMEM	-- zram was not able to allocate enough memory to fulfil your
-needs;
--EINVAL	-- invalid input has been provided.
-
-If you use 'echo', the returned value that is changed by 'echo' utility,
-and, in general case, something like:
-
-	echo 3 > /sys/block/zram0/max_comp_streams
-	if [ $? -ne 0 ];
-		handle_error
-	fi
-
-should suffice.
-
-1) Load Module:
-	modprobe zram num_devices=4
-	This creates 4 devices: /dev/zram{0,1,2,3}
-
-num_devices parameter is optional and tells zram how many devices should be
-pre-created. Default: 1.
-
-2) Set max number of compression streams
-Regardless the value passed to this attribute, ZRAM will always
-allocate multiple compression streams - one per online CPUs - thus
-allowing several concurrent compression operations. The number of
-allocated compression streams goes down when some of the CPUs
-become offline. There is no single-compression-stream mode anymore,
-unless you are running a UP system or has only 1 CPU online.
-
-To find out how many streams are currently available:
-	cat /sys/block/zram0/max_comp_streams
-
-3) Select compression algorithm
-Using comp_algorithm device attribute one can see available and
-currently selected (shown in square brackets) compression algorithms,
-change selected compression algorithm (once the device is initialised
-there is no way to change compression algorithm).
-
-Examples:
-	#show supported compression algorithms
-	cat /sys/block/zram0/comp_algorithm
-	lzo [lz4]
-
-	#select lzo compression algorithm
-	echo lzo > /sys/block/zram0/comp_algorithm
-
-For the time being, the `comp_algorithm' content does not necessarily
-show every compression algorithm supported by the kernel. We keep this
-list primarily to simplify device configuration and one can configure
-a new device with a compression algorithm that is not listed in
-`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
-and, if some of the algorithms were built as modules, it's impossible
-to list all of them using, for instance, /proc/crypto or any other
-method. This, however, has an advantage of permitting the usage of
-custom crypto compression modules (implementing S/W or H/W compression).
-
-4) Set Disksize
-Set disk size by writing the value to sysfs node 'disksize'.
-The value can be either in bytes or you can use mem suffixes.
-Examples:
-	# Initialize /dev/zram0 with 50MB disksize
-	echo $((50*1024*1024)) > /sys/block/zram0/disksize
-
-	# Using mem suffixes
-	echo 256K > /sys/block/zram0/disksize
-	echo 512M > /sys/block/zram0/disksize
-	echo 1G > /sys/block/zram0/disksize
-
-Note:
-There is little point creating a zram of greater than twice the size of memory
-since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
-size of the disk when not in use so a huge zram is wasteful.
-
-5) Set memory limit: Optional
-Set memory limit by writing the value to sysfs node 'mem_limit'.
-The value can be either in bytes or you can use mem suffixes.
-In addition, you could change the value in runtime.
-Examples:
-	# limit /dev/zram0 with 50MB memory
-	echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
-
-	# Using mem suffixes
-	echo 256K > /sys/block/zram0/mem_limit
-	echo 512M > /sys/block/zram0/mem_limit
-	echo 1G > /sys/block/zram0/mem_limit
-
-	# To disable memory limit
-	echo 0 > /sys/block/zram0/mem_limit
-
-6) Activate:
-	mkswap /dev/zram0
-	swapon /dev/zram0
-
-	mkfs.ext4 /dev/zram1
-	mount /dev/zram1 /tmp
-
-7) Add/remove zram devices
-
-zram provides a control interface, which enables dynamic (on-demand) device
-addition and removal.
-
-In order to add a new /dev/zramX device, perform read operation on hot_add
-attribute. This will return either new device's device id (meaning that you
-can use /dev/zram<id>) or error code.
-
-Example:
-	cat /sys/class/zram-control/hot_add
-	1
-
-To remove the existing /dev/zramX device (where X is a device id)
-execute
-	echo X > /sys/class/zram-control/hot_remove
-
-8) Stats:
-Per-device statistics are exported as various nodes under /sys/block/zram<id>/
-
-A brief description of exported device attributes. For more details please
-read Documentation/ABI/testing/sysfs-block-zram.
-
-Name            	access            description
-----            	------            -----------
-disksize          	RW	show and set the device's disk size
-initstate         	RO	shows the initialization state of the device
-reset             	WO	trigger device reset
-mem_used_max      	WO	reset the `mem_used_max' counter (see later)
-mem_limit         	WO	specifies the maximum amount of memory ZRAM can use
-				to store the compressed data
-writeback_limit   	WO	specifies the maximum amount of write IO zram can
-				write out to backing device as 4KB unit
-writeback_limit_enable  RW	show and set writeback_limit feature
-max_comp_streams  	RW	the number of possible concurrent compress operations
-comp_algorithm    	RW	show and change the compression algorithm
-compact           	WO	trigger memory compaction
-debug_stat        	RO	this file is used for zram debugging purposes
-backing_dev	  	RW	set up backend storage for zram to write out
-idle		  	WO	mark allocated slot as idle
-
-
-User space is advised to use the following files to read the device statistics.
-
-File /sys/block/zram<id>/stat
-
-Represents block layer statistics. Read Documentation/block/stat.txt for
-details.
-
-File /sys/block/zram<id>/io_stat
-
-The stat file represents device's I/O statistics not accounted by block
-layer and, thus, not available in zram<id>/stat file. It consists of a
-single line of text and contains the following stats separated by
-whitespace:
- failed_reads     the number of failed reads
- failed_writes    the number of failed writes
- invalid_io       the number of non-page-size-aligned I/O requests
- notify_free      Depending on device usage scenario it may account
-                  a) the number of pages freed because of swap slot free
-                  notifications or b) the number of pages freed because of
-                  REQ_OP_DISCARD requests sent by bio. The former ones are
-                  sent to a swap block device when a swap slot is freed,
-                  which implies that this disk is being used as a swap disk.
-                  The latter ones are sent by filesystem mounted with
-                  discard option, whenever some data blocks are getting
-                  discarded.
-
-File /sys/block/zram<id>/mm_stat
-
-The stat file represents device's mm statistics. It consists of a single
-line of text and contains the following stats separated by whitespace:
- orig_data_size   uncompressed size of data stored in this disk.
-		  This excludes same-element-filled pages (same_pages) since
-		  no memory is allocated for them.
-                  Unit: bytes
- compr_data_size  compressed size of data stored in this disk
- mem_used_total   the amount of memory allocated for this disk. This
-                  includes allocator fragmentation and metadata overhead,
-                  allocated for this disk. So, allocator space efficiency
-                  can be calculated using compr_data_size and this statistic.
-                  Unit: bytes
- mem_limit        the maximum amount of memory ZRAM can use to store
-                  the compressed data
- mem_used_max     the maximum amount of memory zram have consumed to
-                  store the data
- same_pages       the number of same element filled pages written to this disk.
-                  No memory is allocated for such pages.
- pages_compacted  the number of pages freed during compaction
- huge_pages	  the number of incompressible pages
-
-File /sys/block/zram<id>/bd_stat
-
-The stat file represents device's backing device statistics. It consists of
-a single line of text and contains the following stats separated by whitespace:
- bd_count	size of data written in backing device.
-		Unit: 4K bytes
- bd_reads	the number of reads from backing device
-		Unit: 4K bytes
- bd_writes	the number of writes to backing device
-		Unit: 4K bytes
-
-9) Deactivate:
-	swapoff /dev/zram0
-	umount /dev/zram1
-
-10) Reset:
-	Write any positive value to 'reset' sysfs node
-	echo 1 > /sys/block/zram0/reset
-	echo 1 > /sys/block/zram1/reset
-
-	This frees all the memory allocated for the given device and
-	resets the disksize to zero. You must set the disksize again
-	before reusing the device.
-
-* Optional Feature
-
-= writeback
-
-With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
-to backing storage rather than keeping it in memory.
-To use the feature, admin should set up backing device via
-
-	"echo /dev/sda5 > /sys/block/zramX/backing_dev"
-
-before disksize setting. It supports only partition at this moment.
-If admin want to use incompressible page writeback, they could do via
-
-	"echo huge > /sys/block/zramX/write"
-
-To use idle page writeback, first, user need to declare zram pages
-as idle.
-
-	"echo all > /sys/block/zramX/idle"
-
-From now on, any pages on zram are idle pages. The idle mark
-will be removed until someone request access of the block.
-IOW, unless there is access request, those pages are still idle pages.
-
-Admin can request writeback of those idle pages at right timing via
-
-	"echo idle > /sys/block/zramX/writeback"
-
-With the command, zram writeback idle pages from memory to the storage.
-
-If there are lots of write IO with flash device, potentially, it has
-flash wearout problem so that admin needs to design write limitation
-to guarantee storage health for entire product life.
-
-To overcome the concern, zram supports "writeback_limit" feature.
-The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
-any writeback. IOW, if admin want to apply writeback budget, he should
-enable writeback_limit_enable via
-
-	$ echo 1 > /sys/block/zramX/writeback_limit_enable
-
-Once writeback_limit_enable is set, zram doesn't allow any writeback
-until admin set the budget via /sys/block/zramX/writeback_limit.
-
-(If admin doesn't enable writeback_limit_enable, writeback_limit's value
-assigned via /sys/block/zramX/writeback_limit is meaninless.)
-
-If admin want to limit writeback as per-day 400M, he could do it
-like below.
-
-	$ MB_SHIFT=20
-	$ 4K_SHIFT=12
-	$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
-		/sys/block/zram0/writeback_limit.
-	$ echo 1 > /sys/block/zram0/writeback_limit_enable
-
-If admin want to allow further write again once the bugdet is exausted,
-he could do it like below
-
-	$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
-		/sys/block/zram0/writeback_limit
-
-If admin want to see remaining writeback budget since he set,
-
-	$ cat /sys/block/zramX/writeback_limit
-
-If admin want to disable writeback limit, he could do
-
-	$ echo 0 > /sys/block/zramX/writeback_limit_enable
-
-The writeback_limit count will reset whenever you reset zram(e.g.,
-system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
-writeback happened until you reset the zram to allocate extra writeback
-budget in next setting is user's job.
-
-If admin want to measure writeback count in a certain period, he could
-know it via /sys/block/zram0/bd_stat's 3rd column.
-
-= memory tracking
-
-With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
-zram block. It could be useful to catch cold or incompressible
-pages of the process with*pagemap.
-If you enable the feature, you could see block state via
-/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
-
-	  300    75.033841 .wh.
-	  301    63.806904 s...
-	  302    63.806919 ..hi
-
-First column is zram's block index.
-Second column is access time since the system was booted
-Third column is state of the block.
-(s: same page
-w: written page to backing store
-h: huge page
-i: idle page)
-
-First line of above example says 300th block is accessed at 75.033841sec
-and the block's state is huge so it is written back to the backing
-storage. It's a debugging feature so anyone shouldn't rely on it to work
-properly.
-
-Nitin Gupta
-ngupta@vflare.org
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index cb402c59eca5..12a246fcf6cb 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -172,11 +172,31 @@ registers which makes BPF inefficient virtual machine for 32-bit
 CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
 be added to BPF in the future?
 
-A: NO. The first thing to improve performance on 32-bit archs is to teach
-LLVM to generate code that uses 32-bit subregisters. Then second step
-is to teach verifier to mark operations where zero-ing upper bits
-is unnecessary. Then JITs can take advantage of those markings and
-drastically reduce size of generated code and improve performance.
+A: NO.
+
+But some optimizations on zero-ing the upper 32 bits for BPF registers are
+available, and can be leveraged to improve the performance of JITed BPF
+programs for 32-bit architectures.
+
+Starting with version 7, LLVM is able to generate instructions that operate
+on 32-bit subregisters, provided the option -mattr=+alu32 is passed for
+compiling a program. Furthermore, the verifier can now mark the
+instructions for which zero-ing the upper bits of the destination register
+is required, and insert an explicit zero-extension (zext) instruction
+(a mov32 variant). This means that for architectures without zext hardware
+support, the JIT back-ends do not need to clear the upper bits for
+subregisters written by alu32 instructions or narrow loads. Instead, the
+back-ends simply need to support code generation for that mov32 variant,
+and to overwrite bpf_jit_needs_zext() to make it return "true" (in order to
+enable zext insertion in the verifier).
+
+Note that it is possible for a JIT back-end to have partial hardware
+support for zext. In that case, if verifier zext insertion is enabled,
+it could lead to the insertion of unnecessary zext instructions. Such
+instructions could be removed by creating a simple peephole inside the JIT
+back-end: if one instruction has hardware support for zext and if the next
+instruction is an explicit zext, then the latter can be skipped when doing
+the code generation.
 
 Q: Does BPF have a stable ABI?
 ------------------------------
diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index 8820360d00da..4d565d202ce3 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -131,7 +131,7 @@ The following sections detail encoding of each kind.
 ``btf_type`` is followed by a ``u32`` with the following bits arrangement::
 
   #define BTF_INT_ENCODING(VAL)   (((VAL) & 0x0f000000) >> 24)
-  #define BTF_INT_OFFSET(VAL)     (((VAL  & 0x00ff0000)) >> 16)
+  #define BTF_INT_OFFSET(VAL)     (((VAL) & 0x00ff0000) >> 16)
   #define BTF_INT_BITS(VAL)       ((VAL)  & 0x000000ff)
 
 The ``BTF_INT_ENCODING`` has the following attributes::
@@ -151,6 +151,7 @@ for the type. The maximum value of ``BTF_INT_BITS()`` is 128.
 
 The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values
 for this int. For example, a bitfield struct member has:
+
  * btf member bit offset 100 from the start of the structure,
  * btf member pointing to an int type,
  * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4``
@@ -160,6 +161,7 @@ from bits ``100 + 2 = 102``.
 
 Alternatively, the bitfield struct member can be the following to access the
 same bits as the above:
+
  * btf member bit offset 102,
  * btf member pointing to an int type,
  * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4``
diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index d3fe4cac0c90..801a6ed3f2e5 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -42,6 +42,7 @@ Program types
 .. toctree::
    :maxdepth: 1
 
+   prog_cgroup_sockopt
    prog_cgroup_sysctl
    prog_flow_dissector
 
diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst
new file mode 100644
index 000000000000..c47d974629ae
--- /dev/null
+++ b/Documentation/bpf/prog_cgroup_sockopt.rst
@@ -0,0 +1,93 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+BPF_PROG_TYPE_CGROUP_SOCKOPT
+============================
+
+``BPF_PROG_TYPE_CGROUP_SOCKOPT`` program type can be attached to two
+cgroup hooks:
+
+* ``BPF_CGROUP_GETSOCKOPT`` - called every time process executes ``getsockopt``
+  system call.
+* ``BPF_CGROUP_SETSOCKOPT`` - called every time process executes ``setsockopt``
+  system call.
+
+The context (``struct bpf_sockopt``) has associated socket (``sk``) and
+all input arguments: ``level``, ``optname``, ``optval`` and ``optlen``.
+
+BPF_CGROUP_SETSOCKOPT
+=====================
+
+``BPF_CGROUP_SETSOCKOPT`` is triggered *before* the kernel handling of
+sockopt and it has writable context: it can modify the supplied arguments
+before passing them down to the kernel. This hook has access to the cgroup
+and socket local storage.
+
+If BPF program sets ``optlen`` to -1, the control will be returned
+back to the userspace after all other BPF programs in the cgroup
+chain finish (i.e. kernel ``setsockopt`` handling will *not* be executed).
+
+Note, that ``optlen`` can not be increased beyond the user-supplied
+value. It can only be decreased or set to -1. Any other value will
+trigger ``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success, continue with next BPF program in the cgroup chain.
+
+BPF_CGROUP_GETSOCKOPT
+=====================
+
+``BPF_CGROUP_GETSOCKOPT`` is triggered *after* the kernel handing of
+sockopt. The BPF hook can observe ``optval``, ``optlen`` and ``retval``
+if it's interested in whatever kernel has returned. BPF hook can override
+the values above, adjust ``optlen`` and reset ``retval`` to 0. If ``optlen``
+has been increased above initial ``getsockopt`` value (i.e. userspace
+buffer is too small), ``EFAULT`` is returned.
+
+This hook has access to the cgroup and socket local storage.
+
+Note, that the only acceptable value to set to ``retval`` is 0 and the
+original value that the kernel returned. Any other value will trigger
+``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success: copy ``optval`` and ``optlen`` to userspace, return
+  ``retval`` from the syscall (note that this can be overwritten by
+  the BPF program from the parent cgroup).
+
+Cgroup Inheritance
+==================
+
+Suppose, there is the following cgroup hierarchy where each cgroup
+has ``BPF_CGROUP_GETSOCKOPT`` attached at each level with
+``BPF_F_ALLOW_MULTI`` flag::
+
+  A (root, parent)
+   \
+    B (child)
+
+When the application calls ``getsockopt`` syscall from the cgroup B,
+the programs are executed from the bottom up: B, A. First program
+(B) sees the result of kernel's ``getsockopt``. It can optionally
+adjust ``optval``, ``optlen`` and reset ``retval`` to 0. After that
+control will be passed to the second (A) program which will see the
+same context as B including any potential modifications.
+
+Same for ``BPF_CGROUP_SETSOCKOPT``: if the program is attached to
+A and B, the trigger order is B, then A. If B does any changes
+to the input arguments (``level``, ``optname``, ``optval``, ``optlen``),
+then the next program in the chain (A) will see those changes,
+*not* the original input ``setsockopt`` arguments. The potentially
+modified values will be then passed down to the kernel.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/progs/sockopt_sk.c`` for an example
+of BPF program that handles socket options.
diff --git a/Documentation/bus-devices/ti-gpmc.txt b/Documentation/bus-devices/ti-gpmc.txt
deleted file mode 100644
index cc9ce57e0a26..000000000000
--- a/Documentation/bus-devices/ti-gpmc.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-GPMC (General Purpose Memory Controller):
-=========================================
-
-GPMC is an unified memory controller dedicated to interfacing external
-memory devices like
- * Asynchronous SRAM like memories and application specific integrated
-   circuit devices.
- * Asynchronous, synchronous, and page mode burst NOR flash devices
-   NAND flash
- * Pseudo-SRAM devices
-
-GPMC is found on Texas Instruments SoC's (OMAP based)
-IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1
-
-
-GPMC generic timing calculation:
-================================
-
-GPMC has certain timings that has to be programmed for proper
-functioning of the peripheral, while peripheral has another set of
-timings. To have peripheral work with gpmc, peripheral timings has to
-be translated to the form gpmc can understand. The way it has to be
-translated depends on the connected peripheral. Also there is a
-dependency for certain gpmc timings on gpmc clock frequency. Hence a
-generic timing routine was developed to achieve above requirements.
-
-Generic routine provides a generic method to calculate gpmc timings
-from gpmc peripheral timings. struct gpmc_device_timings fields has to
-be updated with timings from the datasheet of the peripheral that is
-connected to gpmc. A few of the peripheral timings can be fed either
-in time or in cycles, provision to handle this scenario has been
-provided (refer struct gpmc_device_timings definition). It may so
-happen that timing as specified by peripheral datasheet is not present
-in timing structure, in this scenario, try to correlate peripheral
-timing to the one available. If that doesn't work, try to add a new
-field as required by peripheral, educate generic timing routine to
-handle it, make sure that it does not break any of the existing.
-Then there may be cases where peripheral datasheet doesn't mention
-certain fields of struct gpmc_device_timings, zero those entries.
-
-Generic timing routine has been verified to work properly on
-multiple onenand's and tusb6010 peripherals.
-
-A word of caution: generic timing routine has been developed based
-on understanding of gpmc timings, peripheral timings, available
-custom timing routines, a kind of reverse engineering without
-most of the datasheets & hardware (to be exact none of those supported
-in mainline having custom timing routine) and by simulation.
-
-gpmc timing dependency on peripheral timings:
-[<gpmc_timing>: <peripheral timing1>, <peripheral timing2> ...]
-
-1. common
-cs_on: t_ceasu
-adv_on: t_avdasu, t_ceavd
-
-2. sync common
-sync_clk: clk
-page_burst_access: t_bacc
-clk_activation: t_ces, t_avds
-
-3. read async muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu, t_aavdh
-access: t_iaa, t_oe, t_ce, t_aa
-rd_cycle: t_rd_cycle, t_cez_r, t_oez
-
-4. read async non-muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu
-access: t_iaa, t_oe, t_ce, t_aa
-rd_cycle: t_rd_cycle, t_cez_r, t_oez
-
-5. read sync muxed
-adv_rd_off: t_avdp_r, t_avdh
-oe_on: t_oeasu, t_ach, cyc_aavdh_oe
-access: t_iaa, cyc_iaa, cyc_oe
-rd_cycle: t_cez_r, t_oez, t_ce_rdyz
-
-6. read sync non-muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu
-access: t_iaa, cyc_iaa, cyc_oe
-rd_cycle: t_cez_r, t_oez, t_ce_rdyz
-
-7. write async muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu, t_aavdh, cyc_aavhd_we
-we_off: t_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_wr_cycle
-
-8. write async non-muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu
-we_off: t_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_wr_cycle
-
-9. write sync muxed
-adv_wr_off: t_avdp_w, t_avdh
-we_on, wr_data_mux_bus: t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we
-we_off: t_wpl, cyc_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_ce_rdyz
-
-10. write sync non-muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu, t_rdyo
-we_off: t_wpl, cyc_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_ce_rdyz
-
-
-Note: Many of gpmc timings are dependent on other gpmc timings (a few
-gpmc timings purely dependent on other gpmc timings, a reason that
-some of the gpmc timings are missing above), and it will result in
-indirect dependency of peripheral timings to gpmc timings other than
-mentioned above, refer timing routine for more details. To know what
-these peripheral timings correspond to, please see explanations in
-struct gpmc_device_timings definition. And for gpmc timings refer
-IP details (link above).
diff --git a/Documentation/cdrom/Makefile b/Documentation/cdrom/Makefile
deleted file mode 100644
index a19e321928e1..000000000000
--- a/Documentation/cdrom/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-LATEXFILE = cdrom-standard
-
-all:
-	make clean
-	latex $(LATEXFILE)
-	latex $(LATEXFILE)
-	@if [ -x `which gv` ]; then \
-		`dvips -q -t letter -o $(LATEXFILE).ps $(LATEXFILE).dvi` ;\
-		`gv -antialias -media letter -nocenter $(LATEXFILE).ps` ;\
-	else \
-		`xdvi $(LATEXFILE).dvi &` ;\
-	fi
-	make sortofclean
-
-clean:
-	rm -f $(LATEXFILE).ps $(LATEXFILE).dvi $(LATEXFILE).aux $(LATEXFILE).log 
-
-sortofclean:
-	rm -f $(LATEXFILE).aux $(LATEXFILE).log 
-
-
diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst
new file mode 100644
index 000000000000..dde4f7f7fdbf
--- /dev/null
+++ b/Documentation/cdrom/cdrom-standard.rst
@@ -0,0 +1,1063 @@
+=======================
+A Linux CD-ROM standard
+=======================
+
+:Author: David van Leeuwen <david@ElseWare.cistron.nl>
+:Date: 12 March 1999
+:Updated by: Erik Andersen (andersee@debian.org)
+:Updated by: Jens Axboe (axboe@image.dk)
+
+
+Introduction
+============
+
+Linux is probably the Unix-like operating system that supports
+the widest variety of hardware devices. The reasons for this are
+presumably
+
+- The large list of hardware devices available for the many platforms
+  that Linux now supports (i.e., i386-PCs, Sparc Suns, etc.)
+- The open design of the operating system, such that anybody can write a
+  driver for Linux.
+- There is plenty of source code around as examples of how to write a driver.
+
+The openness of Linux, and the many different types of available
+hardware has allowed Linux to support many different hardware devices.
+Unfortunately, the very openness that has allowed Linux to support
+all these different devices has also allowed the behavior of each
+device driver to differ significantly from one device to another.
+This divergence of behavior has been very significant for CD-ROM
+devices; the way a particular drive reacts to a `standard` *ioctl()*
+call varies greatly from one device driver to another. To avoid making
+their drivers totally inconsistent, the writers of Linux CD-ROM
+drivers generally created new device drivers by understanding, copying,
+and then changing an existing one. Unfortunately, this practice did not
+maintain uniform behavior across all the Linux CD-ROM drivers.
+
+This document describes an effort to establish Uniform behavior across
+all the different CD-ROM device drivers for Linux. This document also
+defines the various *ioctl()'s*, and how the low-level CD-ROM device
+drivers should implement them. Currently (as of the Linux 2.1.\ *x*
+development kernels) several low-level CD-ROM device drivers, including
+both IDE/ATAPI and SCSI, now use this Uniform interface.
+
+When the CD-ROM was developed, the interface between the CD-ROM drive
+and the computer was not specified in the standards. As a result, many
+different CD-ROM interfaces were developed. Some of them had their
+own proprietary design (Sony, Mitsumi, Panasonic, Philips), other
+manufacturers adopted an existing electrical interface and changed
+the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply
+adapted their drives to one or more of the already existing electrical
+interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and
+most of the `NoName` manufacturers). In cases where a new drive really
+brought its own interface or used its own command set and flow control
+scheme, either a separate driver had to be written, or an existing
+driver had to be enhanced. History has delivered us CD-ROM support for
+many of these different interfaces. Nowadays, almost all new CD-ROM
+drives are either IDE/ATAPI or SCSI, and it is very unlikely that any
+manufacturer will create a new interface. Even finding drives for the
+old proprietary interfaces is getting difficult.
+
+When (in the 1.3.70's) I looked at the existing software interface,
+which was expressed through `cdrom.h`, it appeared to be a rather wild
+set of commands and data formats [#f1]_. It seemed that many
+features of the software interface had been added to accommodate the
+capabilities of a particular drive, in an *ad hoc* manner. More
+importantly, it appeared that the behavior of the `standard` commands
+was different for most of the different drivers: e. g., some drivers
+close the tray if an *open()* call occurs when the tray is open, while
+others do not. Some drivers lock the door upon opening the device, to
+prevent an incoherent file system, but others don't, to allow software
+ejection. Undoubtedly, the capabilities of the different drives vary,
+but even when two drives have the same capability their drivers'
+behavior was usually different.
+
+.. [#f1]
+   I cannot recollect what kernel version I looked at, then,
+   presumably 1.2.13 and 1.3.34 --- the latest kernel that I was
+   indirectly involved in.
+
+I decided to start a discussion on how to make all the Linux CD-ROM
+drivers behave more uniformly. I began by contacting the developers of
+the many CD-ROM drivers found in the Linux kernel. Their reactions
+encouraged me to write the Uniform CD-ROM Driver which this document is
+intended to describe. The implementation of the Uniform CD-ROM Driver is
+in the file `cdrom.c`. This driver is intended to be an additional software
+layer that sits on top of the low-level device drivers for each CD-ROM drive.
+By adding this additional layer, it is possible to have all the different
+CD-ROM devices behave **exactly** the same (insofar as the underlying
+hardware will allow).
+
+The goal of the Uniform CD-ROM Driver is **not** to alienate driver developers
+whohave not yet taken steps to support this effort. The goal of Uniform CD-ROM
+Driver is simply to give people writing application programs for CD-ROM drives
+**one** Linux CD-ROM interface with consistent behavior for all
+CD-ROM devices. In addition, this also provides a consistent interface
+between the low-level device driver code and the Linux kernel. Care
+is taken that 100% compatibility exists with the data structures and
+programmer's interface defined in `cdrom.h`. This guide was written to
+help CD-ROM driver developers adapt their code to use the Uniform CD-ROM
+Driver code defined in `cdrom.c`.
+
+Personally, I think that the most important hardware interfaces are
+the IDE/ATAPI drives and, of course, the SCSI drives, but as prices
+of hardware drop continuously, it is also likely that people may have
+more than one CD-ROM drive, possibly of mixed types. It is important
+that these drives behave in the same way. In December 1994, one of the
+cheapest CD-ROM drives was a Philips cm206, a double-speed proprietary
+drive. In the months that I was busy writing a Linux driver for it,
+proprietary drives became obsolete and IDE/ATAPI drives became the
+standard. At the time of the last update to this document (November
+1997) it is becoming difficult to even **find** anything less than a
+16 speed CD-ROM drive, and 24 speed drives are common.
+
+.. _cdrom_api:
+
+Standardizing through another software level
+============================================
+
+At the time this document was conceived, all drivers directly
+implemented the CD-ROM *ioctl()* calls through their own routines. This
+led to the danger of different drivers forgetting to do important things
+like checking that the user was giving the driver valid data. More
+importantly, this led to the divergence of behavior, which has already
+been discussed.
+
+For this reason, the Uniform CD-ROM Driver was created to enforce consistent
+CD-ROM drive behavior, and to provide a common set of services to the various
+low-level CD-ROM device drivers. The Uniform CD-ROM Driver now provides another
+software-level, that separates the *ioctl()* and *open()* implementation
+from the actual hardware implementation. Note that this effort has
+made few changes which will affect a user's application programs. The
+greatest change involved moving the contents of the various low-level
+CD-ROM drivers\' header files to the kernel's cdrom directory. This was
+done to help ensure that the user is only presented with only one cdrom
+interface, the interface defined in `cdrom.h`.
+
+CD-ROM drives are specific enough (i. e., different from other
+block-devices such as floppy or hard disc drives), to define a set
+of common **CD-ROM device operations**, *<cdrom-device>_dops*.
+These operations are different from the classical block-device file
+operations, *<block-device>_fops*.
+
+The routines for the Uniform CD-ROM Driver interface level are implemented
+in the file `cdrom.c`. In this file, the Uniform CD-ROM Driver interfaces
+with the kernel as a block device by registering the following general
+*struct file_operations*::
+
+	struct file_operations cdrom_fops = {
+		NULL,			/∗ lseek ∗/
+		block _read ,		/∗ read—general block-dev read ∗/
+		block _write,		/∗ write—general block-dev write ∗/
+		NULL,			/∗ readdir ∗/
+		NULL,			/∗ select ∗/
+		cdrom_ioctl,		/∗ ioctl ∗/
+		NULL,			/∗ mmap ∗/
+		cdrom_open,		/∗ open ∗/
+		cdrom_release,		/∗ release ∗/
+		NULL,			/∗ fsync ∗/
+		NULL,			/∗ fasync ∗/
+		cdrom_media_changed,	/∗ media change ∗/
+		NULL			/∗ revalidate ∗/
+	};
+
+Every active CD-ROM device shares this *struct*. The routines
+declared above are all implemented in `cdrom.c`, since this file is the
+place where the behavior of all CD-ROM-devices is defined and
+standardized. The actual interface to the various types of CD-ROM
+hardware is still performed by various low-level CD-ROM-device
+drivers. These routines simply implement certain **capabilities**
+that are common to all CD-ROM (and really, all removable-media
+devices).
+
+Registration of a low-level CD-ROM device driver is now done through
+the general routines in `cdrom.c`, not through the Virtual File System
+(VFS) any more. The interface implemented in `cdrom.c` is carried out
+through two general structures that contain information about the
+capabilities of the driver, and the specific drives on which the
+driver operates. The structures are:
+
+cdrom_device_ops
+  This structure contains information about the low-level driver for a
+  CD-ROM device. This structure is conceptually connected to the major
+  number of the device (although some drivers may have different
+  major numbers, as is the case for the IDE driver).
+
+cdrom_device_info
+  This structure contains information about a particular CD-ROM drive,
+  such as its device name, speed, etc. This structure is conceptually
+  connected to the minor number of the device.
+
+Registering a particular CD-ROM drive with the Uniform CD-ROM Driver
+is done by the low-level device driver though a call to::
+
+	register_cdrom(struct cdrom_device_info * <device>_info)
+
+The device information structure, *<device>_info*, contains all the
+information needed for the kernel to interface with the low-level
+CD-ROM device driver. One of the most important entries in this
+structure is a pointer to the *cdrom_device_ops* structure of the
+low-level driver.
+
+The device operations structure, *cdrom_device_ops*, contains a list
+of pointers to the functions which are implemented in the low-level
+device driver. When `cdrom.c` accesses a CD-ROM device, it does it
+through the functions in this structure. It is impossible to know all
+the capabilities of future CD-ROM drives, so it is expected that this
+list may need to be expanded from time to time as new technologies are
+developed. For example, CD-R and CD-R/W drives are beginning to become
+popular, and support will soon need to be added for them. For now, the
+current *struct* is::
+
+	struct cdrom_device_ops {
+		int (*open)(struct cdrom_device_info *, int)
+		void (*release)(struct cdrom_device_info *);
+		int (*drive_status)(struct cdrom_device_info *, int);
+		unsigned int (*check_events)(struct cdrom_device_info *,
+					     unsigned int, int);
+		int (*media_changed)(struct cdrom_device_info *, int);
+		int (*tray_move)(struct cdrom_device_info *, int);
+		int (*lock_door)(struct cdrom_device_info *, int);
+		int (*select_speed)(struct cdrom_device_info *, int);
+		int (*select_disc)(struct cdrom_device_info *, int);
+		int (*get_last_session) (struct cdrom_device_info *,
+					 struct cdrom_multisession *);
+		int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *);
+		int (*reset)(struct cdrom_device_info *);
+		int (*audio_ioctl)(struct cdrom_device_info *,
+				   unsigned int, void *);
+		const int capability;		/* capability flags */
+		int (*generic_packet)(struct cdrom_device_info *,
+				      struct packet_command *);
+	};
+
+When a low-level device driver implements one of these capabilities,
+it should add a function pointer to this *struct*. When a particular
+function is not implemented, however, this *struct* should contain a
+NULL instead. The *capability* flags specify the capabilities of the
+CD-ROM hardware and/or low-level CD-ROM driver when a CD-ROM drive
+is registered with the Uniform CD-ROM Driver.
+
+Note that most functions have fewer parameters than their
+*blkdev_fops* counterparts. This is because very little of the
+information in the structures *inode* and *file* is used. For most
+drivers, the main parameter is the *struct* *cdrom_device_info*, from
+which the major and minor number can be extracted. (Most low-level
+CD-ROM drivers don't even look at the major and minor number though,
+since many of them only support one device.) This will be available
+through *dev* in *cdrom_device_info* described below.
+
+The drive-specific, minor-like information that is registered with
+`cdrom.c`, currently contains the following fields::
+
+  struct cdrom_device_info {
+	const struct cdrom_device_ops * ops; 	/* device operations for this major */
+	struct list_head list;			/* linked list of all device_info */
+	struct gendisk * disk;			/* matching block layer disk */
+	void *  handle;				/* driver-dependent data */
+
+	int mask; 				/* mask of capability: disables them */
+	int speed;				/* maximum speed for reading data */
+	int capacity;				/* number of discs in a jukebox */
+
+	unsigned int options:30;		/* options flags */
+	unsigned mc_flags:2;			/*  media-change buffer flags */
+	unsigned int vfs_events;		/*  cached events for vfs path */
+	unsigned int ioctl_events;		/*  cached events for ioctl path */
+	int use_count;				/*  number of times device is opened */
+	char name[20];				/*  name of the device type */
+
+	__u8 sanyo_slot : 2;			/*  Sanyo 3-CD changer support */
+	__u8 keeplocked : 1;			/*  CDROM_LOCKDOOR status */
+	__u8 reserved : 5;			/*  not used yet */
+	int cdda_method;			/*  see CDDA_* flags */
+	__u8 last_sense;			/*  saves last sense key */
+	__u8 media_written;			/*  dirty flag, DVD+RW bookkeeping */
+	unsigned short mmc3_profile;		/*  current MMC3 profile */
+	int for_data;				/*  unknown:TBD */
+	int (*exit)(struct cdrom_device_info *);/*  unknown:TBD */
+	int mrw_mode_page;			/*  which MRW mode page is in use */
+  };
+
+Using this *struct*, a linked list of the registered minor devices is
+built, using the *next* field. The device number, the device operations
+struct and specifications of properties of the drive are stored in this
+structure.
+
+The *mask* flags can be used to mask out some of the capabilities listed
+in *ops->capability*, if a specific drive doesn't support a feature
+of the driver. The value *speed* specifies the maximum head-rate of the
+drive, measured in units of normal audio speed (176kB/sec raw data or
+150kB/sec file system data). The parameters are declared *const*
+because they describe properties of the drive, which don't change after
+registration.
+
+A few registers contain variables local to the CD-ROM drive. The
+flags *options* are used to specify how the general CD-ROM routines
+should behave. These various flags registers should provide enough
+flexibility to adapt to the different users' wishes (and **not** the
+`arbitrary` wishes of the author of the low-level device driver, as is
+the case in the old scheme). The register *mc_flags* is used to buffer
+the information from *media_changed()* to two separate queues. Other
+data that is specific to a minor drive, can be accessed through *handle*,
+which can point to a data structure specific to the low-level driver.
+The fields *use_count*, *next*, *options* and *mc_flags* need not be
+initialized.
+
+The intermediate software layer that `cdrom.c` forms will perform some
+additional bookkeeping. The use count of the device (the number of
+processes that have the device opened) is registered in *use_count*. The
+function *cdrom_ioctl()* will verify the appropriate user-memory regions
+for read and write, and in case a location on the CD is transferred,
+it will `sanitize` the format by making requests to the low-level
+drivers in a standard format, and translating all formats between the
+user-software and low level drivers. This relieves much of the drivers'
+memory checking and format checking and translation. Also, the necessary
+structures will be declared on the program stack.
+
+The implementation of the functions should be as defined in the
+following sections. Two functions **must** be implemented, namely
+*open()* and *release()*. Other functions may be omitted, their
+corresponding capability flags will be cleared upon registration.
+Generally, a function returns zero on success and negative on error. A
+function call should return only after the command has completed, but of
+course waiting for the device should not use processor time.
+
+::
+
+	int open(struct cdrom_device_info *cdi, int purpose)
+
+*Open()* should try to open the device for a specific *purpose*, which
+can be either:
+
+- Open for reading data, as done by `mount()` (2), or the
+  user commands `dd` or `cat`.
+- Open for *ioctl* commands, as done by audio-CD playing programs.
+
+Notice that any strategic code (closing tray upon *open()*, etc.) is
+done by the calling routine in `cdrom.c`, so the low-level routine
+should only be concerned with proper initialization, such as spinning
+up the disc, etc.
+
+::
+
+	void release(struct cdrom_device_info *cdi)
+
+Device-specific actions should be taken such as spinning down the device.
+However, strategic actions such as ejection of the tray, or unlocking
+the door, should be left over to the general routine *cdrom_release()*.
+This is the only function returning type *void*.
+
+.. _cdrom_drive_status:
+
+::
+
+	int drive_status(struct cdrom_device_info *cdi, int slot_nr)
+
+The function *drive_status*, if implemented, should provide
+information on the status of the drive (not the status of the disc,
+which may or may not be in the drive). If the drive is not a changer,
+*slot_nr* should be ignored. In `cdrom.h` the possibilities are listed::
+
+
+	CDS_NO_INFO		/* no information available */
+	CDS_NO_DISC		/* no disc is inserted, tray is closed */
+	CDS_TRAY_OPEN		/* tray is opened */
+	CDS_DRIVE_NOT_READY	/* something is wrong, tray is moving? */
+	CDS_DISC_OK		/* a disc is loaded and everything is fine */
+
+::
+
+	int media_changed(struct cdrom_device_info *cdi, int disc_nr)
+
+This function is very similar to the original function in $struct
+file_operations*. It returns 1 if the medium of the device *cdi->dev*
+has changed since the last call, and 0 otherwise. The parameter
+*disc_nr* identifies a specific slot in a juke-box, it should be
+ignored for single-disc drives. Note that by `re-routing` this
+function through *cdrom_media_changed()*, we can implement separate
+queues for the VFS and a new *ioctl()* function that can report device
+changes to software (e. g., an auto-mounting daemon).
+
+::
+
+	int tray_move(struct cdrom_device_info *cdi, int position)
+
+This function, if implemented, should control the tray movement. (No
+other function should control this.) The parameter *position* controls
+the desired direction of movement:
+
+- 0 Close tray
+- 1 Open tray
+
+This function returns 0 upon success, and a non-zero value upon
+error. Note that if the tray is already in the desired position, no
+action need be taken, and the return value should be 0.
+
+::
+
+	int lock_door(struct cdrom_device_info *cdi, int lock)
+
+This function (and no other code) controls locking of the door, if the
+drive allows this. The value of *lock* controls the desired locking
+state:
+
+- 0 Unlock door, manual opening is allowed
+- 1 Lock door, tray cannot be ejected manually
+
+This function returns 0 upon success, and a non-zero value upon
+error. Note that if the door is already in the requested state, no
+action need be taken, and the return value should be 0.
+
+::
+
+	int select_speed(struct cdrom_device_info *cdi, int speed)
+
+Some CD-ROM drives are capable of changing their head-speed. There
+are several reasons for changing the speed of a CD-ROM drive. Badly
+pressed CD-ROM s may benefit from less-than-maximum head rate. Modern
+CD-ROM drives can obtain very high head rates (up to *24x* is
+common). It has been reported that these drives can make reading
+errors at these high speeds, reducing the speed can prevent data loss
+in these circumstances. Finally, some of these drives can
+make an annoyingly loud noise, which a lower speed may reduce.
+
+This function specifies the speed at which data is read or audio is
+played back. The value of *speed* specifies the head-speed of the
+drive, measured in units of standard cdrom speed (176kB/sec raw data
+or 150kB/sec file system data). So to request that a CD-ROM drive
+operate at 300kB/sec you would call the CDROM_SELECT_SPEED *ioctl*
+with *speed=2*. The special value `0` means `auto-selection`, i. e.,
+maximum data-rate or real-time audio rate. If the drive doesn't have
+this `auto-selection` capability, the decision should be made on the
+current disc loaded and the return value should be positive. A negative
+return value indicates an error.
+
+::
+
+	int select_disc(struct cdrom_device_info *cdi, int number)
+
+If the drive can store multiple discs (a juke-box) this function
+will perform disc selection. It should return the number of the
+selected disc on success, a negative value on error. Currently, only
+the ide-cd driver supports this functionality.
+
+::
+
+	int get_last_session(struct cdrom_device_info *cdi,
+			     struct cdrom_multisession *ms_info)
+
+This function should implement the old corresponding *ioctl()*. For
+device *cdi->dev*, the start of the last session of the current disc
+should be returned in the pointer argument *ms_info*. Note that
+routines in `cdrom.c` have sanitized this argument: its requested
+format will **always** be of the type *CDROM_LBA* (linear block
+addressing mode), whatever the calling software requested. But
+sanitization goes even further: the low-level implementation may
+return the requested information in *CDROM_MSF* format if it wishes so
+(setting the *ms_info->addr_format* field appropriately, of
+course) and the routines in `cdrom.c` will make the transformation if
+necessary. The return value is 0 upon success.
+
+::
+
+	int get_mcn(struct cdrom_device_info *cdi,
+		    struct cdrom_mcn *mcn)
+
+Some discs carry a `Media Catalog Number` (MCN), also called
+`Universal Product Code` (UPC). This number should reflect the number
+that is generally found in the bar-code on the product. Unfortunately,
+the few discs that carry such a number on the disc don't even use the
+same format. The return argument to this function is a pointer to a
+pre-declared memory region of type *struct cdrom_mcn*. The MCN is
+expected as a 13-character string, terminated by a null-character.
+
+::
+
+	int reset(struct cdrom_device_info *cdi)
+
+This call should perform a hard-reset on the drive (although in
+circumstances that a hard-reset is necessary, a drive may very well not
+listen to commands anymore). Preferably, control is returned to the
+caller only after the drive has finished resetting. If the drive is no
+longer listening, it may be wise for the underlying low-level cdrom
+driver to time out.
+
+::
+
+	int audio_ioctl(struct cdrom_device_info *cdi,
+			unsigned int cmd, void *arg)
+
+Some of the CD-ROM-\ *ioctl()*\ 's defined in `cdrom.h` can be
+implemented by the routines described above, and hence the function
+*cdrom_ioctl* will use those. However, most *ioctl()*\ 's deal with
+audio-control. We have decided to leave these to be accessed through a
+single function, repeating the arguments *cmd* and *arg*. Note that
+the latter is of type *void*, rather than *unsigned long int*.
+The routine *cdrom_ioctl()* does do some useful things,
+though. It sanitizes the address format type to *CDROM_MSF* (Minutes,
+Seconds, Frames) for all audio calls. It also verifies the memory
+location of *arg*, and reserves stack-memory for the argument. This
+makes implementation of the *audio_ioctl()* much simpler than in the
+old driver scheme. For example, you may look up the function
+*cm206_audio_ioctl()* `cm206.c` that should be updated with
+this documentation.
+
+An unimplemented ioctl should return *-ENOSYS*, but a harmless request
+(e. g., *CDROMSTART*) may be ignored by returning 0 (success). Other
+errors should be according to the standards, whatever they are. When
+an error is returned by the low-level driver, the Uniform CD-ROM Driver
+tries whenever possible to return the error code to the calling program.
+(We may decide to sanitize the return value in *cdrom_ioctl()* though, in
+order to guarantee a uniform interface to the audio-player software.)
+
+::
+
+	int dev_ioctl(struct cdrom_device_info *cdi,
+		      unsigned int cmd, unsigned long arg)
+
+Some *ioctl()'s* seem to be specific to certain CD-ROM drives. That is,
+they are introduced to service some capabilities of certain drives. In
+fact, there are 6 different *ioctl()'s* for reading data, either in some
+particular kind of format, or audio data. Not many drives support
+reading audio tracks as data, I believe this is because of protection
+of copyrights of artists. Moreover, I think that if audio-tracks are
+supported, it should be done through the VFS and not via *ioctl()'s*. A
+problem here could be the fact that audio-frames are 2352 bytes long,
+so either the audio-file-system should ask for 75264 bytes at once
+(the least common multiple of 512 and 2352), or the drivers should
+bend their backs to cope with this incoherence (to which I would be
+opposed). Furthermore, it is very difficult for the hardware to find
+the exact frame boundaries, since there are no synchronization headers
+in audio frames. Once these issues are resolved, this code should be
+standardized in `cdrom.c`.
+
+Because there are so many *ioctl()'s* that seem to be introduced to
+satisfy certain drivers [#f2]_, any non-standard *ioctl()*\ s
+are routed through the call *dev_ioctl()*. In principle, `private`
+*ioctl()*\ 's should be numbered after the device's major number, and not
+the general CD-ROM *ioctl* number, `0x53`. Currently the
+non-supported *ioctl()'s* are:
+
+	CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW,
+	CDROMREADCOOKED, CDROMSEEK, CDROMPLAY-BLK and CDROM-READALL
+
+.. [#f2]
+
+   Is there software around that actually uses these? I'd be interested!
+
+.. _cdrom_capabilities:
+
+CD-ROM capabilities
+-------------------
+
+Instead of just implementing some *ioctl* calls, the interface in
+`cdrom.c` supplies the possibility to indicate the **capabilities**
+of a CD-ROM drive. This can be done by ORing any number of
+capability-constants that are defined in `cdrom.h` at the registration
+phase. Currently, the capabilities are any of::
+
+	CDC_CLOSE_TRAY		/* can close tray by software control */
+	CDC_OPEN_TRAY		/* can open tray */
+	CDC_LOCK		/* can lock and unlock the door */
+	CDC_SELECT_SPEED	/* can select speed, in units of * sim*150 ,kB/s */
+	CDC_SELECT_DISC		/* drive is juke-box */
+	CDC_MULTI_SESSION	/* can read sessions *> rm1* */
+	CDC_MCN			/* can read Media Catalog Number */
+	CDC_MEDIA_CHANGED	/* can report if disc has changed */
+	CDC_PLAY_AUDIO		/* can perform audio-functions (play, pause, etc) */
+	CDC_RESET		/* hard reset device */
+	CDC_IOCTLS		/* driver has non-standard ioctls */
+	CDC_DRIVE_STATUS	/* driver implements drive status */
+
+The capability flag is declared *const*, to prevent drivers from
+accidentally tampering with the contents. The capability fags actually
+inform `cdrom.c` of what the driver can do. If the drive found
+by the driver does not have the capability, is can be masked out by
+the *cdrom_device_info* variable *mask*. For instance, the SCSI CD-ROM
+driver has implemented the code for loading and ejecting CD-ROM's, and
+hence its corresponding flags in *capability* will be set. But a SCSI
+CD-ROM drive might be a caddy system, which can't load the tray, and
+hence for this drive the *cdrom_device_info* struct will have set
+the *CDC_CLOSE_TRAY* bit in *mask*.
+
+In the file `cdrom.c` you will encounter many constructions of the type::
+
+	if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ...
+
+There is no *ioctl* to set the mask... The reason is that
+I think it is better to control the **behavior** rather than the
+**capabilities**.
+
+Options
+-------
+
+A final flag register controls the **behavior** of the CD-ROM
+drives, in order to satisfy different users' wishes, hopefully
+independently of the ideas of the respective author who happened to
+have made the drive's support available to the Linux community. The
+current behavior options are::
+
+	CDO_AUTO_CLOSE	/* try to close tray upon device open() */
+	CDO_AUTO_EJECT	/* try to open tray on last device close() */
+	CDO_USE_FFLAGS	/* use file_pointer->f_flags to indicate purpose for open() */
+	CDO_LOCK	/* try to lock door if device is opened */
+	CDO_CHECK_TYPE	/* ensure disc type is data if opened for data */
+
+The initial value of this register is
+`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`, reflecting my own view on user
+interface and software standards. Before you protest, there are two
+new *ioctl()'s* implemented in `cdrom.c`, that allow you to control the
+behavior by software. These are::
+
+	CDROM_SET_OPTIONS	/* set options specified in (int)arg */
+	CDROM_CLEAR_OPTIONS	/* clear options specified in (int)arg */
+
+One option needs some more explanation: *CDO_USE_FFLAGS*. In the next
+newsection we explain what the need for this option is.
+
+A software package `setcd`, available from the Debian distribution
+and `sunsite.unc.edu`, allows user level control of these flags.
+
+
+The need to know the purpose of opening the CD-ROM device
+=========================================================
+
+Traditionally, Unix devices can be used in two different `modes`,
+either by reading/writing to the device file, or by issuing
+controlling commands to the device, by the device's *ioctl()*
+call. The problem with CD-ROM drives, is that they can be used for
+two entirely different purposes. One is to mount removable
+file systems, CD-ROM's, the other is to play audio CD's. Audio commands
+are implemented entirely through *ioctl()\'s*, presumably because the
+first implementation (SUN?) has been such. In principle there is
+nothing wrong with this, but a good control of the `CD player` demands
+that the device can **always** be opened in order to give the
+*ioctl* commands, regardless of the state the drive is in.
+
+On the other hand, when used as a removable-media disc drive (what the
+original purpose of CD-ROM s is) we would like to make sure that the
+disc drive is ready for operation upon opening the device. In the old
+scheme, some CD-ROM drivers don't do any integrity checking, resulting
+in a number of i/o errors reported by the VFS to the kernel when an
+attempt for mounting a CD-ROM on an empty drive occurs. This is not a
+particularly elegant way to find out that there is no CD-ROM inserted;
+it more-or-less looks like the old IBM-PC trying to read an empty floppy
+drive for a couple of seconds, after which the system complains it
+can't read from it. Nowadays we can **sense** the existence of a
+removable medium in a drive, and we believe we should exploit that
+fact. An integrity check on opening of the device, that verifies the
+availability of a CD-ROM and its correct type (data), would be
+desirable.
+
+These two ways of using a CD-ROM drive, principally for data and
+secondarily for playing audio discs, have different demands for the
+behavior of the *open()* call. Audio use simply wants to open the
+device in order to get a file handle which is needed for issuing
+*ioctl* commands, while data use wants to open for correct and
+reliable data transfer. The only way user programs can indicate what
+their *purpose* of opening the device is, is through the *flags*
+parameter (see `open(2)`). For CD-ROM devices, these flags aren't
+implemented (some drivers implement checking for write-related flags,
+but this is not strictly necessary if the device file has correct
+permission flags). Most option flags simply don't make sense to
+CD-ROM devices: *O_CREAT*, *O_NOCTTY*, *O_TRUNC*, *O_APPEND*, and
+*O_SYNC* have no meaning to a CD-ROM.
+
+We therefore propose to use the flag *O_NONBLOCK* to indicate
+that the device is opened just for issuing *ioctl*
+commands. Strictly, the meaning of *O_NONBLOCK* is that opening and
+subsequent calls to the device don't cause the calling process to
+wait. We could interpret this as don't wait until someone has
+inserted some valid data-CD-ROM. Thus, our proposal of the
+implementation for the *open()* call for CD-ROM s is:
+
+- If no other flags are set than *O_RDONLY*, the device is opened
+  for data transfer, and the return value will be 0 only upon successful
+  initialization of the transfer. The call may even induce some actions
+  on the CD-ROM, such as closing the tray.
+- If the option flag *O_NONBLOCK* is set, opening will always be
+  successful, unless the whole device doesn't exist. The drive will take
+  no actions whatsoever.
+
+And what about standards?
+-------------------------
+
+You might hesitate to accept this proposal as it comes from the
+Linux community, and not from some standardizing institute. What
+about SUN, SGI, HP and all those other Unix and hardware vendors?
+Well, these companies are in the lucky position that they generally
+control both the hardware and software of their supported products,
+and are large enough to set their own standard. They do not have to
+deal with a dozen or more different, competing hardware
+configurations\ [#f3]_.
+
+.. [#f3]
+
+   Incidentally, I think that SUN's approach to mounting CD-ROM s is very
+   good in origin: under Solaris a volume-daemon automatically mounts a
+   newly inserted CD-ROM under `/cdrom/*<volume-name>*`.
+
+   In my opinion they should have pushed this
+   further and have **every** CD-ROM on the local area network be
+   mounted at the similar location, i. e., no matter in which particular
+   machine you insert a CD-ROM, it will always appear at the same
+   position in the directory tree, on every system. When I wanted to
+   implement such a user-program for Linux, I came across the
+   differences in behavior of the various drivers, and the need for an
+   *ioctl* informing about media changes.
+
+We believe that using *O_NONBLOCK* to indicate that a device is being opened
+for *ioctl* commands only can be easily introduced in the Linux
+community. All the CD-player authors will have to be informed, we can
+even send in our own patches to the programs. The use of *O_NONBLOCK*
+has most likely no influence on the behavior of the CD-players on
+other operating systems than Linux. Finally, a user can always revert
+to old behavior by a call to
+*ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, CDO_USE_FFLAGS)*.
+
+The preferred strategy of *open()*
+----------------------------------
+
+The routines in `cdrom.c` are designed in such a way that run-time
+configuration of the behavior of CD-ROM devices (of **any** type)
+can be carried out, by the *CDROM_SET/CLEAR_OPTIONS* *ioctls*. Thus, various
+modes of operation can be set:
+
+`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`
+   This is the default setting. (With *CDO_CHECK_TYPE* it will be better, in
+   the future.) If the device is not yet opened by any other process, and if
+   the device is being opened for data (*O_NONBLOCK* is not set) and the
+   tray is found to be open, an attempt to close the tray is made. Then,
+   it is verified that a disc is in the drive and, if *CDO_CHECK_TYPE* is
+   set, that it contains tracks of type `data mode 1`. Only if all tests
+   are passed is the return value zero. The door is locked to prevent file
+   system corruption. If the drive is opened for audio (*O_NONBLOCK* is
+   set), no actions are taken and a value of 0 will be returned.
+
+`CDO_AUTO_CLOSE | CDO_AUTO_EJECT | CDO_LOCK`
+   This mimics the behavior of the current sbpcd-driver. The option flags are
+   ignored, the tray is closed on the first open, if necessary. Similarly,
+   the tray is opened on the last release, i. e., if a CD-ROM is unmounted,
+   it is automatically ejected, such that the user can replace it.
+
+We hope that these option can convince everybody (both driver
+maintainers and user program developers) to adopt the new CD-ROM
+driver scheme and option flag interpretation.
+
+Description of routines in `cdrom.c`
+====================================
+
+Only a few routines in `cdrom.c` are exported to the drivers. In this
+new section we will discuss these, as well as the functions that `take
+over' the CD-ROM interface to the kernel. The header file belonging
+to `cdrom.c` is called `cdrom.h`. Formerly, some of the contents of this
+file were placed in the file `ucdrom.h`, but this file has now been
+merged back into `cdrom.h`.
+
+::
+
+	struct file_operations cdrom_fops
+
+The contents of this structure were described in cdrom_api_.
+A pointer to this structure is assigned to the *fops* field
+of the *struct gendisk*.
+
+::
+
+	int register_cdrom(struct cdrom_device_info *cdi)
+
+This function is used in about the same way one registers *cdrom_fops*
+with the kernel, the device operations and information structures,
+as described in cdrom_api_, should be registered with the
+Uniform CD-ROM Driver::
+
+	register_cdrom(&<device>_info);
+
+
+This function returns zero upon success, and non-zero upon
+failure. The structure *<device>_info* should have a pointer to the
+driver's *<device>_dops*, as in::
+
+	struct cdrom_device_info <device>_info = {
+		<device>_dops;
+		...
+	}
+
+Note that a driver must have one static structure, *<device>_dops*, while
+it may have as many structures *<device>_info* as there are minor devices
+active. *Register_cdrom()* builds a linked list from these.
+
+
+::
+
+	void unregister_cdrom(struct cdrom_device_info *cdi)
+
+Unregistering device *cdi* with minor number *MINOR(cdi->dev)* removes
+the minor device from the list. If it was the last registered minor for
+the low-level driver, this disconnects the registered device-operation
+routines from the CD-ROM interface. This function returns zero upon
+success, and non-zero upon failure.
+
+::
+
+	int cdrom_open(struct inode * ip, struct file * fp)
+
+This function is not called directly by the low-level drivers, it is
+listed in the standard *cdrom_fops*. If the VFS opens a file, this
+function becomes active. A strategy is implemented in this routine,
+taking care of all capabilities and options that are set in the
+*cdrom_device_ops* connected to the device. Then, the program flow is
+transferred to the device_dependent *open()* call.
+
+::
+
+	void cdrom_release(struct inode *ip, struct file *fp)
+
+This function implements the reverse-logic of *cdrom_open()*, and then
+calls the device-dependent *release()* routine. When the use-count has
+reached 0, the allocated buffers are flushed by calls to *sync_dev(dev)*
+and *invalidate_buffers(dev)*.
+
+
+.. _cdrom_ioctl:
+
+::
+
+	int cdrom_ioctl(struct inode *ip, struct file *fp,
+			unsigned int cmd, unsigned long arg)
+
+This function handles all the standard *ioctl* requests for CD-ROM
+devices in a uniform way. The different calls fall into three
+categories: *ioctl()'s* that can be directly implemented by device
+operations, ones that are routed through the call *audio_ioctl()*, and
+the remaining ones, that are presumable device-dependent. Generally, a
+negative return value indicates an error.
+
+Directly implemented *ioctl()'s*
+--------------------------------
+
+The following `old` CD-ROM *ioctl()*\ 's are implemented by directly
+calling device-operations in *cdrom_device_ops*, if implemented and
+not masked:
+
+`CDROMMULTISESSION`
+	Requests the last session on a CD-ROM.
+`CDROMEJECT`
+	Open tray.
+`CDROMCLOSETRAY`
+	Close tray.
+`CDROMEJECT_SW`
+	If *arg\not=0*, set behavior to auto-close (close
+	tray on first open) and auto-eject (eject on last release), otherwise
+	set behavior to non-moving on *open()* and *release()* calls.
+`CDROM_GET_MCN`
+	Get the Media Catalog Number from a CD.
+
+*Ioctl*s routed through *audio_ioctl()*
+---------------------------------------
+
+The following set of *ioctl()'s* are all implemented through a call to
+the *cdrom_fops* function *audio_ioctl()*. Memory checks and
+allocation are performed in *cdrom_ioctl()*, and also sanitization of
+address format (*CDROM_LBA*/*CDROM_MSF*) is done.
+
+`CDROMSUBCHNL`
+	Get sub-channel data in argument *arg* of type
+	`struct cdrom_subchnl *`.
+`CDROMREADTOCHDR`
+	Read Table of Contents header, in *arg* of type
+	`struct cdrom_tochdr *`.
+`CDROMREADTOCENTRY`
+	Read a Table of Contents entry in *arg* and specified by *arg*
+	of type `struct cdrom_tocentry *`.
+`CDROMPLAYMSF`
+	Play audio fragment specified in Minute, Second, Frame format,
+	delimited by *arg* of type `struct cdrom_msf *`.
+`CDROMPLAYTRKIND`
+	Play audio fragment in track-index format delimited by *arg*
+	of type `struct cdrom_ti *`.
+`CDROMVOLCTRL`
+	Set volume specified by *arg* of type `struct cdrom_volctrl *`.
+`CDROMVOLREAD`
+	Read volume into by *arg* of type `struct cdrom_volctrl *`.
+`CDROMSTART`
+	Spin up disc.
+`CDROMSTOP`
+	Stop playback of audio fragment.
+`CDROMPAUSE`
+	Pause playback of audio fragment.
+`CDROMRESUME`
+	Resume playing.
+
+New *ioctl()'s* in `cdrom.c`
+----------------------------
+
+The following *ioctl()'s* have been introduced to allow user programs to
+control the behavior of individual CD-ROM devices. New *ioctl*
+commands can be identified by the underscores in their names.
+
+`CDROM_SET_OPTIONS`
+	Set options specified by *arg*. Returns the option flag register
+	after modification. Use *arg = \rm0* for reading the current flags.
+`CDROM_CLEAR_OPTIONS`
+	Clear options specified by *arg*. Returns the option flag register
+	after modification.
+`CDROM_SELECT_SPEED`
+	Select head-rate speed of disc specified as by *arg* in units
+	of standard cdrom speed (176\,kB/sec raw data or
+	150kB/sec file system data). The value 0 means `auto-select`,
+	i. e., play audio discs at real time and data discs at maximum speed.
+	The value *arg* is checked against the maximum head rate of the
+	drive found in the *cdrom_dops*.
+`CDROM_SELECT_DISC`
+	Select disc numbered *arg* from a juke-box.
+
+	First disc is numbered 0. The number *arg* is checked against the
+	maximum number of discs in the juke-box found in the *cdrom_dops*.
+`CDROM_MEDIA_CHANGED`
+	Returns 1 if a disc has been changed since the last call.
+	Note that calls to *cdrom_media_changed* by the VFS are treated
+	by an independent queue, so both mechanisms will detect a
+	media change once. For juke-boxes, an extra argument *arg*
+	specifies the slot for which the information is given. The special
+	value *CDSL_CURRENT* requests that information about the currently
+	selected slot be returned.
+`CDROM_DRIVE_STATUS`
+	Returns the status of the drive by a call to
+	*drive_status()*. Return values are defined in cdrom_drive_status_.
+	Note that this call doesn't return information on the
+	current playing activity of the drive; this can be polled through
+	an *ioctl* call to *CDROMSUBCHNL*. For juke-boxes, an extra argument
+	*arg* specifies the slot for which (possibly limited) information is
+	given. The special value *CDSL_CURRENT* requests that information
+	about the currently selected slot be returned.
+`CDROM_DISC_STATUS`
+	Returns the type of the disc currently in the drive.
+	It should be viewed as a complement to *CDROM_DRIVE_STATUS*.
+	This *ioctl* can provide *some* information about the current
+	disc that is inserted in the drive. This functionality used to be
+	implemented in the low level drivers, but is now carried out
+	entirely in Uniform CD-ROM Driver.
+
+	The history of development of the CD's use as a carrier medium for
+	various digital information has lead to many different disc types.
+	This *ioctl* is useful only in the case that CDs have \emph {only
+	one} type of data on them. While this is often the case, it is
+	also very common for CDs to have some tracks with data, and some
+	tracks with audio. Because this is an existing interface, rather
+	than fixing this interface by changing the assumptions it was made
+	under, thereby breaking all user applications that use this
+	function, the Uniform CD-ROM Driver implements this *ioctl* as
+	follows: If the CD in question has audio tracks on it, and it has
+	absolutely no CD-I, XA, or data tracks on it, it will be reported
+	as *CDS_AUDIO*. If it has both audio and data tracks, it will
+	return *CDS_MIXED*. If there are no audio tracks on the disc, and
+	if the CD in question has any CD-I tracks on it, it will be
+	reported as *CDS_XA_2_2*. Failing that, if the CD in question
+	has any XA tracks on it, it will be reported as *CDS_XA_2_1*.
+	Finally, if the CD in question has any data tracks on it,
+	it will be reported as a data CD (*CDS_DATA_1*).
+
+	This *ioctl* can return::
+
+		CDS_NO_INFO	/* no information available */
+		CDS_NO_DISC	/* no disc is inserted, or tray is opened */
+		CDS_AUDIO	/* Audio disc (2352 audio bytes/frame) */
+		CDS_DATA_1	/* data disc, mode 1 (2048 user bytes/frame) */
+		CDS_XA_2_1	/* mixed data (XA), mode 2, form 1 (2048 user bytes) */
+		CDS_XA_2_2	/* mixed data (XA), mode 2, form 1 (2324 user bytes) */
+		CDS_MIXED	/* mixed audio/data disc */
+
+	For some information concerning frame layout of the various disc
+	types, see a recent version of `cdrom.h`.
+
+`CDROM_CHANGER_NSLOTS`
+	Returns the number of slots in a juke-box.
+`CDROMRESET`
+	Reset the drive.
+`CDROM_GET_CAPABILITY`
+	Returns the *capability* flags for the drive. Refer to section
+	cdrom_capabilities_ for more information on these flags.
+`CDROM_LOCKDOOR`
+	 Locks the door of the drive. `arg == 0` unlocks the door,
+	 any other value locks it.
+`CDROM_DEBUG`
+	 Turns on debugging info. Only root is allowed to do this.
+	 Same semantics as CDROM_LOCKDOOR.
+
+
+Device dependent *ioctl()'s*
+----------------------------
+
+Finally, all other *ioctl()'s* are passed to the function *dev_ioctl()*,
+if implemented. No memory allocation or verification is carried out.
+
+How to update your driver
+=========================
+
+- Make a backup of your current driver.
+- Get hold of the files `cdrom.c` and `cdrom.h`, they should be in
+  the directory tree that came with this documentation.
+- Make sure you include `cdrom.h`.
+- Change the 3rd argument of *register_blkdev* from `&<your-drive>_fops`
+  to `&cdrom_fops`.
+- Just after that line, add the following to register with the Uniform
+  CD-ROM Driver::
+
+	register_cdrom(&<your-drive>_info);*
+
+  Similarly, add a call to *unregister_cdrom()* at the appropriate place.
+- Copy an example of the device-operations *struct* to your
+  source, e. g., from `cm206.c` *cm206_dops*, and change all
+  entries to names corresponding to your driver, or names you just
+  happen to like. If your driver doesn't support a certain function,
+  make the entry *NULL*. At the entry *capability* you should list all
+  capabilities your driver currently supports. If your driver
+  has a capability that is not listed, please send me a message.
+- Copy the *cdrom_device_info* declaration from the same example
+  driver, and modify the entries according to your needs. If your
+  driver dynamically determines the capabilities of the hardware, this
+  structure should also be declared dynamically.
+- Implement all functions in your `<device>_dops` structure,
+  according to prototypes listed in  `cdrom.h`, and specifications given
+  in cdrom_api_. Most likely you have already implemented
+  the code in a large part, and you will almost certainly need to adapt the
+  prototype and return values.
+- Rename your `<device>_ioctl()` function to *audio_ioctl* and
+  change the prototype a little. Remove entries listed in the first
+  part in cdrom_ioctl_, if your code was OK, these are
+  just calls to the routines you adapted in the previous step.
+- You may remove all remaining memory checking code in the
+  *audio_ioctl()* function that deals with audio commands (these are
+  listed in the second part of cdrom_ioctl_. There is no
+  need for memory allocation either, so most *case*s in the *switch*
+  statement look similar to::
+
+	case CDROMREADTOCENTRY:
+		get_toc_entry\bigl((struct cdrom_tocentry *) arg);
+
+- All remaining *ioctl* cases must be moved to a separate
+  function, *<device>_ioctl*, the device-dependent *ioctl()'s*. Note that
+  memory checking and allocation must be kept in this code!
+- Change the prototypes of *<device>_open()* and
+  *<device>_release()*, and remove any strategic code (i. e., tray
+  movement, door locking, etc.).
+- Try to recompile the drivers. We advise you to use modules, both
+  for `cdrom.o` and your driver, as debugging is much easier this
+  way.
+
+Thanks
+======
+
+Thanks to all the people involved. First, Erik Andersen, who has
+taken over the torch in maintaining `cdrom.c` and integrating much
+CD-ROM-related code in the 2.1-kernel. Thanks to Scott Snyder and
+Gerd Knorr, who were the first to implement this interface for SCSI
+and IDE-CD drivers and added many ideas for extension of the data
+structures relative to kernel~2.0. Further thanks to Heiko Eißfeldt,
+Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard Mönkeberg and Andrew Kroll,
+the Linux CD-ROM device driver developers who were kind
+enough to give suggestions and criticisms during the writing. Finally
+of course, I want to thank Linus Torvalds for making this possible in
+the first place.
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
deleted file mode 100644
index f7cd455973f7..000000000000
--- a/Documentation/cdrom/cdrom-standard.tex
+++ /dev/null
@@ -1,1026 +0,0 @@
-\documentclass{article}
-\def\version{$Id: cdrom-standard.tex,v 1.9 1997/12/28 15:42:49 david Exp $}
-\newcommand{\newsection}[1]{\newpage\section{#1}}
-
-\evensidemargin=0pt
-\oddsidemargin=0pt
-\topmargin=-\headheight \advance\topmargin by -\headsep
-\textwidth=15.99cm \textheight=24.62cm % normal A4, 1'' margin
-
-\def\linux{{\sc Linux}}
-\def\cdrom{{\sc cd-rom}}
-\def\UCD{{\sc Uniform cd-rom Driver}}
-\def\cdromc{{\tt {cdrom.c}}}
-\def\cdromh{{\tt {cdrom.h}}}
-\def\fo{\sl}                    % foreign words
-\def\ie{{\fo i.e.}}
-\def\eg{{\fo e.g.}}
-
-\everymath{\it} \everydisplay{\it}
-\catcode `\_=\active \def_{\_\penalty100 }
-\catcode`\<=\active \def<#1>{{\langle\hbox{\rm#1}\rangle}}
-
-\begin{document}
-\title{A \linux\ \cdrom\ standard}
-\author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl}
-\\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}}
-\\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}}
-\date{12 March 1999}
-
-\maketitle
-
-\newsection{Introduction}
-
-\linux\ is probably the Unix-like operating system that supports
-the widest variety of hardware devices. The reasons for this are
-presumably 
-\begin{itemize} 
-\item 
-  The large list of hardware devices available for the many platforms
-  that \linux\ now supports (\ie, i386-PCs, Sparc Suns, etc.)
-\item 
-  The open design of the operating system, such that anybody can write a
-  driver for \linux.
-\item 
-  There is plenty of source code around as examples of how to write a driver.
-\end{itemize}
-The openness of \linux, and the many different types of available
-hardware has allowed \linux\ to support many different hardware devices.
-Unfortunately, the very openness that has allowed \linux\ to support
-all these different devices has also allowed the behavior of each
-device driver to differ significantly from one device to another.
-This divergence of behavior has been very significant for \cdrom\
-devices; the way a particular drive reacts to a `standard' $ioctl()$
-call varies greatly from one device driver to another. To avoid making
-their drivers totally inconsistent, the writers of \linux\ \cdrom\
-drivers generally created new device drivers by understanding, copying,
-and then changing an existing one. Unfortunately, this practice did not
-maintain uniform behavior across all the \linux\ \cdrom\ drivers. 
-
-This document describes an effort to establish Uniform behavior across
-all the different \cdrom\ device drivers for \linux. This document also
-defines the various $ioctl$s, and how the low-level \cdrom\ device
-drivers should implement them. Currently (as of the \linux\ 2.1.$x$
-development kernels) several low-level \cdrom\ device drivers, including
-both IDE/ATAPI and SCSI, now use this Uniform interface.
-
-When the \cdrom\ was developed, the interface between the \cdrom\ drive
-and the computer was not specified in the standards. As a result, many
-different \cdrom\ interfaces were developed. Some of them had their
-own proprietary design (Sony, Mitsumi, Panasonic, Philips), other
-manufacturers adopted an existing electrical interface and changed
-the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply
-adapted their drives to one or more of the already existing electrical
-interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and
-most of the `NoName' manufacturers). In cases where a new drive really
-brought its own interface or used its own command set and flow control
-scheme, either a separate driver had to be written, or an existing
-driver had to be enhanced. History has delivered us \cdrom\ support for
-many of these different interfaces. Nowadays, almost all new \cdrom\
-drives are either IDE/ATAPI or SCSI, and it is very unlikely that any
-manufacturer will create a new interface. Even finding drives for the
-old proprietary interfaces is getting difficult.
-
-When (in the 1.3.70's) I looked at the existing software interface,
-which was expressed through \cdromh, it appeared to be a rather wild
-set of commands and data formats.\footnote{I cannot recollect what
-kernel version I looked at, then, presumably 1.2.13 and 1.3.34---the
-latest kernel that I was indirectly involved in.} It seemed that many
-features of the software interface had been added to accommodate the
-capabilities of a particular drive, in an {\fo ad hoc\/} manner. More
-importantly, it appeared that the behavior of the `standard' commands
-was different for most of the different drivers: \eg, some drivers
-close the tray if an $open()$ call occurs when the tray is open, while
-others do not. Some drivers lock the door upon opening the device, to
-prevent an incoherent file system, but others don't, to allow software
-ejection. Undoubtedly, the capabilities of the different drives vary,
-but even when two drives have the same capability their drivers'
-behavior was usually different.
-
-I decided to start a discussion on how to make all the \linux\ \cdrom\
-drivers behave more uniformly. I began by contacting the developers of
-the many \cdrom\ drivers found in the \linux\ kernel. Their reactions
-encouraged me to write the \UCD\ which this document is intended to
-describe. The implementation of the \UCD\ is in the file \cdromc. This
-driver is intended to be an additional software layer that sits on top
-of the low-level device drivers for each \cdrom\ drive. By adding this
-additional layer, it is possible to have all the different \cdrom\
-devices behave {\em exactly\/} the same (insofar as the underlying
-hardware will allow).
-
-The goal of the \UCD\ is {\em not\/} to alienate driver developers who
-have not yet taken steps to support this effort. The goal of \UCD\ is
-simply to give people writing application programs for \cdrom\ drives
-{\em one\/} \linux\ \cdrom\ interface with consistent behavior for all
-\cdrom\ devices. In addition, this also provides a consistent interface
-between the low-level device driver code and the \linux\ kernel. Care
-is taken that 100\,\% compatibility exists with the data structures and
-programmer's interface defined in \cdromh. This guide was written to
-help \cdrom\ driver developers adapt their code to use the \UCD\ code
-defined in \cdromc.
-
-Personally, I think that the most important hardware interfaces are
-the IDE/ATAPI drives and, of course, the SCSI drives, but as prices
-of hardware drop continuously, it is also likely that people may have
-more than one \cdrom\ drive, possibly of mixed types. It is important
-that these drives behave in the same way. In December 1994, one of the
-cheapest \cdrom\ drives was a Philips cm206, a double-speed proprietary
-drive. In the months that I was busy writing a \linux\ driver for it,
-proprietary drives became obsolete and IDE/ATAPI drives became the
-standard. At the time of the last update to this document (November
-1997) it is becoming difficult to even {\em find} anything less than a
-16 speed \cdrom\ drive, and 24 speed drives are common.
-
-\newsection{Standardizing through another software level}
-\label{cdrom.c}
-
-At the time this document was conceived, all drivers directly
-implemented the \cdrom\ $ioctl()$ calls through their own routines. This
-led to the danger of different drivers forgetting to do important things
-like checking that the user was giving the driver valid data. More
-importantly, this led to the divergence of behavior, which has already
-been discussed.
-
-For this reason, the \UCD\ was created to enforce consistent \cdrom\
-drive behavior, and to provide a common set of services to the various
-low-level \cdrom\ device drivers. The \UCD\ now provides another
-software-level, that separates the $ioctl()$ and $open()$ implementation
-from the actual hardware implementation. Note that this effort has
-made few changes which will affect a user's application programs. The
-greatest change involved moving the contents of the various low-level
-\cdrom\ drivers' header files to the kernel's cdrom directory. This was
-done to help ensure that the user is only presented with only one cdrom
-interface, the interface defined in \cdromh.
-
-\cdrom\ drives are specific enough (\ie, different from other
-block-devices such as floppy or hard disc drives), to define a set
-of common {\em \cdrom\ device operations}, $<cdrom-device>_dops$.
-These operations are different from the classical block-device file
-operations, $<block-device>_fops$.
-
-The routines for the \UCD\ interface level are implemented in the file
-\cdromc. In this file, the \UCD\ interfaces with the kernel as a block
-device by registering the following general $struct\ file_operations$:
-$$
-\halign{$#$\ \hfil&$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-struct& file_operations\ cdrom_fops = \{\hidewidth\cr
-        &NULL,                  & lseek \cr
-        &block_read,            & read---general block-dev read \cr
-        &block_write,           & write---general block-dev write \cr
-        &NULL,                  & readdir \cr
-        &NULL,                  & select \cr
-        &cdrom_ioctl,           & ioctl \cr
-        &NULL,                  & mmap \cr
-        &cdrom_open,            & open \cr
-        &cdrom_release,         & release \cr
-        &NULL,                  & fsync \cr
-        &NULL,                  & fasync \cr
-        &cdrom_media_changed,   & media change \cr
-        &NULL                   & revalidate \cr
-\};\cr
-}
-$$ 
-
-Every active \cdrom\ device shares this $struct$. The routines
-declared above are all implemented in \cdromc, since this file is the
-place where the behavior of all \cdrom-devices is defined and
-standardized. The actual interface to the various types of \cdrom\ 
-hardware is still performed by various low-level \cdrom-device
-drivers. These routines simply implement certain {\em capabilities\/}
-that are common to all \cdrom\ (and really, all removable-media
-devices).
-
-Registration of a low-level \cdrom\ device driver is now done through
-the general routines in \cdromc, not through the Virtual File System
-(VFS) any more. The interface implemented in \cdromc\ is carried out
-through two general structures that contain information about the
-capabilities of the driver, and the specific drives on which the
-driver operates. The structures are:
-\begin{description}
-\item[$cdrom_device_ops$] 
-  This structure contains information about the low-level driver for a
-  \cdrom\ device. This structure is conceptually connected to the major
-  number of the device (although some drivers may have different
-  major numbers, as is the case for the IDE driver).
-\item[$cdrom_device_info$] 
-  This structure contains information about a particular \cdrom\ drive,
-  such as its device name, speed, etc. This structure is conceptually
-  connected to the minor number of the device.
-\end{description}
-
-Registering a particular \cdrom\ drive with the \UCD\ is done by the
-low-level device driver though a call to:
-$$register_cdrom(struct\ cdrom_device_info * <device>_info)  
-$$
-The device information structure, $<device>_info$, contains all the
-information needed for the kernel to interface with the low-level
-\cdrom\ device driver. One of the most important entries in this
-structure is a pointer to the $cdrom_device_ops$ structure of the
-low-level driver.
-
-The device operations structure, $cdrom_device_ops$, contains a list
-of pointers to the functions which are implemented in the low-level
-device driver. When \cdromc\ accesses a \cdrom\ device, it does it
-through the functions in this structure. It is impossible to know all
-the capabilities of future \cdrom\ drives, so it is expected that this
-list may need to be expanded from time to time as new technologies are
-developed. For example, CD-R and CD-R/W drives are beginning to become
-popular, and support will soon need to be added for them. For now, the
-current $struct$ is:
-$$
-\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
-  $/*$ \rm# $*/$\hfil\cr
-struct& cdrom_device_ops\ \{ \hidewidth\cr
-  &int& (* open)(struct\ cdrom_device_info *, int)\cr
-  &void& (* release)(struct\ cdrom_device_info *);\cr 
-  &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr     
-  &unsigned\ int& (* check_events)(struct\ cdrom_device_info *, unsigned\ int, int);\cr
-  &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr 
-  &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr
-  &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr
-  &int& (* select_speed)(struct\ cdrom_device_info *, int);\cr
-  &int& (* select_disc)(struct\ cdrom_device_info *, int);\cr
-  &int& (* get_last_session) (struct\ cdrom_device_info *, 
-        struct\ cdrom_multisession *{});\cr
-  &int& (* get_mcn)(struct\ cdrom_device_info *, struct\ cdrom_mcn *{});\cr
-  &int& (* reset)(struct\ cdrom_device_info *);\cr
-  &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int, 
-        void *{});\cr 
-\noalign{\medskip}
-  &const\ int& capability;& capability flags \cr
-  &int& (* generic_packet)(struct\ cdrom_device_info *, struct\ packet_command *{});\cr
-\};\cr
-}
-$$
-When a low-level device driver implements one of these capabilities,
-it should add a function pointer to this $struct$. When a particular
-function is not implemented, however, this $struct$ should contain a
-NULL instead. The $capability$ flags specify the capabilities of the
-\cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive
-is registered with the \UCD.
-
-Note that most functions have fewer parameters than their
-$blkdev_fops$ counterparts. This is because very little of the
-information in the structures $inode$ and $file$ is used. For most
-drivers, the main parameter is the $struct$ $cdrom_device_info$, from
-which the major and minor number can be extracted. (Most low-level
-\cdrom\ drivers don't even look at the major and minor number though,
-since many of them only support one device.) This will be available
-through $dev$ in $cdrom_device_info$ described below.
-
-The drive-specific, minor-like information that is registered with
-\cdromc, currently contains the following fields:
-$$
-\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
-  $/*$ \rm# $*/$\hfil\cr
-struct& cdrom_device_info\ \{ \hidewidth\cr
-  & const\ struct\ cdrom_device_ops *& ops;& device operations for this major\cr
-  & struct\ list_head& list;& linked list of all device_info\cr
-  & struct\ gendisk *& disk;& matching block layer disk\cr
-  & void *&  handle;& driver-dependent data\cr
-\noalign{\medskip}
-  & int& mask;& mask of capability: disables them \cr
-  & int& speed;& maximum speed for reading data \cr
-  & int& capacity;& number of discs in a jukebox \cr
-\noalign{\medskip}
-  &unsigned\ int& options : 30;& options flags \cr
-  &unsigned& mc_flags : 2;& media-change buffer flags \cr
-  &unsigned\ int& vfs_events;& cached events for vfs path\cr
-  &unsigned\ int& ioctl_events;& cached events for ioctl path\cr
-  & int& use_count;& number of times device is opened\cr
-  & char& name[20];& name of the device type\cr
-\noalign{\medskip}
-  &__u8& sanyo_slot : 2;& Sanyo 3-CD changer support\cr
-  &__u8& keeplocked : 1;& CDROM_LOCKDOOR status\cr
-  &__u8& reserved : 5;& not used yet\cr
-  & int& cdda_method;& see CDDA_* flags\cr
-  &__u8& last_sense;& saves last sense key\cr
-  &__u8& media_written;& dirty flag, DVD+RW bookkeeping\cr
-  &unsigned\ short& mmc3_profile;& current MMC3 profile\cr
-  & int& for_data;& unknown:TBD\cr
-  & int\ (* exit)\ (struct\ cdrom_device_info *);&& unknown:TBD\cr
-  & int& mrw_mode_page;& which MRW mode page is in use\cr
-\}\cr
-}$$
-Using this $struct$, a linked list of the registered minor devices is
-built, using the $next$ field. The device number, the device operations
-struct and specifications of properties of the drive are stored in this
-structure.
-
-The $mask$ flags can be used to mask out some of the capabilities listed
-in $ops\to capability$, if a specific drive doesn't support a feature
-of the driver. The value $speed$ specifies the maximum head-rate of the
-drive, measured in units of normal audio speed (176\,kB/sec raw data or
-150\,kB/sec file system data).  The parameters are declared $const$
-because they describe properties of the drive, which don't change after
-registration.
-
-A few registers contain variables local to the \cdrom\ drive. The
-flags $options$ are used to specify how the general \cdrom\ routines
-should behave. These various flags registers should provide enough
-flexibility to adapt to the different users' wishes (and {\em not\/} the
-`arbitrary' wishes of the author of the low-level device driver, as is
-the case in the old scheme). The register $mc_flags$ is used to buffer
-the information from $media_changed()$ to two separate queues. Other
-data that is specific to a minor drive, can be accessed through $handle$,
-which can point to a data structure specific to the low-level driver.
-The fields $use_count$, $next$, $options$ and $mc_flags$ need not be
-initialized.
-
-The intermediate software layer that \cdromc\ forms will perform some
-additional bookkeeping. The use count of the device (the number of
-processes that have the device opened) is registered in $use_count$. The
-function $cdrom_ioctl()$ will verify the appropriate user-memory regions
-for read and write, and in case a location on the CD is transferred,
-it will `sanitize' the format by making requests to the low-level
-drivers in a standard format, and translating all formats between the
-user-software and low level drivers. This relieves much of the drivers'
-memory checking and format checking and translation. Also, the necessary
-structures will be declared on the program stack.
-
-The implementation of the functions should be as defined in the
-following sections. Two functions {\em must\/} be implemented, namely
-$open()$ and $release()$. Other functions may be omitted, their
-corresponding capability flags will be cleared upon registration.
-Generally, a function returns zero on success and negative on error. A
-function call should return only after the command has completed, but of
-course waiting for the device should not use processor time.
-
-\subsection{$Int\ open(struct\ cdrom_device_info * cdi, int\ purpose)$}
-
-$Open()$ should try to open the device for a specific $purpose$, which
-can be either:
-\begin{itemize}
-\item[0] Open for reading data, as done by {\tt {mount()}} (2), or the
-user commands {\tt {dd}} or {\tt {cat}}.  
-\item[1] Open for $ioctl$ commands, as done by audio-CD playing
-programs.
-\end{itemize}
-Notice that any strategic code (closing tray upon $open()$, etc.)\ is
-done by the calling routine in \cdromc, so the low-level routine
-should only be concerned with proper initialization, such as spinning
-up the disc, etc. % and device-use count
-
-
-\subsection{$Void\ release(struct\ cdrom_device_info * cdi)$}
-
-
-Device-specific actions should be taken such as spinning down the device.
-However, strategic actions such as ejection of the tray, or unlocking
-the door, should be left over to the general routine $cdrom_release()$.
-This is the only function returning type $void$.
-
-\subsection{$Int\ drive_status(struct\ cdrom_device_info * cdi, int\ slot_nr)$}
-\label{drive status}
-
-The function $drive_status$, if implemented, should provide
-information on the status of the drive (not the status of the disc,
-which may or may not be in the drive). If the drive is not a changer,
-$slot_nr$ should be ignored. In \cdromh\ the possibilities are listed: 
-$$
-\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-CDS_NO_INFO& no information available\cr
-CDS_NO_DISC& no disc is inserted, tray is closed\cr
-CDS_TRAY_OPEN& tray is opened\cr
-CDS_DRIVE_NOT_READY& something is wrong, tray is moving?\cr
-CDS_DISC_OK& a disc is loaded and everything is fine\cr
-}
-$$
-
-\subsection{$Int\ media_changed(struct\ cdrom_device_info * cdi, int\ disc_nr)$}
-
-This function is very similar to the original function in $struct\ 
-file_operations$. It returns 1 if the medium of the device $cdi\to
-dev$ has changed since the last call, and 0 otherwise. The parameter
-$disc_nr$ identifies a specific slot in a juke-box, it should be
-ignored for single-disc drives.  Note that by `re-routing' this
-function through $cdrom_media_changed()$, we can implement separate
-queues for the VFS and a new $ioctl()$ function that can report device
-changes to software (\eg, an auto-mounting daemon).
-
-\subsection{$Int\ tray_move(struct\ cdrom_device_info * cdi, int\ position)$}
-
-This function, if implemented, should control the tray movement. (No
-other function should control this.) The parameter $position$ controls
-the desired direction of movement:
-\begin{itemize}
-\item[0] Close tray
-\item[1] Open tray
-\end{itemize}
-This function returns 0 upon success, and a non-zero value upon
-error. Note that if the tray is already in the desired position, no
-action need be taken, and the return value should be 0. 
-
-\subsection{$Int\ lock_door(struct\ cdrom_device_info * cdi, int\ lock)$}
-
-This function (and no other code) controls locking of the door, if the
-drive allows this. The value of $lock$ controls the desired locking
-state:
-\begin{itemize}
-\item[0] Unlock door, manual opening is allowed
-\item[1] Lock door, tray cannot be ejected manually
-\end{itemize}
-This function returns 0 upon success, and a non-zero value upon
-error. Note that if the door is already in the requested state, no
-action need be taken, and the return value should be 0. 
-
-\subsection{$Int\ select_speed(struct\ cdrom_device_info * cdi, int\ speed)$}
-
-Some \cdrom\ drives are capable of changing their head-speed. There
-are several reasons for changing the speed of a \cdrom\ drive. Badly
-pressed \cdrom s may benefit from less-than-maximum head rate. Modern
-\cdrom\ drives can obtain very high head rates (up to $24\times$ is
-common).  It has been reported that these drives can make reading
-errors at these high speeds, reducing the speed can prevent data loss
-in these circumstances.  Finally, some of these drives can
-make an annoyingly loud noise, which a lower speed may reduce. %Finally,
-%although the audio-low-pass filters probably aren't designed for it,
-%more than real-time playback of audio might be used for high-speed
-%copying of audio tracks.
-
-This function specifies the speed at which data is read or audio is
-played back. The value of $speed$ specifies the head-speed of the
-drive, measured in units of standard cdrom speed (176\,kB/sec raw data
-or 150\,kB/sec file system data). So to request that a \cdrom\ drive
-operate at 300\,kB/sec you would call the CDROM_SELECT_SPEED $ioctl$
-with $speed=2$. The special value `0' means `auto-selection', \ie,
-maximum data-rate or real-time audio rate. If the drive doesn't have
-this `auto-selection' capability, the decision should be made on the
-current disc loaded and the return value should be positive. A negative
-return value indicates an error.
-
-\subsection{$Int\ select_disc(struct\ cdrom_device_info * cdi, int\ number)$}
-
-If the drive can store multiple discs (a juke-box) this function
-will perform disc selection. It should return the number of the
-selected disc on success, a negative value on error. Currently, only
-the ide-cd driver supports this functionality.
-
-\subsection{$Int\ get_last_session(struct\ cdrom_device_info * cdi, struct\
-  cdrom_multisession * ms_info)$}
-
-This function should implement the old corresponding $ioctl()$. For
-device $cdi\to dev$, the start of the last session of the current disc
-should be returned in the pointer argument $ms_info$. Note that
-routines in \cdromc\ have sanitized this argument: its requested
-format will {\em always\/} be of the type $CDROM_LBA$ (linear block
-addressing mode), whatever the calling software requested. But
-sanitization goes even further: the low-level implementation may
-return the requested information in $CDROM_MSF$ format if it wishes so
-(setting the $ms_info\rightarrow addr_format$ field appropriately, of
-course) and the routines in \cdromc\ will make the transformation if
-necessary. The return value is 0 upon success.
-
-\subsection{$Int\ get_mcn(struct\ cdrom_device_info * cdi, struct\
-  cdrom_mcn * mcn)$}
-
-Some discs carry a `Media Catalog Number' (MCN), also called
-`Universal Product Code' (UPC). This number should reflect the number
-that is generally found in the bar-code on the product. Unfortunately,
-the few discs that carry such a number on the disc don't even use the
-same format. The return argument to this function is a pointer to a
-pre-declared memory region of type $struct\ cdrom_mcn$. The MCN is
-expected as a 13-character string, terminated by a null-character.
-
-\subsection{$Int\ reset(struct\ cdrom_device_info * cdi)$}
-
-This call should perform a hard-reset on the drive (although in
-circumstances that a hard-reset is necessary, a drive may very well not
-listen to commands anymore). Preferably, control is returned to the
-caller only after the drive has finished resetting. If the drive is no
-longer listening, it may be wise for the underlying low-level cdrom
-driver to time out.
-
-\subsection{$Int\ audio_ioctl(struct\ cdrom_device_info * cdi, unsigned\
-  int\ cmd, void * arg)$}
-
-Some of the \cdrom-$ioctl$s defined in \cdromh\ can be
-implemented by the routines described above, and hence the function
-$cdrom_ioctl$ will use those. However, most $ioctl$s deal with
-audio-control. We have decided to leave these to be accessed through a
-single function, repeating the arguments $cmd$ and $arg$. Note that
-the latter is of type $void*{}$, rather than $unsigned\ long\
-int$. The routine $cdrom_ioctl()$ does do some useful things,
-though. It sanitizes the address format type to $CDROM_MSF$ (Minutes,
-Seconds, Frames) for all audio calls. It also verifies the memory
-location of $arg$, and reserves stack-memory for the argument. This
-makes implementation of the $audio_ioctl()$ much simpler than in the
-old driver scheme. For example, you may look up the function
-$cm206_audio_ioctl()$ in {\tt {cm206.c}} that should be updated with
-this documentation. 
-
-An unimplemented ioctl should return $-ENOSYS$, but a harmless request
-(\eg, $CDROMSTART$) may be ignored by returning 0 (success). Other
-errors should be according to the standards, whatever they are. When
-an error is returned by the low-level driver, the \UCD\ tries whenever
-possible to return the error code to the calling program. (We may decide
-to sanitize the return value in $cdrom_ioctl()$ though, in order to
-guarantee a uniform interface to the audio-player software.)
-
-\subsection{$Int\ dev_ioctl(struct\ cdrom_device_info * cdi, unsigned\ int\
-  cmd, unsigned\ long\ arg)$}
-
-Some $ioctl$s seem to be specific to certain \cdrom\ drives. That is,
-they are introduced to service some capabilities of certain drives. In
-fact, there are 6 different $ioctl$s for reading data, either in some
-particular kind of format, or audio data. Not many drives support
-reading audio tracks as data, I believe this is because of protection
-of copyrights of artists. Moreover, I think that if audio-tracks are
-supported, it should be done through the VFS and not via $ioctl$s. A
-problem here could be the fact that audio-frames are 2352 bytes long,
-so either the audio-file-system should ask for 75264 bytes at once
-(the least common multiple of 512 and 2352), or the drivers should
-bend their backs to cope with this incoherence (to which I would be
-opposed).  Furthermore, it is very difficult for the hardware to find
-the exact frame boundaries, since there are no synchronization headers
-in audio frames.  Once these issues are resolved, this code should be
-standardized in \cdromc.
-
-Because there are so many $ioctl$s that seem to be introduced to
-satisfy certain drivers,\footnote{Is there software around that
-  actually uses these? I'd be interested!} any `non-standard' $ioctl$s
-are routed through the call $dev_ioctl()$. In principle, `private'
-$ioctl$s should be numbered after the device's major number, and not
-the general \cdrom\ $ioctl$ number, {\tt {0x53}}. Currently the
-non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2,
-  CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK,
-  CDROMPLAY\-BLK and CDROM\-READALL}.
-
-
-\subsection{\cdrom\ capabilities}
-\label{capability}
-
-Instead of just implementing some $ioctl$ calls, the interface in
-\cdromc\ supplies the possibility to indicate the {\em capabilities\/}
-of a \cdrom\ drive. This can be done by ORing any number of
-capability-constants that are defined in \cdromh\ at the registration
-phase. Currently, the capabilities are any of:
-$$
-\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-CDC_CLOSE_TRAY& can close tray by software control\cr
-CDC_OPEN_TRAY& can open tray\cr
-CDC_LOCK& can lock and unlock the door\cr
-CDC_SELECT_SPEED& can select speed, in units of $\sim$150\,kB/s\cr
-CDC_SELECT_DISC& drive is juke-box\cr
-CDC_MULTI_SESSION& can read sessions $>\rm1$\cr
-CDC_MCN& can read Media Catalog Number\cr
-CDC_MEDIA_CHANGED& can report if disc has changed\cr
-CDC_PLAY_AUDIO& can perform audio-functions (play, pause, etc)\cr
-CDC_RESET& hard reset device\cr
-CDC_IOCTLS& driver has non-standard ioctls\cr
-CDC_DRIVE_STATUS& driver implements drive status\cr
-}
-$$
-The capability flag is declared $const$, to prevent drivers from
-accidentally tampering with the contents. The capability fags actually
-inform \cdromc\ of what the driver can do. If the drive found
-by the driver does not have the capability, is can be masked out by
-the $cdrom_device_info$ variable $mask$. For instance, the SCSI \cdrom\
-driver has implemented the code for loading and ejecting \cdrom's, and
-hence its corresponding flags in $capability$ will be set. But a SCSI
-\cdrom\ drive might be a caddy system, which can't load the tray, and
-hence for this drive the $cdrom_device_info$ struct will have set
-the $CDC_CLOSE_TRAY$ bit in $mask$.
-
-In the file \cdromc\ you will encounter many constructions of the type
-$$\it
-if\ (cdo\rightarrow capability \mathrel\& \mathord{\sim} cdi\rightarrow mask 
-   \mathrel{\&} CDC_<capability>) \ldots
-$$
-There is no $ioctl$ to set the mask\dots The reason is that
-I think it is better to control the {\em behavior\/} rather than the
-{\em capabilities}.
-
-\subsection{Options}
-
-A final flag register controls the {\em behavior\/} of the \cdrom\
-drives, in order to satisfy different users' wishes, hopefully
-independently of the ideas of the respective author who happened to
-have made the drive's support available to the \linux\ community. The
-current behavior options are:
-$$
-\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-CDO_AUTO_CLOSE& try to close tray upon device $open()$\cr
-CDO_AUTO_EJECT& try to open tray on last device $close()$\cr
-CDO_USE_FFLAGS& use $file_pointer\rightarrow f_flags$ to indicate
- purpose for $open()$\cr
-CDO_LOCK& try to lock door if device is opened\cr
-CDO_CHECK_TYPE& ensure disc type is data if opened for data\cr
-}
-$$
-
-The initial value of this register is $CDO_AUTO_CLOSE \mathrel|
-CDO_USE_FFLAGS \mathrel| CDO_LOCK$, reflecting my own view on user
-interface and software standards. Before you protest, there are two
-new $ioctl$s implemented in \cdromc, that allow you to control the
-behavior by software. These are:
-$$
-\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-CDROM_SET_OPTIONS& set options specified in $(int)\ arg$\cr
-CDROM_CLEAR_OPTIONS& clear options specified in $(int)\ arg$\cr
-}
-$$
-One option needs some more explanation: $CDO_USE_FFLAGS$. In the next
-newsection we explain what the need for this option is.
-
-A software package {\tt setcd}, available from the Debian distribution
-and {\tt sunsite.unc.edu}, allows user level control of these flags. 
-
-\newsection{The need to know the purpose of opening the \cdrom\ device}
-
-Traditionally, Unix devices can be used in two different `modes',
-either by reading/writing to the device file, or by issuing
-controlling commands to the device, by the device's $ioctl()$
-call. The problem with \cdrom\ drives, is that they can be used for
-two entirely different purposes. One is to mount removable
-file systems, \cdrom s, the other is to play audio CD's. Audio commands
-are implemented entirely through $ioctl$s, presumably because the
-first implementation (SUN?) has been such. In principle there is
-nothing wrong with this, but a good control of the `CD player' demands
-that the device can {\em always\/} be opened in order to give the
-$ioctl$ commands, regardless of the state the drive is in. 
-
-On the other hand, when used as a removable-media disc drive (what the
-original purpose of \cdrom s is) we would like to make sure that the
-disc drive is ready for operation upon opening the device. In the old
-scheme, some \cdrom\ drivers don't do any integrity checking, resulting
-in a number of i/o errors reported by the VFS to the kernel when an
-attempt for mounting a \cdrom\ on an empty drive occurs. This is not a
-particularly elegant way to find out that there is no \cdrom\ inserted;
-it more-or-less looks like the old IBM-PC trying to read an empty floppy
-drive for a couple of seconds, after which the system complains it
-can't read from it. Nowadays we can {\em sense\/} the existence of a
-removable medium in a drive, and we believe we should exploit that
-fact. An integrity check on opening of the device, that verifies the
-availability of a \cdrom\ and its correct type (data), would be
-desirable.
-
-These two ways of using a \cdrom\ drive, principally for data and
-secondarily for playing audio discs, have different demands for the
-behavior of the $open()$ call. Audio use simply wants to open the
-device in order to get a file handle which is needed for issuing
-$ioctl$ commands, while data use wants to open for correct and
-reliable data transfer. The only way user programs can indicate what
-their {\em purpose\/} of opening the device is, is through the $flags$
-parameter (see {\tt {open(2)}}). For \cdrom\ devices, these flags aren't
-implemented (some drivers implement checking for write-related flags,
-but this is not strictly necessary if the device file has correct
-permission flags). Most option flags simply don't make sense to
-\cdrom\ devices: $O_CREAT$, $O_NOCTTY$, $O_TRUNC$, $O_APPEND$, and
-$O_SYNC$ have no meaning to a \cdrom. 
-
-We therefore propose to use the flag $O_NONBLOCK$ to indicate
-that the device is opened just for issuing $ioctl$
-commands. Strictly, the meaning of $O_NONBLOCK$ is that opening and
-subsequent calls to the device don't cause the calling process to
-wait. We could interpret this as ``don't wait until someone has
-inserted some valid data-\cdrom.'' Thus, our proposal of the
-implementation for the $open()$ call for \cdrom s is:
-\begin{itemize}
-\item If no other flags are set than $O_RDONLY$, the device is opened
-for data transfer, and the return value will be 0 only upon successful
-initialization of the transfer. The call may even induce some actions
-on the \cdrom, such as closing the tray.  
-\item If the option flag $O_NONBLOCK$ is set, opening will always be
-successful, unless the whole device doesn't exist. The drive will take
-no actions whatsoever. 
-\end{itemize}
-
-\subsection{And what about standards?}
-
-You might hesitate to accept this proposal as it comes from the
-\linux\ community, and not from some standardizing institute. What
-about SUN, SGI, HP and all those other Unix and hardware vendors?
-Well, these companies are in the lucky position that they generally
-control both the hardware and software of their supported products,
-and are large enough to set their own standard. They do not have to
-deal with a dozen or more different, competing hardware
-configurations.\footnote{Incidentally, I think that SUN's approach to
-mounting \cdrom s is very good in origin: under Solaris a
-volume-daemon automatically mounts a newly inserted \cdrom\ under {\tt
-{/cdrom/$<volume-name>$/}}. In my opinion they should have pushed this
-further and have {\em every\/} \cdrom\ on the local area network be
-mounted at the similar location, \ie, no matter in which particular
-machine you insert a \cdrom, it will always appear at the same
-position in the directory tree, on every system. When I wanted to
-implement such a user-program for \linux, I came across the
-differences in behavior of the various drivers, and the need for an
-$ioctl$ informing about media changes.}
-
-We believe that using $O_NONBLOCK$ to indicate that a device is being opened
-for $ioctl$ commands only can be easily introduced in the \linux\
-community. All the CD-player authors will have to be informed, we can
-even send in our own patches to the programs. The use of $O_NONBLOCK$
-has most likely no influence on the behavior of the CD-players on
-other operating systems than \linux. Finally, a user can always revert
-to old behavior by a call to $ioctl(file_descriptor, CDROM_CLEAR_OPTIONS,
-CDO_USE_FFLAGS)$. 
-
-\subsection{The preferred strategy of $open()$}
-
-The routines in \cdromc\ are designed in such a way that run-time
-configuration of the behavior of \cdrom\ devices (of {\em any\/} type)
-can be carried out, by the $CDROM_SET/CLEAR_OPTIONS$ $ioctls$. Thus, various
-modes of operation can be set:
-\begin{description}
-\item[$CDO_AUTO_CLOSE \mathrel| CDO_USE_FFLAGS \mathrel| CDO_LOCK$] This
-is the default setting. (With $CDO_CHECK_TYPE$ it will be better, in the
-future.) If the device is not yet opened by any other process, and if
-the device is being opened for data ($O_NONBLOCK$ is not set) and the
-tray is found to be open, an attempt to close the tray is made. Then,
-it is verified that a disc is in the drive and, if $CDO_CHECK_TYPE$ is
-set, that it contains tracks of type `data mode 1.' Only if all tests
-are passed is the return value zero. The door is locked to prevent file
-system corruption. If the drive is opened for audio ($O_NONBLOCK$ is
-set), no actions are taken and a value of 0 will be returned. 
-\item[$CDO_AUTO_CLOSE \mathrel| CDO_AUTO_EJECT \mathrel| CDO_LOCK$] This
-mimics the behavior of the current sbpcd-driver. The option flags are
-ignored, the tray is closed on the first open, if necessary. Similarly,
-the tray is opened on the last release, \ie, if a \cdrom\ is unmounted,
-it is automatically ejected, such that the user can replace it.
-\end{description} 
-We hope that these option can convince everybody (both driver
-maintainers and user program developers) to adopt the new \cdrom\
-driver scheme and option flag interpretation.
-
-\newsection{Description of routines in \cdromc}
-
-Only a few routines in \cdromc\ are exported to the drivers. In this
-new section we will discuss these, as well as the functions that `take
-over' the \cdrom\ interface to the kernel. The header file belonging
-to \cdromc\ is called \cdromh. Formerly, some of the contents of this
-file were placed in the file {\tt {ucdrom.h}}, but this file has now been
-merged back into \cdromh.
-
-\subsection{$Struct\ file_operations\ cdrom_fops$}
-
-The contents of this structure were described in section~\ref{cdrom.c}.
-A pointer to this structure is assigned to the $fops$ field
-of the $struct gendisk$.
-
-\subsection{$Int\ register_cdrom( struct\ cdrom_device_info\ * cdi)$}
-
-This function is used in about the same way one registers $cdrom_fops$
-with the kernel, the device operations and information structures,
-as described in section~\ref{cdrom.c}, should be registered with the
-\UCD:
-$$
-register_cdrom(\&<device>_info));
-$$
-This function returns zero upon success, and non-zero upon
-failure. The structure $<device>_info$ should have a pointer to the
-driver's $<device>_dops$, as in 
-$$
-\vbox{\halign{&$#$\hfil\cr
-struct\ &cdrom_device_info\ <device>_info = \{\cr
-& <device>_dops;\cr
-&\ldots\cr
-\}\cr
-}}$$
-Note that a driver must have one static structure, $<device>_dops$, while
-it may have as many structures $<device>_info$ as there are minor devices
-active. $Register_cdrom()$ builds a linked list from these. 
-
-\subsection{$Void\ unregister_cdrom(struct\ cdrom_device_info * cdi)$}
-
-Unregistering device $cdi$ with minor number $MINOR(cdi\to dev)$ removes
-the minor device from the list. If it was the last registered minor for
-the low-level driver, this disconnects the registered device-operation
-routines from the \cdrom\ interface. This function returns zero upon
-success, and non-zero upon failure.
-
-\subsection{$Int\ cdrom_open(struct\ inode * ip, struct\ file * fp)$}
-
-This function is not called directly by the low-level drivers, it is
-listed in the standard $cdrom_fops$. If the VFS opens a file, this
-function becomes active. A strategy is implemented in this routine,
-taking care of all capabilities and options that are set in the
-$cdrom_device_ops$ connected to the device. Then, the program flow is
-transferred to the device_dependent $open()$ call.
-
-\subsection{$Void\ cdrom_release(struct\ inode *ip, struct\ file
-*fp)$}
-
-This function implements the reverse-logic of $cdrom_open()$, and then
-calls the device-dependent $release()$ routine. When the use-count has
-reached 0, the allocated buffers are flushed by calls to $sync_dev(dev)$
-and $invalidate_buffers(dev)$.
-
-
-\subsection{$Int\ cdrom_ioctl(struct\ inode *ip, struct\ file *fp,
-unsigned\ int\ cmd, unsigned\ long\ arg)$}
-\label{cdrom-ioctl}
-
-This function handles all the standard $ioctl$ requests for \cdrom\
-devices in a uniform way. The different calls fall into three
-categories: $ioctl$s that can be directly implemented by device
-operations, ones that are routed through the call $audio_ioctl()$, and
-the remaining ones, that are presumable device-dependent. Generally, a
-negative return value indicates an error.
-
-\subsubsection{Directly implemented $ioctl$s}
-\label{ioctl-direct}
-
-The following `old' \cdrom-$ioctl$s are implemented by directly
-calling device-operations in $cdrom_device_ops$, if implemented and
-not masked:
-\begin{description}
-\item[CDROMMULTISESSION] Requests the last session on a \cdrom.
-\item[CDROMEJECT] Open tray. 
-\item[CDROMCLOSETRAY] Close tray.
-\item[CDROMEJECT_SW] If $arg\not=0$, set behavior to auto-close (close
-tray on first open) and auto-eject (eject on last release), otherwise
-set behavior to non-moving on $open()$ and $release()$ calls.
-\item[CDROM_GET_MCN] Get the Media Catalog Number from a CD.
-\end{description}
-
-\subsubsection{$Ioctl$s routed through $audio_ioctl()$}
-\label{ioctl-audio}
-
-The following set of $ioctl$s are all implemented through a call to
-the $cdrom_fops$ function $audio_ioctl()$. Memory checks and
-allocation are performed in $cdrom_ioctl()$, and also sanitization of
-address format ($CDROM_LBA$/$CDROM_MSF$) is done.
-\begin{description}
-\item[CDROMSUBCHNL] Get sub-channel data in argument $arg$ of type $struct\
-cdrom_subchnl *{}$.
-\item[CDROMREADTOCHDR] Read Table of Contents header, in $arg$ of type
-$struct\ cdrom_tochdr *{}$. 
-\item[CDROMREADTOCENTRY] Read a Table of Contents entry in $arg$ and
-specified by $arg$ of type $struct\ cdrom_tocentry *{}$.
-\item[CDROMPLAYMSF] Play audio fragment specified in Minute, Second,
-Frame format, delimited by $arg$ of type $struct\ cdrom_msf *{}$.
-\item[CDROMPLAYTRKIND] Play audio fragment in track-index format
-delimited by $arg$ of type $struct\ \penalty-1000 cdrom_ti *{}$.
-\item[CDROMVOLCTRL] Set volume specified by $arg$ of type $struct\
-cdrom_volctrl *{}$.
-\item[CDROMVOLREAD] Read volume into by $arg$ of type $struct\
-cdrom_volctrl *{}$.
-\item[CDROMSTART] Spin up disc.
-\item[CDROMSTOP] Stop playback of audio fragment.
-\item[CDROMPAUSE] Pause playback of audio fragment.
-\item[CDROMRESUME] Resume playing.
-\end{description}
-
-\subsubsection{New $ioctl$s in \cdromc}
-
-The following $ioctl$s have been introduced to allow user programs to
-control the behavior of individual \cdrom\ devices. New $ioctl$
-commands can be identified by the underscores in their names.
-\begin{description}
-\item[CDROM_SET_OPTIONS] Set options specified by $arg$. Returns the
-option flag register after modification. Use  $arg = \rm0$ for reading
-the current flags.
-\item[CDROM_CLEAR_OPTIONS] Clear options specified by $arg$. Returns
-  the option flag register after modification.
-\item[CDROM_SELECT_SPEED] Select head-rate speed of disc specified as
-  by $arg$ in units of standard cdrom speed (176\,kB/sec raw data or
-  150\,kB/sec file system data). The value 0 means `auto-select', \ie,
-  play audio discs at real time and data discs at maximum speed. The value
-  $arg$ is checked against the maximum head rate of the drive found in the
-  $cdrom_dops$.
-\item[CDROM_SELECT_DISC] Select disc numbered $arg$ from a juke-box.
-  First disc is numbered 0. The number $arg$ is checked against the
-  maximum number of discs in the juke-box found in the $cdrom_dops$.
-\item[CDROM_MEDIA_CHANGED] Returns 1 if a disc has been changed since
-  the last call. Note that calls to $cdrom_media_changed$ by the VFS
-  are treated by an independent queue, so both mechanisms will detect
-  a media change once. For juke-boxes, an extra argument $arg$
-  specifies the slot for which the information is given. The special
-  value $CDSL_CURRENT$ requests that information about the currently
-  selected slot be returned.
-\item[CDROM_DRIVE_STATUS] Returns the status of the drive by a call to
-  $drive_status()$. Return values are defined in section~\ref{drive
-   status}. Note that this call doesn't return information on the
-  current playing activity of the drive; this can be polled through an
-  $ioctl$ call to $CDROMSUBCHNL$. For juke-boxes, an extra argument
-  $arg$ specifies the slot for which (possibly limited) information is
-  given. The special value $CDSL_CURRENT$ requests that information
-  about the currently selected slot be returned.
-\item[CDROM_DISC_STATUS] Returns the type of the disc currently in the
-  drive.  It should be viewed as a complement to $CDROM_DRIVE_STATUS$.
-  This $ioctl$ can provide \emph {some} information about the current
-  disc that is inserted in the drive.  This functionality used to be
-  implemented in the low level drivers, but is now carried out
-  entirely in \UCD.
-  
-  The history of development of the CD's use as a carrier medium for
-  various digital information has lead to many different disc types.
-  This $ioctl$ is useful only in the case that CDs have \emph {only
-    one} type of data on them.  While this is often the case, it is
-  also very common for CDs to have some tracks with data, and some
-  tracks with audio.  Because this is an existing interface, rather
-  than fixing this interface by changing the assumptions it was made
-  under, thereby breaking all user applications that use this
-  function, the \UCD\ implements this $ioctl$ as follows: If the CD in
-  question has audio tracks on it, and it has absolutely no CD-I, XA,
-  or data tracks on it, it will be reported as $CDS_AUDIO$.  If it has
-  both audio and data tracks, it will return $CDS_MIXED$.  If there
-  are no audio tracks on the disc, and if the CD in question has any
-  CD-I tracks on it, it will be reported as $CDS_XA_2_2$.  Failing
-  that, if the CD in question has any XA tracks on it, it will be
-  reported as $CDS_XA_2_1$.  Finally, if the CD in question has any
-  data tracks on it, it will be reported as a data CD ($CDS_DATA_1$).
-
-  This $ioctl$ can return:
-  $$
-  \halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
-    CDS_NO_INFO& no information available\cr
-    CDS_NO_DISC& no disc is inserted, or tray is opened\cr
-    CDS_AUDIO& Audio disc (2352 audio bytes/frame)\cr
-    CDS_DATA_1& data disc, mode 1 (2048 user bytes/frame)\cr
-    CDS_XA_2_1& mixed data (XA), mode 2, form 1 (2048 user bytes)\cr
-    CDS_XA_2_2& mixed data (XA), mode 2, form 1 (2324  user bytes)\cr
-    CDS_MIXED& mixed audio/data disc\cr
-    }
-  $$
-  For some information concerning frame layout of the various disc
-  types, see a recent version of \cdromh.
-
-\item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a
-  juke-box. 
-\item[CDROMRESET] Reset the drive. 
-\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the
-  drive. Refer to section \ref{capability} for more information on
-  these flags.
-\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$
-  unlocks the door, any other value locks it.
-\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed
-  to do this. Same semantics as CDROM_LOCKDOOR.
-\end{description}
-
-\subsubsection{Device dependent $ioctl$s}
-
-Finally, all other $ioctl$s are passed to the function $dev_ioctl()$,
-if implemented. No memory allocation or verification is carried out. 
-
-\newsection{How to update your driver}
-
-\begin{enumerate}
-\item Make a backup of your current driver. 
-\item Get hold of the files \cdromc\ and \cdromh, they should be in
-  the directory tree that came with this documentation.
-\item Make sure you include \cdromh.
-\item Change the 3rd argument of $register_blkdev$ from
-$\&<your-drive>_fops$ to $\&cdrom_fops$. 
-\item Just after that line, add the following to register with the \UCD:
-  $$register_cdrom(\&<your-drive>_info);$$
-  Similarly, add a call to $unregister_cdrom()$ at the appropriate place.
-\item Copy an example of the device-operations $struct$ to your
-  source, \eg, from {\tt {cm206.c}} $cm206_dops$, and change all
-  entries to names corresponding to your driver, or names you just
-  happen to like. If your driver doesn't support a certain function,
-  make the entry $NULL$. At the entry $capability$ you should list all
-  capabilities your driver currently supports. If your driver
-  has a capability that is not listed, please send me a message.
-\item Copy the $cdrom_device_info$ declaration from the same example
-  driver, and modify the entries according to your needs. If your
-  driver dynamically determines the capabilities of the hardware, this
-  structure should also be declared dynamically. 
-\item Implement all functions in your $<device>_dops$ structure,
-  according to prototypes listed in \cdromh, and specifications given
-  in section~\ref{cdrom.c}. Most likely you have already implemented
-  the code in a large part, and you will almost certainly need to adapt the
-  prototype and return values.
-\item Rename your $<device>_ioctl()$ function to $audio_ioctl$ and
-  change the prototype a little. Remove entries listed in the first
-  part in section~\ref{cdrom-ioctl}, if your code was OK, these are
-  just calls to the routines you adapted in the previous step.
-\item You may remove all remaining memory checking code in the
-  $audio_ioctl()$ function that deals with audio commands (these are
-  listed in the second part of section~\ref{cdrom-ioctl}). There is no
-  need for memory allocation either, so most $case$s in the $switch$
-  statement look similar to:
-  $$
-  case\ CDROMREADTOCENTRY\colon get_toc_entry\bigl((struct\ 
-  cdrom_tocentry *{})\ arg\bigr);
-  $$
-\item All remaining $ioctl$ cases must be moved to a separate
-  function, $<device>_ioctl$, the device-dependent $ioctl$s. Note that
-  memory checking and allocation must be kept in this code!
-\item Change the prototypes of $<device>_open()$ and
-  $<device>_release()$, and remove any strategic code (\ie, tray
-  movement, door locking, etc.).
-\item Try to recompile the drivers. We advise you to use modules, both
-  for {\tt {cdrom.o}} and your driver, as debugging is much easier this
-  way.
-\end{enumerate} 
-
-\newsection{Thanks}
-
-Thanks to all the people involved.  First, Erik Andersen, who has
-taken over the torch in maintaining \cdromc\ and integrating much
-\cdrom-related code in the 2.1-kernel.  Thanks to Scott Snyder and
-Gerd Knorr, who were the first to implement this interface for SCSI
-and IDE-CD drivers and added many ideas for extension of the data
-structures relative to kernel~2.0.  Further thanks to Heiko Ei{\ss}feldt,
-Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
-Kroll, the \linux\ \cdrom\ device driver developers who were kind
-enough to give suggestions and criticisms during the writing. Finally
-of course, I want to thank Linus Torvalds for making this possible in
-the first place.
-
-\vfill
-$ \version\ $
-\eject
-\end{document}
diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd
deleted file mode 100644
index a5f2a7f1ff46..000000000000
--- a/Documentation/cdrom/ide-cd
+++ /dev/null
@@ -1,534 +0,0 @@
-IDE-CD driver documentation
-Originally by scott snyder  <snyder@fnald0.fnal.gov> (19 May 1996)
-Carrying on the torch is: Erik Andersen <andersee@debian.org>
-New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk>
-
-1. Introduction
----------------
-
-The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant 
-CDROM drives which attach to an IDE interface.  Note that some CDROM vendors
-(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made
-both ATAPI-compliant drives and drives which use a proprietary
-interface.  If your drive uses one of those proprietary interfaces,
-this driver will not work with it (but one of the other CDROM drivers
-probably will).  This driver will not work with `ATAPI' drives which
-attach to the parallel port.  In addition, there is at least one drive
-(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI;
-this driver will not work with drives like that either (but see the
-aztcd driver).
-
-This driver provides the following features:
-
- - Reading from data tracks, and mounting ISO 9660 filesystems.
-
- - Playing audio tracks.  Most of the CDROM player programs floating
-   around should work; I usually use Workman.
-
- - Multisession support.
-
- - On drives which support it, reading digital audio data directly
-   from audio tracks.  The program cdda2wav can be used for this.
-   Note, however, that only some drives actually support this.
-
- - There is now support for CDROM changers which comply with the 
-   ATAPI 2.6 draft standard (such as the NEC CDR-251).  This additional
-   functionality includes a function call to query which slot is the
-   currently selected slot, a function call to query which slots contain
-   CDs, etc. A sample program which demonstrates this functionality is
-   appended to the end of this file.  The Sanyo 3-disc changer
-   (which does not conform to the standard) is also now supported.
-   Please note the driver refers to the first CD as slot # 0.
-
-
-2. Installation
----------------
-
-0. The ide-cd relies on the ide disk driver.  See
-   Documentation/ide/ide.txt for up-to-date information on the ide
-   driver.
-
-1. Make sure that the ide and ide-cd drivers are compiled into the
-   kernel you're using.  When configuring the kernel, in the section 
-   entitled "Floppy, IDE, and other block devices", say either `Y' 
-   (which will compile the support directly into the kernel) or `M'
-   (to compile support as a module which can be loaded and unloaded)
-   to the options: 
-
-      ATA/ATAPI/MFM/RLL support
-      Include IDE/ATAPI CDROM support
-
-   Depending on what type of IDE interface you have, you may need to
-   specify additional configuration options.  See
-   Documentation/ide/ide.txt.
-
-2. You should also ensure that the iso9660 filesystem is either
-   compiled into the kernel or available as a loadable module.  You
-   can see if a filesystem is known to the kernel by catting
-   /proc/filesystems.
-
-3. The CDROM drive should be connected to the host on an IDE
-   interface.  Each interface on a system is defined by an I/O port
-   address and an IRQ number, the standard assignments being
-   0x1f0 and 14 for the primary interface and 0x170 and 15 for the
-   secondary interface.  Each interface can control up to two devices,
-   where each device can be a hard drive, a CDROM drive, a floppy drive, 
-   or a tape drive.  The two devices on an interface are called `master'
-   and `slave'; this is usually selectable via a jumper on the drive.
-
-   Linux names these devices as follows.  The master and slave devices
-   on the primary IDE interface are called `hda' and `hdb',
-   respectively.  The drives on the secondary interface are called
-   `hdc' and `hdd'.  (Interfaces at other locations get other letters
-   in the third position; see Documentation/ide/ide.txt.)
-
-   If you want your CDROM drive to be found automatically by the
-   driver, you should make sure your IDE interface uses either the
-   primary or secondary addresses mentioned above.  In addition, if
-   the CDROM drive is the only device on the IDE interface, it should
-   be jumpered as `master'.  (If for some reason you cannot configure
-   your system in this manner, you can probably still use the driver.
-   You may have to pass extra configuration information to the kernel
-   when you boot, however.  See Documentation/ide/ide.txt for more
-   information.)
-
-4. Boot the system.  If the drive is recognized, you should see a
-   message which looks like
-
-     hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive
-
-   If you do not see this, see section 5 below.
-
-5. You may want to create a symbolic link /dev/cdrom pointing to the
-   actual device.  You can do this with the command
-
-     ln -s  /dev/hdX  /dev/cdrom
-
-   where X should be replaced by the letter indicating where your
-   drive is installed.
-
-6. You should be able to see any error messages from the driver with
-   the `dmesg' command.
-
-
-3. Basic usage
---------------
-
-An ISO 9660 CDROM can be mounted by putting the disc in the drive and 
-typing (as root)
-
-  mount -t iso9660 /dev/cdrom /mnt/cdrom
-
-where it is assumed that /dev/cdrom is a link pointing to the actual
-device (as described in step 5 of the last section) and /mnt/cdrom is
-an empty directory.  You should now be able to see the contents of the
-CDROM under the /mnt/cdrom directory.  If you want to eject the CDROM,
-you must first dismount it with a command like
-
-  umount /mnt/cdrom
-
-Note that audio CDs cannot be mounted.
-
-Some distributions set up /etc/fstab to always try to mount a CDROM
-filesystem on bootup.  It is not required to mount the CDROM in this
-manner, though, and it may be a nuisance if you change CDROMs often.
-You should feel free to remove the cdrom line from /etc/fstab and
-mount CDROMs manually if that suits you better.
-
-Multisession and photocd discs should work with no special handling.
-The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be
-useful for reading photocds.
-
-To play an audio CD, you should first unmount and remove any data
-CDROM.  Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.).
-
-On a few drives, you can read digital audio directly using a program
-such as cdda2wav.  The only types of drive which I've heard support
-this are Sony and Toshiba drives.  You will get errors if you try to
-use this function on a drive which does not support it.
-
-For supported changers, you can use the `cdchange' program (appended to
-the end of this file) to switch between changer slots.  Note that the
-drive should be unmounted before attempting this.  The program takes
-two arguments:  the CDROM device, and the slot number to which you wish
-to change.  If the slot number is -1, the drive is unloaded.
-
-
-4. Common problems
-------------------
-
-This section discusses some common problems encountered when trying to
-use the driver, and some possible solutions.  Note that if you are
-experiencing problems, you should probably also review
-Documentation/ide/ide.txt for current information about the underlying
-IDE support code.  Some of these items apply only to earlier versions
-of the driver, but are mentioned here for completeness.
-
-In most cases, you should probably check with `dmesg' for any errors
-from the driver.
-
-a. Drive is not detected during booting.
-
-   - Review the configuration instructions above and in
-     Documentation/ide/ide.txt, and check how your hardware is
-     configured.
-
-   - If your drive is the only device on an IDE interface, it should
-     be jumpered as master, if at all possible.
-
-   - If your IDE interface is not at the standard addresses of 0x170
-     or 0x1f0, you'll need to explicitly inform the driver using a
-     lilo option.  See Documentation/ide/ide.txt.  (This feature was
-     added around kernel version 1.3.30.)
-
-   - If the autoprobing is not finding your drive, you can tell the
-     driver to assume that one exists by using a lilo option of the
-     form `hdX=cdrom', where X is the drive letter corresponding to
-     where your drive is installed.  Note that if you do this and you 
-     see a boot message like
-
-       hdX: ATAPI cdrom (?)
-
-     this does _not_ mean that the driver has successfully detected
-     the drive; rather, it means that the driver has not detected a
-     drive, but is assuming there's one there anyway because you told
-     it so.  If you actually try to do I/O to a drive defined at a
-     nonexistent or nonresponding I/O address, you'll probably get
-     errors with a status value of 0xff.
-
-   - Some IDE adapters require a nonstandard initialization sequence
-     before they'll function properly.  (If this is the case, there
-     will often be a separate MS-DOS driver just for the controller.)
-     IDE interfaces on sound cards often fall into this category.
-
-     Support for some interfaces needing extra initialization is
-     provided in later 1.3.x kernels.  You may need to turn on
-     additional kernel configuration options to get them to work;
-     see Documentation/ide/ide.txt.
-
-     Even if support is not available for your interface, you may be
-     able to get it to work with the following procedure.  First boot
-     MS-DOS and load the appropriate drivers.  Then warm-boot linux
-     (i.e., without powering off).  If this works, it can be automated
-     by running loadlin from the MS-DOS autoexec.
-
-
-b. Timeout/IRQ errors.
-
-  - If you always get timeout errors, interrupts from the drive are
-    probably not making it to the host.
-
-  - IRQ problems may also be indicated by the message
-    `IRQ probe failed (<n>)' while booting.  If <n> is zero, that
-    means that the system did not see an interrupt from the drive when
-    it was expecting one (on any feasible IRQ).  If <n> is negative,
-    that means the system saw interrupts on multiple IRQ lines, when
-    it was expecting to receive just one from the CDROM drive.
-
-  - Double-check your hardware configuration to make sure that the IRQ
-    number of your IDE interface matches what the driver expects.
-    (The usual assignments are 14 for the primary (0x1f0) interface
-    and 15 for the secondary (0x170) interface.)  Also be sure that
-    you don't have some other hardware which might be conflicting with
-    the IRQ you're using.  Also check the BIOS setup for your system;
-    some have the ability to disable individual IRQ levels, and I've
-    had one report of a system which was shipped with IRQ 15 disabled
-    by default.
-
-  - Note that many MS-DOS CDROM drivers will still function even if
-    there are hardware problems with the interrupt setup; they
-    apparently don't use interrupts.
-
-  - If you own a Pioneer DR-A24X, you _will_ get nasty error messages 
-    on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }"
-    The Pioneer DR-A24X CDROM drives are fairly popular these days.
-    Unfortunately, these drives seem to become very confused when we perform
-    the standard Linux ATA disk drive probe. If you own one of these drives,
-    you can bypass the ATA probing which confuses these CDROM drives, by 
-    adding `append="hdX=noprobe hdX=cdrom"' to your lilo.conf file and running 
-    lilo (again where X is the drive letter corresponding to where your drive 
-    is installed.)
-    
-c. System hangups.
-
-  - If the system locks up when you try to access the CDROM, the most
-    likely cause is that you have a buggy IDE adapter which doesn't
-    properly handle simultaneous transactions on multiple interfaces.
-    The most notorious of these is the CMD640B chip.  This problem can
-    be worked around by specifying the `serialize' option when
-    booting.  Recent kernels should be able to detect the need for
-    this automatically in most cases, but the detection is not
-    foolproof.  See Documentation/ide/ide.txt for more information
-    about the `serialize' option and the CMD640B.
-
-  - Note that many MS-DOS CDROM drivers will work with such buggy
-    hardware, apparently because they never attempt to overlap CDROM
-    operations with other disk activity.
-
-
-d. Can't mount a CDROM.
-
-  - If you get errors from mount, it may help to check `dmesg' to see
-    if there are any more specific errors from the driver or from the
-    filesystem.
-
-  - Make sure there's a CDROM loaded in the drive, and that's it's an
-    ISO 9660 disc.  You can't mount an audio CD.
-
-  - With the CDROM in the drive and unmounted, try something like
-
-      cat /dev/cdrom | od | more
-
-    If you see a dump, then the drive and driver are probably working
-    OK, and the problem is at the filesystem level (i.e., the CDROM is
-    not ISO 9660 or has errors in the filesystem structure).
-
-  - If you see `not a block device' errors, check that the definitions
-    of the device special files are correct.  They should be as
-    follows:
-
-      brw-rw----   1 root     disk       3,   0 Nov 11 18:48 /dev/hda
-      brw-rw----   1 root     disk       3,  64 Nov 11 18:48 /dev/hdb
-      brw-rw----   1 root     disk      22,   0 Nov 11 18:48 /dev/hdc
-      brw-rw----   1 root     disk      22,  64 Nov 11 18:48 /dev/hdd
-
-    Some early Slackware releases had these defined incorrectly.  If
-    these are wrong, you can remake them by running the script
-    scripts/MAKEDEV.ide.  (You may have to make it executable
-    with chmod first.)
-
-    If you have a /dev/cdrom symbolic link, check that it is pointing
-    to the correct device file.
-
-    If you hear people talking of the devices `hd1a' and `hd1b', these
-    were old names for what are now called hdc and hdd.  Those names
-    should be considered obsolete.
-
-  - If mount is complaining that the iso9660 filesystem is not
-    available, but you know it is (check /proc/filesystems), you
-    probably need a newer version of mount.  Early versions would not
-    always give meaningful error messages.
-
-
-e. Directory listings are unpredictably truncated, and `dmesg' shows
-   `buffer botch' error messages from the driver.
-
-  - There was a bug in the version of the driver in 1.2.x kernels
-    which could cause this.  It was fixed in 1.3.0.  If you can't
-    upgrade, you can probably work around the problem by specifying a
-    blocksize of 2048 when mounting.  (Note that you won't be able to
-    directly execute binaries off the CDROM in that case.)
-
-    If you see this in kernels later than 1.3.0, please report it as a
-    bug.
-
-
-f. Data corruption.
-
-  - Random data corruption was occasionally observed with the Hitachi
-    CDR-7730 CDROM. If you experience data corruption, using "hdx=slow"
-    as a command line parameter may work around the problem, at the
-    expense of low system performance.
-
-
-5. cdchange.c
--------------
-
-/*
- * cdchange.c  [-v]  <device>  [<slot>]
- *
- * This loads a CDROM from a specified slot in a changer, and displays 
- * information about the changer status.  The drive should be unmounted before 
- * using this program.
- *
- * Changer information is displayed if either the -v flag is specified
- * or no slot was specified.
- *
- * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>.
- * Changer status information, and rewrite for the new Uniform CDROM driver
- * interface by Erik Andersen <andersee@debian.org>.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <linux/cdrom.h>
-
-
-int
-main (int argc, char **argv)
-{
-	char *program;
-	char *device;
-	int fd;           /* file descriptor for CD-ROM device */
-	int status;       /* return status for system calls */
-	int verbose = 0;
-	int slot=-1, x_slot;
-	int total_slots_available;
-
-	program = argv[0];
-
-	++argv;
-	--argc;
-
-	if (argc < 1 || argc > 3) {
-		fprintf (stderr, "usage: %s [-v] <device> [<slot>]\n",
-			 program);
-		fprintf (stderr, "       Slots are numbered 1 -- n.\n");
-		exit (1);
-	}
- 
-       if (strcmp (argv[0], "-v") == 0) {
-                verbose = 1;
-                ++argv;
-                --argc;
-        }
- 
-	device = argv[0];
- 
-	if (argc == 2)
-		slot = atoi (argv[1]) - 1;
-
-	/* open device */ 
-	fd = open(device, O_RDONLY | O_NONBLOCK);
-	if (fd < 0) {
-		fprintf (stderr, "%s: open failed for `%s': %s\n",
-			 program, device, strerror (errno));
-		exit (1);
-	}
-
-	/* Check CD player status */ 
-	total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS);
-	if (total_slots_available <= 1 ) {
-		fprintf (stderr, "%s: Device `%s' is not an ATAPI "
-			"compliant CD changer.\n", program, device);
-		exit (1);
-	}
-
-	if (slot >= 0) {
-		if (slot >= total_slots_available) {
-			fprintf (stderr, "Bad slot number.  "
-				 "Should be 1 -- %d.\n",
-				 total_slots_available);
-			exit (1);
-		}
-
-		/* load */ 
-		slot=ioctl (fd, CDROM_SELECT_DISC, slot);
-		if (slot<0) {
-			fflush(stdout);
-				perror ("CDROM_SELECT_DISC ");
-			exit(1);
-		}
-	}
-
-	if (slot < 0 || verbose) {
-
-		status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT);
-		if (status<0) {
-			fflush(stdout);
-			perror (" CDROM_SELECT_DISC");
-			exit(1);
-		}
-		slot=status;
-
-		printf ("Current slot: %d\n", slot+1);
-		printf ("Total slots available: %d\n",
-			total_slots_available);
-
-		printf ("Drive status: ");
-                status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
-                if (status<0) {
-                  perror(" CDROM_DRIVE_STATUS");
-                } else switch(status) {
-		case CDS_DISC_OK:
-			printf ("Ready.\n");
-			break;
-		case CDS_TRAY_OPEN:
-			printf ("Tray Open.\n");
-			break;
-		case CDS_DRIVE_NOT_READY:
-			printf ("Drive Not Ready.\n");
-			break;
-		default:
-			printf ("This Should not happen!\n");
-			break;
-		}
-
-		for (x_slot=0; x_slot<total_slots_available; x_slot++) {
-			printf ("Slot %2d: ", x_slot+1);
-             		status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot);
-             		if (status<0) {
-             		     perror(" CDROM_DRIVE_STATUS");
-             		} else switch(status) {
-			case CDS_DISC_OK:
-				printf ("Disc present.");
-				break;
-			case CDS_NO_DISC: 
-				printf ("Empty slot.");
-				break;
-			case CDS_TRAY_OPEN:
-				printf ("CD-ROM tray open.\n");
-				break;
-			case CDS_DRIVE_NOT_READY:
-				printf ("CD-ROM drive not ready.\n");
-				break;
-			case CDS_NO_INFO:
-				printf ("No Information available.");
-				break;
-			default:
-				printf ("This Should not happen!\n");
-				break;
-			}
-		  if (slot == x_slot) {
-                  status = ioctl (fd, CDROM_DISC_STATUS);
-                  if (status<0) {
-			perror(" CDROM_DISC_STATUS");
-                  }
-		  switch (status) {
-			case CDS_AUDIO:
-				printf ("\tAudio disc.\t");
-				break;
-			case CDS_DATA_1:
-			case CDS_DATA_2:
-				printf ("\tData disc type %d.\t", status-CDS_DATA_1+1);
-				break;
-			case CDS_XA_2_1:
-			case CDS_XA_2_2:
-				printf ("\tXA data disc type %d.\t", status-CDS_XA_2_1+1);
-				break;
-			default:
-				printf ("\tUnknown disc type 0x%x!\t", status);
-				break;
-			}
-			}
-                  	status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot);
-                  	if (status<0) {
-				perror(" CDROM_MEDIA_CHANGED");
-                  	}
-		  	switch (status) {
-			case 1:
-				printf ("Changed.\n");
-				break;
-			default:
-				printf ("\n");
-				break;
-			}
-		}
-	}
-
-	/* close device */
-	status = close (fd);
-	if (status != 0) {
-		fprintf (stderr, "%s: close failed for `%s': %s\n",
-			 program, device, strerror (errno));
-		exit (1);
-	}
- 
-	exit (0);
-}
diff --git a/Documentation/cdrom/ide-cd.rst b/Documentation/cdrom/ide-cd.rst
new file mode 100644
index 000000000000..bdccb74fc92d
--- /dev/null
+++ b/Documentation/cdrom/ide-cd.rst
@@ -0,0 +1,538 @@
+IDE-CD driver documentation
+===========================
+
+:Originally by: scott snyder  <snyder@fnald0.fnal.gov> (19 May 1996)
+:Carrying on the torch is: Erik Andersen <andersee@debian.org>
+:New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk>
+
+1. Introduction
+---------------
+
+The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant
+CDROM drives which attach to an IDE interface.  Note that some CDROM vendors
+(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made
+both ATAPI-compliant drives and drives which use a proprietary
+interface.  If your drive uses one of those proprietary interfaces,
+this driver will not work with it (but one of the other CDROM drivers
+probably will).  This driver will not work with `ATAPI` drives which
+attach to the parallel port.  In addition, there is at least one drive
+(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI;
+this driver will not work with drives like that either (but see the
+aztcd driver).
+
+This driver provides the following features:
+
+ - Reading from data tracks, and mounting ISO 9660 filesystems.
+
+ - Playing audio tracks.  Most of the CDROM player programs floating
+   around should work; I usually use Workman.
+
+ - Multisession support.
+
+ - On drives which support it, reading digital audio data directly
+   from audio tracks.  The program cdda2wav can be used for this.
+   Note, however, that only some drives actually support this.
+
+ - There is now support for CDROM changers which comply with the
+   ATAPI 2.6 draft standard (such as the NEC CDR-251).  This additional
+   functionality includes a function call to query which slot is the
+   currently selected slot, a function call to query which slots contain
+   CDs, etc. A sample program which demonstrates this functionality is
+   appended to the end of this file.  The Sanyo 3-disc changer
+   (which does not conform to the standard) is also now supported.
+   Please note the driver refers to the first CD as slot # 0.
+
+
+2. Installation
+---------------
+
+0. The ide-cd relies on the ide disk driver.  See
+   Documentation/ide/ide.rst for up-to-date information on the ide
+   driver.
+
+1. Make sure that the ide and ide-cd drivers are compiled into the
+   kernel you're using.  When configuring the kernel, in the section
+   entitled "Floppy, IDE, and other block devices", say either `Y`
+   (which will compile the support directly into the kernel) or `M`
+   (to compile support as a module which can be loaded and unloaded)
+   to the options::
+
+      ATA/ATAPI/MFM/RLL support
+      Include IDE/ATAPI CDROM support
+
+   Depending on what type of IDE interface you have, you may need to
+   specify additional configuration options.  See
+   Documentation/ide/ide.rst.
+
+2. You should also ensure that the iso9660 filesystem is either
+   compiled into the kernel or available as a loadable module.  You
+   can see if a filesystem is known to the kernel by catting
+   /proc/filesystems.
+
+3. The CDROM drive should be connected to the host on an IDE
+   interface.  Each interface on a system is defined by an I/O port
+   address and an IRQ number, the standard assignments being
+   0x1f0 and 14 for the primary interface and 0x170 and 15 for the
+   secondary interface.  Each interface can control up to two devices,
+   where each device can be a hard drive, a CDROM drive, a floppy drive,
+   or a tape drive.  The two devices on an interface are called `master`
+   and `slave`; this is usually selectable via a jumper on the drive.
+
+   Linux names these devices as follows.  The master and slave devices
+   on the primary IDE interface are called `hda` and `hdb`,
+   respectively.  The drives on the secondary interface are called
+   `hdc` and `hdd`.  (Interfaces at other locations get other letters
+   in the third position; see Documentation/ide/ide.rst.)
+
+   If you want your CDROM drive to be found automatically by the
+   driver, you should make sure your IDE interface uses either the
+   primary or secondary addresses mentioned above.  In addition, if
+   the CDROM drive is the only device on the IDE interface, it should
+   be jumpered as `master`.  (If for some reason you cannot configure
+   your system in this manner, you can probably still use the driver.
+   You may have to pass extra configuration information to the kernel
+   when you boot, however.  See Documentation/ide/ide.rst for more
+   information.)
+
+4. Boot the system.  If the drive is recognized, you should see a
+   message which looks like::
+
+     hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive
+
+   If you do not see this, see section 5 below.
+
+5. You may want to create a symbolic link /dev/cdrom pointing to the
+   actual device.  You can do this with the command::
+
+     ln -s  /dev/hdX  /dev/cdrom
+
+   where X should be replaced by the letter indicating where your
+   drive is installed.
+
+6. You should be able to see any error messages from the driver with
+   the `dmesg` command.
+
+
+3. Basic usage
+--------------
+
+An ISO 9660 CDROM can be mounted by putting the disc in the drive and
+typing (as root)::
+
+  mount -t iso9660 /dev/cdrom /mnt/cdrom
+
+where it is assumed that /dev/cdrom is a link pointing to the actual
+device (as described in step 5 of the last section) and /mnt/cdrom is
+an empty directory.  You should now be able to see the contents of the
+CDROM under the /mnt/cdrom directory.  If you want to eject the CDROM,
+you must first dismount it with a command like::
+
+  umount /mnt/cdrom
+
+Note that audio CDs cannot be mounted.
+
+Some distributions set up /etc/fstab to always try to mount a CDROM
+filesystem on bootup.  It is not required to mount the CDROM in this
+manner, though, and it may be a nuisance if you change CDROMs often.
+You should feel free to remove the cdrom line from /etc/fstab and
+mount CDROMs manually if that suits you better.
+
+Multisession and photocd discs should work with no special handling.
+The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be
+useful for reading photocds.
+
+To play an audio CD, you should first unmount and remove any data
+CDROM.  Any of the CDROM player programs should then work (workman,
+workbone, cdplayer, etc.).
+
+On a few drives, you can read digital audio directly using a program
+such as cdda2wav.  The only types of drive which I've heard support
+this are Sony and Toshiba drives.  You will get errors if you try to
+use this function on a drive which does not support it.
+
+For supported changers, you can use the `cdchange` program (appended to
+the end of this file) to switch between changer slots.  Note that the
+drive should be unmounted before attempting this.  The program takes
+two arguments:  the CDROM device, and the slot number to which you wish
+to change.  If the slot number is -1, the drive is unloaded.
+
+
+4. Common problems
+------------------
+
+This section discusses some common problems encountered when trying to
+use the driver, and some possible solutions.  Note that if you are
+experiencing problems, you should probably also review
+Documentation/ide/ide.rst for current information about the underlying
+IDE support code.  Some of these items apply only to earlier versions
+of the driver, but are mentioned here for completeness.
+
+In most cases, you should probably check with `dmesg` for any errors
+from the driver.
+
+a. Drive is not detected during booting.
+
+   - Review the configuration instructions above and in
+     Documentation/ide/ide.rst, and check how your hardware is
+     configured.
+
+   - If your drive is the only device on an IDE interface, it should
+     be jumpered as master, if at all possible.
+
+   - If your IDE interface is not at the standard addresses of 0x170
+     or 0x1f0, you'll need to explicitly inform the driver using a
+     lilo option.  See Documentation/ide/ide.rst.  (This feature was
+     added around kernel version 1.3.30.)
+
+   - If the autoprobing is not finding your drive, you can tell the
+     driver to assume that one exists by using a lilo option of the
+     form `hdX=cdrom`, where X is the drive letter corresponding to
+     where your drive is installed.  Note that if you do this and you
+     see a boot message like::
+
+       hdX: ATAPI cdrom (?)
+
+     this does _not_ mean that the driver has successfully detected
+     the drive; rather, it means that the driver has not detected a
+     drive, but is assuming there's one there anyway because you told
+     it so.  If you actually try to do I/O to a drive defined at a
+     nonexistent or nonresponding I/O address, you'll probably get
+     errors with a status value of 0xff.
+
+   - Some IDE adapters require a nonstandard initialization sequence
+     before they'll function properly.  (If this is the case, there
+     will often be a separate MS-DOS driver just for the controller.)
+     IDE interfaces on sound cards often fall into this category.
+
+     Support for some interfaces needing extra initialization is
+     provided in later 1.3.x kernels.  You may need to turn on
+     additional kernel configuration options to get them to work;
+     see Documentation/ide/ide.rst.
+
+     Even if support is not available for your interface, you may be
+     able to get it to work with the following procedure.  First boot
+     MS-DOS and load the appropriate drivers.  Then warm-boot linux
+     (i.e., without powering off).  If this works, it can be automated
+     by running loadlin from the MS-DOS autoexec.
+
+
+b. Timeout/IRQ errors.
+
+  - If you always get timeout errors, interrupts from the drive are
+    probably not making it to the host.
+
+  - IRQ problems may also be indicated by the message
+    `IRQ probe failed (<n>)` while booting.  If <n> is zero, that
+    means that the system did not see an interrupt from the drive when
+    it was expecting one (on any feasible IRQ).  If <n> is negative,
+    that means the system saw interrupts on multiple IRQ lines, when
+    it was expecting to receive just one from the CDROM drive.
+
+  - Double-check your hardware configuration to make sure that the IRQ
+    number of your IDE interface matches what the driver expects.
+    (The usual assignments are 14 for the primary (0x1f0) interface
+    and 15 for the secondary (0x170) interface.)  Also be sure that
+    you don't have some other hardware which might be conflicting with
+    the IRQ you're using.  Also check the BIOS setup for your system;
+    some have the ability to disable individual IRQ levels, and I've
+    had one report of a system which was shipped with IRQ 15 disabled
+    by default.
+
+  - Note that many MS-DOS CDROM drivers will still function even if
+    there are hardware problems with the interrupt setup; they
+    apparently don't use interrupts.
+
+  - If you own a Pioneer DR-A24X, you _will_ get nasty error messages
+    on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }"
+    The Pioneer DR-A24X CDROM drives are fairly popular these days.
+    Unfortunately, these drives seem to become very confused when we perform
+    the standard Linux ATA disk drive probe. If you own one of these drives,
+    you can bypass the ATA probing which confuses these CDROM drives, by
+    adding `append="hdX=noprobe hdX=cdrom"` to your lilo.conf file and running
+    lilo (again where X is the drive letter corresponding to where your drive
+    is installed.)
+
+c. System hangups.
+
+  - If the system locks up when you try to access the CDROM, the most
+    likely cause is that you have a buggy IDE adapter which doesn't
+    properly handle simultaneous transactions on multiple interfaces.
+    The most notorious of these is the CMD640B chip.  This problem can
+    be worked around by specifying the `serialize` option when
+    booting.  Recent kernels should be able to detect the need for
+    this automatically in most cases, but the detection is not
+    foolproof.  See Documentation/ide/ide.rst for more information
+    about the `serialize` option and the CMD640B.
+
+  - Note that many MS-DOS CDROM drivers will work with such buggy
+    hardware, apparently because they never attempt to overlap CDROM
+    operations with other disk activity.
+
+
+d. Can't mount a CDROM.
+
+  - If you get errors from mount, it may help to check `dmesg` to see
+    if there are any more specific errors from the driver or from the
+    filesystem.
+
+  - Make sure there's a CDROM loaded in the drive, and that's it's an
+    ISO 9660 disc.  You can't mount an audio CD.
+
+  - With the CDROM in the drive and unmounted, try something like::
+
+      cat /dev/cdrom | od | more
+
+    If you see a dump, then the drive and driver are probably working
+    OK, and the problem is at the filesystem level (i.e., the CDROM is
+    not ISO 9660 or has errors in the filesystem structure).
+
+  - If you see `not a block device` errors, check that the definitions
+    of the device special files are correct.  They should be as
+    follows::
+
+      brw-rw----   1 root     disk       3,   0 Nov 11 18:48 /dev/hda
+      brw-rw----   1 root     disk       3,  64 Nov 11 18:48 /dev/hdb
+      brw-rw----   1 root     disk      22,   0 Nov 11 18:48 /dev/hdc
+      brw-rw----   1 root     disk      22,  64 Nov 11 18:48 /dev/hdd
+
+    Some early Slackware releases had these defined incorrectly.  If
+    these are wrong, you can remake them by running the script
+    scripts/MAKEDEV.ide.  (You may have to make it executable
+    with chmod first.)
+
+    If you have a /dev/cdrom symbolic link, check that it is pointing
+    to the correct device file.
+
+    If you hear people talking of the devices `hd1a` and `hd1b`, these
+    were old names for what are now called hdc and hdd.  Those names
+    should be considered obsolete.
+
+  - If mount is complaining that the iso9660 filesystem is not
+    available, but you know it is (check /proc/filesystems), you
+    probably need a newer version of mount.  Early versions would not
+    always give meaningful error messages.
+
+
+e. Directory listings are unpredictably truncated, and `dmesg` shows
+   `buffer botch` error messages from the driver.
+
+  - There was a bug in the version of the driver in 1.2.x kernels
+    which could cause this.  It was fixed in 1.3.0.  If you can't
+    upgrade, you can probably work around the problem by specifying a
+    blocksize of 2048 when mounting.  (Note that you won't be able to
+    directly execute binaries off the CDROM in that case.)
+
+    If you see this in kernels later than 1.3.0, please report it as a
+    bug.
+
+
+f. Data corruption.
+
+  - Random data corruption was occasionally observed with the Hitachi
+    CDR-7730 CDROM. If you experience data corruption, using "hdx=slow"
+    as a command line parameter may work around the problem, at the
+    expense of low system performance.
+
+
+5. cdchange.c
+-------------
+
+::
+
+  /*
+   * cdchange.c  [-v]  <device>  [<slot>]
+   *
+   * This loads a CDROM from a specified slot in a changer, and displays
+   * information about the changer status.  The drive should be unmounted before
+   * using this program.
+   *
+   * Changer information is displayed if either the -v flag is specified
+   * or no slot was specified.
+   *
+   * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>.
+   * Changer status information, and rewrite for the new Uniform CDROM driver
+   * interface by Erik Andersen <andersee@debian.org>.
+   */
+
+  #include <stdio.h>
+  #include <stdlib.h>
+  #include <errno.h>
+  #include <string.h>
+  #include <unistd.h>
+  #include <fcntl.h>
+  #include <sys/ioctl.h>
+  #include <linux/cdrom.h>
+
+
+  int
+  main (int argc, char **argv)
+  {
+	char *program;
+	char *device;
+	int fd;           /* file descriptor for CD-ROM device */
+	int status;       /* return status for system calls */
+	int verbose = 0;
+	int slot=-1, x_slot;
+	int total_slots_available;
+
+	program = argv[0];
+
+	++argv;
+	--argc;
+
+	if (argc < 1 || argc > 3) {
+		fprintf (stderr, "usage: %s [-v] <device> [<slot>]\n",
+			 program);
+		fprintf (stderr, "       Slots are numbered 1 -- n.\n");
+		exit (1);
+	}
+
+       if (strcmp (argv[0], "-v") == 0) {
+                verbose = 1;
+                ++argv;
+                --argc;
+        }
+
+	device = argv[0];
+
+	if (argc == 2)
+		slot = atoi (argv[1]) - 1;
+
+	/* open device */
+	fd = open(device, O_RDONLY | O_NONBLOCK);
+	if (fd < 0) {
+		fprintf (stderr, "%s: open failed for `%s`: %s\n",
+			 program, device, strerror (errno));
+		exit (1);
+	}
+
+	/* Check CD player status */
+	total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS);
+	if (total_slots_available <= 1 ) {
+		fprintf (stderr, "%s: Device `%s` is not an ATAPI "
+			"compliant CD changer.\n", program, device);
+		exit (1);
+	}
+
+	if (slot >= 0) {
+		if (slot >= total_slots_available) {
+			fprintf (stderr, "Bad slot number.  "
+				 "Should be 1 -- %d.\n",
+				 total_slots_available);
+			exit (1);
+		}
+
+		/* load */
+		slot=ioctl (fd, CDROM_SELECT_DISC, slot);
+		if (slot<0) {
+			fflush(stdout);
+				perror ("CDROM_SELECT_DISC ");
+			exit(1);
+		}
+	}
+
+	if (slot < 0 || verbose) {
+
+		status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT);
+		if (status<0) {
+			fflush(stdout);
+			perror (" CDROM_SELECT_DISC");
+			exit(1);
+		}
+		slot=status;
+
+		printf ("Current slot: %d\n", slot+1);
+		printf ("Total slots available: %d\n",
+			total_slots_available);
+
+		printf ("Drive status: ");
+                status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+                if (status<0) {
+                  perror(" CDROM_DRIVE_STATUS");
+                } else switch(status) {
+		case CDS_DISC_OK:
+			printf ("Ready.\n");
+			break;
+		case CDS_TRAY_OPEN:
+			printf ("Tray Open.\n");
+			break;
+		case CDS_DRIVE_NOT_READY:
+			printf ("Drive Not Ready.\n");
+			break;
+		default:
+			printf ("This Should not happen!\n");
+			break;
+		}
+
+		for (x_slot=0; x_slot<total_slots_available; x_slot++) {
+			printf ("Slot %2d: ", x_slot+1);
+			status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot);
+			if (status<0) {
+			     perror(" CDROM_DRIVE_STATUS");
+			} else switch(status) {
+			case CDS_DISC_OK:
+				printf ("Disc present.");
+				break;
+			case CDS_NO_DISC:
+				printf ("Empty slot.");
+				break;
+			case CDS_TRAY_OPEN:
+				printf ("CD-ROM tray open.\n");
+				break;
+			case CDS_DRIVE_NOT_READY:
+				printf ("CD-ROM drive not ready.\n");
+				break;
+			case CDS_NO_INFO:
+				printf ("No Information available.");
+				break;
+			default:
+				printf ("This Should not happen!\n");
+				break;
+			}
+		  if (slot == x_slot) {
+                  status = ioctl (fd, CDROM_DISC_STATUS);
+                  if (status<0) {
+			perror(" CDROM_DISC_STATUS");
+                  }
+		  switch (status) {
+			case CDS_AUDIO:
+				printf ("\tAudio disc.\t");
+				break;
+			case CDS_DATA_1:
+			case CDS_DATA_2:
+				printf ("\tData disc type %d.\t", status-CDS_DATA_1+1);
+				break;
+			case CDS_XA_2_1:
+			case CDS_XA_2_2:
+				printf ("\tXA data disc type %d.\t", status-CDS_XA_2_1+1);
+				break;
+			default:
+				printf ("\tUnknown disc type 0x%x!\t", status);
+				break;
+			}
+			}
+			status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot);
+			if (status<0) {
+				perror(" CDROM_MEDIA_CHANGED");
+			}
+			switch (status) {
+			case 1:
+				printf ("Changed.\n");
+				break;
+			default:
+				printf ("\n");
+				break;
+			}
+		}
+	}
+
+	/* close device */
+	status = close (fd);
+	if (status != 0) {
+		fprintf (stderr, "%s: close failed for `%s`: %s\n",
+			 program, device, strerror (errno));
+		exit (1);
+	}
+
+	exit (0);
+  }
diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst
new file mode 100644
index 000000000000..338ad5f94e7c
--- /dev/null
+++ b/Documentation/cdrom/index.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====
+cdrom
+=====
+
+.. toctree::
+    :maxdepth: 1
+
+    cdrom-standard
+    ide-cd
+    packet-writing
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/cdrom/packet-writing.rst b/Documentation/cdrom/packet-writing.rst
new file mode 100644
index 000000000000..c5c957195a5a
--- /dev/null
+++ b/Documentation/cdrom/packet-writing.rst
@@ -0,0 +1,139 @@
+==============
+Packet writing
+==============
+
+Getting started quick
+---------------------
+
+- Select packet support in the block device section and UDF support in
+  the file system section.
+
+- Compile and install kernel and modules, reboot.
+
+- You need the udftools package (pktsetup, mkudffs, cdrwtool).
+  Download from http://sourceforge.net/projects/linux-udf/
+
+- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
+  as appropriate)::
+
+	# cdrwtool -d /dev/hdc -q
+
+- Setup your writer::
+
+	# pktsetup dev_name /dev/hdc
+
+- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy::
+
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD-RW media
+-------------------------------
+
+DVD-RW discs can be written to much like CD-RW discs if they are in
+the so called "restricted overwrite" mode. To put a disc in restricted
+overwrite mode, run::
+
+	# dvd+rw-format /dev/hdc
+
+You can then use the disc the same way you would use a CD-RW disc::
+
+	# pktsetup dev_name /dev/hdc
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD+RW media
+-------------------------------
+
+According to the DVD+RW specification, a drive supporting DVD+RW discs
+shall implement "true random writes with 2KB granularity", which means
+that it should be possible to put any filesystem with a block size >=
+2KB on such a disc. For example, it should be possible to do::
+
+	# dvd+rw-format /dev/hdc   (only needed if the disc has never
+	                            been formatted)
+	# mkudffs /dev/hdc
+	# mount /dev/hdc /cdrom -t udf -o rw,noatime
+
+However, some drives don't follow the specification and expect the
+host to perform aligned writes at 32KB boundaries. Other drives do
+follow the specification, but suffer bad performance problems if the
+writes are not 32KB aligned.
+
+Both problems can be solved by using the pktcdvd driver, which always
+generates aligned writes::
+
+	# dvd+rw-format /dev/hdc
+	# pktsetup dev_name /dev/hdc
+	# mkudffs /dev/pktcdvd/dev_name
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD-RAM media
+--------------------------------
+
+DVD-RAM discs are random writable, so using the pktcdvd driver is not
+necessary. However, using the pktcdvd driver can improve performance
+in the same way it does for DVD+RW media.
+
+
+Notes
+-----
+
+- CD-RW media can usually not be overwritten more than about 1000
+  times, so to avoid unnecessary wear on the media, you should always
+  use the noatime mount option.
+
+- Defect management (ie automatic remapping of bad sectors) has not
+  been implemented yet, so you are likely to get at least some
+  filesystem corruption if the disc wears out.
+
+- Since the pktcdvd driver makes the disc appear as a regular block
+  device with a 2KB block size, you can put any filesystem you like on
+  the disc. For example, run::
+
+	# /sbin/mke2fs /dev/pktcdvd/dev_name
+
+  to create an ext2 filesystem on the disc.
+
+
+Using the pktcdvd sysfs interface
+---------------------------------
+
+Since Linux 2.6.20, the pktcdvd module has a sysfs interface
+and can be controlled by it. For example the "pktcdvd" tool uses
+this interface. (see http://tom.ist-im-web.de/download/pktcdvd )
+
+"pktcdvd" works similar to "pktsetup", e.g.::
+
+	# pktcdvd -a dev_name /dev/hdc
+	# mkudffs /dev/pktcdvd/dev_name
+	# mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram
+	# cp files /dvdram
+	# umount /dvdram
+	# pktcdvd -r dev_name
+
+
+For a description of the sysfs interface look into the file:
+
+  Documentation/ABI/testing/sysfs-class-pktcdvd
+
+
+Using the pktcdvd debugfs interface
+-----------------------------------
+
+To read pktcdvd device infos in human readable form, do::
+
+	# cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
+
+For a description of the debugfs interface look into the file:
+
+  Documentation/ABI/testing/debugfs-pktcdvd
+
+
+
+Links
+-----
+
+See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information
+about DVD writing.
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
deleted file mode 100644
index 2834170d821e..000000000000
--- a/Documentation/cdrom/packet-writing.txt
+++ /dev/null
@@ -1,132 +0,0 @@
-Getting started quick
----------------------
-
-- Select packet support in the block device section and UDF support in
-  the file system section.
-
-- Compile and install kernel and modules, reboot.
-
-- You need the udftools package (pktsetup, mkudffs, cdrwtool).
-  Download from http://sourceforge.net/projects/linux-udf/
-
-- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
-  as appropriate):
-	# cdrwtool -d /dev/hdc -q
-
-- Setup your writer
-	# pktsetup dev_name /dev/hdc
-
-- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy!
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD-RW media
--------------------------------
-
-DVD-RW discs can be written to much like CD-RW discs if they are in
-the so called "restricted overwrite" mode. To put a disc in restricted
-overwrite mode, run:
-
-	# dvd+rw-format /dev/hdc
-
-You can then use the disc the same way you would use a CD-RW disc:
-
-	# pktsetup dev_name /dev/hdc
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD+RW media
--------------------------------
-
-According to the DVD+RW specification, a drive supporting DVD+RW discs
-shall implement "true random writes with 2KB granularity", which means
-that it should be possible to put any filesystem with a block size >=
-2KB on such a disc. For example, it should be possible to do:
-
-	# dvd+rw-format /dev/hdc   (only needed if the disc has never
-	                            been formatted)
-	# mkudffs /dev/hdc
-	# mount /dev/hdc /cdrom -t udf -o rw,noatime
-
-However, some drives don't follow the specification and expect the
-host to perform aligned writes at 32KB boundaries. Other drives do
-follow the specification, but suffer bad performance problems if the
-writes are not 32KB aligned.
-
-Both problems can be solved by using the pktcdvd driver, which always
-generates aligned writes.
-
-	# dvd+rw-format /dev/hdc
-	# pktsetup dev_name /dev/hdc
-	# mkudffs /dev/pktcdvd/dev_name
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD-RAM media
---------------------------------
-
-DVD-RAM discs are random writable, so using the pktcdvd driver is not
-necessary. However, using the pktcdvd driver can improve performance
-in the same way it does for DVD+RW media.
-
-
-Notes
------
-
-- CD-RW media can usually not be overwritten more than about 1000
-  times, so to avoid unnecessary wear on the media, you should always
-  use the noatime mount option.
-
-- Defect management (ie automatic remapping of bad sectors) has not
-  been implemented yet, so you are likely to get at least some
-  filesystem corruption if the disc wears out.
-
-- Since the pktcdvd driver makes the disc appear as a regular block
-  device with a 2KB block size, you can put any filesystem you like on
-  the disc. For example, run:
-
-	# /sbin/mke2fs /dev/pktcdvd/dev_name
-
-  to create an ext2 filesystem on the disc.
-
-
-Using the pktcdvd sysfs interface
----------------------------------
-
-Since Linux 2.6.20, the pktcdvd module has a sysfs interface
-and can be controlled by it. For example the "pktcdvd" tool uses
-this interface. (see http://tom.ist-im-web.de/download/pktcdvd )
-
-"pktcdvd" works similar to "pktsetup", e.g.:
-
-	# pktcdvd -a dev_name /dev/hdc
-	# mkudffs /dev/pktcdvd/dev_name
-	# mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram
-	# cp files /dvdram
-	# umount /dvdram
-	# pktcdvd -r dev_name
-
-
-For a description of the sysfs interface look into the file:
-
-  Documentation/ABI/testing/sysfs-class-pktcdvd
-
-
-Using the pktcdvd debugfs interface
------------------------------------
-
-To read pktcdvd device infos in human readable form, do:
-
-	# cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
-
-For a description of the debugfs interface look into the file:
-
-  Documentation/ABI/testing/debugfs-pktcdvd
-
-
-
-Links
------
-
-See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information
-about DVD writing.
diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt
deleted file mode 100644
index 673dc34d3f78..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.txt
+++ /dev/null
@@ -1,375 +0,0 @@
-				Block IO Controller
-				===================
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-Currently two IO control policies are implemented. First one is proportional
-weight time based division of disk policy. It is implemented in CFQ. Hence
-this policy takes effect only on leaf nodes when CFQ is being used. The second
-one is throttling policy which can be used to specify upper IO rate limits
-on devices. This policy is implemented in generic block layer and can be
-used on leaf nodes as well as higher level logical devices like device mapper.
-
-HOWTO
-=====
-Proportional Weight division of bandwidth
------------------------------------------
-You can do a very simple testing of running two dd threads in two different
-cgroups. Here is what you can do.
-
-- Enable Block IO controller
-	CONFIG_BLK_CGROUP=y
-
-- Enable group scheduling in CFQ
-	CONFIG_CFQ_GROUP_IOSCHED=y
-
-- Compile and boot into kernel and mount IO controller (blkio); see
-  cgroups.txt, Why are cgroups needed?.
-
-	mount -t tmpfs cgroup_root /sys/fs/cgroup
-	mkdir /sys/fs/cgroup/blkio
-	mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Create two cgroups
-	mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
-
-- Set weights of group test1 and test2
-	echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
-	echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
-
-- Create two same size files (say 512MB each) on same disk (file1, file2) and
-  launch two dd threads in different cgroup to read those files.
-
-	sync
-	echo 3 > /proc/sys/vm/drop_caches
-
-	dd if=/mnt/sdb/zerofile1 of=/dev/null &
-	echo $! > /sys/fs/cgroup/blkio/test1/tasks
-	cat /sys/fs/cgroup/blkio/test1/tasks
-
-	dd if=/mnt/sdb/zerofile2 of=/dev/null &
-	echo $! > /sys/fs/cgroup/blkio/test2/tasks
-	cat /sys/fs/cgroup/blkio/test2/tasks
-
-- At macro level, first dd should finish first. To get more precise data, keep
-  on looking at (with the help of script), at blkio.disk_time and
-  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
-  much disk time (in milliseconds), each group got and how many sectors each
-  group dispatched to the disk. We provide fairness in terms of disk time, so
-  ideally io.disk_time of cgroups should be in proportion to the weight.
-
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller
-	CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer
-	CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>".
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not.
-
-        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Both CFQ and throttling implement hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows.
-
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
-
-CFQ by default and throttling with "sane_behavior" will handle the
-hierarchy correctly.  For details on CFQ hierarchy support, refer to
-Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following.
-
-				pivot
-			     /  /   \  \
-			root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-	- Block IO controller.
-
-CONFIG_DEBUG_BLK_CGROUP
-	- Debug help. Right now some additional stats file show up in cgroup
-	  if this option is enabled.
-
-CONFIG_CFQ_GROUP_IOSCHED
-	- Enables group scheduling in CFQ. Currently only 1 level of group
-	  creation is allowed.
-
-CONFIG_BLK_DEV_THROTTLING
-	- Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-	- Specifies per cgroup weight. This is default weight of the group
-	  on all the devices until and unless overridden by per device rule.
-	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-	- One can specify per cgroup per device rules using this interface.
-	  These rules override the default value of group weight as specified
-	  by blkio.weight.
-
-	  Following is the format.
-
-	  # echo dev_maj:dev_minor weight > blkio.weight_device
-	  Configure weight=300 on /dev/sdb (8:16) in this cgroup
-	  # echo 8:16 300 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:16    300
-
-	  Configure weight=500 on /dev/sda (8:0) in this cgroup
-	  # echo 8:0 500 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:0     500
-	  8:16    300
-
-	  Remove specific weight for /dev/sda in this cgroup
-	  # echo 8:0 0 > blkio.weight_device
-	  # cat blkio.weight_device
-	  dev     weight
-	  8:16    300
-
-- blkio.leaf_weight[_device]
-	- Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-	- disk time allocated to cgroup per device in milliseconds. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the disk time allocated to group in
-	  milliseconds.
-
-- blkio.sectors
-	- number of sectors transferred to/from disk by the group. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the number of sectors transferred by the
-	  group to/from the device.
-
-- blkio.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-- blkio.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.io_service_time
-	- Total amount of time between request dispatch and request completion
-	  for the IOs done by this cgroup. This is in nanoseconds to make it
-	  meaningful for flash devices too. For devices with queue depth of 1,
-	  this time represents the actual service time. When queue_depth > 1,
-	  that is no longer true as requests may be served out of order. This
-	  may cause the service time for a given IO to include the service time
-	  of multiple IOs when served out of order which may result in total
-	  io_service_time > actual time elapsed. This time is further divided by
-	  the type of operation - read or write, sync or async. First two fields
-	  specify the major and minor number of the device, third field
-	  specifies the operation type and the fourth field specifies the
-	  io_service_time in ns.
-
-- blkio.io_wait_time
-	- Total amount of time the IOs for this cgroup spent waiting in the
-	  scheduler queues for service. This can be greater than the total time
-	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
-	  measure of total time the cgroup spent waiting but rather a measure of
-	  the wait_time for its individual IOs. For devices with queue_depth > 1
-	  this metric does not include the time spent waiting for service once
-	  the IO is dispatched to the device but till it actually gets serviced
-	  (there might be a time lag here due to re-ordering of requests by the
-	  device). This is in nanoseconds to make it meaningful for flash
-	  devices too. This time is further divided by the type of operation -
-	  read or write, sync or async. First two fields specify the major and
-	  minor number of the device, third field specifies the operation type
-	  and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-	- Total number of bios/requests merged into requests belonging to this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.io_queued
-	- Total number of requests queued up at any given instant for this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  The average queue size for this cgroup over the entire time of this
-	  cgroup's existence. Queue size samples are taken each time one of the
-	  queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time the cgroup had to wait since it became busy
-	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-	  its queues. This is different from the io_wait_time which is the
-	  cumulative total of the amount of time spent by each IO in that cgroup
-	  waiting in the scheduler queue. This is in nanoseconds. If this is
-	  read when the cgroup is in a waiting (for timeslice) state, the stat
-	  will only report the group_wait_time accumulated till the last time it
-	  got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time a cgroup spends without any pending
-	  requests when not being served, i.e., it does not include any time
-	  spent idling for one of the queues of the cgroup. This is in
-	  nanoseconds. If this is read when the cgroup is in an empty state,
-	  the stat will only report the empty_time accumulated till the last
-	  time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the existing ones
-	  from other queues/cgroups. This is in nanoseconds. If this is read
-	  when the cgroup is in an idling state, the stat will only report the
-	  idle_time accumulated till the last idle period and will not include
-	  the current delta.
-
-- blkio.dequeue
-	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
-	  gives the statistics about how many a times a group was dequeued
-	  from service tree of the device. First two fields specify the major
-	  and minor number of the device and third field specifies the number
-	  of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-	- Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per device. Following is
-	  the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-	- Writing an int to this file will result in resetting all the stats
-	  for that cgroup.
-
-CFQ sysfs tunable
-=================
-/sys/block/<disk>/queue/iosched/slice_idle
-------------------------------------------
-On a faster hardware CFQ can be slow, especially with sequential workload.
-This happens because CFQ idles on a single queue and single queue might not
-drive deeper request queue depths to keep the storage busy. In such scenarios
-one can try setting slice_idle=0 and that would switch CFQ to IOPS
-(IO operations per second) mode on NCQ supporting hardware.
-
-That means CFQ will not idle between cfq queues of a cfq group and hence be
-able to driver higher queue depth and achieve better throughput. That also
-means that cfq provides fairness among groups in terms of IOPS and not in
-terms of disk time.
-
-/sys/block/<disk>/queue/iosched/group_idle
-------------------------------------------
-If one disables idling on individual cfq queues and cfq service trees by
-setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
-on the group in an attempt to provide fairness among groups.
-
-By default group_idle is same as slice_idle and does not do anything if
-slice_idle is enabled.
-
-One can experience an overall throughput drop if you have created multiple
-groups and put applications in that group which are not driving enough
-IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
-on individual groups and throughput should improve.
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
deleted file mode 100644
index 059f7063eea6..000000000000
--- a/Documentation/cgroup-v1/cgroups.txt
+++ /dev/null
@@ -1,677 +0,0 @@
-				CGROUPS
-				-------
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.txt
-
-Original copyright statements from cpusets.txt:
-Portions Copyright (C) 2004 BULL SA.
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-
-CONTENTS:
-=========
-
-1. Control Groups
-  1.1 What are cgroups ?
-  1.2 Why are cgroups needed ?
-  1.3 How are cgroups implemented ?
-  1.4 What does notify_on_release do ?
-  1.5 What does clone_children do ?
-  1.6 How do I use cgroups ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Attaching processes
-  2.3 Mounting hierarchies by name
-3. Kernel API
-  3.1 Overview
-  3.2 Synchronization
-  3.3 Subsystem API
-4. Extended attributes usage
-5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines:
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of:
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like:
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup:
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type:
-# mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?' you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group.
-
-# mount -t tmpfs cgroup_root /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type:
-# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent:
-# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-  xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent:
-# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1:
-# cd /sys/fs/cgroup/rg1
-# mkdir my_cgroup
-
-Now you want to do something with this cgroup.
-# cd my_cgroup
-
-In this directory you can find several files:
-# ls
-cgroup.procs notify_on_release tasks
-(plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup:
-# /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir:
-# rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-	...
-# /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0:
-
-# echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-int css_online(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-void css_offline(struct cgroup *cgrp);
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-void css_free(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-void css_reset(struct cgroup_subsys_state *css)
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-void fork(struct task_struct *task)
-
-Called when a task is forked into a cgroup.
-
-void exit(struct task_struct *task)
-
-Called during task exit.
-
-void free(struct task_struct *task)
-
-Called when the task_struct is freed.
-
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-	- Trusted (XATTR_TRUSTED)
-	- Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cgroup file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE PID.
-
diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt
deleted file mode 100644
index 9d73cc0cadb9..000000000000
--- a/Documentation/cgroup-v1/cpuacct.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-CPU Accounting Controller
--------------------------
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
deleted file mode 100644
index 8402dd6de8df..000000000000
--- a/Documentation/cgroup-v1/cpusets.txt
+++ /dev/null
@@ -1,839 +0,0 @@
-				CPUSETS
-				-------
-
-Copyright (C) 2004 BULL SA.
-Written by Simon.Derr@bull.net
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-Modified by Paul Menage <menage@google.com>
-Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-CONTENTS:
-=========
-
-1. Cpusets
-  1.1 What are cpusets ?
-  1.2 Why are cpusets needed ?
-  1.3 How are cpusets implemented ?
-  1.4 What are exclusive cpusets ?
-  1.5 What is memory_pressure ?
-  1.6 What is memory spread ?
-  1.7 What is sched_load_balance ?
-  1.8 What is sched_relax_domain_level ?
-  1.9 How do I use cpusets ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Adding/removing cpus
-  2.3 Setting flags
-  2.4 Attaching processes
-3. Questions
-4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.txt.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example:
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==> Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-  -1  : no request. use system default or follow request of others.
-   0  : no search.
-   1  : search siblings (hyperthreads in a core).
-   2  : search cores in a package.
-   3  : search cpus in a node [= system wide on non-NUMA system]
-   4  : search nodes in a chunk of node [on NUMA system]
-   5  : search system wide [on NUMA system]
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset:
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
-# cd /sys/fs/cgroup/cpuset
-# mkdir my_cpuset
-
-Now you want to do something with this cpuset.
-# cd my_cpuset
-
-In this directory you can find several files:
-# ls
-cgroup.clone_children  cpuset.memory_pressure
-cgroup.event_control   cpuset.memory_spread_page
-cgroup.procs           cpuset.memory_spread_slab
-cpuset.cpu_exclusive   cpuset.mems
-cpuset.cpus            cpuset.sched_load_balance
-cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-cpuset.mem_hardwall    notify_on_release
-cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags:
-# /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus:
-# /bin/echo 0-7 > cpuset.cpus
-
-Add some mems:
-# /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset:
-# /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir:
-# rmdir my_sub_cs
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command
-
-mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to
-
-mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories:
-
-# /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset:
-
-# /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs:
-
-# /bin/echo "" > cpuset.cpus		-> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple:
-
-# /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
-# /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-	...
-# /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.txt
deleted file mode 100644
index 3c1095ca02ea..000000000000
--- a/Documentation/cgroup-v1/devices.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-Device Whitelist Controller
-
-1. Description:
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance
-
-	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing
-
-	echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing
-
-	echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example:
-      A
-     / \
-        B
-
-    group        behavior	exceptions
-    A            allow		"b 8:* rwm", "c 116:1 rw"
-    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A:
-	# echo "c 116:* r" > A/devices.deny
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed:
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated:
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding "c *:3 rwm":
-	# echo "c *:3 rwm" >A/devices.allow
-
-the result:
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B:
-	# echo "c 2:3 rwm" >B/devices.allow
-	# echo "c 50:3 r" >B/devices.allow
-or even
-	# echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt
deleted file mode 100644
index e831cb2b8394..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells:
-
-	$ echo $$
-	16644
-	$ bash
-	$ echo $$
-	16690
-
-	From a second, unrelated bash shell:
-	$ kill -SIGSTOP 16690
-	$ kill -SIGCONT 16690
-
-	<at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage :
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem :
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container :
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container :
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt
deleted file mode 100644
index 106245c3aecc..000000000000
--- a/Documentation/cgroup-v1/hugetlb.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-HugeTLB Controller
--------------------
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting two hugepage size (16M and 16G) the control
-files include:
-
-hugetlb.16GB.limit_in_bytes
-hugetlb.16GB.max_usage_in_bytes
-hugetlb.16GB.usage_in_bytes
-hugetlb.16GB.failcnt
-hugetlb.16MB.limit_in_bytes
-hugetlb.16MB.max_usage_in_bytes
-hugetlb.16MB.usage_in_bytes
-hugetlb.16MB.failcnt
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
deleted file mode 100644
index 621e29ffb358..000000000000
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ /dev/null
@@ -1,280 +0,0 @@
-Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2010/2
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.txt)
-
-0. How to record usage ?
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-	Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-	Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-	mem_cgroup_try_charge()
-
-2. Uncharge
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-	mem_cgroup_uncharge()
-	  Called when a page's refcount goes down to 0.
-
-	mem_cgroup_uncharge_swap()
-	  Called when swp_entry's refcnt goes down to 0. A charge against swap
-	  disappears.
-
-3. charge-commit-cancel
-	Memcg pages are charged in two steps:
-		mem_cgroup_try_charge()
-		mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-	At try_charge(), there are no flags to say "this page is charged".
-	at this point, usage += PAGE_SIZE.
-
-	At commit(), the page is associated with the memcg.
-
-	At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-	Anonymous page is newly allocated at
-		  - page fault into MAP_ANONYMOUS mapping.
-		  - Copy-On-Write.
-
-	4.1 Swap-in.
-	At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-	(a) If the SwapCache is newly allocated and read, it has no charges.
-	(b) If the SwapCache has been mapped by processes, it has been
-	    charged already.
-
-	4.2 Swap-out.
-	At swap-out, typical state transition is below.
-
-	(a) add to swap cache. (marked as SwapCache)
-	    swp_entry's refcnt += 1.
-	(b) fully unmapped.
-	    swp_entry's refcnt += # of ptes.
-	(c) write back to swap.
-	(d) delete from swap cache. (remove from SwapCache)
-	    swp_entry's refcnt -= 1.
-
-
-	Finally, at task exit,
-	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-   	Page Cache is charged at
-	- add_to_page_cache_locked().
-
-	The logic is very clear. (About migration, see below)
-	Note: __remove_from_page_cache() is called by remove_from_page_cache()
-	and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-	The best way to understand shmem's page state transition is to read
-	mm/shmem.c.
-	But brief explanation of the behavior of memcg around shmem will be
-	helpful to understand the logic.
-
-	Shmem's page (just leaf page, not direct/indirect block) can be on
-		- radix-tree of shmem's inode.
-		- SwapCache.
-		- Both on radix-tree and SwapCache. This happens at swap-in
-		  and swap-out,
-
-	It's charged when...
-	- A new page is added to shmem's radix-tree.
-	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-
-	mem_cgroup_migrate()
-
-8. LRU
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-	(By __isolate_lru_page(), the page is removed from both of global and
-	 private LRU.)
-
-
-9. Typical Tests.
-
- Tests for racy cases.
-
- 9.1 Small limit to memcg.
-	When you do test to do racy case, it's good test to set memcg's limit
-	to be very small rather than GB. Many races found in the test under
-	xKB or xxMB limits.
-	(Memory behavior under GB and Memory behavior under MB shows very
-	 different situation.)
-
- 9.2 Shmem
-	Historically, memcg's shmem handling was poor and we saw some amount
-	of troubles here. This is because shmem is page-cache but can be
-	SwapCache. Test with shmem/tmpfs is always good test.
-
- 9.3 Migration
-	For NUMA, migration is an another special case. To do easy test, cpuset
-	is useful. Following is a sample script to do migration.
-
-	mount -t cgroup -o cpuset none /opt/cpuset
-
-	mkdir /opt/cpuset/01
-	echo 1 > /opt/cpuset/01/cpuset.cpus
-	echo 0 > /opt/cpuset/01/cpuset.mems
-	echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-	mkdir /opt/cpuset/02
-	echo 1 > /opt/cpuset/02/cpuset.cpus
-	echo 1 > /opt/cpuset/02/cpuset.mems
-	echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-	In above set, when you moves a task from 01 to 02, page migration to
-	node 0 to node 1 will occur. Following is a script to migrate all
-	under cpuset.
-	--
-	move_task()
-	{
-	for pid in $1
-        do
-                /bin/echo $pid >$2/tasks 2>/dev/null
-		echo -n $pid
-		echo -n " "
-        done
-	echo END
-	}
-
-	G1_TASK=`cat ${G1}/tasks`
-	G2_TASK=`cat ${G2}/tasks`
-	move_task "${G1_TASK}" ${G2} &
-	--
- 9.4 Memory hotplug.
-	memory hotplug test is one of good test.
-	to offline memory, do following.
-	# echo offline > /sys/devices/system/memory/memoryXXX/state
-	(XXX is the place of memory)
-	This is an easy way to test page migration, too.
-
- 9.5 mkdir/rmdir
-	When using hierarchy, mkdir/rmdir test should be done.
-	Use tests like the following.
-
-	echo 1 >/opt/cgroup/01/memory/use_hierarchy
-	mkdir /opt/cgroup/01/child_a
-	mkdir /opt/cgroup/01/child_b
-
-	set limit to 01.
-	add limit to 01/child_b
-	run jobs under child_a and child_b
-
-	create/delete following groups at random while jobs are running.
-	/opt/cgroup/01/child_a/child_aa
-	/opt/cgroup/01/child_b/child_bb
-	/opt/cgroup/01/child_c
-
-	running new jobs in new group is also good.
-
- 9.6 Mount with other subsystems.
-	Mounting with other subsystems is a good test because there is a
-	race and lock dependency with other cgroup subsystems.
-
-	example)
-	# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-	and do task move, mkdir, rmdir etc...under this.
-
- 9.7 swapoff.
-	Besides management of swap is one of complicated parts of memcg,
-	call path of swap-in at swapoff is not same as usual swap-in path..
-	It's worth to be tested explicitly.
-
-	For example, test like following is good.
-	(Shell-A)
-	# mount -t cgroup none /cgroup -o memory
-	# mkdir /cgroup/test
-	# echo 40M > /cgroup/test/memory.limit_in_bytes
-	# echo 0 > /cgroup/test/tasks
-	Run malloc(100M) program under this. You'll see 60M of swaps.
-	(Shell-B)
-	# move all tasks in /cgroup/test to /cgroup
-	# /sbin/swapoff -a
-	# rmdir /cgroup/test
-	# kill malloc task.
-
-	Of course, tmpfs v.s. swapoff test should be tested, too.
-
- 9.8 OOM-Killer
-	Out-of-memory caused by memcg's limit will kill tasks under
-	the memcg. When hierarchy is used, a task under hierarchy
-	will be killed by the kernel.
-	In this case, panic_on_oom shouldn't be invoked and tasks
-	in other groups shouldn't be killed.
-
-	It's not difficult to cause OOM under memcg as following.
-	Case A) when you can swapoff
-	#swapoff -a
-	#echo 50M > /memory.limit_in_bytes
-	run 51M of malloc
-
-	Case B) when you use mem+swap limitation.
-	#echo 50M > memory.limit_in_bytes
-	#echo 50M > memory.memsw.limit_in_bytes
-	run 51M of malloc
-
- 9.9 Move charges at task migration
-	Charges associated with a task can be moved along with task migration.
-
-	(Shell-A)
-	#mkdir /cgroup/A
-	#echo $$ >/cgroup/A/tasks
-	run some programs which uses some amount of memory in /cgroup/A.
-
-	(Shell-B)
-	#mkdir /cgroup/B
-	#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-	#echo "pid of the program running in group A" >/cgroup/B/tasks
-
-	You can see charges have been moved by reading *.usage_in_bytes or
-	memory.stat of both A and B.
-	See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be
-	written to move_charge_at_immigrate.
-
- 9.10 Memory thresholds
-	Memory controller implements memory thresholds using cgroups notification
-	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-	(Shell-A) Create cgroup and run event listener
-	# mkdir /cgroup/A
-	# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-	(Shell-B) Add task to cgroup and try to allocate and free memory
-	# echo $$ >/cgroup/A/tasks
-	# a="$(dd if=/dev/zero bs=1M count=10)"
-	# a=
-
-	You will see message from cgroup_event_listener every time you cross
-	the thresholds.
-
-	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
deleted file mode 100644
index a33cedf85427..000000000000
--- a/Documentation/cgroup-v1/memory.txt
+++ /dev/null
@@ -1,892 +0,0 @@
-Memory Resource Controller
-
-NOTE: This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE: The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors)
-In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
- tasks				 # attach a task(thread) and show list of threads
- cgroup.procs			 # show list of processes
- cgroup.event_control		 # an interface for event_fd()
- memory.usage_in_bytes		 # show current usage for memory
-				 (See 5.5 for details)
- memory.memsw.usage_in_bytes	 # show current usage for memory+Swap
-				 (See 5.5 for details)
- memory.limit_in_bytes		 # set/show limit of memory usage
- memory.memsw.limit_in_bytes	 # set/show limit of memory+Swap usage
- memory.failcnt			 # show the number of memory usage hits limits
- memory.memsw.failcnt		 # show the number of memory+Swap hits limits
- memory.max_usage_in_bytes	 # show max memory usage recorded
- memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
- memory.soft_limit_in_bytes	 # set/show soft limit of memory usage
- memory.stat			 # show various statistics
- memory.use_hierarchy		 # set/show hierarchical account enabled
- memory.force_empty		 # trigger forced page reclaim
- memory.pressure_level		 # set memory pressure notifications
- memory.swappiness		 # set/show swappiness parameter of vmscan
-				 (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate # set/show controls of moving charges
- memory.oom_control		 # set/show oom controls.
- memory.numa_stat		 # show the number of memory usage per numa node
-
- memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes      # show current kernel memory allocation
- memory.kmem.failcnt             # show the number of kernel memory usage hits limits
- memory.kmem.max_usage_in_bytes  # show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes  # show current tcp buf memory allocation
- memory.kmem.tcp.failcnt            # show the number of tcp buf memory usage hits limits
- memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
-
-1. History
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
-
-		+--------------------+
-		|  mem_cgroup        |
-		|  (page_counter)    |
-		+--------------------+
-		 /            ^      \
-		/             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-* why 'memory+swap' rather than swap.
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-* What happens when a cgroup hits memory.memsw.limit_in_bytes
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE: Reclaim does not work for the root cgroup, since we cannot set any
-limits on the root cgroup.
-
-Note2: When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
-
-   Other lock order is following:
-   PG_locked.
-   mm->page_table_lock
-       pgdat->lru_lock
-	  lock_page_cgroup.
-  In many cases, just lock_page_cgroup() is called.
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
-
-* stack pages: every process consumes some stack pages. By accounting into
-kernel memory, we prevent new processes from being created when the kernel
-memory usage is too high.
-
-* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
-of each kmem_cache is created every time the cache is touched by the first time
-from inside the memcg. The creation is done lazily, so some objects can still be
-skipped while the cache is being created. All objects in a slab page should
-belong to the same memcg. This only fails to hold when a task is migrated to a
-different memcg during the page allocation by the cache.
-
-* sockets memory pressure: some sockets protocols have memory pressure
-thresholds. The Memory Controller allows them to be controlled individually
-per cgroup, instead of globally.
-
-* tcp memory pressure: sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-    U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-    U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-    WARNING: In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-    U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-
-3.0. Configuration
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
-# mount -t tmpfs none /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/memory
-# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it
-# mkdir /sys/fs/cgroup/memory/0
-# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit:
-# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
-
-NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
-NOTE: We cannot set limits on the root cgroup any more.
-
-# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-4194304
-
-We can check the usage:
-# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel.
-
-# echo 1 > memory.limit_in_bytes
-# cat memory.limit_in_bytes
-4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces.
-
-5.1 force_empty
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this
-
-  # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Though rmdir() offlines memcg, but the memcg may still stay there due to
-  charged file caches. Some out-of-use page caches may keep charged until
-  memory pressure happens. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
-
-memory.stat file includes following statistics
-
-# per-memory cgroup local status
-cache		- # of bytes of page cache memory.
-rss		- # of bytes of anonymous and swap cache memory (includes
-		transparent hugepages).
-rss_huge	- # of bytes of anonymous transparent hugepages.
-mapped_file	- # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin		- # of charging events to the memory cgroup. The charging
-		event happens each time a page is accounted as either mapped
-		anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout		- # of uncharging events to the memory cgroup. The uncharging
-		event happens each time a page is unaccounted from the cgroup.
-swap		- # of bytes of swap usage
-dirty		- # of bytes that are waiting to get written back to the disk.
-writeback	- # of bytes of file/anon cache that are queued for syncing to
-		disk.
-inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
-		LRU list.
-active_anon	- # of bytes of anonymous and swap cache memory on active
-		LRU list.
-inactive_file	- # of bytes of file-backed memory on inactive LRU list.
-active_file	- # of bytes of file-backed memory on active LRU list.
-unevictable	- # of bytes of memory that cannot be reclaimed (mlocked etc).
-
-# status considering hierarchy (see memory.use_hierarchy settings)
-
-hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
-			under which the memory cgroup is
-hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
-			hierarchy under which memory cgroup is.
-
-total_<counter>		- # hierarchical version of <counter>, which in
-			addition to the cgroup's own value includes the
-			sum of all hierarchical children's values of
-			<counter>, i.e. total_cache
-
-# The following additional stats are dependent on CONFIG_DEBUG_VM.
-
-recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon	- VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file	- VM internal parameter. (see mm/vmscan.c)
-
-Memo:
-	recent_rotated means recent frequency of LRU rotation.
-	recent_scanned means recent # of scans to LRU.
-	showing for better debug please see the code for meanings.
-
-Note:
-	Only anonymous and swap cache memory is listed as part of 'rss' stat.
-	This should not be confused with the true 'resident set size' or the
-	amount of physical memory used by the cgroup.
-	'rss + mapped_file" will give you resident set size of cgroup.
-	(Note: file and shmem may be shared among other cgroups. In that case,
-	 mapped_file is accounted only when the memory cgroup is owner of page
-	 cache.)
-
-5.3 swappiness
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file.
-# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is:
-
-total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy
-
-	       root
-	     /  |   \
-            /	|    \
-	   a	b     c
-		      | \
-		      |  \
-		      d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
-
-# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by
-
-# echo 0 > memory.use_hierarchy
-
-NOTE1: Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2: When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)
-
-# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use
-
-# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1: Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2: It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it:
-
-# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note: Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note: Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note: If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note: It can take several seconds if you move charges much.
-
-And if you want disable it again:
-
-# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-  bit | what type of charges would be moved ?
- -----+------------------------------------------------------------------------
-   0  | A charge of an anonymous page (or swap of it) used by the target task.
-      | You must enable Swap Extension (see 2.4) to enable move of swap charges.
- -----+------------------------------------------------------------------------
-   1  | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
-      | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
-      | anonymous pages, file pages (and swaps) in the range mmapped by the task
-      | will be moved even if the task hasn't done page fault, i.e. they might
-      | not be the task's "RSS", but other task's "RSS" that maps the same file.
-      | And mapcount of the page is ignored (the page can be moved even if
-      | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
-      | enable move of swap charges.
-
-8.3 TODO
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-	#echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-	* enlarge limit or reduce usage.
-To reduce usage,
-	* kill some tasks.
-	* move some tasks to other group with account migration.
-	* remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-	oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
-	under_oom	 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
-				 be stopped.)
-
-11. Memory Pressure
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
-   same as omitting the optional mode parameter, preserved by backwards
-   compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
-   behavior, except that propagation continues regardless of whether there are
-   event listeners at each level, with the "hierarchy" mode. In the above
-   example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
-   memory pressure is experienced in the memcg for which the notification is
-   registered. In the above example, group C will receive notification if
-   registered for "local" notification and the group experiences memory
-   pressure. However, group B will never receive notification, regardless if
-   there is an event listener for group C or not, if group B is registered for
-   local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure:
-
-   # cd /sys/fs/cgroup/memory/
-   # mkdir foo
-   # cd foo
-   # cgroup_event_listener memory.pressure_level low,hierarchy &
-   # echo 8000000 > memory.limit_in_bytes
-   # echo 8000000 > memory.memsw.limit_in_bytes
-   # echo $$ > tasks
-   # dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt
deleted file mode 100644
index ec182346dea2..000000000000
--- a/Documentation/cgroup-v1/net_cls.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Network classifier cgroup
--------------------------
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example:
-mkdir /sys/fs/cgroup/net_cls
-mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-mkdir /sys/fs/cgroup/net_cls/0
-echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-	- setting a 10:1 handle.
-
-cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-1048577
-
-configuring tc:
-tc qdisc add dev eth0 root handle 10: htb
-
-tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
- - creating traffic class 10:1
-
-tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example:
-iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt
deleted file mode 100644
index a82cbd28ea8a..000000000000
--- a/Documentation/cgroup-v1/net_prio.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Network priority cgroup
--------------------------
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-This file is read-only, and is simply informative.  It contains a unique integer
-value that the kernel uses as an internal representation of this cgroup.
-
-net_prio.ifpriomap
-This file contains a map of the priorities assigned to traffic originating from
-processes in this group and egressing the system on various interfaces. It
-contains a list of tuples in the form <ifname priority>.  Contents of this file
-can be modified by echoing a string into the file using the same tuple format.
-for example:
-
-echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt
deleted file mode 100644
index e105d708ccde..000000000000
--- a/Documentation/cgroup-v1/pids.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-						   Process Number Controller
-						   =========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
-  - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller:
-# mkdir -p /sys/fs/cgroup/pids
-# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it:
-# mkdir -p /sys/fs/cgroup/pids/parent/child
-# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail:
-
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's):
-
-# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.max
-max
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current):
-
-# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-#
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
deleted file mode 100644
index 9bdb7fd03f83..000000000000
--- a/Documentation/cgroup-v1/rdma.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-				RDMA Controller
-				----------------
-
-Contents
---------
-
-1. Overview
-  1-1. What is RDMA controller?
-  1-2. Why RDMA controller needed?
-  1-3. How is RDMA controller implemented?
-2. Usage Examples
-
-1. Overview
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-
-2. Usage Examples
------------------
-
-(a) Configure resource limit:
-echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit:
-cat /sys/fs/cgroup/rdma/2/rdma.max
-#Output:
-mlx4_0 hca_handle=2 hca_object=2000
-ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage:
-cat /sys/fs/cgroup/rdma/2/rdma.current
-#Output:
-mlx4_0 hca_handle=1 hca_object=20
-ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit:
-echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt
deleted file mode 100644
index 6cef20a8cedc..000000000000
--- a/Documentation/cma/debugfs.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-The CMA debugfs interface is useful to retrieve basic information out of the
-different CMA areas and to test allocation/release in each of the areas.
-
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
-
-	<debugfs>/cma/cma-0
-
-The structure of the files created under that directory is as follows:
-
- - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
- - [RO] count: Amount of memory in the CMA area.
- - [RO] order_per_bit: Order of pages represented by one bit.
- - [RO] bitmap: The bitmap of page states in the zone.
- - [WO] alloc: Allocate N pages from that CMA area. For example:
-
-	echo 5 > <debugfs>/cma/cma-2/alloc
-
-would try to allocate 5 pages from the cma-2 area.
-
- - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 72647a38b5c2..a8fe845832bc 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -16,6 +16,8 @@ import sys
 import os
 import sphinx
 
+from subprocess import check_output
+
 # Get Sphinx version
 major, minor, patch = sphinx.version_info[:3]
 
@@ -34,10 +36,11 @@ needs_sphinx = '1.3'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']
+extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
+              'kfigure', 'sphinx.ext.ifconfig', 'automarkup']
 
 # The name of the math extension changed on Sphinx 1.4
-if major == 1 and minor > 3:
+if (major == 1 and minor > 3) or (major > 1):
     extensions.append("sphinx.ext.imgmath")
 else:
     extensions.append("sphinx.ext.pngmath")
@@ -200,7 +203,7 @@ html_context = {
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+html_use_smartypants = False
 
 # Custom sidebar templates, maps document names to template names.
 #html_sidebars = {}
@@ -275,10 +278,21 @@ latex_elements = {
         \\setsansfont{DejaVu Sans}
         \\setromanfont{DejaVu Serif}
         \\setmonofont{DejaVu Sans Mono}
-
      '''
 }
 
+# At least one book (translations) may have Asian characters
+# with are only displayed if xeCJK is used
+
+cjk_cmd = check_output(['fc-list', '--format="%{family[0]}\n"']).decode('utf-8', 'ignore')
+if cjk_cmd.find("Noto Sans CJK SC") >= 0:
+    print ("enabling CJK for LaTeX builder")
+    latex_elements['preamble']  += '''
+	% This is needed for translations
+        \\usepackage{xeCJK}
+        \\setCJKmainfont{Noto Sans CJK SC}
+     '''
+
 # Fix reference escape troubles with Sphinx 1.4.x
 if major == 1 and minor > 3:
     latex_elements['preamble']  += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
@@ -409,6 +423,21 @@ latex_documents = [
      'The kernel development community', 'manual'),
 ]
 
+# Add all other index files from Documentation/ subdirectories
+for fn in os.listdir('.'):
+    doc = os.path.join(fn, "index")
+    if os.path.exists(doc + ".rst"):
+        has = False
+        for l in latex_documents:
+            if l[0] == doc:
+                has = True
+                break
+        if not has:
+            latex_documents.append((doc, fn + '.tex',
+                                    'Linux %s Documentation' % fn.capitalize(),
+                                    'The kernel development community',
+                                    'manual'))
+
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
 #latex_logo = None
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
deleted file mode 100644
index ab7ca897fab7..000000000000
--- a/Documentation/connector/connector.txt
+++ /dev/null
@@ -1,196 +0,0 @@
-/*****************************************/
-Kernel Connector.
-/*****************************************/
-
-Kernel connector - new netlink based userspace <-> kernel space easy
-to use communication module.
-
-The Connector driver makes it easy to connect various agents using a
-netlink based network.  One must register a callback and an identifier.
-When the driver receives a special netlink message with the appropriate
-identifier, the appropriate callback will be called.
-
-From the userspace point of view it's quite straightforward:
-
-	socket();
-	bind();
-	send();
-	recv();
-
-But if kernelspace wants to use the full power of such connections, the
-driver writer must create special sockets, must know about struct sk_buff
-handling, etc...  The Connector driver allows any kernelspace agents to use
-netlink based networking for inter-process communication in a significantly
-easier way:
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
-void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
-
-struct cb_id
-{
-	__u32			idx;
-	__u32			val;
-};
-
-idx and val are unique identifiers which must be registered in the
-connector.h header for in-kernel usage.  void (*callback) (void *) is a
-callback function which will be called when a message with above idx.val
-is received by the connector core.  The argument for that function must
-be dereferenced to struct cn_msg *.
-
-struct cn_msg
-{
-	struct cb_id		id;
-
-	__u32			seq;
-	__u32			ack;
-
-	__u32			len;		/* Length of the following data */
-	__u8			data[0];
-};
-
-/*****************************************/
-Connector interfaces.
-/*****************************************/
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-
- Registers new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-				  It must be registered in connector.h for legal in-kernel users.
- char *name			- connector's callback symbolic name.
- void (*callback) (struct cn..)	- connector's callback.
-				  cn_msg and the sender's credentials
-
-
-void cn_del_callback(struct cb_id *id);
-
- Unregisters new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-
-
-int cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __groups, int gfp_mask);
-int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __groups, int gfp_mask);
-
- Sends message to the specified groups.  It can be safely called from
- softirq context, but may silently fail under strong memory pressure.
- If there are no listeners for given group -ESRCH can be returned.
-
- struct cn_msg *		- message header(with attached data).
- u16 len			- for *_multi multiple cn_msg messages can be sent
- u32 port			- destination port.
- 				  If non-zero the message will be sent to the
-				  given port, which should be set to the
-				  original sender.
- u32 __group			- destination group.
-				  If port and __group is zero, then appropriate group will
-				  be searched through all registered connector users,
-				  and message will be delivered to the group which was
-				  created for user with the same ID as in msg.
-				  If __group is not zero, then message will be delivered
-				  to the specified group.
- int gfp_mask			- GFP mask.
-
- Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to its id.idx.
-
-/*****************************************/
-Protocol description.
-/*****************************************/
-
-The current framework offers a transport layer with fixed headers.  The
-recommended protocol which uses such a header is as following:
-
-msg->seq and msg->ack are used to determine message genealogy.  When
-someone sends a message, they use a locally unique sequence and random
-acknowledge number.  The sequence number may be copied into
-nlmsghdr->nlmsg_seq too.
-
-The sequence number is incremented with each message sent.
-
-If you expect a reply to the message, then the sequence number in the
-received message MUST be the same as in the original message, and the
-acknowledge number MUST be the same + 1.
-
-If we receive a message and its sequence number is not equal to one we
-are expecting, then it is a new message.  If we receive a message and
-its sequence number is the same as one we are expecting, but its
-acknowledge is not equal to the sequence number in the original
-message + 1, then it is a new message.
-
-Obviously, the protocol header contains the above id.
-
-The connector allows event notification in the following form: kernel
-driver or userspace process can ask connector to notify it when
-selected ids will be turned on or off (registered or unregistered its
-callback).  It is done by sending a special command to the connector
-driver (it also registers itself with id={-1, -1}).
-
-As example of this usage can be found in the cn_test.c module which
-uses the connector to request notification and to send messages.
-
-/*****************************************/
-Reliability.
-/*****************************************/
-
-Netlink itself is not a reliable protocol.  That means that messages can
-be lost due to memory pressure or process' receiving queue overflowed,
-so caller is warned that it must be prepared.  That is why the struct
-cn_msg [main connector's message header] contains u32 seq and u32 ack
-fields.
-
-/*****************************************/
-Userspace usage.
-/*****************************************/
-
-2.6.14 has a new netlink socket implementation, which by default does not
-allow people to send data to netlink groups other than 1.
-So, if you wish to use a netlink socket (for example using connector)
-with a different group number, the userspace application must subscribe to
-that group first.  It can be achieved by the following pseudocode:
-
-s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
-
-l_local.nl_family = AF_NETLINK;
-l_local.nl_groups = 12345;
-l_local.nl_pid = 0;
-
-if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
-	perror("bind");
-	close(s);
-	return -1;
-}
-
-{
-	int on = l_local.nl_groups;
-	setsockopt(s, 270, 1, &on, sizeof(on));
-}
-
-Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
-option.  To drop a multicast subscription, one should call the above socket
-option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
-
-2.6.14 netlink code only allows to select a group which is less or equal to
-the maximum group number, which is used at netlink_kernel_create() time.
-In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
-group number 12345, you must increment CN_NETLINK_USERS to that number.
-Additional 0xf numbers are allocated to be used by non-in-kernel users.
-
-Due to this limitation, group 0xffffffff does not work now, so one can
-not use add/remove connector's group notifications, but as far as I know, 
-only cn_test.c test module used it.
-
-Some work in netlink area is still being done, so things can be changed in
-2.6.15 timeframe, if it will happen, documentation will be updated for that
-kernel.
-
-/*****************************************/
-Code samples
-/*****************************************/
-
-Sample code for a connector test module and user space can be found
-in samples/connector/. To build this code, enable CONFIG_CONNECTOR
-and CONFIG_SAMPLES.
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt
deleted file mode 100644
index d73c2ab4beda..000000000000
--- a/Documentation/console/console.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Console Drivers
-===============
-
-The Linux kernel has 2 general types of console drivers.  The first type is
-assigned by the kernel to all the virtual consoles during the boot process.
-This type will be called 'system driver', and only one system driver is allowed
-to exist. The system driver is persistent and it can never be unloaded, though
-it may become inactive.
-
-The second type has to be explicitly loaded and unloaded. This will be called
-'modular driver' by this document. Multiple modular drivers can coexist at
-any time with each driver sharing the console with other drivers including
-the system driver. However, modular drivers cannot take over the console
-that is currently occupied by another modular driver. (Exception: Drivers that
-call do_take_over_console() will succeed in the takeover regardless of the type
-of driver occupying the consoles.) They can only take over the console that is
-occupied by the system driver. In the same token, if the modular driver is
-released by the console, the system driver will take over.
-
-Modular drivers, from the programmer's point of view, have to call:
-
-	 do_take_over_console() - load and bind driver to console layer
-	 give_up_console() - unload driver; it will only work if driver
-			     is fully unbound
-
-In newer kernels, the following are also available:
-
-	 do_register_con_driver()
-	 do_unregister_con_driver()
-
-If sysfs is enabled, the contents of /sys/class/vtconsole can be
-examined. This shows the console backends currently registered by the
-system which are named vtcon<n> where <n> is an integer from 0 to 15. Thus:
-
-       ls /sys/class/vtconsole
-       .  ..  vtcon0  vtcon1
-
-Each directory in /sys/class/vtconsole has 3 files:
-
-     ls /sys/class/vtconsole/vtcon0
-     .  ..  bind  name  uevent
-
-What do these files signify?
-
-     1. bind - this is a read/write file. It shows the status of the driver if
-        read, or acts to bind or unbind the driver to the virtual consoles
-        when written to. The possible values are:
-
-	0 - means the driver is not bound and if echo'ed, commands the driver
-	    to unbind
-
-        1 - means the driver is bound and if echo'ed, commands the driver to
-	    bind
-
-     2. name - read-only file. Shows the name of the driver in this format:
-
-	cat /sys/class/vtconsole/vtcon0/name
-	(S) VGA+
-
-	    '(S)' stands for a (S)ystem driver, i.e., it cannot be directly
-	    commanded to bind or unbind
-
-	    'VGA+' is the name of the driver
-
-	cat /sys/class/vtconsole/vtcon1/name
-	(M) frame buffer device
-
-	    In this case, '(M)' stands for a (M)odular driver, one that can be
-	    directly commanded to bind or unbind.
-
-     3. uevent - ignore this file
-
-When unbinding, the modular driver is detached first, and then the system
-driver takes over the consoles vacated by the driver. Binding, on the other
-hand, will bind the driver to the consoles that are currently occupied by a
-system driver.
-
-NOTE1: Binding and unbinding must be selected in Kconfig. It's under:
-
-Device Drivers -> Character devices -> Support for binding and unbinding
-console drivers
-
-NOTE2: If any of the virtual consoles are in KD_GRAPHICS mode, then binding or
-unbinding will not succeed. An example of an application that sets the console
-to KD_GRAPHICS is X.
-
-How useful is this feature? This is very useful for console driver
-developers. By unbinding the driver from the console layer, one can unload the
-driver, make changes, recompile, reload and rebind the driver without any need
-for rebooting the kernel. For regular users who may want to switch from
-framebuffer console to VGA console and vice versa, this feature also makes
-this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb
-for more details.)
-
-Notes for developers:
-=====================
-
-do_take_over_console() is now broken up into:
-
-     do_register_con_driver()
-     do_bind_con_driver() - private function
-
-give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must
-be fully unbound for this call to succeed. con_is_bound() will check if the
-driver is bound or not.
-
-Guidelines for console driver writers:
-=====================================
-
-In order for binding to and unbinding from the console to properly work,
-console drivers must follow these guidelines:
-
-1. All drivers, except system drivers, must call either do_register_con_driver()
-   or do_take_over_console(). do_register_con_driver() will just add the driver
-   to the console's internal list. It won't take over the
-   console. do_take_over_console(), as it name implies, will also take over (or
-   bind to) the console.
-
-2. All resources allocated during con->con_init() must be released in
-   con->con_deinit().
-
-3. All resources allocated in con->con_startup() must be released when the
-   driver, which was previously bound, becomes unbound.  The console layer
-   does not have a complementary call to con->con_startup() so it's up to the
-   driver to check when it's legal to release these resources. Calling
-   con_is_bound() in con->con_deinit() will help.  If the call returned
-   false(), then it's safe to release the resources.  This balance has to be
-   ensured because con->con_startup() can be called again when a request to
-   rebind the driver to the console arrives.
-
-4. Upon exit of the driver, ensure that the driver is totally unbound. If the
-   condition is satisfied, then the driver must call do_unregister_con_driver()
-   or give_up_console().
-
-5. do_unregister_con_driver() can also be called on conditions which make it
-   impossible for the driver to service console requests.  This can happen
-   with the framebuffer console that suddenly lost all of its drivers.
-
-The current crop of console drivers should still work correctly, but binding
-and unbinding them may cause problems. With minimal fixes, these drivers can
-be made to work correctly.
-
-==========================
-Antonino Daplas <adaplas@pol.net>
-
diff --git a/Documentation/core-api/circular-buffers.rst b/Documentation/core-api/circular-buffers.rst
index 53e51caa3347..50966f66e398 100644
--- a/Documentation/core-api/circular-buffers.rst
+++ b/Documentation/core-api/circular-buffers.rst
@@ -3,7 +3,7 @@ Circular Buffers
 ================
 
 :Author: David Howells <dhowells@redhat.com>
-:Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+:Author: Paul E. McKenney <paulmck@linux.ibm.com>
 
 
 Linux provides a number of features that can be used to implement circular
diff --git a/Documentation/core-api/conf.py b/Documentation/core-api/conf.py
deleted file mode 100644
index db1f7659f3da..000000000000
--- a/Documentation/core-api/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Core-API Documentation"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'core-api.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/gcc-plugins.txt b/Documentation/core-api/gcc-plugins.rst
index 8502f24396fb..8502f24396fb 100644
--- a/Documentation/gcc-plugins.txt
+++ b/Documentation/core-api/gcc-plugins.rst
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index ee1bb8983a88..da0ed972d224 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -34,6 +34,9 @@ Core utilities
    timekeeping
    boot-time-mm
    memory-hotplug
+   protection-keys
+   ../RCU/index
+   gcc-plugins
 
 
 Interfaces for kernel debugging
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index a29c99d13331..08af5caf036d 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -33,6 +33,9 @@ String Conversions
 .. kernel-doc:: lib/kstrtox.c
    :export:
 
+.. kernel-doc:: lib/string_helpers.c
+   :export:
+
 String Manipulation
 -------------------
 
@@ -51,7 +54,7 @@ The Linux kernel provides more basic utility functions.
 Bit Operations
 --------------
 
-.. kernel-doc:: arch/x86/include/asm/bitops.h
+.. kernel-doc:: include/asm-generic/bitops-instrumented.h
    :internal:
 
 Bitmap Operations
@@ -138,6 +141,15 @@ Base 2 log and power Functions
 .. kernel-doc:: include/linux/log2.h
    :internal:
 
+Integer power Functions
+-----------------------
+
+.. kernel-doc:: lib/math/int_pow.c
+   :export:
+
+.. kernel-doc:: lib/math/int_sqrt.c
+   :export:
+
 Division Functions
 ------------------
 
@@ -358,8 +370,6 @@ Read-Copy Update (RCU)
 
 .. kernel-doc:: kernel/rcu/tree.c
 
-.. kernel-doc:: kernel/rcu/tree_plugin.h
-
 .. kernel-doc:: kernel/rcu/tree_exp.h
 
 .. kernel-doc:: kernel/rcu/update.c
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 75d2bbe9813f..c6224d039bcb 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -119,7 +119,7 @@ Kernel Pointers
 
 For printing kernel pointers which should be hidden from unprivileged
 users. The behaviour of %pK depends on the kptr_restrict sysctl - see
-Documentation/sysctl/kernel.txt for more details.
+Documentation/admin-guide/sysctl/kernel.rst for more details.
 
 Unmodified Addresses
 --------------------
diff --git a/Documentation/x86/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index 49d9833af871..49d9833af871 100644
--- a/Documentation/x86/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 93cbeb9daec0..c0ffa30c7c37 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -65,7 +65,7 @@ different format depending on what is required by the user:
 .. c:function:: u64 ktime_get_ns( void )
 		u64 ktime_get_boottime_ns( void )
 		u64 ktime_get_real_ns( void )
-		u64 ktime_get_tai_ns( void )
+		u64 ktime_get_clocktai_ns( void )
 		u64 ktime_get_raw_ns( void )
 
 	Same as the plain ktime_get functions, but returning a u64 number
@@ -99,19 +99,23 @@ Coarse and fast_ns access
 
 Some additional variants exist for more specialized cases:
 
-.. c:function:: ktime_t ktime_get_coarse_boottime( void )
+.. c:function:: ktime_t ktime_get_coarse( void )
+		ktime_t ktime_get_coarse_boottime( void )
 		ktime_t ktime_get_coarse_real( void )
 		ktime_t ktime_get_coarse_clocktai( void )
-		ktime_t ktime_get_coarse_raw( void )
+
+.. c:function:: u64 ktime_get_coarse_ns( void )
+		u64 ktime_get_coarse_boottime_ns( void )
+		u64 ktime_get_coarse_real_ns( void )
+		u64 ktime_get_coarse_clocktai_ns( void )
 
 .. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
 		void ktime_get_coarse_boottime_ts64( struct timespec64 * )
 		void ktime_get_coarse_real_ts64( struct timespec64 * )
 		void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
-		void ktime_get_coarse_raw_ts64( struct timespec64 * )
 
 	These are quicker than the non-coarse versions, but less accurate,
-	corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
+	corresponding to CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE
 	in user space, along with the equivalent boottime/tai/raw
 	timebase not available in user space.
 
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index ef6f9f98f595..fcedc5349ace 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -30,27 +30,27 @@ it called marks.  Each mark may be set or cleared independently of
 the others.  You can iterate over entries which are marked.
 
 Normal pointers may be stored in the XArray directly.  They must be 4-byte
-aligned, which is true for any pointer returned from :c:func:`kmalloc` and
-:c:func:`alloc_page`.  It isn't true for arbitrary user-space pointers,
+aligned, which is true for any pointer returned from kmalloc() and
+alloc_page().  It isn't true for arbitrary user-space pointers,
 nor for function pointers.  You can store pointers to statically allocated
 objects, as long as those objects have an alignment of at least 4.
 
 You can also store integers between 0 and ``LONG_MAX`` in the XArray.
-You must first convert it into an entry using :c:func:`xa_mk_value`.
+You must first convert it into an entry using xa_mk_value().
 When you retrieve an entry from the XArray, you can check whether it is
-a value entry by calling :c:func:`xa_is_value`, and convert it back to
-an integer by calling :c:func:`xa_to_value`.
+a value entry by calling xa_is_value(), and convert it back to
+an integer by calling xa_to_value().
 
 Some users want to store tagged pointers instead of using the marks
-described above.  They can call :c:func:`xa_tag_pointer` to create an
-entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry
-back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve
+described above.  They can call xa_tag_pointer() to create an
+entry with a tag, xa_untag_pointer() to turn a tagged entry
+back into an untagged pointer and xa_pointer_tag() to retrieve
 the tag of an entry.  Tagged pointers use the same bits that are used
 to distinguish value entries from normal pointers, so each user must
 decide whether they want to store value entries or tagged pointers in
 any particular XArray.
 
-The XArray does not support storing :c:func:`IS_ERR` pointers as some
+The XArray does not support storing IS_ERR() pointers as some
 conflict with value entries or internal entries.
 
 An unusual feature of the XArray is the ability to create entries which
@@ -64,89 +64,89 @@ entry will cause the XArray to forget about the range.
 Normal API
 ==========
 
-Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY`
-for statically allocated XArrays or :c:func:`xa_init` for dynamically
+Start by initialising an XArray, either with DEFINE_XARRAY()
+for statically allocated XArrays or xa_init() for dynamically
 allocated ones.  A freshly-initialised XArray contains a ``NULL``
 pointer at every index.
 
-You can then set entries using :c:func:`xa_store` and get entries
-using :c:func:`xa_load`.  xa_store will overwrite any entry with the
+You can then set entries using xa_store() and get entries
+using xa_load().  xa_store will overwrite any entry with the
 new entry and return the previous entry stored at that index.  You can
-use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a
+use xa_erase() instead of calling xa_store() with a
 ``NULL`` entry.  There is no difference between an entry that has never
 been stored to, one that has been erased and one that has most recently
 had ``NULL`` stored to it.
 
 You can conditionally replace an entry at an index by using
-:c:func:`xa_cmpxchg`.  Like :c:func:`cmpxchg`, it will only succeed if
+xa_cmpxchg().  Like cmpxchg(), it will only succeed if
 the entry at that index has the 'old' value.  It also returns the entry
 which was at that index; if it returns the same entry which was passed as
-'old', then :c:func:`xa_cmpxchg` succeeded.
+'old', then xa_cmpxchg() succeeded.
 
 If you want to only store a new entry to an index if the current entry
-at that index is ``NULL``, you can use :c:func:`xa_insert` which
+at that index is ``NULL``, you can use xa_insert() which
 returns ``-EBUSY`` if the entry is not empty.
 
 You can enquire whether a mark is set on an entry by using
-:c:func:`xa_get_mark`.  If the entry is not ``NULL``, you can set a mark
-on it by using :c:func:`xa_set_mark` and remove the mark from an entry by
-calling :c:func:`xa_clear_mark`.  You can ask whether any entry in the
-XArray has a particular mark set by calling :c:func:`xa_marked`.
+xa_get_mark().  If the entry is not ``NULL``, you can set a mark
+on it by using xa_set_mark() and remove the mark from an entry by
+calling xa_clear_mark().  You can ask whether any entry in the
+XArray has a particular mark set by calling xa_marked().
 
 You can copy entries out of the XArray into a plain array by calling
-:c:func:`xa_extract`.  Or you can iterate over the present entries in
-the XArray by calling :c:func:`xa_for_each`.  You may prefer to use
-:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present
+xa_extract().  Or you can iterate over the present entries in
+the XArray by calling xa_for_each().  You may prefer to use
+xa_find() or xa_find_after() to move to the next present
 entry in the XArray.
 
-Calling :c:func:`xa_store_range` stores the same entry in a range
+Calling xa_store_range() stores the same entry in a range
 of indices.  If you do this, some of the other operations will behave
 in a slightly odd way.  For example, marking the entry at one index
 may result in the entry being marked at some, but not all of the other
 indices.  Storing into one index may result in the entry retrieved by
 some, but not all of the other indices changing.
 
-Sometimes you need to ensure that a subsequent call to :c:func:`xa_store`
-will not need to allocate memory.  The :c:func:`xa_reserve` function
+Sometimes you need to ensure that a subsequent call to xa_store()
+will not need to allocate memory.  The xa_reserve() function
 will store a reserved entry at the indicated index.  Users of the
 normal API will see this entry as containing ``NULL``.  If you do
-not need to use the reserved entry, you can call :c:func:`xa_release`
+not need to use the reserved entry, you can call xa_release()
 to remove the unused entry.  If another user has stored to the entry
-in the meantime, :c:func:`xa_release` will do nothing; if instead you
-want the entry to become ``NULL``, you should use :c:func:`xa_erase`.
-Using :c:func:`xa_insert` on a reserved entry will fail.
+in the meantime, xa_release() will do nothing; if instead you
+want the entry to become ``NULL``, you should use xa_erase().
+Using xa_insert() on a reserved entry will fail.
 
-If all entries in the array are ``NULL``, the :c:func:`xa_empty` function
+If all entries in the array are ``NULL``, the xa_empty() function
 will return ``true``.
 
 Finally, you can remove all entries from an XArray by calling
-:c:func:`xa_destroy`.  If the XArray entries are pointers, you may wish
+xa_destroy().  If the XArray entries are pointers, you may wish
 to free the entries first.  You can do this by iterating over all present
-entries in the XArray using the :c:func:`xa_for_each` iterator.
+entries in the XArray using the xa_for_each() iterator.
 
 Allocating XArrays
 ------------------
 
-If you use :c:func:`DEFINE_XARRAY_ALLOC` to define the XArray, or
-initialise it by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`,
+If you use DEFINE_XARRAY_ALLOC() to define the XArray, or
+initialise it by passing ``XA_FLAGS_ALLOC`` to xa_init_flags(),
 the XArray changes to track whether entries are in use or not.
 
-You can call :c:func:`xa_alloc` to store the entry at an unused index
+You can call xa_alloc() to store the entry at an unused index
 in the XArray.  If you need to modify the array from interrupt context,
-you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable
+you can use xa_alloc_bh() or xa_alloc_irq() to disable
 interrupts while allocating the ID.
 
-Using :c:func:`xa_store`, :c:func:`xa_cmpxchg` or :c:func:`xa_insert` will
+Using xa_store(), xa_cmpxchg() or xa_insert() will
 also mark the entry as being allocated.  Unlike a normal XArray, storing
-``NULL`` will mark the entry as being in use, like :c:func:`xa_reserve`.
-To free an entry, use :c:func:`xa_erase` (or :c:func:`xa_release` if
+``NULL`` will mark the entry as being in use, like xa_reserve().
+To free an entry, use xa_erase() (or xa_release() if
 you only want to free the entry if it's ``NULL``).
 
 By default, the lowest free entry is allocated starting from 0.  If you
 want to allocate entries starting at 1, it is more efficient to use
-:c:func:`DEFINE_XARRAY_ALLOC1` or ``XA_FLAGS_ALLOC1``.  If you want to
+DEFINE_XARRAY_ALLOC1() or ``XA_FLAGS_ALLOC1``.  If you want to
 allocate IDs up to a maximum, then wrap back around to the lowest free
-ID, you can use :c:func:`xa_alloc_cyclic`.
+ID, you can use xa_alloc_cyclic().
 
 You cannot use ``XA_MARK_0`` with an allocating XArray as this mark
 is used to track whether an entry is free or not.  The other marks are
@@ -155,17 +155,17 @@ available for your use.
 Memory allocation
 -----------------
 
-The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`,
-:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t
+The xa_store(), xa_cmpxchg(), xa_alloc(),
+xa_reserve() and xa_insert() functions take a gfp_t
 parameter in case the XArray needs to allocate memory to store this entry.
 If the entry is being deleted, no memory allocation needs to be performed,
 and the GFP flags specified will be ignored.
 
 It is possible for no memory to be allocatable, particularly if you pass
 a restrictive set of GFP flags.  In that case, the functions return a
-special value which can be turned into an errno using :c:func:`xa_err`.
+special value which can be turned into an errno using xa_err().
 If you don't need to know exactly which error occurred, using
-:c:func:`xa_is_err` is slightly more efficient.
+xa_is_err() is slightly more efficient.
 
 Locking
 -------
@@ -174,54 +174,54 @@ When using the Normal API, you do not have to worry about locking.
 The XArray uses RCU and an internal spinlock to synchronise access:
 
 No lock needed:
- * :c:func:`xa_empty`
- * :c:func:`xa_marked`
+ * xa_empty()
+ * xa_marked()
 
 Takes RCU read lock:
- * :c:func:`xa_load`
- * :c:func:`xa_for_each`
- * :c:func:`xa_find`
- * :c:func:`xa_find_after`
- * :c:func:`xa_extract`
- * :c:func:`xa_get_mark`
+ * xa_load()
+ * xa_for_each()
+ * xa_find()
+ * xa_find_after()
+ * xa_extract()
+ * xa_get_mark()
 
 Takes xa_lock internally:
- * :c:func:`xa_store`
- * :c:func:`xa_store_bh`
- * :c:func:`xa_store_irq`
- * :c:func:`xa_insert`
- * :c:func:`xa_insert_bh`
- * :c:func:`xa_insert_irq`
- * :c:func:`xa_erase`
- * :c:func:`xa_erase_bh`
- * :c:func:`xa_erase_irq`
- * :c:func:`xa_cmpxchg`
- * :c:func:`xa_cmpxchg_bh`
- * :c:func:`xa_cmpxchg_irq`
- * :c:func:`xa_store_range`
- * :c:func:`xa_alloc`
- * :c:func:`xa_alloc_bh`
- * :c:func:`xa_alloc_irq`
- * :c:func:`xa_reserve`
- * :c:func:`xa_reserve_bh`
- * :c:func:`xa_reserve_irq`
- * :c:func:`xa_destroy`
- * :c:func:`xa_set_mark`
- * :c:func:`xa_clear_mark`
+ * xa_store()
+ * xa_store_bh()
+ * xa_store_irq()
+ * xa_insert()
+ * xa_insert_bh()
+ * xa_insert_irq()
+ * xa_erase()
+ * xa_erase_bh()
+ * xa_erase_irq()
+ * xa_cmpxchg()
+ * xa_cmpxchg_bh()
+ * xa_cmpxchg_irq()
+ * xa_store_range()
+ * xa_alloc()
+ * xa_alloc_bh()
+ * xa_alloc_irq()
+ * xa_reserve()
+ * xa_reserve_bh()
+ * xa_reserve_irq()
+ * xa_destroy()
+ * xa_set_mark()
+ * xa_clear_mark()
 
 Assumes xa_lock held on entry:
- * :c:func:`__xa_store`
- * :c:func:`__xa_insert`
- * :c:func:`__xa_erase`
- * :c:func:`__xa_cmpxchg`
- * :c:func:`__xa_alloc`
- * :c:func:`__xa_set_mark`
- * :c:func:`__xa_clear_mark`
+ * __xa_store()
+ * __xa_insert()
+ * __xa_erase()
+ * __xa_cmpxchg()
+ * __xa_alloc()
+ * __xa_set_mark()
+ * __xa_clear_mark()
 
 If you want to take advantage of the lock to protect the data structures
-that you are storing in the XArray, you can call :c:func:`xa_lock`
-before calling :c:func:`xa_load`, then take a reference count on the
-object you have found before calling :c:func:`xa_unlock`.  This will
+that you are storing in the XArray, you can call xa_lock()
+before calling xa_load(), then take a reference count on the
+object you have found before calling xa_unlock().  This will
 prevent stores from removing the object from the array between looking
 up the object and incrementing the refcount.  You can also use RCU to
 avoid dereferencing freed memory, but an explanation of that is beyond
@@ -261,7 +261,7 @@ context and then erase them in softirq context, you can do that this way::
     }
 
 If you are going to modify the XArray from interrupt or softirq context,
-you need to initialise the array using :c:func:`xa_init_flags`, passing
+you need to initialise the array using xa_init_flags(), passing
 ``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``.
 
 The above example also shows a common pattern of wanting to extend the
@@ -269,20 +269,20 @@ coverage of the xa_lock on the store side to protect some statistics
 associated with the array.
 
 Sharing the XArray with interrupt context is also possible, either
-using :c:func:`xa_lock_irqsave` in both the interrupt handler and process
-context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock`
+using xa_lock_irqsave() in both the interrupt handler and process
+context, or xa_lock_irq() in process context and xa_lock()
 in the interrupt handler.  Some of the more common patterns have helper
-functions such as :c:func:`xa_store_bh`, :c:func:`xa_store_irq`,
-:c:func:`xa_erase_bh`, :c:func:`xa_erase_irq`, :c:func:`xa_cmpxchg_bh`
-and :c:func:`xa_cmpxchg_irq`.
+functions such as xa_store_bh(), xa_store_irq(),
+xa_erase_bh(), xa_erase_irq(), xa_cmpxchg_bh()
+and xa_cmpxchg_irq().
 
 Sometimes you need to protect access to the XArray with a mutex because
 that lock sits above another mutex in the locking hierarchy.  That does
-not entitle you to use functions like :c:func:`__xa_erase` without taking
+not entitle you to use functions like __xa_erase() without taking
 the xa_lock; the xa_lock is used for lockdep validation and will be used
 for other purposes in the future.
 
-The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also
+The __xa_set_mark() and __xa_clear_mark() functions are also
 available for situations where you look up an entry and want to atomically
 set or clear a mark.  It may be more efficient to use the advanced API
 in this case, as it will save you from walking the tree twice.
@@ -300,27 +300,27 @@ indeed the normal API is implemented in terms of the advanced API.  The
 advanced API is only available to modules with a GPL-compatible license.
 
 The advanced API is based around the xa_state.  This is an opaque data
-structure which you declare on the stack using the :c:func:`XA_STATE`
+structure which you declare on the stack using the XA_STATE()
 macro.  This macro initialises the xa_state ready to start walking
 around the XArray.  It is used as a cursor to maintain the position
 in the XArray and let you compose various operations together without
 having to restart from the top every time.
 
 The xa_state is also used to store errors.  You can call
-:c:func:`xas_error` to retrieve the error.  All operations check whether
+xas_error() to retrieve the error.  All operations check whether
 the xa_state is in an error state before proceeding, so there's no need
 for you to check for an error after each call; you can make multiple
 calls in succession and only check at a convenient point.  The only
 errors currently generated by the XArray code itself are ``ENOMEM`` and
 ``EINVAL``, but it supports arbitrary errors in case you want to call
-:c:func:`xas_set_err` yourself.
+xas_set_err() yourself.
 
-If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem`
+If the xa_state is holding an ``ENOMEM`` error, calling xas_nomem()
 will attempt to allocate more memory using the specified gfp flags and
 cache it in the xa_state for the next attempt.  The idea is that you take
 the xa_lock, attempt the operation and drop the lock.  The operation
 attempts to allocate memory while holding the lock, but it is more
-likely to fail.  Once you have dropped the lock, :c:func:`xas_nomem`
+likely to fail.  Once you have dropped the lock, xas_nomem()
 can try harder to allocate more memory.  It will return ``true`` if it
 is worth retrying the operation (i.e. that there was a memory error *and*
 more memory was allocated).  If it has previously allocated memory, and
@@ -333,7 +333,7 @@ Internal Entries
 The XArray reserves some entries for its own purposes.  These are never
 exposed through the normal API, but when using the advanced API, it's
 possible to see them.  Usually the best way to handle them is to pass them
-to :c:func:`xas_retry`, and retry the operation if it returns ``true``.
+to xas_retry(), and retry the operation if it returns ``true``.
 
 .. flat-table::
    :widths: 1 1 6
@@ -343,89 +343,89 @@ to :c:func:`xas_retry`, and retry the operation if it returns ``true``.
      - Usage
 
    * - Node
-     - :c:func:`xa_is_node`
+     - xa_is_node()
      - An XArray node.  May be visible when using a multi-index xa_state.
 
    * - Sibling
-     - :c:func:`xa_is_sibling`
+     - xa_is_sibling()
      - A non-canonical entry for a multi-index entry.  The value indicates
        which slot in this node has the canonical entry.
 
    * - Retry
-     - :c:func:`xa_is_retry`
+     - xa_is_retry()
      - This entry is currently being modified by a thread which has the
        xa_lock.  The node containing this entry may be freed at the end
        of this RCU period.  You should restart the lookup from the head
        of the array.
 
    * - Zero
-     - :c:func:`xa_is_zero`
+     - xa_is_zero()
      - Zero entries appear as ``NULL`` through the Normal API, but occupy
        an entry in the XArray which can be used to reserve the index for
        future use.  This is used by allocating XArrays for allocated entries
        which are ``NULL``.
 
 Other internal entries may be added in the future.  As far as possible, they
-will be handled by :c:func:`xas_retry`.
+will be handled by xas_retry().
 
 Additional functionality
 ------------------------
 
-The :c:func:`xas_create_range` function allocates all the necessary memory
+The xas_create_range() function allocates all the necessary memory
 to store every entry in a range.  It will set ENOMEM in the xa_state if
 it cannot allocate memory.
 
-You can use :c:func:`xas_init_marks` to reset the marks on an entry
+You can use xas_init_marks() to reset the marks on an entry
 to their default state.  This is usually all marks clear, unless the
 XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set
 and all other marks are clear.  Replacing one entry with another using
-:c:func:`xas_store` will not reset the marks on that entry; if you want
+xas_store() will not reset the marks on that entry; if you want
 the marks reset, you should do that explicitly.
 
-The :c:func:`xas_load` will walk the xa_state as close to the entry
+The xas_load() will walk the xa_state as close to the entry
 as it can.  If you know the xa_state has already been walked to the
 entry and need to check that the entry hasn't changed, you can use
-:c:func:`xas_reload` to save a function call.
+xas_reload() to save a function call.
 
 If you need to move to a different index in the XArray, call
-:c:func:`xas_set`.  This resets the cursor to the top of the tree, which
+xas_set().  This resets the cursor to the top of the tree, which
 will generally make the next operation walk the cursor to the desired
 spot in the tree.  If you want to move to the next or previous index,
-call :c:func:`xas_next` or :c:func:`xas_prev`.  Setting the index does
+call xas_next() or xas_prev().  Setting the index does
 not walk the cursor around the array so does not require a lock to be
 held, while moving to the next or previous index does.
 
-You can search for the next present entry using :c:func:`xas_find`.  This
-is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`;
+You can search for the next present entry using xas_find().  This
+is the equivalent of both xa_find() and xa_find_after();
 if the cursor has been walked to an entry, then it will find the next
 entry after the one currently referenced.  If not, it will return the
-entry at the index of the xa_state.  Using :c:func:`xas_next_entry` to
-move to the next present entry instead of :c:func:`xas_find` will save
+entry at the index of the xa_state.  Using xas_next_entry() to
+move to the next present entry instead of xas_find() will save
 a function call in the majority of cases at the expense of emitting more
 inline code.
 
-The :c:func:`xas_find_marked` function is similar.  If the xa_state has
+The xas_find_marked() function is similar.  If the xa_state has
 not been walked, it will return the entry at the index of the xa_state,
 if it is marked.  Otherwise, it will return the first marked entry after
-the entry referenced by the xa_state.  The :c:func:`xas_next_marked`
-function is the equivalent of :c:func:`xas_next_entry`.
+the entry referenced by the xa_state.  The xas_next_marked()
+function is the equivalent of xas_next_entry().
 
-When iterating over a range of the XArray using :c:func:`xas_for_each`
-or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop
-the iteration.  The :c:func:`xas_pause` function exists for this purpose.
+When iterating over a range of the XArray using xas_for_each()
+or xas_for_each_marked(), it may be necessary to temporarily stop
+the iteration.  The xas_pause() function exists for this purpose.
 After you have done the necessary work and wish to resume, the xa_state
 is in an appropriate state to continue the iteration after the entry
 you last processed.  If you have interrupts disabled while iterating,
 then it is good manners to pause the iteration and reenable interrupts
 every ``XA_CHECK_SCHED`` entries.
 
-The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and
-:c:func:`xas_clear_mark` functions require the xa_state cursor to have
+The xas_get_mark(), xas_set_mark() and
+xas_clear_mark() functions require the xa_state cursor to have
 been moved to the appropriate location in the xarray; they will do
-nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set`
+nothing if you have called xas_pause() or xas_set()
 immediately before.
 
-You can call :c:func:`xas_set_update` to have a callback function
+You can call xas_set_update() to have a callback function
 called each time the XArray updates a node.  This is used by the page
 cache workingset code to maintain its list of nodes which contain only
 shadow entries.
@@ -443,25 +443,25 @@ eg indices 64-127 may be tied together, but 2-6 may not be.  This may
 save substantial quantities of memory; for example tying 512 entries
 together will save over 4kB.
 
-You can create a multi-index entry by using :c:func:`XA_STATE_ORDER`
-or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`.
-Calling :c:func:`xas_load` with a multi-index xa_state will walk the
+You can create a multi-index entry by using XA_STATE_ORDER()
+or xas_set_order() followed by a call to xas_store().
+Calling xas_load() with a multi-index xa_state will walk the
 xa_state to the right location in the tree, but the return value is not
 meaningful, potentially being an internal entry or ``NULL`` even when there
-is an entry stored within the range.  Calling :c:func:`xas_find_conflict`
+is an entry stored within the range.  Calling xas_find_conflict()
 will return the first entry within the range or ``NULL`` if there are no
-entries in the range.  The :c:func:`xas_for_each_conflict` iterator will
+entries in the range.  The xas_for_each_conflict() iterator will
 iterate over every entry which overlaps the specified range.
 
-If :c:func:`xas_load` encounters a multi-index entry, the xa_index
+If xas_load() encounters a multi-index entry, the xa_index
 in the xa_state will not be changed.  When iterating over an XArray
-or calling :c:func:`xas_find`, if the initial index is in the middle
+or calling xas_find(), if the initial index is in the middle
 of a multi-index entry, it will not be altered.  Subsequent calls
 or iterations will move the index to the first index in the range.
 Each entry will only be returned once, no matter how many indices it
 occupies.
 
-Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state
+Using xas_next() or xas_prev() with a multi-index xa_state
 is not supported.  Using either of these functions on a multi-index entry
 will reveal sibling entries; these should be skipped over by the caller.
 
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 073f128af5a7..55193e680250 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -95,7 +95,7 @@ flags	- flags of the cpufreq driver
 
 3. CPUFreq Table Generation with Operating Performance Point (OPP)
 ==================================================================
-For details about OPP, see Documentation/power/opp.txt
+For details about OPP, see Documentation/power/opp.rst
 
 dev_pm_opp_init_cpufreq_table -
 	This function provides a ready to use conversion routine to translate
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
deleted file mode 100644
index cb61277e2308..000000000000
--- a/Documentation/cputopology.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-===========================================
-How CPU topology info is exported via sysfs
-===========================================
-
-Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo output of some architectures.  They reside in
-/sys/devices/system/cpu/cpuX/topology/:
-
-physical_package_id:
-
-	physical package id of cpuX. Typically corresponds to a physical
-	socket number, but the actual value is architecture and platform
-	dependent.
-
-core_id:
-
-	the CPU core ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-book_id:
-
-	the book ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-drawer_id:
-
-	the drawer ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-thread_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	core as cpuX.
-
-thread_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	core as cpuX.
-
-core_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	physical_package_id.
-
-core_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	physical_package_id.
-
-book_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	book_id.
-
-book_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	book_id.
-
-drawer_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	drawer_id.
-
-drawer_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	drawer_id.
-
-Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
-
-For an architecture to support this feature, it must define some of
-these macros in include/asm-XXX/topology.h::
-
-	#define topology_physical_package_id(cpu)
-	#define topology_core_id(cpu)
-	#define topology_book_id(cpu)
-	#define topology_drawer_id(cpu)
-	#define topology_sibling_cpumask(cpu)
-	#define topology_core_cpumask(cpu)
-	#define topology_book_cpumask(cpu)
-	#define topology_drawer_cpumask(cpu)
-
-The type of ``**_id macros`` is int.
-The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
-correspond with appropriate ``**_siblings`` sysfs attributes (except for
-topology_sibling_cpumask() which corresponds with thread_siblings).
-
-To be consistent on all architectures, include/linux/topology.h
-provides default definitions for any of the above macros that are
-not defined by include/asm-XXX/topology.h:
-
-1) topology_physical_package_id: -1
-2) topology_core_id: 0
-3) topology_sibling_cpumask: just the given CPU
-4) topology_core_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
-
-Additionally, CPU topology information is provided under
-/sys/devices/system/cpu and includes these files.  The internal
-source for the output is in brackets ("[]").
-
-    =========== ==========================================================
-    kernel_max: the maximum CPU index allowed by the kernel configuration.
-		[NR_CPUS-1]
-
-    offline:	CPUs that are not online because they have been
-		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
-		of CPUs allowed by the kernel configuration (kernel_max
-		above). [~cpu_online_mask + cpus >= NR_CPUS]
-
-    online:	CPUs that are online and being scheduled [cpu_online_mask]
-
-    possible:	CPUs that have been allocated resources and can be
-		brought online if they are present. [cpu_possible_mask]
-
-    present:	CPUs that have been identified as being present in the
-		system. [cpu_present_mask]
-    =========== ==========================================================
-
-The format for the above output is compatible with cpulist_parse()
-[see <linux/cpumask.h>].  Some examples follow.
-
-In this example, there are 64 CPUs in the system but cpus 32-63 exceed
-the kernel max which is limited to 0..31 by the NR_CPUS config option
-being 32.  Note also that CPUs 2 and 4-31 are not online but could be
-brought online as they are both present and possible::
-
-     kernel_max: 31
-        offline: 2,4-31,32-63
-         online: 0-1,3
-       possible: 0-31
-        present: 0-31
-
-In this example, the NR_CPUS config option is 128, but the kernel was
-started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
-was manually taken offline (and is the only CPU that can be brought
-online.)::
-
-     kernel_max: 127
-        offline: 2,4-127,128-143
-         online: 0-1,3
-       possible: 0-127
-        present: 0-3
-
-See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
-as well as more information on the various cpumasks.
diff --git a/Documentation/crypto/api-samples.rst b/Documentation/crypto/api-samples.rst
index f14afaaf2f32..e923f17bc2bd 100644
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -4,111 +4,89 @@ Code Examples
 Code Example For Symmetric Key Cipher Operation
 -----------------------------------------------
 
-::
-
-
-    /* tie all data structures together */
-    struct skcipher_def {
-        struct scatterlist sg;
-        struct crypto_skcipher *tfm;
-        struct skcipher_request *req;
-        struct crypto_wait wait;
-    };
-
-    /* Perform cipher operation */
-    static unsigned int test_skcipher_encdec(struct skcipher_def *sk,
-                         int enc)
-    {
-        int rc;
-
-        if (enc)
-            rc = crypto_wait_req(crypto_skcipher_encrypt(sk->req), &sk->wait);
-        else
-            rc = crypto_wait_req(crypto_skcipher_decrypt(sk->req), &sk->wait);
-
-	if (rc)
-		pr_info("skcipher encrypt returned with result %d\n", rc);
+This code encrypts some data with AES-256-XTS.  For sake of example,
+all inputs are random bytes, the encryption is done in-place, and it's
+assumed the code is running in a context where it can sleep.
 
-        return rc;
-    }
+::
 
-    /* Initialize and trigger cipher operation */
     static int test_skcipher(void)
     {
-        struct skcipher_def sk;
-        struct crypto_skcipher *skcipher = NULL;
-        struct skcipher_request *req = NULL;
-        char *scratchpad = NULL;
-        char *ivdata = NULL;
-        unsigned char key[32];
-        int ret = -EFAULT;
-
-        skcipher = crypto_alloc_skcipher("cbc-aes-aesni", 0, 0);
-        if (IS_ERR(skcipher)) {
-            pr_info("could not allocate skcipher handle\n");
-            return PTR_ERR(skcipher);
-        }
-
-        req = skcipher_request_alloc(skcipher, GFP_KERNEL);
-        if (!req) {
-            pr_info("could not allocate skcipher request\n");
-            ret = -ENOMEM;
-            goto out;
-        }
-
-        skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-                          crypto_req_done,
-                          &sk.wait);
-
-        /* AES 256 with random key */
-        get_random_bytes(&key, 32);
-        if (crypto_skcipher_setkey(skcipher, key, 32)) {
-            pr_info("key could not be set\n");
-            ret = -EAGAIN;
-            goto out;
-        }
-
-        /* IV will be random */
-        ivdata = kmalloc(16, GFP_KERNEL);
-        if (!ivdata) {
-            pr_info("could not allocate ivdata\n");
-            goto out;
-        }
-        get_random_bytes(ivdata, 16);
-
-        /* Input data will be random */
-        scratchpad = kmalloc(16, GFP_KERNEL);
-        if (!scratchpad) {
-            pr_info("could not allocate scratchpad\n");
-            goto out;
-        }
-        get_random_bytes(scratchpad, 16);
-
-        sk.tfm = skcipher;
-        sk.req = req;
-
-        /* We encrypt one block */
-        sg_init_one(&sk.sg, scratchpad, 16);
-        skcipher_request_set_crypt(req, &sk.sg, &sk.sg, 16, ivdata);
-        crypto_init_wait(&sk.wait);
-
-        /* encrypt data */
-        ret = test_skcipher_encdec(&sk, 1);
-        if (ret)
-            goto out;
-
-        pr_info("Encryption triggered successfully\n");
-
+            struct crypto_skcipher *tfm = NULL;
+            struct skcipher_request *req = NULL;
+            u8 *data = NULL;
+            const size_t datasize = 512; /* data size in bytes */
+            struct scatterlist sg;
+            DECLARE_CRYPTO_WAIT(wait);
+            u8 iv[16];  /* AES-256-XTS takes a 16-byte IV */
+            u8 key[64]; /* AES-256-XTS takes a 64-byte key */
+            int err;
+
+            /*
+             * Allocate a tfm (a transformation object) and set the key.
+             *
+             * In real-world use, a tfm and key are typically used for many
+             * encryption/decryption operations.  But in this example, we'll just do a
+             * single encryption operation with it (which is not very efficient).
+             */
+
+            tfm = crypto_alloc_skcipher("xts(aes)", 0, 0);
+            if (IS_ERR(tfm)) {
+                    pr_err("Error allocating xts(aes) handle: %ld\n", PTR_ERR(tfm));
+                    return PTR_ERR(tfm);
+            }
+
+            get_random_bytes(key, sizeof(key));
+            err = crypto_skcipher_setkey(tfm, key, sizeof(key));
+            if (err) {
+                    pr_err("Error setting key: %d\n", err);
+                    goto out;
+            }
+
+            /* Allocate a request object */
+            req = skcipher_request_alloc(tfm, GFP_KERNEL);
+            if (!req) {
+                    err = -ENOMEM;
+                    goto out;
+            }
+
+            /* Prepare the input data */
+            data = kmalloc(datasize, GFP_KERNEL);
+            if (!data) {
+                    err = -ENOMEM;
+                    goto out;
+            }
+            get_random_bytes(data, datasize);
+
+            /* Initialize the IV */
+            get_random_bytes(iv, sizeof(iv));
+
+            /*
+             * Encrypt the data in-place.
+             *
+             * For simplicity, in this example we wait for the request to complete
+             * before proceeding, even if the underlying implementation is asynchronous.
+             *
+             * To decrypt instead of encrypt, just change crypto_skcipher_encrypt() to
+             * crypto_skcipher_decrypt().
+             */
+            sg_init_one(&sg, data, datasize);
+            skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+                                               CRYPTO_TFM_REQ_MAY_SLEEP,
+                                          crypto_req_done, &wait);
+            skcipher_request_set_crypt(req, &sg, &sg, datasize, iv);
+            err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+            if (err) {
+                    pr_err("Error encrypting data: %d\n", err);
+                    goto out;
+            }
+
+            pr_debug("Encryption was successful\n");
     out:
-        if (skcipher)
-            crypto_free_skcipher(skcipher);
-        if (req)
+            crypto_free_skcipher(tfm);
             skcipher_request_free(req);
-        if (ivdata)
-            kfree(ivdata);
-        if (scratchpad)
-            kfree(scratchpad);
-        return ret;
+            kfree(data);
+            return err;
     }
 
 
diff --git a/Documentation/crypto/api-skcipher.rst b/Documentation/crypto/api-skcipher.rst
index 4eec4a93f7e3..20ba08dddf2e 100644
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -5,7 +5,7 @@ Block Cipher Algorithm Definitions
    :doc: Block Cipher Algorithm Definitions
 
 .. kernel-doc:: include/linux/crypto.h
-   :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg
+   :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg compress_alg
 
 Symmetric Key Cipher API
 ------------------------
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index ee8ff0762d7f..3eae1ae7f798 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -208,9 +208,7 @@ the aforementioned cipher types:
 -  CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
    an ECDH or DH implementation
 
--  CRYPTO_ALG_TYPE_DIGEST Raw message digest
-
--  CRYPTO_ALG_TYPE_HASH Alias for CRYPTO_ALG_TYPE_DIGEST
+-  CRYPTO_ALG_TYPE_HASH Raw message digest
 
 -  CRYPTO_ALG_TYPE_SHASH Synchronous multi-block hash
 
diff --git a/Documentation/crypto/conf.py b/Documentation/crypto/conf.py
deleted file mode 100644
index 4335d251ddf3..000000000000
--- a/Documentation/crypto/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = 'Linux Kernel Crypto API'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/crypto/crypto_engine.rst b/Documentation/crypto/crypto_engine.rst
index 1d56221dfe35..236c674d6897 100644
--- a/Documentation/crypto/crypto_engine.rst
+++ b/Documentation/crypto/crypto_engine.rst
@@ -1,50 +1,85 @@
-=============
-CRYPTO ENGINE
+.. SPDX-License-Identifier: GPL-2.0
+Crypto Engine
 =============
 
 Overview
 --------
-The crypto engine API (CE), is a crypto queue manager.
+The crypto engine (CE) API is a crypto queue manager.
 
 Requirement
 -----------
-You have to put at start of your tfm_ctx the struct crypto_engine_ctx::
+You must put, at the start of your transform context your_tfm_ctx, the structure
+crypto_engine:
+
+::
 
-  struct your_tfm_ctx {
-        struct crypto_engine_ctx enginectx;
-        ...
-  };
+	struct your_tfm_ctx {
+		struct crypto_engine engine;
+		...
+	};
 
-Why: Since CE manage only crypto_async_request, it cannot know the underlying
-request_type and so have access only on the TFM.
-So using container_of for accessing __ctx is impossible.
-Furthermore, the crypto engine cannot know the "struct your_tfm_ctx",
-so it must assume that crypto_engine_ctx is at start of it.
+The crypto engine only manages asynchronous requests in the form of
+crypto_async_request. It cannot know the underlying request type and thus only
+has access to the transform structure. It is not possible to access the context
+using container_of. In addition, the engine knows nothing about your
+structure "``struct your_tfm_ctx``". The engine assumes (requires) the placement
+of the known member ``struct crypto_engine`` at the beginning.
 
 Order of operations
 -------------------
-You have to obtain a struct crypto_engine via crypto_engine_alloc_init().
-And start it via crypto_engine_start().
-
-Before transferring any request, you have to fill the enginectx.
-- prepare_request: (taking a function pointer) If you need to do some processing before doing the request
-- unprepare_request: (taking a function pointer) Undoing what's done in prepare_request
-- do_one_request: (taking a function pointer) Do encryption for current request
-
-Note: that those three functions get the crypto_async_request associated with the received request.
-So your need to get the original request via container_of(areq, struct yourrequesttype_request, base);
-
-When your driver receive a crypto_request, you have to transfer it to
-the cryptoengine via one of:
-- crypto_transfer_ablkcipher_request_to_engine()
-- crypto_transfer_aead_request_to_engine()
-- crypto_transfer_akcipher_request_to_engine()
-- crypto_transfer_hash_request_to_engine()
-- crypto_transfer_skcipher_request_to_engine()
-
-At the end of the request process, a call to one of the following function is needed:
-- crypto_finalize_ablkcipher_request
-- crypto_finalize_aead_request
-- crypto_finalize_akcipher_request
-- crypto_finalize_hash_request
-- crypto_finalize_skcipher_request
+You are required to obtain a struct crypto_engine via ``crypto_engine_alloc_init()``.
+Start it via ``crypto_engine_start()``. When finished with your work, shut down the
+engine using ``crypto_engine_stop()`` and destroy the engine with
+``crypto_engine_exit()``.
+
+Before transferring any request, you have to fill the context enginectx by
+providing functions for the following:
+
+* ``prepare_crypt_hardware``: Called once before any prepare functions are
+  called.
+
+* ``unprepare_crypt_hardware``: Called once after all unprepare functions have
+  been called.
+
+* ``prepare_cipher_request``/``prepare_hash_request``: Called before each
+  corresponding request is performed. If some processing or other preparatory
+  work is required, do it here.
+
+* ``unprepare_cipher_request``/``unprepare_hash_request``: Called after each
+  request is handled. Clean up / undo what was done in the prepare function.
+
+* ``cipher_one_request``/``hash_one_request``: Handle the current request by
+  performing the operation.
+
+Note that these functions access the crypto_async_request structure
+associated with the received request. You are able to retrieve the original
+request by using:
+
+::
+
+	container_of(areq, struct yourrequesttype_request, base);
+
+When your driver receives a crypto_request, you must to transfer it to
+the crypto engine via one of:
+
+* crypto_transfer_ablkcipher_request_to_engine()
+
+* crypto_transfer_aead_request_to_engine()
+
+* crypto_transfer_akcipher_request_to_engine()
+
+* crypto_transfer_hash_request_to_engine()
+
+* crypto_transfer_skcipher_request_to_engine()
+
+At the end of the request process, a call to one of the following functions is needed:
+
+* crypto_finalize_ablkcipher_request()
+
+* crypto_finalize_aead_request()
+
+* crypto_finalize_akcipher_request()
+
+* crypto_finalize_hash_request()
+
+* crypto_finalize_skcipher_request()
diff --git a/Documentation/dev-tools/conf.py b/Documentation/dev-tools/conf.py
deleted file mode 100644
index 7faafa3f7888..000000000000
--- a/Documentation/dev-tools/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Development tools for the kernel"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'dev-tools.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index e6f51260ff32..3621cd5e1eef 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -2,8 +2,8 @@ Kernel Memory Leak Detector
 ===========================
 
 Kmemleak provides a way of detecting possible kernel memory leaks in a
-way similar to a tracing garbage collector
-(https://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
+way similar to a `tracing garbage collector
+<https://en.wikipedia.org/wiki/Tracing_garbage_collection>`_,
 with the difference that the orphan objects are not freed but only
 reported via /sys/kernel/debug/kmemleak. A similar method is used by the
 Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in
@@ -15,10 +15,13 @@ Usage
 
 CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
 thread scans the memory every 10 minutes (by default) and prints the
-number of new unreferenced objects found. To display the details of all
-the possible memory leaks::
+number of new unreferenced objects found. If the ``debugfs`` isn't already
+mounted, mount with::
 
   # mount -t debugfs nodev /sys/kernel/debug/
+
+To display the details of all the possible scanned memory leaks::
+
   # cat /sys/kernel/debug/kmemleak
 
 To trigger an intermediate memory scan::
@@ -72,6 +75,9 @@ If CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF are enabled, the kmemleak is
 disabled by default. Passing ``kmemleak=on`` on the kernel command
 line enables the function. 
 
+If you are getting errors like "Error while writing to stdout" or "write_loop:
+Invalid argument", make sure kmemleak is properly enabled.
+
 Basic Algorithm
 ---------------
 
@@ -218,3 +224,37 @@ the pointer is calculated by other methods than the usual container_of
 macro or the pointer is stored in a location not scanned by kmemleak.
 
 Page allocations and ioremap are not tracked.
+
+Testing with kmemleak-test
+--------------------------
+
+To check if you have all set up to use kmemleak, you can use the kmemleak-test
+module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST
+as module (it can't be used as bult-in) and boot the kernel with kmemleak
+enabled. Load the module and perform a scan with::
+
+        # modprobe kmemleak-test
+        # echo scan > /sys/kernel/debug/kmemleak
+
+Note that the you may not get results instantly or on the first scanning. When
+kmemleak gets results, it'll log ``kmemleak: <count of leaks> new suspected
+memory leaks``. Then read the file to see then::
+
+        # cat /sys/kernel/debug/kmemleak
+        unreferenced object 0xffff89862ca702e8 (size 32):
+          comm "modprobe", pid 2088, jiffies 4294680594 (age 375.486s)
+          hex dump (first 32 bytes):
+            6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+            6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
+          backtrace:
+            [<00000000e0a73ec7>] 0xffffffffc01d2036
+            [<000000000c5d2a46>] do_one_initcall+0x41/0x1df
+            [<0000000046db7e0a>] do_init_module+0x55/0x200
+            [<00000000542b9814>] load_module+0x203c/0x2480
+            [<00000000c2850256>] __do_sys_finit_module+0xba/0xe0
+            [<000000006564e7ef>] do_syscall_64+0x43/0x110
+            [<000000007c873fa6>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+        ...
+
+Removing the module with ``rmmod kmemleak_test`` should also trigger some
+kmemleak results.
diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index c401c952a340..6f4870528226 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -81,11 +81,6 @@ of sparse using git to clone::
 
         git://git.kernel.org/pub/scm/devel/sparse/sparse.git
 
-DaveJ has hourly generated tarballs of the git tree available at::
-
-        http://www.codemonkey.org.uk/projects/git-snapshots/sparse/
-
-
 Once you have it, just do::
 
         make
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
deleted file mode 100644
index 86786d87d9a8..000000000000
--- a/Documentation/device-mapper/cache-policies.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Guidance for writing policies
-=============================
-
-Try to keep transactionality out of it.  The core is careful to
-avoid asking about anything that is migrating.  This is a pain, but
-makes it easier to write the policies.
-
-Mappings are loaded into the policy at construction time.
-
-Every bio that is mapped by the target is referred to the policy.
-The policy can return a simple HIT or MISS or issue a migration.
-
-Currently there's no way for the policy to issue background work,
-e.g. to start writing back dirty blocks that are going to be evicted
-soon.
-
-Because we map bios, rather than requests it's easy for the policy
-to get fooled by many small bios.  For this reason the core target
-issues periodic ticks to the policy.  It's suggested that the policy
-doesn't update states (eg, hit counts) for a block more than once
-for each tick.  The core ticks by watching bios complete, and so
-trying to see when the io scheduler has let the ios run.
-
-
-Overview of supplied cache replacement policies
-===============================================
-
-multiqueue (mq)
----------------
-
-This policy is now an alias for smq (see below).
-
-The following tunables are accepted, but have no effect:
-
-	'sequential_threshold <#nr_sequential_ios>'
-	'random_threshold <#nr_random_ios>'
-	'read_promote_adjustment <value>'
-	'write_promote_adjustment <value>'
-	'discard_promote_adjustment <value>'
-
-Stochastic multiqueue (smq)
----------------------------
-
-This policy is the default.
-
-The stochastic multi-queue (smq) policy addresses some of the problems
-with the multiqueue (mq) policy.
-
-The smq policy (vs mq) offers the promise of less memory utilization,
-improved performance and increased adaptability in the face of changing
-workloads.  smq also does not have any cumbersome tuning knobs.
-
-Users may switch from "mq" to "smq" simply by appropriately reloading a
-DM table that is using the cache target.  Doing so will cause all of the
-mq policy's hints to be dropped.  Also, performance of the cache may
-degrade slightly until smq recalculates the origin device's hotspots
-that should be cached.
-
-Memory usage:
-The mq policy used a lot of memory; 88 bytes per cache block on a 64
-bit machine.
-
-smq uses 28bit indexes to implement its data structures rather than
-pointers.  It avoids storing an explicit hit count for each block.  It
-has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of
-the entries (each hotspot block covers a larger area than a single
-cache block).
-
-All this means smq uses ~25bytes per cache block.  Still a lot of
-memory, but a substantial improvement nontheless.
-
-Level balancing:
-mq placed entries in different levels of the multiqueue structures
-based on their hit count (~ln(hit count)).  This meant the bottom
-levels generally had the most entries, and the top ones had very
-few.  Having unbalanced levels like this reduced the efficacy of the
-multiqueue.
-
-smq does not maintain a hit count, instead it swaps hit entries with
-the least recently used entry from the level above.  The overall
-ordering being a side effect of this stochastic process.  With this
-scheme we can decide how many entries occupy each multiqueue level,
-resulting in better promotion/demotion decisions.
-
-Adaptability:
-The mq policy maintained a hit count for each cache block.  For a
-different block to get promoted to the cache its hit count has to
-exceed the lowest currently in the cache.  This meant it could take a
-long time for the cache to adapt between varying IO patterns.
-
-smq doesn't maintain hit counts, so a lot of this problem just goes
-away.  In addition it tracks performance of the hotspot queue, which
-is used to decide which blocks to promote.  If the hotspot queue is
-performing badly then it starts moving entries more quickly between
-levels.  This lets it adapt to new IO patterns very quickly.
-
-Performance:
-Testing smq shows substantially better performance than mq.
-
-cleaner
--------
-
-The cleaner writes back all dirty blocks in a cache to decommission it.
-
-Examples
-========
-
-The syntax for a table is:
-	cache <metadata dev> <cache dev> <origin dev> <block size>
-	<#feature_args> [<feature arg>]*
-	<policy> <#policy_args> [<policy arg>]*
-
-The syntax to send a message using the dmsetup command is:
-	dmsetup message <mapped device> 0 sequential_threshold 1024
-	dmsetup message <mapped device> 0 random_threshold 8
-
-Using dmsetup:
-	dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \
-	    /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8"
-	creates a 128GB large mapped device named 'blah' with the
-	sequential threshold set to 1024 and the random_threshold set to 8.
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
deleted file mode 100644
index 8ae1cf8e94da..000000000000
--- a/Documentation/device-mapper/cache.txt
+++ /dev/null
@@ -1,311 +0,0 @@
-Introduction
-============
-
-dm-cache is a device mapper target written by Joe Thornber, Heinz
-Mauelshagen, and Mike Snitzer.
-
-It aims to improve performance of a block device (eg, a spindle) by
-dynamically migrating some of its data to a faster, smaller device
-(eg, an SSD).
-
-This device-mapper solution allows us to insert this caching at
-different levels of the dm stack, for instance above the data device for
-a thin-provisioning pool.  Caching solutions that are integrated more
-closely with the virtual memory system should give better performance.
-
-The target reuses the metadata library used in the thin-provisioning
-library.
-
-The decision as to what data to migrate and when is left to a plug-in
-policy module.  Several of these have been written as we experiment,
-and we hope other people will contribute others for specific io
-scenarios (eg. a vm image server).
-
-Glossary
-========
-
-  Migration -  Movement of the primary copy of a logical block from one
-	       device to the other.
-  Promotion -  Migration from slow device to fast device.
-  Demotion  -  Migration from fast device to slow device.
-
-The origin device always contains a copy of the logical block, which
-may be out of date or kept in sync with the copy on the cache device
-(depending on policy).
-
-Design
-======
-
-Sub-devices
------------
-
-The target is constructed by passing three devices to it (along with
-other parameters detailed later):
-
-1. An origin device - the big, slow one.
-
-2. A cache device - the small, fast one.
-
-3. A small metadata device - records which blocks are in the cache,
-   which are dirty, and extra hints for use by the policy object.
-   This information could be put on the cache device, but having it
-   separate allows the volume manager to configure it differently,
-   e.g. as a mirror for extra robustness.  This metadata device may only
-   be used by a single cache device.
-
-Fixed block size
-----------------
-
-The origin is divided up into blocks of a fixed size.  This block size
-is configurable when you first create the cache.  Typically we've been
-using block sizes of 256KB - 1024KB.  The block size must be between 64
-sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB).
-
-Having a fixed block size simplifies the target a lot.  But it is
-something of a compromise.  For instance, a small part of a block may be
-getting hit a lot, yet the whole block will be promoted to the cache.
-So large block sizes are bad because they waste cache space.  And small
-block sizes are bad because they increase the amount of metadata (both
-in core and on disk).
-
-Cache operating modes
----------------------
-
-The cache has three operating modes: writeback, writethrough and
-passthrough.
-
-If writeback, the default, is selected then a write to a block that is
-cached will go only to the cache and the block will be marked dirty in
-the metadata.
-
-If writethrough is selected then a write to a cached block will not
-complete until it has hit both the origin and cache devices.  Clean
-blocks should remain clean.
-
-If passthrough is selected, useful when the cache contents are not known
-to be coherent with the origin device, then all reads are served from
-the origin device (all reads miss the cache) and all writes are
-forwarded to the origin device; additionally, write hits cause cache
-block invalidates.  To enable passthrough mode the cache must be clean.
-Passthrough mode allows a cache device to be activated without having to
-worry about coherency.  Coherency that exists is maintained, although
-the cache will gradually cool as writes take place.  If the coherency of
-the cache can later be verified, or established through use of the
-"invalidate_cblocks" message, the cache device can be transitioned to
-writethrough or writeback mode while still warm.  Otherwise, the cache
-contents can be discarded prior to transitioning to the desired
-operating mode.
-
-A simple cleaner policy is provided, which will clean (write back) all
-dirty blocks in a cache.  Useful for decommissioning a cache or when
-shrinking a cache.  Shrinking the cache's fast device requires all cache
-blocks, in the area of the cache being removed, to be clean.  If the
-area being removed from the cache still contains dirty blocks the resize
-will fail.  Care must be taken to never reduce the volume used for the
-cache's fast device until the cache is clean.  This is of particular
-importance if writeback mode is used.  Writethrough and passthrough
-modes already maintain a clean cache.  Future support to partially clean
-the cache, above a specified threshold, will allow for keeping the cache
-warm and in writeback mode during resize.
-
-Migration throttling
---------------------
-
-Migrating data between the origin and cache device uses bandwidth.
-The user can set a throttle to prevent more than a certain amount of
-migration occurring at any one time.  Currently we're not taking any
-account of normal io traffic going to the devices.  More work needs
-doing here to avoid migrating during those peak io moments.
-
-For the time being, a message "migration_threshold <#sectors>"
-can be used to set the maximum number of sectors being migrated,
-the default being 2048 sectors (1MB).
-
-Updating on-disk metadata
--------------------------
-
-On-disk metadata is committed every time a FLUSH or FUA bio is written.
-If no such requests are made then commits will occur every second.  This
-means the cache behaves like a physical disk that has a volatile write
-cache.  If power is lost you may lose some recent writes.  The metadata
-should always be consistent in spite of any crash.
-
-The 'dirty' state for a cache block changes far too frequently for us
-to keep updating it on the fly.  So we treat it as a hint.  In normal
-operation it will be written when the dm device is suspended.  If the
-system crashes all cache blocks will be assumed dirty when restarted.
-
-Per-block policy hints
-----------------------
-
-Policy plug-ins can store a chunk of data per cache block.  It's up to
-the policy how big this chunk is, but it should be kept small.  Like the
-dirty flags this data is lost if there's a crash so a safe fallback
-value should always be possible.
-
-Policy hints affect performance, not correctness.
-
-Policy messaging
-----------------
-
-Policies will have different tunables, specific to each one, so we
-need a generic way of getting and setting these.  Device-mapper
-messages are used.  Refer to cache-policies.txt.
-
-Discard bitset resolution
--------------------------
-
-We can avoid copying data during migration if we know the block has
-been discarded.  A prime example of this is when mkfs discards the
-whole block device.  We store a bitset tracking the discard state of
-blocks.  However, we allow this bitset to have a different block size
-from the cache blocks.  This is because we need to track the discard
-state for all of the origin device (compare with the dirty bitset
-which is just for the smaller cache device).
-
-Target interface
-================
-
-Constructor
------------
-
- cache <metadata dev> <cache dev> <origin dev> <block size>
-       <#feature args> [<feature arg>]*
-       <policy> <#policy args> [policy args]*
-
- metadata dev    : fast device holding the persistent metadata
- cache dev	 : fast device holding cached data blocks
- origin dev	 : slow device holding original data blocks
- block size      : cache unit size in sectors
-
- #feature args   : number of feature arguments passed
- feature args    : writethrough or passthrough (The default is writeback.)
-
- policy          : the replacement policy to use
- #policy args    : an even number of arguments corresponding to
-                   key/value pairs passed to the policy
- policy args     : key/value pairs passed to the policy
-		   E.g. 'sequential_threshold 1024'
-		   See cache-policies.txt for details.
-
-Optional feature arguments are:
-   writethrough  : write through caching that prohibits cache block
-		   content from being different from origin block content.
-		   Without this argument, the default behaviour is to write
-		   back cache block contents later for performance reasons,
-		   so they may differ from the corresponding origin blocks.
-
-   passthrough	 : a degraded mode useful for various cache coherency
-		   situations (e.g., rolling back snapshots of
-		   underlying storage).	 Reads and writes always go to
-		   the origin.	If a write goes to a cached origin
-		   block, then the cache block is invalidated.
-		   To enable passthrough mode the cache must be clean.
-
-   metadata2	: use version 2 of the metadata.  This stores the dirty bits
-                  in a separate btree, which improves speed of shutting
-		  down the cache.
-
-   no_discard_passdown	: disable passing down discards from the cache
-			  to the origin's data device.
-
-A policy called 'default' is always registered.  This is an alias for
-the policy we currently think is giving best all round performance.
-
-As the default policy could vary between kernels, if you are relying on
-the characteristics of a specific policy, always request it by name.
-
-Status
-------
-
-<metadata block size> <#used metadata blocks>/<#total metadata blocks>
-<cache block size> <#used cache blocks>/<#total cache blocks>
-<#read hits> <#read misses> <#write hits> <#write misses>
-<#demotions> <#promotions> <#dirty> <#features> <features>*
-<#core args> <core args>* <policy name> <#policy args> <policy args>*
-<cache metadata mode>
-
-metadata block size	 : Fixed block size for each metadata block in
-			     sectors
-#used metadata blocks	 : Number of metadata blocks used
-#total metadata blocks	 : Total number of metadata blocks
-cache block size	 : Configurable block size for the cache device
-			     in sectors
-#used cache blocks	 : Number of blocks resident in the cache
-#total cache blocks	 : Total number of cache blocks
-#read hits		 : Number of times a READ bio has been mapped
-			     to the cache
-#read misses		 : Number of times a READ bio has been mapped
-			     to the origin
-#write hits		 : Number of times a WRITE bio has been mapped
-			     to the cache
-#write misses		 : Number of times a WRITE bio has been
-			     mapped to the origin
-#demotions		 : Number of times a block has been removed
-			     from the cache
-#promotions		 : Number of times a block has been moved to
-			     the cache
-#dirty			 : Number of blocks in the cache that differ
-			     from the origin
-#feature args		 : Number of feature args to follow
-feature args		 : 'writethrough' (optional)
-#core args		 : Number of core arguments (must be even)
-core args		 : Key/value pairs for tuning the core
-			     e.g. migration_threshold
-policy name		 : Name of the policy
-#policy args		 : Number of policy arguments to follow (must be even)
-policy args		 : Key/value pairs e.g. sequential_threshold
-cache metadata mode      : ro if read-only, rw if read-write
-	In serious cases where even a read-only mode is deemed unsafe
-	no further I/O will be permitted and the status will just
-	contain the string 'Fail'.  The userspace recovery tools
-	should then be used.
-needs_check		 : 'needs_check' if set, '-' if not set
-	A metadata operation has failed, resulting in the needs_check
-	flag being set in the metadata's superblock.  The metadata
-	device must be deactivated and checked/repaired before the
-	cache can be made fully operational again.  '-' indicates
-	needs_check is not set.
-
-Messages
---------
-
-Policies will have different tunables, specific to each one, so we
-need a generic way of getting and setting these.  Device-mapper
-messages are used.  (A sysfs interface would also be possible.)
-
-The message format is:
-
-   <key> <value>
-
-E.g.
-   dmsetup message my_cache 0 sequential_threshold 1024
-
-
-Invalidation is removing an entry from the cache without writing it
-back.  Cache blocks can be invalidated via the invalidate_cblocks
-message, which takes an arbitrary number of cblock ranges.  Each cblock
-range's end value is "one past the end", meaning 5-10 expresses a range
-of values from 5 to 9.  Each cblock must be expressed as a decimal
-value, in the future a variant message that takes cblock ranges
-expressed in hexadecimal may be needed to better support efficient
-invalidation of larger caches.  The cache must be in passthrough mode
-when invalidate_cblocks is used.
-
-   invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*
-
-E.g.
-   dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789
-
-Examples
-========
-
-The test suite can be found here:
-
-https://github.com/jthornber/device-mapper-test-suite
-
-dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
-	/dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0'
-dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
-	/dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \
-	mq 4 sequential_threshold 1024 random_threshold 8'
diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt
deleted file mode 100644
index 6426c45273cb..000000000000
--- a/Documentation/device-mapper/delay.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-dm-delay
-========
-
-Device-Mapper's "delay" target delays reads and/or writes
-and maps them to different devices.
-
-Parameters:
-    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>
-			       [<flush_device> <flush_offset> <flush_delay>]]
-
-With separate write parameters, the first set is only used for reads.
-Offsets are specified in sectors.
-Delays are specified in milliseconds.
-
-Example scripts
-===============
-[[
-#!/bin/sh
-# Create device delaying rw operation for 500ms
-echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed
-]]
-
-[[
-#!/bin/sh
-# Create device delaying only write operation for 500ms and
-# splitting reads and writes to different devices $1 $2
-echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed
-]]
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
deleted file mode 100644
index 3b3e1de21c9c..000000000000
--- a/Documentation/device-mapper/dm-crypt.txt
+++ /dev/null
@@ -1,162 +0,0 @@
-dm-crypt
-=========
-
-Device-Mapper's "crypt" target provides transparent encryption of block devices
-using the kernel crypto API.
-
-For a more detailed description of supported parameters see:
-https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
-
-Parameters: <cipher> <key> <iv_offset> <device path> \
-	      <offset> [<#opt_params> <opt_params>]
-
-<cipher>
-    Encryption cipher, encryption mode and Initial Vector (IV) generator.
-
-    The cipher specifications format is:
-       cipher[:keycount]-chainmode-ivmode[:ivopts]
-    Examples:
-       aes-cbc-essiv:sha256
-       aes-xts-plain64
-       serpent-xts-plain64
-
-    Cipher format also supports direct specification with kernel crypt API
-    format (selected by capi: prefix). The IV specification is the same
-    as for the first format type.
-    This format is mainly used for specification of authenticated modes.
-
-    The crypto API cipher specifications format is:
-        capi:cipher_api_spec-ivmode[:ivopts]
-    Examples:
-        capi:cbc(aes)-essiv:sha256
-        capi:xts(aes)-plain64
-    Examples of authenticated modes:
-        capi:gcm(aes)-random
-        capi:authenc(hmac(sha256),xts(aes))-random
-        capi:rfc7539(chacha20,poly1305)-random
-
-    The /proc/crypto contains a list of curently loaded crypto modes.
-
-<key>
-    Key used for encryption. It is encoded either as a hexadecimal number
-    or it can be passed as <key_string> prefixed with single colon
-    character (':') for keys residing in kernel keyring service.
-    You can only use key sizes that are valid for the selected cipher
-    in combination with the selected iv mode.
-    Note that for some iv modes the key string can contain additional
-    keys (for example IV seed) so the key contains more parts concatenated
-    into a single string.
-
-<key_string>
-    The kernel keyring key is identified by string in following format:
-    <key_size>:<key_type>:<key_description>.
-
-<key_size>
-    The encryption key size in bytes. The kernel key payload size must match
-    the value passed in <key_size>.
-
-<key_type>
-    Either 'logon' or 'user' kernel key type.
-
-<key_description>
-    The kernel keyring key description crypt target should look for
-    when loading key of <key_type>.
-
-<keycount>
-    Multi-key compatibility mode. You can define <keycount> keys and
-    then sectors are encrypted according to their offsets (sector 0 uses key0;
-    sector 1 uses key1 etc.).  <keycount> must be a power of two.
-
-<iv_offset>
-    The IV offset is a sector count that is added to the sector number
-    before creating the IV.
-
-<device path>
-    This is the device that is going to be used as backend and contains the
-    encrypted data.  You can specify it as a path like /dev/xxx or a device
-    number <major>:<minor>.
-
-<offset>
-    Starting sector within the device where the encrypted data begins.
-
-<#opt_params>
-    Number of optional parameters. If there are no optional parameters,
-    the optional paramaters section can be skipped or #opt_params can be zero.
-    Otherwise #opt_params is the number of following arguments.
-
-    Example of optional parameters section:
-        3 allow_discards same_cpu_crypt submit_from_crypt_cpus
-
-allow_discards
-    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
-    The default is to ignore discard requests.
-
-    WARNING: Assess the specific security risks carefully before enabling this
-    option.  For example, allowing discards on encrypted devices may lead to
-    the leak of information about the ciphertext device (filesystem type,
-    used space etc.) if the discarded blocks can be located easily on the
-    device later.
-
-same_cpu_crypt
-    Perform encryption using the same cpu that IO was submitted on.
-    The default is to use an unbound workqueue so that encryption work
-    is automatically balanced between available CPUs.
-
-submit_from_crypt_cpus
-    Disable offloading writes to a separate thread after encryption.
-    There are some situations where offloading write bios from the
-    encryption threads to a single thread degrades performance
-    significantly.  The default is to offload write bios to the same
-    thread because it benefits CFQ to have writes submitted using the
-    same context.
-
-integrity:<bytes>:<type>
-    The device requires additional <bytes> metadata per-sector stored
-    in per-bio integrity structure. This metadata must by provided
-    by underlying dm-integrity target.
-
-    The <type> can be "none" if metadata is used only for persistent IV.
-
-    For Authenticated Encryption with Additional Data (AEAD)
-    the <type> is "aead". An AEAD mode additionally calculates and verifies
-    integrity for the encrypted device. The additional space is then
-    used for storing authentication tag (and persistent IV if needed).
-
-sector_size:<bytes>
-    Use <bytes> as the encryption unit instead of 512 bytes sectors.
-    This option can be in range 512 - 4096 bytes and must be power of two.
-    Virtual device will announce this size as a minimal IO and logical sector.
-
-iv_large_sectors
-   IV generators will use sector number counted in <sector_size> units
-   instead of default 512 bytes sectors.
-
-   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
-   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
-   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
-   if this flag is specified.
-
-Example scripts
-===============
-LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
-encryption with dm-crypt using the 'cryptsetup' utility, see
-https://gitlab.com/cryptsetup/cryptsetup
-
-[[
-#!/bin/sh
-# Create a crypt device using dmsetup
-dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0"
-]]
-
-[[
-#!/bin/sh
-# Create a crypt device using dmsetup when encryption key is stored in keyring service
-dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0"
-]]
-
-[[
-#!/bin/sh
-# Create a crypt device using cryptsetup and LUKS header with default cipher
-cryptsetup luksFormat $1
-cryptsetup luksOpen $1 crypt1
-]]
diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
deleted file mode 100644
index 9f0e247d0877..000000000000
--- a/Documentation/device-mapper/dm-flakey.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-dm-flakey
-=========
-
-This target is the same as the linear target except that it exhibits
-unreliable behaviour periodically.  It's been found useful in simulating
-failing devices for testing purposes.
-
-Starting from the time the table is loaded, the device is available for
-<up interval> seconds, then exhibits unreliable behaviour for <down
-interval> seconds, and then this cycle repeats.
-
-Also, consider using this in combination with the dm-delay target too,
-which can delay reads and writes and/or send them to different
-underlying devices.
-
-Table parameters
-----------------
-  <dev path> <offset> <up interval> <down interval> \
-    [<num_features> [<feature arguments>]]
-
-Mandatory parameters:
-    <dev path>: Full pathname to the underlying block-device, or a
-                "major:minor" device-number.
-    <offset>: Starting sector within the device.
-    <up interval>: Number of seconds device is available.
-    <down interval>: Number of seconds device returns errors.
-
-Optional feature parameters:
-  If no feature parameters are present, during the periods of
-  unreliability, all I/O returns errors.
-
-  drop_writes:
-	All write I/O is silently ignored.
-	Read I/O is handled correctly.
-
-  error_writes:
-	All write I/O is failed with an error signalled.
-	Read I/O is handled correctly.
-
-  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
-	During <down interval>, replace <Nth_byte> of the data of
-	each matching bio with <value>.
-
-    <Nth_byte>: The offset of the byte to replace.
-		Counting starts at 1, to replace the first byte.
-    <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
-		 'w' is incompatible with drop_writes.
-    <value>: The value (from 0-255) to write.
-    <flags>: Perform the replacement only if bio->bi_opf has all the
-	     selected flags set.
-
-Examples:
-  corrupt_bio_byte 32 r 1 0
-	- replaces the 32nd byte of READ bios with the value 1
-
-  corrupt_bio_byte 224 w 0 32
-	- replaces the 224th byte of REQ_META (=32) bios with the value 0
diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt
deleted file mode 100644
index 8464ee7c01b8..000000000000
--- a/Documentation/device-mapper/dm-init.txt
+++ /dev/null
@@ -1,114 +0,0 @@
-Early creation of mapped devices
-====================================
-
-It is possible to configure a device-mapper device to act as the root device for
-your system in two ways.
-
-The first is to build an initial ramdisk which boots to a minimal userspace
-which configures the device, then pivot_root(8) in to it.
-
-The second is to create one or more device-mappers using the module parameter
-"dm-mod.create=" through the kernel boot command line argument.
-
-The format is specified as a string of data separated by commas and optionally
-semi-colons, where:
- - a comma is used to separate fields like name, uuid, flags and table
-   (specifies one device)
- - a semi-colon is used to separate devices.
-
-So the format will look like this:
-
- dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
-
-Where,
-	<name>		::= The device name.
-	<uuid>		::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | ""
-	<minor>		::= The device minor number | ""
-	<flags>		::= "ro" | "rw"
-	<table>		::= <start_sector> <num_sectors> <target_type> <target_args>
-	<target_type>	::= "verity" | "linear" | ... (see list below)
-
-The dm line should be equivalent to the one used by the dmsetup tool with the
---concise argument.
-
-Target types
-============
-
-Not all target types are available as there are serious risks in allowing
-activation of certain DM targets without first using userspace tools to check
-the validity of associated metadata.
-
-	"cache":		constrained, userspace should verify cache device
-	"crypt":		allowed
-	"delay":		allowed
-	"era":			constrained, userspace should verify metadata device
-	"flakey":		constrained, meant for test
-	"linear":		allowed
-	"log-writes":		constrained, userspace should verify metadata device
-	"mirror":		constrained, userspace should verify main/mirror device
-	"raid":			constrained, userspace should verify metadata device
-	"snapshot":		constrained, userspace should verify src/dst device
-	"snapshot-origin":	allowed
-	"snapshot-merge":	constrained, userspace should verify src/dst device
-	"striped":		allowed
-	"switch":		constrained, userspace should verify dev path
-	"thin":			constrained, requires dm target message from userspace
-	"thin-pool":		constrained, requires dm target message from userspace
-	"verity":		allowed
-	"writecache":		constrained, userspace should verify cache device
-	"zero":			constrained, not meant for rootfs
-
-If the target is not listed above, it is constrained by default (not tested).
-
-Examples
-========
-An example of booting to a linear array made up of user-mode linux block
-devices:
-
-  dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0
-
-This will boot to a rw dm-linear target of 8192 sectors split across two block
-devices identified by their major:minor numbers.  After boot, udev will rename
-this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned.
-
-An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here
-split on multiple lines for readability:
-
-  vroot,,,ro,
-    0 1740800 verity 254:0 254:0 1740800 sha1
-      76e9be054b15884a9fa85973e9cb274c93afadb6
-      5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe;
-  vram,,,rw,
-    0 32768 linear 1:0 0,
-    32768 32768 linear 1:1 0
-
-Other examples (per target):
-
-"crypt":
-  dm-crypt,,8,ro,
-    0 1048576 crypt aes-xts-plain64
-    babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0
-    /dev/sda 0 1 allow_discards
-
-"delay":
-  dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500
-
-"linear":
-  dm-linear,,,rw,
-    0 32768 linear /dev/sda1 0,
-    32768 1024000 linear /dev/sda2 0,
-    1056768 204800 linear /dev/sda3 0,
-    1261568 512000 linear /dev/sda4 0
-
-"snapshot-origin":
-  dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2
-
-"striped":
-  dm-striped,,4,ro,0 1638400 striped 4 4096
-  /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0
-
-"verity":
-  dm-verity,,4,ro,
-    0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
-    fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
-    51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
deleted file mode 100644
index d63d78ffeb73..000000000000
--- a/Documentation/device-mapper/dm-integrity.txt
+++ /dev/null
@@ -1,233 +0,0 @@
-The dm-integrity target emulates a block device that has additional
-per-sector tags that can be used for storing integrity information.
-
-A general problem with storing integrity tags with every sector is that
-writing the sector and the integrity tag must be atomic - i.e. in case of
-crash, either both sector and integrity tag or none of them is written.
-
-To guarantee write atomicity, the dm-integrity target uses journal, it
-writes sector data and integrity tags into a journal, commits the journal
-and then copies the data and integrity tags to their respective location.
-
-The dm-integrity target can be used with the dm-crypt target - in this
-situation the dm-crypt target creates the integrity data and passes them
-to the dm-integrity target via bio_integrity_payload attached to the bio.
-In this mode, the dm-crypt and dm-integrity targets provide authenticated
-disk encryption - if the attacker modifies the encrypted device, an I/O
-error is returned instead of random data.
-
-The dm-integrity target can also be used as a standalone target, in this
-mode it calculates and verifies the integrity tag internally. In this
-mode, the dm-integrity target can be used to detect silent data
-corruption on the disk or in the I/O path.
-
-There's an alternate mode of operation where dm-integrity uses bitmap
-instead of a journal. If a bit in the bitmap is 1, the corresponding
-region's data and integrity tags are not synchronized - if the machine
-crashes, the unsynchronized regions will be recalculated. The bitmap mode
-is faster than the journal mode, because we don't have to write the data
-twice, but it is also less reliable, because if data corruption happens
-when the machine crashes, it may not be detected.
-
-When loading the target for the first time, the kernel driver will format
-the device. But it will only format the device if the superblock contains
-zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
-target can't be loaded.
-
-To use the target for the first time:
-1. overwrite the superblock with zeroes
-2. load the dm-integrity target with one-sector size, the kernel driver
-	will format the device
-3. unload the dm-integrity target
-4. read the "provided_data_sectors" value from the superblock
-5. load the dm-integrity target with the the target size
-	"provided_data_sectors"
-6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
-	with the size "provided_data_sectors"
-
-
-Target arguments:
-
-1. the underlying block device
-
-2. the number of reserved sector at the beginning of the device - the
-	dm-integrity won't read of write these sectors
-
-3. the size of the integrity tag (if "-" is used, the size is taken from
-	the internal-hash algorithm)
-
-4. mode:
-	D - direct writes (without journal) - in this mode, journaling is
-		not used and data sectors and integrity tags are written
-		separately. In case of crash, it is possible that the data
-		and integrity tag doesn't match.
-	J - journaled writes - data and integrity tags are written to the
-		journal and atomicity is guaranteed. In case of crash,
-		either both data and tag or none of them are written. The
-		journaled mode degrades write throughput twice because the
-		data have to be written twice.
-	B - bitmap mode - data and metadata are written without any
-		synchronization, the driver maintains a bitmap of dirty
-		regions where data and metadata don't match. This mode can
-		only be used with internal hash.
-	R - recovery mode - in this mode, journal is not replayed,
-		checksums are not checked and writes to the device are not
-		allowed. This mode is useful for data recovery if the
-		device cannot be activated in any of the other standard
-		modes.
-
-5. the number of additional arguments
-
-Additional arguments:
-
-journal_sectors:number
-	The size of journal, this argument is used only if formatting the
-	device. If the device is already formatted, the value from the
-	superblock is used.
-
-interleave_sectors:number
-	The number of interleaved sectors. This values is rounded down to
-	a power of two. If the device is already formatted, the value from
-	the superblock is used.
-
-meta_device:device
-	Don't interleave the data and metadata on on device. Use a
-	separate device for metadata.
-
-buffer_sectors:number
-	The number of sectors in one buffer. The value is rounded down to
-	a power of two.
-
-	The tag area is accessed using buffers, the buffer size is
-	configurable. The large buffer size means that the I/O size will
-	be larger, but there could be less I/Os issued.
-
-journal_watermark:number
-	The journal watermark in percents. When the size of the journal
-	exceeds this watermark, the thread that flushes the journal will
-	be started.
-
-commit_time:number
-	Commit time in milliseconds. When this time passes, the journal is
-	written. The journal is also written immediatelly if the FLUSH
-	request is received.
-
-internal_hash:algorithm(:key)	(the key is optional)
-	Use internal hash or crc.
-	When this argument is used, the dm-integrity target won't accept
-	integrity tags from the upper target, but it will automatically
-	generate and verify the integrity tags.
-
-	You can use a crc algorithm (such as crc32), then integrity target
-	will protect the data against accidental corruption.
-	You can also use a hmac algorithm (for example
-	"hmac(sha256):0123456789abcdef"), in this mode it will provide
-	cryptographic authentication of the data without encryption.
-
-	When this argument is not used, the integrity tags are accepted
-	from an upper layer target, such as dm-crypt. The upper layer
-	target should check the validity of the integrity tags.
-
-recalculate
-	Recalculate the integrity tags automatically. It is only valid
-	when using internal hash.
-
-journal_crypt:algorithm(:key)	(the key is optional)
-	Encrypt the journal using given algorithm to make sure that the
-	attacker can't read the journal. You can use a block cipher here
-	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
-	"salsa20", "ctr(aes)" or "ecb(arc4)").
-
-	The journal contains history of last writes to the block device,
-	an attacker reading the journal could see the last sector nubmers
-	that were written. From the sector numbers, the attacker can infer
-	the size of files that were written. To protect against this
-	situation, you can encrypt the journal.
-
-journal_mac:algorithm(:key)	(the key is optional)
-	Protect sector numbers in the journal from accidental or malicious
-	modification. To protect against accidental modification, use a
-	crc algorithm, to protect against malicious modification, use a
-	hmac algorithm with a key.
-
-	This option is not needed when using internal-hash because in this
-	mode, the integrity of journal entries is checked when replaying
-	the journal. Thus, modified sector number would be detected at
-	this stage.
-
-block_size:number
-	The size of a data block in bytes.  The larger the block size the
-	less overhead there is for per-block integrity metadata.
-	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
-	specified the default block size is 512 bytes.
-
-sectors_per_bit:number
-	In the bitmap mode, this parameter specifies the number of
-	512-byte sectors that corresponds to one bitmap bit.
-
-bitmap_flush_interval:number
-	The bitmap flush interval in milliseconds. The metadata buffers
-	are synchronized when this interval expires.
-
-
-The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
-be changed when reloading the target (load an inactive table and swap the
-tables with suspend and resume). The other arguments should not be changed
-when reloading the target because the layout of disk data depend on them
-and the reloaded target would be non-functional.
-
-
-The layout of the formatted block device:
-* reserved sectors (they are not used by this target, they can be used for
-  storing LUKS metadata or for other purpose), the size of the reserved
-  area is specified in the target arguments
-* superblock (4kiB)
-	* magic string - identifies that the device was formatted
-	* version
-	* log2(interleave sectors)
-	* integrity tag size
-	* the number of journal sections
-	* provided data sectors - the number of sectors that this target
-	  provides (i.e. the size of the device minus the size of all
-	  metadata and padding). The user of this target should not send
-	  bios that access data beyond the "provided data sectors" limit.
-	* flags
-	  SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
-	  SB_FLAG_RECALCULATING - recalculating is in progress
-	  SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
-		blocks
-	* log2(sectors per block)
-	* a position where recalculating finished
-* journal
-	The journal is divided into sections, each section contains:
-	* metadata area (4kiB), it contains journal entries
-	  every journal entry contains:
-		* logical sector (specifies where the data and tag should
-		  be written)
-		* last 8 bytes of data
-		* integrity tag (the size is specified in the superblock)
-	    every metadata sector ends with
-		* mac (8-bytes), all the macs in 8 metadata sectors form a
-		  64-byte value. It is used to store hmac of sector
-		  numbers in the journal section, to protect against a
-		  possibility that the attacker tampers with sector
-		  numbers in the journal.
-		* commit id
-	* data area (the size is variable; it depends on how many journal
-	  entries fit into the metadata area)
-	    every sector in the data area contains:
-		* data (504 bytes of data, the last 8 bytes are stored in
-		  the journal entry)
-		* commit id
-	To test if the whole journal section was written correctly, every
-	512-byte sector of the journal ends with 8-byte commit id. If the
-	commit id matches on all sectors in a journal section, then it is
-	assumed that the section was written correctly. If the commit id
-	doesn't match, the section was written partially and it should not
-	be replayed.
-* one or more runs of interleaved tags and data. Each run contains:
-	* tag area - it contains integrity tags. There is one tag for each
-	  sector in the data area
-	* data area - it contains data sectors. The number of data sectors
-	  in one run must be a power of two. log2 of this value is stored
-	  in the superblock.
diff --git a/Documentation/device-mapper/dm-io.txt b/Documentation/device-mapper/dm-io.txt
deleted file mode 100644
index 3b5d9a52cdcf..000000000000
--- a/Documentation/device-mapper/dm-io.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-dm-io
-=====
-
-Dm-io provides synchronous and asynchronous I/O services. There are three
-types of I/O services available, and each type has a sync and an async
-version.
-
-The user must set up an io_region structure to describe the desired location
-of the I/O. Each io_region indicates a block-device along with the starting
-sector and size of the region.
-
-   struct io_region {
-      struct block_device *bdev;
-      sector_t sector;
-      sector_t count;
-   };
-
-Dm-io can read from one io_region or write to one or more io_regions. Writes
-to multiple regions are specified by an array of io_region structures.
-
-The first I/O service type takes a list of memory pages as the data buffer for
-the I/O, along with an offset into the first page.
-
-   struct page_list {
-      struct page_list *next;
-      struct page *page;
-   };
-
-   int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
-                  struct page_list *pl, unsigned int offset,
-                  unsigned long *error_bits);
-   int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
-                   struct page_list *pl, unsigned int offset,
-                   io_notify_fn fn, void *context);
-
-The second I/O service type takes an array of bio vectors as the data buffer
-for the I/O. This service can be handy if the caller has a pre-assembled bio,
-but wants to direct different portions of the bio to different devices.
-
-   int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where,
-                       int rw, struct bio_vec *bvec,
-                       unsigned long *error_bits);
-   int dm_io_async_bvec(unsigned int num_regions, struct io_region *where,
-                        int rw, struct bio_vec *bvec,
-                        io_notify_fn fn, void *context);
-
-The third I/O service type takes a pointer to a vmalloc'd memory buffer as the
-data buffer for the I/O. This service can be handy if the caller needs to do
-I/O to a large region but doesn't want to allocate a large number of individual
-memory pages.
-
-   int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
-                     void *data, unsigned long *error_bits);
-   int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
-                      void *data, io_notify_fn fn, void *context);
-
-Callers of the asynchronous I/O services must include the name of a completion
-callback routine and a pointer to some context data for the I/O.
-
-   typedef void (*io_notify_fn)(unsigned long error, void *context);
-
-The "error" parameter in this callback, as well as the "*error" parameter in
-all of the synchronous versions, is a bitset (instead of a simple error value).
-In the case of an write-I/O to multiple regions, this bitset allows dm-io to
-indicate success or failure on each individual region.
-
-Before using any of the dm-io services, the user should call dm_io_get()
-and specify the number of pages they expect to perform I/O on concurrently.
-Dm-io will attempt to resize its mempool to make sure enough pages are
-always available in order to avoid unnecessary waiting while performing I/O.
-
-When the user is finished using the dm-io services, they should call
-dm_io_put() and specify the same number of pages that were given on the
-dm_io_get() call.
-
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
deleted file mode 100644
index c155ac569c44..000000000000
--- a/Documentation/device-mapper/dm-log.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Device-Mapper Logging
-=====================
-The device-mapper logging code is used by some of the device-mapper
-RAID targets to track regions of the disk that are not consistent.
-A region (or portion of the address space) of the disk may be
-inconsistent because a RAID stripe is currently being operated on or
-a machine died while the region was being altered.  In the case of
-mirrors, a region would be considered dirty/inconsistent while you
-are writing to it because the writes need to be replicated for all
-the legs of the mirror and may not reach the legs at the same time.
-Once all writes are complete, the region is considered clean again.
-
-There is a generic logging interface that the device-mapper RAID
-implementations use to perform logging operations (see
-dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
-logging implementations are available and provide different
-capabilities.  The list includes:
-
-Type		Files
-====		=====
-disk		drivers/md/dm-log.c
-core		drivers/md/dm-log.c
-userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
-
-The "disk" log type
--------------------
-This log implementation commits the log state to disk.  This way, the
-logging state survives reboots/crashes.
-
-The "core" log type
--------------------
-This log implementation keeps the log state in memory.  The log state
-will not survive a reboot or crash, but there may be a small boost in
-performance.  This method can also be used if no storage device is
-available for storing log state.
-
-The "userspace" log type
-------------------------
-This log type simply provides a way to export the log API to userspace,
-so log implementations can be done there.  This is done by forwarding most
-logging requests to userspace, where a daemon receives and processes the
-request.
-
-The structure used for communication between kernel and userspace are
-located in include/linux/dm-log-userspace.h.  Due to the frequency,
-diversity, and 2-way communication nature of the exchanges between
-kernel and userspace, 'connector' is used as the interface for
-communication.
-
-There are currently two userspace log implementations that leverage this
-framework - "clustered-disk" and "clustered-core".  These implementations
-provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
-can be used in a shared-storage environment when the cluster log implementations
-are employed.
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
deleted file mode 100644
index f4db2562175c..000000000000
--- a/Documentation/device-mapper/dm-queue-length.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-dm-queue-length
-===============
-
-dm-queue-length is a path selector module for device-mapper targets,
-which selects a path with the least number of in-flight I/Os.
-The path selector name is 'queue-length'.
-
-Table parameters for each path: [<repeat_count>]
-	<repeat_count>: The number of I/Os to dispatch using the selected
-			path before switching to the next path.
-			If not given, internal default is used. To check
-			the default value, see the activated table.
-
-Status for each path: <status> <fail-count> <in-flight>
-	<status>: 'A' if the path is active, 'F' if the path is failed.
-	<fail-count>: The number of path failures.
-	<in-flight>: The number of in-flight I/Os on the path.
-
-
-Algorithm
-=========
-
-dm-queue-length increments/decrements 'in-flight' when an I/O is
-dispatched/completed respectively.
-dm-queue-length selects a path with the minimum 'in-flight'.
-
-
-Examples
-========
-In case that 2 paths (sda and sdb) are used with repeat_count == 128.
-
-# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
-  dmsetup create test
-#
-# dmsetup table
-test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
-#
-# dmsetup status
-test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
deleted file mode 100644
index 2355bef14653..000000000000
--- a/Documentation/device-mapper/dm-raid.txt
+++ /dev/null
@@ -1,354 +0,0 @@
-dm-raid
-=======
-
-The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
-It allows the MD RAID drivers to be accessed using a device-mapper
-interface.
-
-
-Mapping Table Interface
------------------------
-The target is named "raid" and it accepts the following parameters:
-
-  <raid_type> <#raid_params> <raid_params> \
-    <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
-
-<raid_type>:
-  raid0		RAID0 striping (no resilience)
-  raid1		RAID1 mirroring
-  raid4		RAID4 with dedicated last parity disk
-  raid5_n 	RAID5 with dedicated last parity disk supporting takeover
-		Same as raid4
-		-Transitory layout
-  raid5_la	RAID5 left asymmetric
-		- rotating parity 0 with data continuation
-  raid5_ra	RAID5 right asymmetric
-		- rotating parity N with data continuation
-  raid5_ls	RAID5 left symmetric
-		- rotating parity 0 with data restart
-  raid5_rs 	RAID5 right symmetric
-		- rotating parity N with data restart
-  raid6_zr	RAID6 zero restart
-		- rotating parity zero (left-to-right) with data restart
-  raid6_nr	RAID6 N restart
-		- rotating parity N (right-to-left) with data restart
-  raid6_nc	RAID6 N continue
-		- rotating parity N (right-to-left) with data continuation
-  raid6_n_6	RAID6 with dedicate parity disks
-		- parity and Q-syndrome on the last 2 disks;
-		  layout for takeover from/to raid4/raid5_n
-  raid6_la_6	Same as "raid_la" plus dedicated last Q-syndrome disk
-		- layout for takeover from raid5_la from/to raid6
-  raid6_ra_6	Same as "raid5_ra" dedicated last Q-syndrome disk
-		- layout for takeover from raid5_ra from/to raid6
-  raid6_ls_6	Same as "raid5_ls" dedicated last Q-syndrome disk
-		- layout for takeover from raid5_ls from/to raid6
-  raid6_rs_6	Same as "raid5_rs" dedicated last Q-syndrome disk
-		- layout for takeover from raid5_rs from/to raid6
-  raid10        Various RAID10 inspired algorithms chosen by additional params
-		(see raid10_format and raid10_copies below)
-		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
-		- RAID1E: Integrated Adjacent Stripe Mirroring
-		- RAID1E: Integrated Offset Stripe Mirroring
-		-  and other similar RAID10 variants
-
-  Reference: Chapter 4 of
-  http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
-
-<#raid_params>: The number of parameters that follow.
-
-<raid_params> consists of
-    Mandatory parameters:
-        <chunk_size>: Chunk size in sectors.  This parameter is often known as
-		      "stripe size".  It is the only mandatory parameter and
-		      is placed first.
-
-    followed by optional parameters (in any order):
-	[sync|nosync]   Force or prevent RAID initialization.
-
-	[rebuild <idx>]	Rebuild drive number 'idx' (first drive is 0).
-
-	[daemon_sleep <ms>]
-		Interval between runs of the bitmap daemon that
-		clear bits.  A longer interval means less bitmap I/O but
-		resyncing after a failure is likely to take longer.
-
-	[min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
-	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
-	[write_mostly <idx>]		   Mark drive index 'idx' write-mostly.
-	[max_write_behind <sectors>]       See '--write-behind=' (man mdadm)
-	[stripe_cache <sectors>]           Stripe cache size (RAID 4/5/6 only)
-	[region_size <sectors>]
-		The region_size multiplied by the number of regions is the
-		logical size of the array.  The bitmap records the device
-		synchronisation state for each region.
-
-        [raid10_copies   <# copies>]
-        [raid10_format   <near|far|offset>]
-		These two options are used to alter the default layout of
-		a RAID10 configuration.  The number of copies is can be
-		specified, but the default is 2.  There are also three
-		variations to how the copies are laid down - the default
-		is "near".  Near copies are what most people think of with
-		respect to mirroring.  If these options are left unspecified,
-		or 'raid10_copies 2' and/or 'raid10_format near' are given,
-		then the layouts for 2, 3 and 4 devices	are:
-		2 drives         3 drives          4 drives
-		--------         ----------        --------------
-		A1  A1           A1  A1  A2        A1  A1  A2  A2
-		A2  A2           A2  A3  A3        A3  A3  A4  A4
-		A3  A3           A4  A4  A5        A5  A5  A6  A6
-		A4  A4           A5  A6  A6        A7  A7  A8  A8
-		..  ..           ..  ..  ..        ..  ..  ..  ..
-		The 2-device layout is equivalent 2-way RAID1.  The 4-device
-		layout is what a traditional RAID10 would look like.  The
-		3-device layout is what might be called a 'RAID1E - Integrated
-		Adjacent Stripe Mirroring'.
-
-		If 'raid10_copies 2' and 'raid10_format far', then the layouts
-		for 2, 3 and 4 devices are:
-		2 drives             3 drives             4 drives
-		--------             --------------       --------------------
-		A1  A2               A1   A2   A3         A1   A2   A3   A4
-		A3  A4               A4   A5   A6         A5   A6   A7   A8
-		A5  A6               A7   A8   A9         A9   A10  A11  A12
-		..  ..               ..   ..   ..         ..   ..   ..   ..
-		A2  A1               A3   A1   A2         A2   A1   A4   A3
-		A4  A3               A6   A4   A5         A6   A5   A8   A7
-		A6  A5               A9   A7   A8         A10  A9   A12  A11
-		..  ..               ..   ..   ..         ..   ..   ..   ..
-
-		If 'raid10_copies 2' and 'raid10_format offset', then the
-		layouts for 2, 3 and 4 devices are:
-		2 drives       3 drives           4 drives
-		--------       ------------       -----------------
-		A1  A2         A1  A2  A3         A1  A2  A3  A4
-		A2  A1         A3  A1  A2         A2  A1  A4  A3
-		A3  A4         A4  A5  A6         A5  A6  A7  A8
-		A4  A3         A6  A4  A5         A6  A5  A8  A7
-		A5  A6         A7  A8  A9         A9  A10 A11 A12
-		A6  A5         A9  A7  A8         A10 A9  A12 A11
-		..  ..         ..  ..  ..         ..  ..  ..  ..
-		Here we see layouts closely akin to 'RAID1E - Integrated
-		Offset Stripe Mirroring'.
-
-        [delta_disks <N>]
-		The delta_disks option value (-251 < N < +251) triggers
-		device removal (negative value) or device addition (positive
-		value) to any reshape supporting raid levels 4/5/6 and 10.
-		RAID levels 4/5/6 allow for addition of devices (metadata
-		and data device tuple), raid10_near and raid10_offset only
-		allow for device addition. raid10_far does not support any
-		reshaping at all.
-		A minimum of devices have to be kept to enforce resilience,
-		which is 3 devices for raid4/5 and 4 devices for raid6.
-
-        [data_offset <sectors>]
-		This option value defines the offset into each data device
-		where the data starts. This is used to provide out-of-place
-		reshaping space to avoid writing over data while
-		changing the layout of stripes, hence an interruption/crash
-		may happen at any time without the risk of losing data.
-		E.g. when adding devices to an existing raid set during
-		forward reshaping, the out-of-place space will be allocated
-		at the beginning of each raid device. The kernel raid4/5/6/10
-		MD personalities supporting such device addition will read the data from
-		the existing first stripes (those with smaller number of stripes)
-		starting at data_offset to fill up a new stripe with the larger
-		number of stripes, calculate the redundancy blocks (CRC/Q-syndrome)
-		and write that new stripe to offset 0. Same will be applied to all
-		N-1 other new stripes. This out-of-place scheme is used to change
-		the RAID type (i.e. the allocation algorithm) as well, e.g.
-		changing from raid5_ls to raid5_n.
-
-	[journal_dev <dev>]
-		This option adds a journal device to raid4/5/6 raid sets and
-		uses it to close the 'write hole' caused by the non-atomic updates
-		to the component devices which can cause data loss during recovery.
-		The journal device is used as writethrough thus causing writes to
-		be throttled versus non-journaled raid4/5/6 sets.
-		Takeover/reshape is not possible with a raid4/5/6 journal device;
-		it has to be deconfigured before requesting these.
-
-	[journal_mode <mode>]
-		This option sets the caching mode on journaled raid4/5/6 raid sets
-		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
-		If 'writeback' is selected the journal device has to be resilient
-		and must not suffer from the 'write hole' problem itself (e.g. use
-		raid1 or raid10) to avoid a single point of failure.
-
-<#raid_devs>: The number of devices composing the array.
-	Each device consists of two entries.  The first is the device
-	containing the metadata (if any); the second is the one containing the
-	data. A Maximum of 64 metadata/data device entries are supported
-	up to target version 1.8.0.
-	1.9.0 supports up to 253 which is enforced by the used MD kernel runtime.
-
-	If a drive has failed or is missing at creation time, a '-' can be
-	given for both the metadata and data drives for a given position.
-
-
-Example Tables
---------------
-# RAID4 - 4 data drives, 1 parity (no metadata devices)
-# No metadata devices specified to hold superblock/bitmap info
-# Chunk size of 1MiB
-# (Lines separated for easy reading)
-
-0 1960893648 raid \
-        raid4 1 2048 \
-        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
-
-# RAID4 - 4 data drives, 1 parity (with metadata devices)
-# Chunk size of 1MiB, force RAID initialization,
-#       min recovery rate at 20 kiB/sec/disk
-
-0 1960893648 raid \
-        raid4 4 2048 sync min_recovery_rate 20 \
-        5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
-
-
-Status Output
--------------
-'dmsetup table' displays the table used to construct the mapping.
-The optional parameters are always printed in the order listed
-above with "sync" or "nosync" always output ahead of the other
-arguments, regardless of the order used when originally loading the table.
-Arguments that can be repeated are ordered by value.
-
-
-'dmsetup status' yields information on the state and health of the array.
-The output is as follows (normally a single line, but expanded here for
-clarity):
-1: <s> <l> raid \
-2:      <raid_type> <#devices> <health_chars> \
-3:      <sync_ratio> <sync_action> <mismatch_cnt>
-
-Line 1 is the standard output produced by device-mapper.
-Line 2 & 3 are produced by the raid target and are best explained by example:
-        0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0
-Here we can see the RAID type is raid4, there are 5 devices - all of
-which are 'A'live, and the array is 2/490221568 complete with its initial
-recovery.  Here is a fuller description of the individual fields:
-	<raid_type>     Same as the <raid_type> used to create the array.
-	<health_chars>  One char for each device, indicating: 'A' = alive and
-			in-sync, 'a' = alive but not in-sync, 'D' = dead/failed.
-	<sync_ratio>    The ratio indicating how much of the array has undergone
-			the process described by 'sync_action'.  If the
-			'sync_action' is "check" or "repair", then the process
-			of "resync" or "recover" can be considered complete.
-	<sync_action>   One of the following possible states:
-			idle    - No synchronization action is being performed.
-			frozen  - The current action has been halted.
-			resync  - Array is undergoing its initial synchronization
-				  or is resynchronizing after an unclean shutdown
-				  (possibly aided by a bitmap).
-			recover - A device in the array is being rebuilt or
-				  replaced.
-			check   - A user-initiated full check of the array is
-				  being performed.  All blocks are read and
-				  checked for consistency.  The number of
-				  discrepancies found are recorded in
-				  <mismatch_cnt>.  No changes are made to the
-				  array by this action.
-			repair  - The same as "check", but discrepancies are
-				  corrected.
-			reshape - The array is undergoing a reshape.
-	<mismatch_cnt>  The number of discrepancies found between mirror copies
-			in RAID1/10 or wrong parity values found in RAID4/5/6.
-			This value is valid only after a "check" of the array
-			is performed.  A healthy array has a 'mismatch_cnt' of 0.
-	<data_offset>   The current data offset to the start of the user data on
-			each component device of a raid set (see the respective
-			raid parameter to support out-of-place reshaping).
-	<journal_char>	'A' - active write-through journal device.
-			'a' - active write-back journal device.
-			'D' - dead journal device.
-			'-' - no journal device.
-
-
-Message Interface
------------------
-The dm-raid target will accept certain actions through the 'message' interface.
-('man dmsetup' for more information on the message interface.)  These actions
-include:
-	"idle"   - Halt the current sync action.
-	"frozen" - Freeze the current sync action.
-	"resync" - Initiate/continue a resync.
-	"recover"- Initiate/continue a recover process.
-	"check"  - Initiate a check (i.e. a "scrub") of the array.
-	"repair" - Initiate a repair of the array.
-
-
-Discard Support
----------------
-The implementation of discard support among hardware vendors varies.
-When a block is discarded, some storage devices will return zeroes when
-the block is read.  These devices set the 'discard_zeroes_data'
-attribute.  Other devices will return random data.  Confusingly, some
-devices that advertise 'discard_zeroes_data' will not reliably return
-zeroes when discarded blocks are read!  Since RAID 4/5/6 uses blocks
-from a number of devices to calculate parity blocks and (for performance
-reasons) relies on 'discard_zeroes_data' being reliable, it is important
-that the devices be consistent.  Blocks may be discarded in the middle
-of a RAID 4/5/6 stripe and if subsequent read results are not
-consistent, the parity blocks may be calculated differently at any time;
-making the parity blocks useless for redundancy.  It is important to
-understand how your hardware behaves with discards if you are going to
-enable discards with RAID 4/5/6.
-
-Since the behavior of storage devices is unreliable in this respect,
-even when reporting 'discard_zeroes_data', by default RAID 4/5/6
-discard support is disabled -- this ensures data integrity at the
-expense of losing some performance.
-
-Storage devices that properly support 'discard_zeroes_data' are
-increasingly whitelisted in the kernel and can thus be trusted.
-
-For trusted devices, the following dm-raid module parameter can be set
-to safely enable discard support for RAID 4/5/6:
-    'devices_handle_discards_safely'
-
-
-Version History
----------------
-1.0.0	Initial version.  Support for RAID 4/5/6
-1.1.0	Added support for RAID 1
-1.2.0	Handle creation of arrays that contain failed devices.
-1.3.0	Added support for RAID 10
-1.3.1	Allow device replacement/rebuild for RAID 10
-1.3.2   Fix/improve redundancy checking for RAID10
-1.4.0	Non-functional change.  Removes arg from mapping function.
-1.4.1   RAID10 fix redundancy validation checks (commit 55ebbb5).
-1.4.2   Add RAID10 "far" and "offset" algorithm support.
-1.5.0   Add message interface to allow manipulation of the sync_action.
-	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
-1.5.1   Add ability to restore transiently failed devices on resume.
-1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
-1.6.0   Add discard support (and devices_handle_discard_safely module param).
-1.7.0   Add support for MD RAID0 mappings.
-1.8.0   Explicitly check for compatible flags in the superblock metadata
-	and reject to start the raid set if any are set by a newer
-	target version, thus avoiding data corruption on a raid set
-	with a reshape in progress.
-1.9.0   Add support for RAID level takeover/reshape/region size
-	and set size reduction.
-1.9.1   Fix activation of existing RAID 4/10 mapped devices
-1.9.2   Don't emit '- -' on the status table line in case the constructor
-	fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
-	'D' on the status line.  If '- -' is passed into the constructor, emit
-	'- -' on the table line and '-' as the status line health character.
-1.10.0  Add support for raid4/5/6 journal device
-1.10.1  Fix data corruption on reshape request
-1.11.0  Fix table line argument order
-	(wrong raid10_copies/raid10_format sequence)
-1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
-1.12.1  Fix for MD deadlock between mddev_suspend() and md_write_start() available
-1.13.0  Fix dev_health status at end of "recover" (was 'a', now 'A')
-1.13.1  Fix deadlock caused by early md_stop_writes().  Also fix size an
-	state races.
-1.13.2  Fix raid redundancy validation and avoid keeping raid set frozen
-1.14.0  Fix reshape race on small devices.  Fix stripe adding reshape
-	deadlock/potential data corruption.  Update superblock when
-	specific devices are requested via rebuild.  Fix RAID leg
-	rebuild errors.
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
deleted file mode 100644
index fb1d4a0cf122..000000000000
--- a/Documentation/device-mapper/dm-service-time.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-dm-service-time
-===============
-
-dm-service-time is a path selector module for device-mapper targets,
-which selects a path with the shortest estimated service time for
-the incoming I/O.
-
-The service time for each path is estimated by dividing the total size
-of in-flight I/Os on a path with the performance value of the path.
-The performance value is a relative throughput value among all paths
-in a path-group, and it can be specified as a table argument.
-
-The path selector name is 'service-time'.
-
-Table parameters for each path: [<repeat_count> [<relative_throughput>]]
-	<repeat_count>: The number of I/Os to dispatch using the selected
-			path before switching to the next path.
-			If not given, internal default is used.  To check
-			the default value, see the activated table.
-	<relative_throughput>: The relative throughput value of the path
-			among all paths in the path-group.
-			The valid range is 0-100.
-			If not given, minimum value '1' is used.
-			If '0' is given, the path isn't selected while
-			other paths having a positive value are available.
-
-Status for each path: <status> <fail-count> <in-flight-size> \
-		      <relative_throughput>
-	<status>: 'A' if the path is active, 'F' if the path is failed.
-	<fail-count>: The number of path failures.
-	<in-flight-size>: The size of in-flight I/Os on the path.
-	<relative_throughput>: The relative throughput value of the path
-			among all paths in the path-group.
-
-
-Algorithm
-=========
-
-dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
-dispatched and subtracts when completed.
-Basically, dm-service-time selects a path having minimum service time
-which is calculated by:
-
-	('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
-
-However, some optimizations below are used to reduce the calculation
-as much as possible.
-
-	1. If the paths have the same 'relative_throughput', skip
-	   the division and just compare the 'in-flight-size'.
-
-	2. If the paths have the same 'in-flight-size', skip the division
-	   and just compare the 'relative_throughput'.
-
-	3. If some paths have non-zero 'relative_throughput' and others
-	   have zero 'relative_throughput', ignore those paths with zero
-	   'relative_throughput'.
-
-If such optimizations can't be applied, calculate service time, and
-compare service time.
-If calculated service time is equal, the path having maximum
-'relative_throughput' may be better.  So compare 'relative_throughput'
-then.
-
-
-Examples
-========
-In case that 2 paths (sda and sdb) are used with repeat_count == 128
-and sda has an average throughput 1GB/s and sdb has 4GB/s,
-'relative_throughput' value may be '1' for sda and '4' for sdb.
-
-# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
-  dmsetup create test
-#
-# dmsetup table
-test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
-#
-# dmsetup status
-test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
-
-
-Or '2' for sda and '8' for sdb would be also true.
-
-# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
-  dmsetup create test
-#
-# dmsetup table
-test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
-#
-# dmsetup status
-test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
deleted file mode 100644
index 07edbd85c714..000000000000
--- a/Documentation/device-mapper/dm-uevent.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-The device-mapper uevent code adds the capability to device-mapper to create
-and send kobject uevents (uevents).  Previously device-mapper events were only
-available through the ioctl interface.  The advantage of the uevents interface
-is the event contains environment attributes providing increased context for
-the event avoiding the need to query the state of the device-mapper device after
-the event is received.
-
-There are two functions currently for device-mapper events.  The first function
-listed creates the event and the second function sends the event(s).
-
-void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
-                    const char *path, unsigned nr_valid_paths)
-
-void dm_send_uevents(struct list_head *events, struct kobject *kobj)
-
-
-The variables added to the uevent environment are:
-
-Variable Name: DM_TARGET
-Uevent Action(s): KOBJ_CHANGE
-Type: string
-Description:
-Value: Name of device-mapper target that generated the event.
-
-Variable Name: DM_ACTION
-Uevent Action(s): KOBJ_CHANGE
-Type: string
-Description:
-Value: Device-mapper specific action that caused the uevent action.
-	PATH_FAILED - A path has failed.
-	PATH_REINSTATED - A path has been reinstated.
-
-Variable Name: DM_SEQNUM
-Uevent Action(s): KOBJ_CHANGE
-Type: unsigned integer
-Description: A sequence number for this specific device-mapper device.
-Value: Valid unsigned integer range.
-
-Variable Name: DM_PATH
-Uevent Action(s): KOBJ_CHANGE
-Type: string
-Description: Major and minor number of the path device pertaining to this
-event.
-Value: Path name in the form of "Major:Minor"
-
-Variable Name: DM_NR_VALID_PATHS
-Uevent Action(s): KOBJ_CHANGE
-Type: unsigned integer
-Description:
-Value: Valid unsigned integer range.
-
-Variable Name: DM_NAME
-Uevent Action(s): KOBJ_CHANGE
-Type: string
-Description: Name of the device-mapper device.
-Value: Name
-
-Variable Name: DM_UUID
-Uevent Action(s): KOBJ_CHANGE
-Type: string
-Description: UUID of the device-mapper device.
-Value: UUID. (Empty string if there isn't one.)
-
-An example of the uevents generated as captured by udevmonitor is shown
-below.
-
-1.) Path failure.
-UEVENT[1192521009.711215] change@/block/dm-3
-ACTION=change
-DEVPATH=/block/dm-3
-SUBSYSTEM=block
-DM_TARGET=multipath
-DM_ACTION=PATH_FAILED
-DM_SEQNUM=1
-DM_PATH=8:32
-DM_NR_VALID_PATHS=0
-DM_NAME=mpath2
-DM_UUID=mpath-35333333000002328
-MINOR=3
-MAJOR=253
-SEQNUM=1130
-
-2.) Path reinstate.
-UEVENT[1192521132.989927] change@/block/dm-3
-ACTION=change
-DEVPATH=/block/dm-3
-SUBSYSTEM=block
-DM_TARGET=multipath
-DM_ACTION=PATH_REINSTATED
-DM_SEQNUM=2
-DM_PATH=8:32
-DM_NR_VALID_PATHS=1
-DM_NAME=mpath2
-DM_UUID=mpath-35333333000002328
-MINOR=3
-MAJOR=253
-SEQNUM=1131
diff --git a/Documentation/device-mapper/dm-zoned.txt b/Documentation/device-mapper/dm-zoned.txt
deleted file mode 100644
index 736fcc78d193..000000000000
--- a/Documentation/device-mapper/dm-zoned.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-dm-zoned
-========
-
-The dm-zoned device mapper target exposes a zoned block device (ZBC and
-ZAC compliant devices) as a regular block device without any write
-pattern constraints. In effect, it implements a drive-managed zoned
-block device which hides from the user (a file system or an application
-doing raw block device accesses) the sequential write constraints of
-host-managed zoned block devices and can mitigate the potential
-device-side performance degradation due to excessive random writes on
-host-aware zoned block devices.
-
-For a more detailed description of the zoned block device models and
-their constraints see (for SCSI devices):
-
-http://www.t10.org/drafts.htm#ZBC_Family
-
-and (for ATA devices):
-
-http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf
-
-The dm-zoned implementation is simple and minimizes system overhead (CPU
-and memory usage as well as storage capacity loss). For a 10TB
-host-managed disk with 256 MB zones, dm-zoned memory usage per disk
-instance is at most 4.5 MB and as little as 5 zones will be used
-internally for storing metadata and performaing reclaim operations.
-
-dm-zoned target devices are formatted and checked using the dmzadm
-utility available at:
-
-https://github.com/hgst/dm-zoned-tools
-
-Algorithm
-=========
-
-dm-zoned implements an on-disk buffering scheme to handle non-sequential
-write accesses to the sequential zones of a zoned block device.
-Conventional zones are used for caching as well as for storing internal
-metadata.
-
-The zones of the device are separated into 2 types:
-
-1) Metadata zones: these are conventional zones used to store metadata.
-Metadata zones are not reported as useable capacity to the user.
-
-2) Data zones: all remaining zones, the vast majority of which will be
-sequential zones used exclusively to store user data. The conventional
-zones of the device may be used also for buffering user random writes.
-Data in these zones may be directly mapped to the conventional zone, but
-later moved to a sequential zone so that the conventional zone can be
-reused for buffering incoming random writes.
-
-dm-zoned exposes a logical device with a sector size of 4096 bytes,
-irrespective of the physical sector size of the backend zoned block
-device being used. This allows reducing the amount of metadata needed to
-manage valid blocks (blocks written).
-
-The on-disk metadata format is as follows:
-
-1) The first block of the first conventional zone found contains the
-super block which describes the on disk amount and position of metadata
-blocks.
-
-2) Following the super block, a set of blocks is used to describe the
-mapping of the logical device blocks. The mapping is done per chunk of
-blocks, with the chunk size equal to the zoned block device size. The
-mapping table is indexed by chunk number and each mapping entry
-indicates the zone number of the device storing the chunk of data. Each
-mapping entry may also indicate if the zone number of a conventional
-zone used to buffer random modification to the data zone.
-
-3) A set of blocks used to store bitmaps indicating the validity of
-blocks in the data zones follows the mapping table. A valid block is
-defined as a block that was written and not discarded. For a buffered
-data chunk, a block is always valid only in the data zone mapping the
-chunk or in the buffer zone of the chunk.
-
-For a logical chunk mapped to a conventional zone, all write operations
-are processed by directly writing to the zone. If the mapping zone is a
-sequential zone, the write operation is processed directly only if the
-write offset within the logical chunk is equal to the write pointer
-offset within of the sequential data zone (i.e. the write operation is
-aligned on the zone write pointer). Otherwise, write operations are
-processed indirectly using a buffer zone. In that case, an unused
-conventional zone is allocated and assigned to the chunk being
-accessed. Writing a block to the buffer zone of a chunk will
-automatically invalidate the same block in the sequential zone mapping
-the chunk. If all blocks of the sequential zone become invalid, the zone
-is freed and the chunk buffer zone becomes the primary zone mapping the
-chunk, resulting in native random write performance similar to a regular
-block device.
-
-Read operations are processed according to the block validity
-information provided by the bitmaps. Valid blocks are read either from
-the sequential zone mapping a chunk, or if the chunk is buffered, from
-the buffer zone assigned. If the accessed chunk has no mapping, or the
-accessed blocks are invalid, the read buffer is zeroed and the read
-operation terminated.
-
-After some time, the limited number of convnetional zones available may
-be exhausted (all used to map chunks or buffer sequential zones) and
-unaligned writes to unbuffered chunks become impossible. To avoid this
-situation, a reclaim process regularly scans used conventional zones and
-tries to reclaim the least recently used zones by copying the valid
-blocks of the buffer zone to a free sequential zone. Once the copy
-completes, the chunk mapping is updated to point to the sequential zone
-and the buffer zone freed for reuse.
-
-Metadata Protection
-===================
-
-To protect metadata against corruption in case of sudden power loss or
-system crash, 2 sets of metadata zones are used. One set, the primary
-set, is used as the main metadata region, while the secondary set is
-used as a staging area. Modified metadata is first written to the
-secondary set and validated by updating the super block in the secondary
-set, a generation counter is used to indicate that this set contains the
-newest metadata. Once this operation completes, in place of metadata
-block updates can be done in the primary metadata set. This ensures that
-one of the set is always consistent (all modifications committed or none
-at all). Flush operations are used as a commit point. Upon reception of
-a flush request, metadata modification activity is temporarily blocked
-(for both incoming BIO processing and reclaim process) and all dirty
-metadata blocks are staged and updated. Normal operation is then
-resumed. Flushing metadata thus only temporarily delays write and
-discard requests. Read requests can be processed concurrently while
-metadata flush is being executed.
-
-Usage
-=====
-
-A zoned block device must first be formatted using the dmzadm tool. This
-will analyze the device zone configuration, determine where to place the
-metadata sets on the device and initialize the metadata sets.
-
-Ex:
-
-dmzadm --format /dev/sdxx
-
-For a formatted device, the target can be created normally with the
-dmsetup utility. The only parameter that dm-zoned requires is the
-underlying zoned block device name. Ex:
-
-echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | dmsetup create dmz-`basename ${dev}`
diff --git a/Documentation/device-mapper/era.txt b/Documentation/device-mapper/era.txt
deleted file mode 100644
index 3c6d01be3560..000000000000
--- a/Documentation/device-mapper/era.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-Introduction
-============
-
-dm-era is a target that behaves similar to the linear target.  In
-addition it keeps track of which blocks were written within a user
-defined period of time called an 'era'.  Each era target instance
-maintains the current era as a monotonically increasing 32-bit
-counter.
-
-Use cases include tracking changed blocks for backup software, and
-partially invalidating the contents of a cache to restore cache
-coherency after rolling back a vendor snapshot.
-
-Constructor
-===========
-
- era <metadata dev> <origin dev> <block size>
-
- metadata dev    : fast device holding the persistent metadata
- origin dev	 : device holding data blocks that may change
- block size      : block size of origin data device, granularity that is
-		     tracked by the target
-
-Messages
-========
-
-None of the dm messages take any arguments.
-
-checkpoint
-----------
-
-Possibly move to a new era.  You shouldn't assume the era has
-incremented.  After sending this message, you should check the
-current era via the status line.
-
-take_metadata_snap
-------------------
-
-Create a clone of the metadata, to allow a userland process to read it.
-
-drop_metadata_snap
-------------------
-
-Drop the metadata snapshot.
-
-Status
-======
-
-<metadata block size> <#used metadata blocks>/<#total metadata blocks>
-<current era> <held metadata root | '-'>
-
-metadata block size	 : Fixed block size for each metadata block in
-			     sectors
-#used metadata blocks	 : Number of metadata blocks used
-#total metadata blocks	 : Total number of metadata blocks
-current era		 : The current era
-held metadata root	 : The location, in blocks, of the metadata root
-			     that has been 'held' for userspace read
-			     access. '-' indicates there is no held root
-
-Detailed use case
-=================
-
-The scenario of invalidating a cache when rolling back a vendor
-snapshot was the primary use case when developing this target:
-
-Taking a vendor snapshot
-------------------------
-
-- Send a checkpoint message to the era target
-- Make a note of the current era in its status line
-- Take vendor snapshot (the era and snapshot should be forever
-  associated now).
-
-Rolling back to an vendor snapshot
-----------------------------------
-
-- Cache enters passthrough mode (see: dm-cache's docs in cache.txt)
-- Rollback vendor storage
-- Take metadata snapshot
-- Ascertain which blocks have been written since the snapshot was taken
-  by checking each block's era
-- Invalidate those blocks in the caching software
-- Cache returns to writeback/writethrough mode
-
-Memory usage
-============
-
-The target uses a bitset to record writes in the current era.  It also
-has a spare bitset ready for switching over to a new era.  Other than
-that it uses a few 4k blocks for updating metadata.
-
-   (4 * nr_blocks) bytes + buffers
-
-Resilience
-==========
-
-Metadata is updated on disk before a write to a previously unwritten
-block is performed.  As such dm-era should not be effected by a hard
-crash such as power failure.
-
-Userland tools
-==============
-
-Userland tools are found in the increasingly poorly named
-thin-provisioning-tools project:
-
-    https://github.com/jthornber/thin-provisioning-tools
diff --git a/Documentation/device-mapper/kcopyd.txt b/Documentation/device-mapper/kcopyd.txt
deleted file mode 100644
index 820382c4cecf..000000000000
--- a/Documentation/device-mapper/kcopyd.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-kcopyd
-======
-
-Kcopyd provides the ability to copy a range of sectors from one block-device
-to one or more other block-devices, with an asynchronous completion
-notification. It is used by dm-snapshot and dm-mirror.
-
-Users of kcopyd must first create a client and indicate how many memory pages
-to set aside for their copy jobs. This is done with a call to
-kcopyd_client_create().
-
-   int kcopyd_client_create(unsigned int num_pages,
-                            struct kcopyd_client **result);
-
-To start a copy job, the user must set up io_region structures to describe
-the source and destinations of the copy. Each io_region indicates a
-block-device along with the starting sector and size of the region. The source
-of the copy is given as one io_region structure, and the destinations of the
-copy are given as an array of io_region structures.
-
-   struct io_region {
-      struct block_device *bdev;
-      sector_t sector;
-      sector_t count;
-   };
-
-To start the copy, the user calls kcopyd_copy(), passing in the client
-pointer, pointers to the source and destination io_regions, the name of a
-completion callback routine, and a pointer to some context data for the copy.
-
-   int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
-                   unsigned int num_dests, struct io_region *dests,
-                   unsigned int flags, kcopyd_notify_fn fn, void *context);
-
-   typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err,
-				    void *context);
-
-When the copy completes, kcopyd will call the user's completion routine,
-passing back the user's context pointer. It will also indicate if a read or
-write error occurred during the copy.
-
-When a user is done with all their copy jobs, they should call
-kcopyd_client_destroy() to delete the kcopyd client, which will release the
-associated memory pages.
-
-   void kcopyd_client_destroy(struct kcopyd_client *kc);
-
diff --git a/Documentation/device-mapper/linear.txt b/Documentation/device-mapper/linear.txt
deleted file mode 100644
index 7cb98d89d3f8..000000000000
--- a/Documentation/device-mapper/linear.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-dm-linear
-=========
-
-Device-Mapper's "linear" target maps a linear range of the Device-Mapper
-device onto a linear range of another device.  This is the basic building
-block of logical volume managers.
-
-Parameters: <dev path> <offset>
-    <dev path>: Full pathname to the underlying block-device, or a
-                "major:minor" device-number.
-    <offset>: Starting sector within the device.
-
-
-Example scripts
-===============
-[[
-#!/bin/sh
-# Create an identity mapping for a device
-echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity
-]]
-
-
-[[
-#!/bin/sh
-# Join 2 devices together
-size1=`blockdev --getsz $1`
-size2=`blockdev --getsz $2`
-echo "0 $size1 linear $1 0
-$size1 $size2 linear $2 0" | dmsetup create joined
-]]
-
-
-[[
-#!/usr/bin/perl -w
-# Split a device into 4M chunks and then join them together in reverse order.
-
-my $name = "reverse";
-my $extent_size = 4 * 1024 * 2;
-my $dev = $ARGV[0];
-my $table = "";
-my $count = 0;
-
-if (!defined($dev)) {
-        die("Please specify a device.\n");
-}
-
-my $dev_size = `blockdev --getsz $dev`;
-my $extents = int($dev_size / $extent_size) -
-              (($dev_size % $extent_size) ? 1 : 0);
-
-while ($extents > 0) {
-        my $this_start = $count * $extent_size;
-        $extents--;
-        $count++;
-        my $this_offset = $extents * $extent_size;
-
-        $table .= "$this_start $extent_size linear $dev $this_offset\n";
-}
-
-`echo \"$table\" | dmsetup create $name`;
-]]
diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt
deleted file mode 100644
index b638d124be6a..000000000000
--- a/Documentation/device-mapper/log-writes.txt
+++ /dev/null
@@ -1,140 +0,0 @@
-dm-log-writes
-=============
-
-This target takes 2 devices, one to pass all IO to normally, and one to log all
-of the write operations to.  This is intended for file system developers wishing
-to verify the integrity of metadata or data as the file system is written to.
-There is a log_write_entry written for every WRITE request and the target is
-able to take arbitrary data from userspace to insert into the log.  The data
-that is in the WRITE requests is copied into the log to make the replay happen
-exactly as it happened originally.
-
-Log Ordering
-============
-
-We log things in order of completion once we are sure the write is no longer in
-cache.  This means that normal WRITE requests are not actually logged until the
-next REQ_PREFLUSH request.  This is to make it easier for userspace to replay
-the log in a way that correlates to what is on disk and not what is in cache,
-to make it easier to detect improper waiting/flushing.
-
-This works by attaching all WRITE requests to a list once the write completes.
-Once we see a REQ_PREFLUSH request we splice this list onto the request and once
-the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
-completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to
-simulate the worst case scenario with regard to power failures.  Consider the
-following example (W means write, C means complete):
-
-W1,W2,W3,C3,C2,Wflush,C1,Cflush
-
-The log would show the following
-
-W3,W2,flush,W1....
-
-Again this is to simulate what is actually on disk, this allows us to detect
-cases where a power failure at a particular point in time would create an
-inconsistent file system.
-
-Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
-they complete as those requests will obviously bypass the device cache.
-
-Any REQ_OP_DISCARD requests are treated like WRITE requests.  Otherwise we would
-have all the DISCARD requests, and then the WRITE requests and then the FLUSH
-request.  Consider the following example:
-
-WRITE block 1, DISCARD block 1, FLUSH
-
-If we logged DISCARD when it completed, the replay would look like this
-
-DISCARD 1, WRITE 1, FLUSH
-
-which isn't quite what happened and wouldn't be caught during the log replay.
-
-Target interface
-================
-
-i) Constructor
-
-   log-writes <dev_path> <log_dev_path>
-
-   dev_path	: Device that all of the IO will go to normally.
-   log_dev_path : Device where the log entries are written to.
-
-ii) Status
-
-    <#logged entries> <highest allocated sector>
-
-    #logged entries	       : Number of logged entries
-    highest allocated sector   : Highest allocated sector
-
-iii) Messages
-
-    mark <description>
-
-	You can use a dmsetup message to set an arbitrary mark in a log.
-	For example say you want to fsck a file system after every
-	write, but first you need to replay up to the mkfs to make sure
-	we're fsck'ing something reasonable, you would do something like
-	this:
-
-	  mkfs.btrfs -f /dev/mapper/log
-	  dmsetup message log 0 mark mkfs
-	  <run test>
-
-	  This would allow you to replay the log up to the mkfs mark and
-	  then replay from that point on doing the fsck check in the
-	  interval that you want.
-
-	Every log has a mark at the end labeled "dm-log-writes-end".
-
-Userspace component
-===================
-
-There is a userspace tool that will replay the log for you in various ways.
-It can be found here: https://github.com/josefbacik/log-writes
-
-Example usage
-=============
-
-Say you want to test fsync on your file system.  You would do something like
-this:
-
-TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
-dmsetup create log --table "$TABLE"
-mkfs.btrfs -f /dev/mapper/log
-dmsetup message log 0 mark mkfs
-
-mount /dev/mapper/log /mnt/btrfs-test
-<some test that does fsync at the end>
-dmsetup message log 0 mark fsync
-md5sum /mnt/btrfs-test/foo
-umount /mnt/btrfs-test
-
-dmsetup remove log
-replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
-mount /dev/sdb /mnt/btrfs-test
-md5sum /mnt/btrfs-test/foo
-<verify md5sum's are correct>
-
-Another option is to do a complicated file system operation and verify the file
-system is consistent during the entire operation.  You could do this with:
-
-TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
-dmsetup create log --table "$TABLE"
-mkfs.btrfs -f /dev/mapper/log
-dmsetup message log 0 mark mkfs
-
-mount /dev/mapper/log /mnt/btrfs-test
-<fsstress to dirty the fs>
-btrfs filesystem balance /mnt/btrfs-test
-umount /mnt/btrfs-test
-dmsetup remove log
-
-replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
-btrfsck /dev/sdb
-replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
-	--fsck "btrfsck /dev/sdb" --check fua
-
-And that will replay the log until it sees a FUA request, run the fsck command
-and if the fsck passes it will replay to the next FUA, until it is completed or
-the fsck command exists abnormally.
diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.txt
deleted file mode 100644
index a333bcb3a6c2..000000000000
--- a/Documentation/device-mapper/persistent-data.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-Introduction
-============
-
-The more-sophisticated device-mapper targets require complex metadata
-that is managed in kernel.  In late 2010 we were seeing that various
-different targets were rolling their own data structures, for example:
-
-- Mikulas Patocka's multisnap implementation
-- Heinz Mauelshagen's thin provisioning target
-- Another btree-based caching target posted to dm-devel
-- Another multi-snapshot target based on a design of Daniel Phillips
-
-Maintaining these data structures takes a lot of work, so if possible
-we'd like to reduce the number.
-
-The persistent-data library is an attempt to provide a re-usable
-framework for people who want to store metadata in device-mapper
-targets.  It's currently used by the thin-provisioning target and an
-upcoming hierarchical storage target.
-
-Overview
-========
-
-The main documentation is in the header files which can all be found
-under drivers/md/persistent-data.
-
-The block manager
------------------
-
-dm-block-manager.[hc]
-
-This provides access to the data on disk in fixed sized-blocks.  There
-is a read/write locking interface to prevent concurrent accesses, and
-keep data that is being used in the cache.
-
-Clients of persistent-data are unlikely to use this directly.
-
-The transaction manager
------------------------
-
-dm-transaction-manager.[hc]
-
-This restricts access to blocks and enforces copy-on-write semantics.
-The only way you can get hold of a writable block through the
-transaction manager is by shadowing an existing block (ie. doing
-copy-on-write) or allocating a fresh one.  Shadowing is elided within
-the same transaction so performance is reasonable.  The commit method
-ensures that all data is flushed before it writes the superblock.
-On power failure your metadata will be as it was when last committed.
-
-The Space Maps
---------------
-
-dm-space-map.h
-dm-space-map-metadata.[hc]
-dm-space-map-disk.[hc]
-
-On-disk data structures that keep track of reference counts of blocks.
-Also acts as the allocator of new blocks.  Currently two
-implementations: a simpler one for managing blocks on a different
-device (eg. thinly-provisioned data blocks); and one for managing
-the metadata space.  The latter is complicated by the need to store
-its own data within the space it's managing.
-
-The data structures
--------------------
-
-dm-btree.[hc]
-dm-btree-remove.c
-dm-btree-spine.c
-dm-btree-internal.h
-
-Currently there is only one data structure, a hierarchical btree.
-There are plans to add more.  For example, something with an
-array-like interface would see a lot of use.
-
-The btree is 'hierarchical' in that you can define it to be composed
-of nested btrees, and take multiple keys.  For example, the
-thin-provisioning target uses a btree with two levels of nesting.
-The first maps a device id to a mapping tree, and that in turn maps a
-virtual block to a physical block.
-
-Values stored in the btrees can have arbitrary size.  Keys are always
-64bits, although nesting allows you to use multiple keys.
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
deleted file mode 100644
index b8bbb516f989..000000000000
--- a/Documentation/device-mapper/snapshot.txt
+++ /dev/null
@@ -1,176 +0,0 @@
-Device-mapper snapshot support
-==============================
-
-Device-mapper allows you, without massive data copying:
-
-*) To create snapshots of any block device i.e. mountable, saved states of
-the block device which are also writable without interfering with the
-original content;
-*) To create device "forks", i.e. multiple different versions of the
-same data stream.
-*) To merge a snapshot of a block device back into the snapshot's origin
-device.
-
-In the first two cases, dm copies only the chunks of data that get
-changed and uses a separate copy-on-write (COW) block device for
-storage.
-
-For snapshot merge the contents of the COW storage are merged back into
-the origin device.
-
-
-There are three dm targets available:
-snapshot, snapshot-origin, and snapshot-merge.
-
-*) snapshot-origin <origin>
-
-which will normally have one or more snapshots based on it.
-Reads will be mapped directly to the backing device. For each write, the
-original data will be saved in the <COW device> of each snapshot to keep
-its visible content unchanged, at least until the <COW device> fills up.
-
-
-*) snapshot <origin> <COW device> <persistent?> <chunksize>
-
-A snapshot of the <origin> block device is created. Changed chunks of
-<chunksize> sectors will be stored on the <COW device>.  Writes will
-only go to the <COW device>.  Reads will come from the <COW device> or
-from <origin> for unchanged data.  <COW device> will often be
-smaller than the origin and if it fills up the snapshot will become
-useless and be disabled, returning errors.  So it is important to monitor
-the amount of free space and expand the <COW device> before it fills up.
-
-<persistent?> is P (Persistent) or N (Not persistent - will not survive
-after reboot).  O (Overflow) can be added as a persistent store option
-to allow userspace to advertise its support for seeing "Overflow" in the
-snapshot status.  So supported store types are "P", "PO" and "N".
-
-The difference between persistent and transient is with transient
-snapshots less metadata must be saved on disk - they can be kept in
-memory by the kernel.
-
-When loading or unloading the snapshot target, the corresponding
-snapshot-origin or snapshot-merge target must be suspended. A failure to
-suspend the origin target could result in data corruption.
-
-
-* snapshot-merge <origin> <COW device> <persistent> <chunksize>
-
-takes the same table arguments as the snapshot target except it only
-works with persistent snapshots.  This target assumes the role of the
-"snapshot-origin" target and must not be loaded if the "snapshot-origin"
-is still present for <origin>.
-
-Creates a merging snapshot that takes control of the changed chunks
-stored in the <COW device> of an existing snapshot, through a handover
-procedure, and merges these chunks back into the <origin>.  Once merging
-has started (in the background) the <origin> may be opened and the merge
-will continue while I/O is flowing to it.  Changes to the <origin> are
-deferred until the merging snapshot's corresponding chunk(s) have been
-merged.  Once merging has started the snapshot device, associated with
-the "snapshot" target, will return -EIO when accessed.
-
-
-How snapshot is used by LVM2
-============================
-When you create the first LVM2 snapshot of a volume, four dm devices are used:
-
-1) a device containing the original mapping table of the source volume;
-2) a device used as the <COW device>;
-3) a "snapshot" device, combining #1 and #2, which is the visible snapshot
-   volume;
-4) the "original" volume (which uses the device number used by the original
-   source volume), whose table is replaced by a "snapshot-origin" mapping
-   from device #1.
-
-A fixed naming scheme is used, so with the following commands:
-
-lvcreate -L 1G -n base volumeGroup
-lvcreate -L 100M --snapshot -n snap volumeGroup/base
-
-we'll have this situation (with volumes in above order):
-
-# dmsetup table|grep volumeGroup
-
-volumeGroup-base-real: 0 2097152 linear 8:19 384
-volumeGroup-snap-cow: 0 204800 linear 8:19 2097536
-volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16
-volumeGroup-base: 0 2097152 snapshot-origin 254:11
-
-# ls -lL /dev/mapper/volumeGroup-*
-brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
-brw-------  1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
-brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
-brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
-
-
-How snapshot-merge is used by LVM2
-==================================
-A merging snapshot assumes the role of the "snapshot-origin" while
-merging.  As such the "snapshot-origin" is replaced with
-"snapshot-merge".  The "-real" device is not changed and the "-cow"
-device is renamed to <origin name>-cow to aid LVM2's cleanup of the
-merging snapshot after it completes.  The "snapshot" that hands over its
-COW device to the "snapshot-merge" is deactivated (unless using lvchange
---refresh); but if it is left active it will simply return I/O errors.
-
-A snapshot will merge into its origin with the following command:
-
-lvconvert --merge volumeGroup/snap
-
-we'll now have this situation:
-
-# dmsetup table|grep volumeGroup
-
-volumeGroup-base-real: 0 2097152 linear 8:19 384
-volumeGroup-base-cow: 0 204800 linear 8:19 2097536
-volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
-
-# ls -lL /dev/mapper/volumeGroup-*
-brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
-brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
-brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
-
-
-How to determine when a merging is complete
-===========================================
-The snapshot-merge and snapshot status lines end with:
-  <sectors_allocated>/<total_sectors> <metadata_sectors>
-
-Both <sectors_allocated> and <total_sectors> include both data and metadata.
-During merging, the number of sectors allocated gets smaller and
-smaller.  Merging has finished when the number of sectors holding data
-is zero, in other words <sectors_allocated> == <metadata_sectors>.
-
-Here is a practical example (using a hybrid of lvm and dmsetup commands):
-
-# lvs
-  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
-  base    volumeGroup owi-a- 4.00g
-  snap    volumeGroup swi-a- 1.00g base  18.97
-
-# dmsetup status volumeGroup-snap
-0 8388608 snapshot 397896/2097152 1560
-                                  ^^^^ metadata sectors
-
-# lvconvert --merge -b volumeGroup/snap
-  Merging of volume snap started.
-
-# lvs volumeGroup/snap
-  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
-  base    volumeGroup Owi-a- 4.00g          17.23
-
-# dmsetup status volumeGroup-base
-0 8388608 snapshot-merge 281688/2097152 1104
-
-# dmsetup status volumeGroup-base
-0 8388608 snapshot-merge 180480/2097152 712
-
-# dmsetup status volumeGroup-base
-0 8388608 snapshot-merge 16/2097152 16
-
-Merging has finished.
-
-# lvs
-  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
-  base    volumeGroup owi-a- 4.00g
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
deleted file mode 100644
index 170ac02a1f50..000000000000
--- a/Documentation/device-mapper/statistics.txt
+++ /dev/null
@@ -1,223 +0,0 @@
-DM statistics
-=============
-
-Device Mapper supports the collection of I/O statistics on user-defined
-regions of a DM device.	 If no regions are defined no statistics are
-collected so there isn't any performance impact.  Only bio-based DM
-devices are currently supported.
-
-Each user-defined region specifies a starting sector, length and step.
-Individual statistics will be collected for each step-sized area within
-the range specified.
-
-The I/O statistics counters for each step-sized area of a region are
-in the same format as /sys/block/*/stat or /proc/diskstats (see:
-Documentation/iostats.txt).  But two extra counters (12 and 13) are
-provided: total time spent reading and writing.  When the histogram
-argument is used, the 14th parameter is reported that represents the
-histogram of latencies.  All these counters may be accessed by sending
-the @stats_print message to the appropriate DM device via dmsetup.
-
-The reported times are in milliseconds and the granularity depends on
-the kernel ticks.  When the option precise_timestamps is used, the
-reported times are in nanoseconds.
-
-Each region has a corresponding unique identifier, which we call a
-region_id, that is assigned when the region is created.	 The region_id
-must be supplied when querying statistics about the region, deleting the
-region, etc.  Unique region_ids enable multiple userspace programs to
-request and process statistics for the same DM device without stepping
-on each other's data.
-
-The creation of DM statistics will allocate memory via kmalloc or
-fallback to using vmalloc space.  At most, 1/4 of the overall system
-memory may be allocated by DM statistics.  The admin can see how much
-memory is used by reading
-/sys/module/dm_mod/parameters/stats_current_allocated_bytes
-
-Messages
-========
-
-    @stats_create <range> <step>
-		[<number_of_optional_arguments> <optional_arguments>...]
-		[<program_id> [<aux_data>]]
-
-	Create a new region and return the region_id.
-
-	<range>
-	  "-" - whole device
-	  "<start_sector>+<length>" - a range of <length> 512-byte sectors
-				      starting with <start_sector>.
-
-	<step>
-	  "<area_size>" - the range is subdivided into areas each containing
-			  <area_size> sectors.
-	  "/<number_of_areas>" - the range is subdivided into the specified
-				 number of areas.
-
-	<number_of_optional_arguments>
-	  The number of optional arguments
-
-	<optional_arguments>
-	  The following optional arguments are supported
-	  precise_timestamps - use precise timer with nanosecond resolution
-		instead of the "jiffies" variable.  When this argument is
-		used, the resulting times are in nanoseconds instead of
-		milliseconds.  Precise timestamps are a little bit slower
-		to obtain than jiffies-based timestamps.
-	  histogram:n1,n2,n3,n4,... - collect histogram of latencies.  The
-		numbers n1, n2, etc are times that represent the boundaries
-		of the histogram.  If precise_timestamps is not used, the
-		times are in milliseconds, otherwise they are in
-		nanoseconds.  For each range, the kernel will report the
-		number of requests that completed within this range. For
-		example, if we use "histogram:10,20,30", the kernel will
-		report four numbers a:b:c:d. a is the number of requests
-		that took 0-10 ms to complete, b is the number of requests
-		that took 10-20 ms to complete, c is the number of requests
-		that took 20-30 ms to complete and d is the number of
-		requests that took more than 30 ms to complete.
-
-	<program_id>
-	  An optional parameter.  A name that uniquely identifies
-	  the userspace owner of the range.  This groups ranges together
-	  so that userspace programs can identify the ranges they
-	  created and ignore those created by others.
-	  The kernel returns this string back in the output of
-	  @stats_list message, but it doesn't use it for anything else.
-	  If we omit the number of optional arguments, program id must not
-	  be a number, otherwise it would be interpreted as the number of
-	  optional arguments.
-
-	<aux_data>
-	  An optional parameter.  A word that provides auxiliary data
-	  that is useful to the client program that created the range.
-	  The kernel returns this string back in the output of
-	  @stats_list message, but it doesn't use this value for anything.
-
-    @stats_delete <region_id>
-
-	Delete the region with the specified id.
-
-	<region_id>
-	  region_id returned from @stats_create
-
-    @stats_clear <region_id>
-
-	Clear all the counters except the in-flight i/o counters.
-
-	<region_id>
-	  region_id returned from @stats_create
-
-    @stats_list [<program_id>]
-
-	List all regions registered with @stats_create.
-
-	<program_id>
-	  An optional parameter.
-	  If this parameter is specified, only matching regions
-	  are returned.
-	  If it is not specified, all regions are returned.
-
-	Output format:
-	  <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
-	        precise_timestamps histogram:n1,n2,n3,...
-
-	The strings "precise_timestamps" and "histogram" are printed only
-	if they were specified when creating the region.
-
-    @stats_print <region_id> [<starting_line> <number_of_lines>]
-
-	Print counters for each step-sized area of a region.
-
-	<region_id>
-	  region_id returned from @stats_create
-
-	<starting_line>
-	  The index of the starting line in the output.
-	  If omitted, all lines are returned.
-
-	<number_of_lines>
-	  The number of lines to include in the output.
-	  If omitted, all lines are returned.
-
-	Output format for each step-sized area of a region:
-
-	  <start_sector>+<length> counters
-
-	  The first 11 counters have the same meaning as
-	  /sys/block/*/stat or /proc/diskstats.
-
-	  Please refer to Documentation/iostats.txt for details.
-
-	  1. the number of reads completed
-	  2. the number of reads merged
-	  3. the number of sectors read
-	  4. the number of milliseconds spent reading
-	  5. the number of writes completed
-	  6. the number of writes merged
-	  7. the number of sectors written
-	  8. the number of milliseconds spent writing
-	  9. the number of I/Os currently in progress
-	  10. the number of milliseconds spent doing I/Os
-	  11. the weighted number of milliseconds spent doing I/Os
-
-	  Additional counters:
-	  12. the total time spent reading in milliseconds
-	  13. the total time spent writing in milliseconds
-
-    @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
-
-	Atomically print and then clear all the counters except the
-	in-flight i/o counters.	 Useful when the client consuming the
-	statistics does not want to lose any statistics (those updated
-	between printing and clearing).
-
-	<region_id>
-	  region_id returned from @stats_create
-
-	<starting_line>
-	  The index of the starting line in the output.
-	  If omitted, all lines are printed and then cleared.
-
-	<number_of_lines>
-	  The number of lines to process.
-	  If omitted, all lines are printed and then cleared.
-
-    @stats_set_aux <region_id> <aux_data>
-
-	Store auxiliary data aux_data for the specified region.
-
-	<region_id>
-	  region_id returned from @stats_create
-
-	<aux_data>
-	  The string that identifies data which is useful to the client
-	  program that created the range.  The kernel returns this
-	  string back in the output of @stats_list message, but it
-	  doesn't use this value for anything.
-
-Examples
-========
-
-Subdivide the DM device 'vol' into 100 pieces and start collecting
-statistics on them:
-
-  dmsetup message vol 0 @stats_create - /100
-
-Set the auxiliary data string to "foo bar baz" (the escape for each
-space must also be escaped, otherwise the shell will consume them):
-
-  dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
-
-List the statistics:
-
-  dmsetup message vol 0 @stats_list
-
-Print the statistics:
-
-  dmsetup message vol 0 @stats_print 0
-
-Delete the statistics:
-
-  dmsetup message vol 0 @stats_delete 0
diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt
deleted file mode 100644
index 07ec492cceee..000000000000
--- a/Documentation/device-mapper/striped.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-dm-stripe
-=========
-
-Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0)
-device across one or more underlying devices. Data is written in "chunks",
-with consecutive chunks rotating among the underlying devices. This can
-potentially provide improved I/O throughput by utilizing several physical
-devices in parallel.
-
-Parameters: <num devs> <chunk size> [<dev path> <offset>]+
-    <num devs>: Number of underlying devices.
-    <chunk size>: Size of each chunk of data. Must be at least as
-                  large as the system's PAGE_SIZE.
-    <dev path>: Full pathname to the underlying block-device, or a
-                "major:minor" device-number.
-    <offset>: Starting sector within the device.
-
-One or more underlying devices can be specified. The striped device size must
-be a multiple of the chunk size multiplied by the number of underlying devices.
-
-
-Example scripts
-===============
-
-[[
-#!/usr/bin/perl -w
-# Create a striped device across any number of underlying devices. The device
-# will be called "stripe_dev" and have a chunk-size of 128k.
-
-my $chunk_size = 128 * 2;
-my $dev_name = "stripe_dev";
-my $num_devs = @ARGV;
-my @devs = @ARGV;
-my ($min_dev_size, $stripe_dev_size, $i);
-
-if (!$num_devs) {
-        die("Specify at least one device\n");
-}
-
-$min_dev_size = `blockdev --getsz $devs[0]`;
-for ($i = 1; $i < $num_devs; $i++) {
-        my $this_size = `blockdev --getsz $devs[$i]`;
-        $min_dev_size = ($min_dev_size < $this_size) ?
-                        $min_dev_size : $this_size;
-}
-
-$stripe_dev_size = $min_dev_size * $num_devs;
-$stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs);
-
-$table = "0 $stripe_dev_size striped $num_devs $chunk_size";
-for ($i = 0; $i < $num_devs; $i++) {
-        $table .= " $devs[$i] 0";
-}
-
-`echo $table | dmsetup create $dev_name`;
-]]
-
diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt
deleted file mode 100644
index 5bd4831db4a8..000000000000
--- a/Documentation/device-mapper/switch.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-dm-switch
-=========
-
-The device-mapper switch target creates a device that supports an
-arbitrary mapping of fixed-size regions of I/O across a fixed set of
-paths.  The path used for any specific region can be switched
-dynamically by sending the target a message.
-
-It maps I/O to underlying block devices efficiently when there is a large
-number of fixed-sized address regions but there is no simple pattern
-that would allow for a compact representation of the mapping such as
-dm-stripe.
-
-Background
-----------
-
-Dell EqualLogic and some other iSCSI storage arrays use a distributed
-frameless architecture.  In this architecture, the storage group
-consists of a number of distinct storage arrays ("members") each having
-independent controllers, disk storage and network adapters.  When a LUN
-is created it is spread across multiple members.  The details of the
-spreading are hidden from initiators connected to this storage system.
-The storage group exposes a single target discovery portal, no matter
-how many members are being used.  When iSCSI sessions are created, each
-session is connected to an eth port on a single member.  Data to a LUN
-can be sent on any iSCSI session, and if the blocks being accessed are
-stored on another member the I/O will be forwarded as required.  This
-forwarding is invisible to the initiator.  The storage layout is also
-dynamic, and the blocks stored on disk may be moved from member to
-member as needed to balance the load.
-
-This architecture simplifies the management and configuration of both
-the storage group and initiators.  In a multipathing configuration, it
-is possible to set up multiple iSCSI sessions to use multiple network
-interfaces on both the host and target to take advantage of the
-increased network bandwidth.  An initiator could use a simple round
-robin algorithm to send I/O across all paths and let the storage array
-members forward it as necessary, but there is a performance advantage to
-sending data directly to the correct member.
-
-A device-mapper table already lets you map different regions of a
-device onto different targets.  However in this architecture the LUN is
-spread with an address region size on the order of 10s of MBs, which
-means the resulting table could have more than a million entries and
-consume far too much memory.
-
-Using this device-mapper switch target we can now build a two-layer
-device hierarchy:
-
-    Upper Tier - Determine which array member the I/O should be sent to.
-    Lower Tier - Load balance amongst paths to a particular member.
-
-The lower tier consists of a single dm multipath device for each member.
-Each of these multipath devices contains the set of paths directly to
-the array member in one priority group, and leverages existing path
-selectors to load balance amongst these paths.  We also build a
-non-preferred priority group containing paths to other array members for
-failover reasons.
-
-The upper tier consists of a single dm-switch device.  This device uses
-a bitmap to look up the location of the I/O and choose the appropriate
-lower tier device to route the I/O.  By using a bitmap we are able to
-use 4 bits for each address range in a 16 member group (which is very
-large for us).  This is a much denser representation than the dm table
-b-tree can achieve.
-
-Construction Parameters
-=======================
-
-    <num_paths> <region_size> <num_optional_args> [<optional_args>...]
-    [<dev_path> <offset>]+
-
-<num_paths>
-    The number of paths across which to distribute the I/O.
-
-<region_size>
-    The number of 512-byte sectors in a region. Each region can be redirected
-    to any of the available paths.
-
-<num_optional_args>
-    The number of optional arguments. Currently, no optional arguments
-    are supported and so this must be zero.
-
-<dev_path>
-    The block device that represents a specific path to the device.
-
-<offset>
-    The offset of the start of data on the specific <dev_path> (in units
-    of 512-byte sectors). This number is added to the sector number when
-    forwarding the request to the specific path. Typically it is zero.
-
-Messages
-========
-
-set_region_mappings <index>:<path_nr> [<index>]:<path_nr> [<index>]:<path_nr>...
-
-Modify the region table by specifying which regions are redirected to
-which paths.
-
-<index>
-    The region number (region size was specified in constructor parameters).
-    If index is omitted, the next region (previous index + 1) is used.
-    Expressed in hexadecimal (WITHOUT any prefix like 0x).
-
-<path_nr>
-    The path number in the range 0 ... (<num_paths> - 1).
-    Expressed in hexadecimal (WITHOUT any prefix like 0x).
-
-R<n>,<m>
-    This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
-    are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
-    slots.
-
-Status
-======
-
-No status line is reported.
-
-Example
-=======
-
-Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with
-the same size.
-
-Create a switch device with 64kB region size:
-    dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0`
-	switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0"
-
-Set mappings for the first 7 entries to point to devices switch0, switch1,
-switch2, switch0, switch1, switch2, switch1:
-    dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
-
-Set repetitive mapping. This command:
-    dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
-is equivalent to:
-    dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
-	:1 :2 :1 :2 :1 :2 :1 :2 :1 :2
-
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
deleted file mode 100644
index 883e7ca5f745..000000000000
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ /dev/null
@@ -1,411 +0,0 @@
-Introduction
-============
-
-This document describes a collection of device-mapper targets that
-between them implement thin-provisioning and snapshots.
-
-The main highlight of this implementation, compared to the previous
-implementation of snapshots, is that it allows many virtual devices to
-be stored on the same data volume.  This simplifies administration and
-allows the sharing of data between volumes, thus reducing disk usage.
-
-Another significant feature is support for an arbitrary depth of
-recursive snapshots (snapshots of snapshots of snapshots ...).  The
-previous implementation of snapshots did this by chaining together
-lookup tables, and so performance was O(depth).  This new
-implementation uses a single data structure to avoid this degradation
-with depth.  Fragmentation may still be an issue, however, in some
-scenarios.
-
-Metadata is stored on a separate device from data, giving the
-administrator some freedom, for example to:
-
-- Improve metadata resilience by storing metadata on a mirrored volume
-  but data on a non-mirrored one.
-
-- Improve performance by storing the metadata on SSD.
-
-Status
-======
-
-These targets are considered safe for production use.  But different use
-cases will have different performance characteristics, for example due
-to fragmentation of the data volume.
-
-If you find this software is not performing as expected please mail
-dm-devel@redhat.com with details and we'll try our best to improve
-things for you.
-
-Userspace tools for checking and repairing the metadata have been fully
-developed and are available as 'thin_check' and 'thin_repair'.  The name
-of the package that provides these utilities varies by distribution (on
-a Red Hat distribution it is named 'device-mapper-persistent-data').
-
-Cookbook
-========
-
-This section describes some quick recipes for using thin provisioning.
-They use the dmsetup program to control the device-mapper driver
-directly.  End users will be advised to use a higher-level volume
-manager such as LVM2 once support has been added.
-
-Pool device
------------
-
-The pool device ties together the metadata volume and the data volume.
-It maps I/O linearly to the data volume and updates the metadata via
-two mechanisms:
-
-- Function calls from the thin targets
-
-- Device-mapper 'messages' from userspace which control the creation of new
-  virtual devices amongst other things.
-
-Setting up a fresh pool device
-------------------------------
-
-Setting up a pool device requires a valid metadata device, and a
-data device.  If you do not have an existing metadata device you can
-make one by zeroing the first 4k to indicate empty metadata.
-
-    dd if=/dev/zero of=$metadata_dev bs=4096 count=1
-
-The amount of metadata you need will vary according to how many blocks
-are shared between thin devices (i.e. through snapshots).  If you have
-less sharing than average you'll need a larger-than-average metadata device.
-
-As a guide, we suggest you calculate the number of bytes to use in the
-metadata device as 48 * $data_dev_size / $data_block_size but round it up
-to 2MB if the answer is smaller.  If you're creating large numbers of
-snapshots which are recording large amounts of change, you may find you
-need to increase this.
-
-The largest size supported is 16GB: If the device is larger,
-a warning will be issued and the excess space will not be used.
-
-Reloading a pool table
-----------------------
-
-You may reload a pool's table, indeed this is how the pool is resized
-if it runs out of space.  (N.B. While specifying a different metadata
-device when reloading is not forbidden at the moment, things will go
-wrong if it does not route I/O to exactly the same on-disk location as
-previously.)
-
-Using an existing pool device
------------------------------
-
-    dmsetup create pool \
-	--table "0 20971520 thin-pool $metadata_dev $data_dev \
-		 $data_block_size $low_water_mark"
-
-$data_block_size gives the smallest unit of disk space that can be
-allocated at a time expressed in units of 512-byte sectors.
-$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a
-multiple of 128 (64KB).  $data_block_size cannot be changed after the
-thin-pool is created.  People primarily interested in thin provisioning
-may want to use a value such as 1024 (512KB).  People doing lots of
-snapshotting may want a smaller value such as 128 (64KB).  If you are
-not zeroing newly-allocated data, a larger $data_block_size in the
-region of 256000 (128MB) is suggested.
-
-$low_water_mark is expressed in blocks of size $data_block_size.  If
-free space on the data device drops below this level then a dm event
-will be triggered which a userspace daemon should catch allowing it to
-extend the pool device.  Only one such event will be sent.
-
-No special event is triggered if a just resumed device's free space is below
-the low water mark. However, resuming a device always triggers an
-event; a userspace daemon should verify that free space exceeds the low
-water mark when handling this event.
-
-A low water mark for the metadata device is maintained in the kernel and
-will trigger a dm event if free space on the metadata device drops below
-it.
-
-Updating on-disk metadata
--------------------------
-
-On-disk metadata is committed every time a FLUSH or FUA bio is written.
-If no such requests are made then commits will occur every second.  This
-means the thin-provisioning target behaves like a physical disk that has
-a volatile write cache.  If power is lost you may lose some recent
-writes.  The metadata should always be consistent in spite of any crash.
-
-If data space is exhausted the pool will either error or queue IO
-according to the configuration (see: error_if_no_space).  If metadata
-space is exhausted or a metadata operation fails: the pool will error IO
-until the pool is taken offline and repair is performed to 1) fix any
-potential inconsistencies and 2) clear the flag that imposes repair.
-Once the pool's metadata device is repaired it may be resized, which
-will allow the pool to return to normal operation.  Note that if a pool
-is flagged as needing repair, the pool's data and metadata devices
-cannot be resized until repair is performed.  It should also be noted
-that when the pool's metadata space is exhausted the current metadata
-transaction is aborted.  Given that the pool will cache IO whose
-completion may have already been acknowledged to upper IO layers
-(e.g. filesystem) it is strongly suggested that consistency checks
-(e.g. fsck) be performed on those layers when repair of the pool is
-required.
-
-Thin provisioning
------------------
-
-i) Creating a new thinly-provisioned volume.
-
-  To create a new thinly- provisioned volume you must send a message to an
-  active pool device, /dev/mapper/pool in this example.
-
-    dmsetup message /dev/mapper/pool 0 "create_thin 0"
-
-  Here '0' is an identifier for the volume, a 24-bit number.  It's up
-  to the caller to allocate and manage these identifiers.  If the
-  identifier is already in use, the message will fail with -EEXIST.
-
-ii) Using a thinly-provisioned volume.
-
-  Thinly-provisioned volumes are activated using the 'thin' target:
-
-    dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0"
-
-  The last parameter is the identifier for the thinp device.
-
-Internal snapshots
-------------------
-
-i) Creating an internal snapshot.
-
-  Snapshots are created with another message to the pool.
-
-  N.B.  If the origin device that you wish to snapshot is active, you
-  must suspend it before creating the snapshot to avoid corruption.
-  This is NOT enforced at the moment, so please be careful!
-
-    dmsetup suspend /dev/mapper/thin
-    dmsetup message /dev/mapper/pool 0 "create_snap 1 0"
-    dmsetup resume /dev/mapper/thin
-
-  Here '1' is the identifier for the volume, a 24-bit number.  '0' is the
-  identifier for the origin device.
-
-ii) Using an internal snapshot.
-
-  Once created, the user doesn't have to worry about any connection
-  between the origin and the snapshot.  Indeed the snapshot is no
-  different from any other thinly-provisioned device and can be
-  snapshotted itself via the same method.  It's perfectly legal to
-  have only one of them active, and there's no ordering requirement on
-  activating or removing them both.  (This differs from conventional
-  device-mapper snapshots.)
-
-  Activate it exactly the same way as any other thinly-provisioned volume:
-
-    dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
-
-External snapshots
-------------------
-
-You can use an external _read only_ device as an origin for a
-thinly-provisioned volume.  Any read to an unprovisioned area of the
-thin device will be passed through to the origin.  Writes trigger
-the allocation of new blocks as usual.
-
-One use case for this is VM hosts that want to run guests on
-thinly-provisioned volumes but have the base image on another device
-(possibly shared between many VMs).
-
-You must not write to the origin device if you use this technique!
-Of course, you may write to the thin device and take internal snapshots
-of the thin volume.
-
-i) Creating a snapshot of an external device
-
-  This is the same as creating a thin device.
-  You don't mention the origin at this stage.
-
-    dmsetup message /dev/mapper/pool 0 "create_thin 0"
-
-ii) Using a snapshot of an external device.
-
-  Append an extra parameter to the thin target specifying the origin:
-
-    dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
-
-  N.B. All descendants (internal snapshots) of this snapshot require the
-  same extra origin parameter.
-
-Deactivation
-------------
-
-All devices using a pool must be deactivated before the pool itself
-can be.
-
-    dmsetup remove thin
-    dmsetup remove snap
-    dmsetup remove pool
-
-Reference
-=========
-
-'thin-pool' target
-------------------
-
-i) Constructor
-
-    thin-pool <metadata dev> <data dev> <data block size (sectors)> \
-	      <low water mark (blocks)> [<number of feature args> [<arg>]*]
-
-    Optional feature arguments:
-
-      skip_block_zeroing: Skip the zeroing of newly-provisioned blocks.
-
-      ignore_discard: Disable discard support.
-
-      no_discard_passdown: Don't pass discards down to the underlying
-			   data device, but just remove the mapping.
-
-      read_only: Don't allow any changes to be made to the pool
-		 metadata.  This mode is only available after the
-		 thin-pool has been created and first used in full
-		 read/write mode.  It cannot be specified on initial
-		 thin-pool creation.
-
-      error_if_no_space: Error IOs, instead of queueing, if no space.
-
-    Data block size must be between 64KB (128 sectors) and 1GB
-    (2097152 sectors) inclusive.
-
-
-ii) Status
-
-    <transaction id> <used metadata blocks>/<total metadata blocks>
-    <used data blocks>/<total data blocks> <held metadata root>
-    ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space
-    needs_check|- metadata_low_watermark
-
-    transaction id:
-	A 64-bit number used by userspace to help synchronise with metadata
-	from volume managers.
-
-    used data blocks / total data blocks
-	If the number of free blocks drops below the pool's low water mark a
-	dm event will be sent to userspace.  This event is edge-triggered and
-	it will occur only once after each resume so volume manager writers
-	should register for the event and then check the target's status.
-
-    held metadata root:
-	The location, in blocks, of the metadata root that has been
-	'held' for userspace read access.  '-' indicates there is no
-	held root.
-
-    discard_passdown|no_discard_passdown
-	Whether or not discards are actually being passed down to the
-	underlying device.  When this is enabled when loading the table,
-	it can get disabled if the underlying device doesn't support it.
-
-    ro|rw|out_of_data_space
-	If the pool encounters certain types of device failures it will
-	drop into a read-only metadata mode in which no changes to
-	the pool metadata (like allocating new blocks) are permitted.
-
-	In serious cases where even a read-only mode is deemed unsafe
-	no further I/O will be permitted and the status will just
-	contain the string 'Fail'.  The userspace recovery tools
-	should then be used.
-
-    error_if_no_space|queue_if_no_space
-	If the pool runs out of data or metadata space, the pool will
-	either queue or error the IO destined to the data device.  The
-	default is to queue the IO until more space is added or the
-	'no_space_timeout' expires.  The 'no_space_timeout' dm-thin-pool
-	module parameter can be used to change this timeout -- it
-	defaults to 60 seconds but may be disabled using a value of 0.
-
-    needs_check
-	A metadata operation has failed, resulting in the needs_check
-	flag being set in the metadata's superblock.  The metadata
-	device must be deactivated and checked/repaired before the
-	thin-pool can be made fully operational again.  '-' indicates
-	needs_check is not set.
-
-    metadata_low_watermark:
-	Value of metadata low watermark in blocks.  The kernel sets this
-	value internally but userspace needs to know this value to
-	determine if an event was caused by crossing this threshold.
-
-iii) Messages
-
-    create_thin <dev id>
-
-	Create a new thinly-provisioned device.
-	<dev id> is an arbitrary unique 24-bit identifier chosen by
-	the caller.
-
-    create_snap <dev id> <origin id>
-
-	Create a new snapshot of another thinly-provisioned device.
-	<dev id> is an arbitrary unique 24-bit identifier chosen by
-	the caller.
-	<origin id> is the identifier of the thinly-provisioned device
-	of which the new device will be a snapshot.
-
-    delete <dev id>
-
-	Deletes a thin device.  Irreversible.
-
-    set_transaction_id <current id> <new id>
-
-	Userland volume managers, such as LVM, need a way to
-	synchronise their external metadata with the internal metadata of the
-	pool target.  The thin-pool target offers to store an
-	arbitrary 64-bit transaction id and return it on the target's
-	status line.  To avoid races you must provide what you think
-	the current transaction id is when you change it with this
-	compare-and-swap message.
-
-    reserve_metadata_snap
-
-        Reserve a copy of the data mapping btree for use by userland.
-        This allows userland to inspect the mappings as they were when
-        this message was executed.  Use the pool's status command to
-        get the root block associated with the metadata snapshot.
-
-    release_metadata_snap
-
-        Release a previously reserved copy of the data mapping btree.
-
-'thin' target
--------------
-
-i) Constructor
-
-    thin <pool dev> <dev id> [<external origin dev>]
-
-    pool dev:
-	the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
-
-    dev id:
-	the internal device identifier of the device to be
-	activated.
-
-    external origin dev:
-	an optional block device outside the pool to be treated as a
-	read-only snapshot origin: reads to unprovisioned areas of the
-	thin target will be mapped to this device.
-
-The pool doesn't store any size against the thin devices.  If you
-load a thin target that is smaller than you've been using previously,
-then you'll have no access to blocks mapped beyond the end.  If you
-load a target that is bigger than before, then extra blocks will be
-provisioned as and when needed.
-
-ii) Status
-
-     <nr mapped sectors> <highest mapped sector>
-
-	If the pool has encountered device errors and failed, the status
-	will just contain the string 'Fail'.  The userspace recovery
-	tools should then be used.
-
-    In the case where <nr mapped sectors> is 0, there is no highest
-    mapped sector and the value of <highest mapped sector> is unspecified.
diff --git a/Documentation/device-mapper/unstriped.txt b/Documentation/device-mapper/unstriped.txt
deleted file mode 100644
index 0b2a306c54ee..000000000000
--- a/Documentation/device-mapper/unstriped.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-Introduction
-============
-
-The device-mapper "unstriped" target provides a transparent mechanism to
-unstripe a device-mapper "striped" target to access the underlying disks
-without having to touch the true backing block-device.  It can also be
-used to unstripe a hardware RAID-0 to access backing disks.
-
-Parameters:
-<number of stripes> <chunk size> <stripe #> <dev_path> <offset>
-
-<number of stripes>
-        The number of stripes in the RAID 0.
-
-<chunk size>
-	The amount of 512B sectors in the chunk striping.
-
-<dev_path>
-	The block device you wish to unstripe.
-
-<stripe #>
-        The stripe number within the device that corresponds to physical
-        drive you wish to unstripe.  This must be 0 indexed.
-
-
-Why use this module?
-====================
-
-An example of undoing an existing dm-stripe
--------------------------------------------
-
-This small bash script will setup 4 loop devices and use the existing
-striped target to combine the 4 devices into one.  It then will use
-the unstriped target ontop of the striped device to access the
-individual backing loop devices.  We write data to the newly exposed
-unstriped devices and verify the data written matches the correct
-underlying device on the striped array.
-
-#!/bin/bash
-
-MEMBER_SIZE=$((128 * 1024 * 1024))
-NUM=4
-SEQ_END=$((${NUM}-1))
-CHUNK=256
-BS=4096
-
-RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512))
-DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}"
-COUNT=$((${MEMBER_SIZE} / ${BS}))
-
-for i in $(seq 0 ${SEQ_END}); do
-  dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct
-  losetup /dev/loop${i} member-${i}
-  DM_PARMS+=" /dev/loop${i} 0"
-done
-
-echo $DM_PARMS | dmsetup create raid0
-for i in $(seq 0 ${SEQ_END}); do
-  echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i}
-done;
-
-for i in $(seq 0 ${SEQ_END}); do
-  dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct
-  diff /dev/mapper/set-${i} member-${i}
-done;
-
-for i in $(seq 0 ${SEQ_END}); do
-  dmsetup remove set-${i}
-done
-
-dmsetup remove raid0
-
-for i in $(seq 0 ${SEQ_END}); do
-  losetup -d /dev/loop${i}
-  rm -f member-${i}
-done
-
-Another example
----------------
-
-Intel NVMe drives contain two cores on the physical device.
-Each core of the drive has segregated access to its LBA range.
-The current LBA model has a RAID 0 128k chunk on each core, resulting
-in a 256k stripe across the two cores:
-
-   Core 0:       Core 1:
-  __________    __________
-  | LBA 512|    | LBA 768|
-  | LBA 0  |    | LBA 256|
-  ----------    ----------
-
-The purpose of this unstriping is to provide better QoS in noisy
-neighbor environments. When two partitions are created on the
-aggregate drive without this unstriping, reads on one partition
-can affect writes on another partition.  This is because the partitions
-are striped across the two cores.  When we unstripe this hardware RAID 0
-and make partitions on each new exposed device the two partitions are now
-physically separated.
-
-With the dm-unstriped target we're able to segregate an fio script that
-has read and write jobs that are independent of each other.  Compared to
-when we run the test on a combined drive with partitions, we were able
-to get a 92% reduction in read latency using this device mapper target.
-
-
-Example dmsetup usage
-=====================
-
-unstriped ontop of Intel NVMe device that has 2 cores
------------------------------------------------------
-dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0'
-dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0'
-
-There will now be two devices that expose Intel NVMe core 0 and 1
-respectively:
-/dev/mapper/nvmset0
-/dev/mapper/nvmset1
-
-unstriped ontop of striped with 4 drives using 128K chunk size
---------------------------------------------------------------
-dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0'
-dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0'
-dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0'
-dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0'
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
deleted file mode 100644
index b3d2e4a42255..000000000000
--- a/Documentation/device-mapper/verity.txt
+++ /dev/null
@@ -1,219 +0,0 @@
-dm-verity
-==========
-
-Device-Mapper's "verity" target provides transparent integrity checking of
-block devices using a cryptographic digest provided by the kernel crypto API.
-This target is read-only.
-
-Construction Parameters
-=======================
-    <version> <dev> <hash_dev>
-    <data_block_size> <hash_block_size>
-    <num_data_blocks> <hash_start_block>
-    <algorithm> <digest> <salt>
-    [<#opt_params> <opt_params>]
-
-<version>
-    This is the type of the on-disk hash format.
-
-    0 is the original format used in the Chromium OS.
-      The salt is appended when hashing, digests are stored continuously and
-      the rest of the block is padded with zeroes.
-
-    1 is the current format that should be used for new devices.
-      The salt is prepended when hashing and each digest is
-      padded with zeroes to the power of two.
-
-<dev>
-    This is the device containing data, the integrity of which needs to be
-    checked.  It may be specified as a path, like /dev/sdaX, or a device number,
-    <major>:<minor>.
-
-<hash_dev>
-    This is the device that supplies the hash tree data.  It may be
-    specified similarly to the device path and may be the same device.  If the
-    same device is used, the hash_start should be outside the configured
-    dm-verity device.
-
-<data_block_size>
-    The block size on a data device in bytes.
-    Each block corresponds to one digest on the hash device.
-
-<hash_block_size>
-    The size of a hash block in bytes.
-
-<num_data_blocks>
-    The number of data blocks on the data device.  Additional blocks are
-    inaccessible.  You can place hashes to the same partition as data, in this
-    case hashes are placed after <num_data_blocks>.
-
-<hash_start_block>
-    This is the offset, in <hash_block_size>-blocks, from the start of hash_dev
-    to the root block of the hash tree.
-
-<algorithm>
-    The cryptographic hash algorithm used for this device.  This should
-    be the name of the algorithm, like "sha1".
-
-<digest>
-    The hexadecimal encoding of the cryptographic hash of the root hash block
-    and the salt.  This hash should be trusted as there is no other authenticity
-    beyond this point.
-
-<salt>
-    The hexadecimal encoding of the salt value.
-
-<#opt_params>
-    Number of optional parameters. If there are no optional parameters,
-    the optional paramaters section can be skipped or #opt_params can be zero.
-    Otherwise #opt_params is the number of following arguments.
-
-    Example of optional parameters section:
-        1 ignore_corruption
-
-ignore_corruption
-    Log corrupted blocks, but allow read operations to proceed normally.
-
-restart_on_corruption
-    Restart the system when a corrupted block is discovered. This option is
-    not compatible with ignore_corruption and requires user space support to
-    avoid restart loops.
-
-ignore_zero_blocks
-    Do not verify blocks that are expected to contain zeroes and always return
-    zeroes instead. This may be useful if the partition contains unused blocks
-    that are not guaranteed to contain zeroes.
-
-use_fec_from_device <fec_dev>
-    Use forward error correction (FEC) to recover from corruption if hash
-    verification fails. Use encoding data from the specified device. This
-    may be the same device where data and hash blocks reside, in which case
-    fec_start must be outside data and hash areas.
-
-    If the encoding data covers additional metadata, it must be accessible
-    on the hash device after the hash blocks.
-
-    Note: block sizes for data and hash devices must match. Also, if the
-    verity <dev> is encrypted the <fec_dev> should be too.
-
-fec_roots <num>
-    Number of generator roots. This equals to the number of parity bytes in
-    the encoding data. For example, in RS(M, N) encoding, the number of roots
-    is M-N.
-
-fec_blocks <num>
-    The number of encoding data blocks on the FEC device. The block size for
-    the FEC device is <data_block_size>.
-
-fec_start <offset>
-    This is the offset, in <data_block_size> blocks, from the start of the
-    FEC device to the beginning of the encoding data.
-
-check_at_most_once
-    Verify data blocks only the first time they are read from the data device,
-    rather than every time.  This reduces the overhead of dm-verity so that it
-    can be used on systems that are memory and/or CPU constrained.  However, it
-    provides a reduced level of security because only offline tampering of the
-    data device's content will be detected, not online tampering.
-
-    Hash blocks are still verified each time they are read from the hash device,
-    since verification of hash blocks is less performance critical than data
-    blocks, and a hash block will not be verified any more after all the data
-    blocks it covers have been verified anyway.
-
-Theory of operation
-===================
-
-dm-verity is meant to be set up as part of a verified boot path.  This
-may be anything ranging from a boot using tboot or trustedgrub to just
-booting from a known-good device (like a USB drive or CD).
-
-When a dm-verity device is configured, it is expected that the caller
-has been authenticated in some way (cryptographic signatures, etc).
-After instantiation, all hashes will be verified on-demand during
-disk access.  If they cannot be verified up to the root node of the
-tree, the root hash, then the I/O will fail.  This should detect
-tampering with any data on the device and the hash data.
-
-Cryptographic hashes are used to assert the integrity of the device on a
-per-block basis. This allows for a lightweight hash computation on first read
-into the page cache. Block hashes are stored linearly, aligned to the nearest
-block size.
-
-If forward error correction (FEC) support is enabled any recovery of
-corrupted data will be verified using the cryptographic hash of the
-corresponding data. This is why combining error correction with
-integrity checking is essential.
-
-Hash Tree
----------
-
-Each node in the tree is a cryptographic hash.  If it is a leaf node, the hash
-of some data block on disk is calculated. If it is an intermediary node,
-the hash of a number of child nodes is calculated.
-
-Each entry in the tree is a collection of neighboring nodes that fit in one
-block.  The number is determined based on block_size and the size of the
-selected cryptographic digest algorithm.  The hashes are linearly-ordered in
-this entry and any unaligned trailing space is ignored but included when
-calculating the parent node.
-
-The tree looks something like:
-
-alg = sha256, num_blocks = 32768, block_size = 4096
-
-                                 [   root    ]
-                                /    . . .    \
-                     [entry_0]                 [entry_1]
-                    /  . . .  \                 . . .   \
-         [entry_0_0]   . . .  [entry_0_127]    . . . .  [entry_1_127]
-           / ... \             /   . . .  \             /           \
-     blk_0 ... blk_127  blk_16256   blk_16383      blk_32640 . . . blk_32767
-
-
-On-disk format
-==============
-
-The verity kernel code does not read the verity metadata on-disk header.
-It only reads the hash blocks which directly follow the header.
-It is expected that a user-space tool will verify the integrity of the
-verity header.
-
-Alternatively, the header can be omitted and the dmsetup parameters can
-be passed via the kernel command-line in a rooted chain of trust where
-the command-line is verified.
-
-Directly following the header (and with sector number padded to the next hash
-block boundary) are the hash blocks which are stored a depth at a time
-(starting from the root), sorted in order of increasing index.
-
-The full specification of kernel parameters and on-disk metadata format
-is available at the cryptsetup project's wiki page
-  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
-
-Status
-======
-V (for Valid) is returned if every check performed so far was valid.
-If any check failed, C (for Corruption) is returned.
-
-Example
-=======
-Set up a device:
-  # dmsetup create vroot --readonly --table \
-    "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\
-    "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\
-    "1234000000000000000000000000000000000000000000000000000000000000"
-
-A command line tool veritysetup is available to compute or verify
-the hash tree or activate the kernel device. This is available from
-the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
-(as a libcryptsetup extension).
-
-Create hash on the device:
-  # veritysetup format /dev/sda1 /dev/sda2
-  ...
-  Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
-
-Activate the device:
-  # veritysetup create vroot /dev/sda1 /dev/sda2 \
-    4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.txt
deleted file mode 100644
index 01532b3008ae..000000000000
--- a/Documentation/device-mapper/writecache.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-The writecache target caches writes on persistent memory or on SSD. It
-doesn't cache reads because reads are supposed to be cached in page cache
-in normal RAM.
-
-When the device is constructed, the first sector should be zeroed or the
-first sector should contain valid superblock from previous invocation.
-
-Constructor parameters:
-1. type of the cache device - "p" or "s"
-	p - persistent memory
-	s - SSD
-2. the underlying device that will be cached
-3. the cache device
-4. block size (4096 is recommended; the maximum block size is the page
-   size)
-5. the number of optional parameters (the parameters with an argument
-   count as two)
-	start_sector n		(default: 0)
-		offset from the start of cache device in 512-byte sectors
-	high_watermark n	(default: 50)
-		start writeback when the number of used blocks reach this
-		watermark
-	low_watermark x		(default: 45)
-		stop writeback when the number of used blocks drops below
-		this watermark
-	writeback_jobs n	(default: unlimited)
-		limit the number of blocks that are in flight during
-		writeback. Setting this value reduces writeback
-		throughput, but it may improve latency of read requests
-	autocommit_blocks n	(default: 64 for pmem, 65536 for ssd)
-		when the application writes this amount of blocks without
-		issuing the FLUSH request, the blocks are automatically
-		commited
-	autocommit_time ms	(default: 1000)
-		autocommit time in milliseconds. The data is automatically
-		commited if this time passes and no FLUSH request is
-		received
-	fua			(by default on)
-		applicable only to persistent memory - use the FUA flag
-		when writing data from persistent memory back to the
-		underlying device
-	nofua
-		applicable only to persistent memory - don't use the FUA
-		flag when writing back data and send the FLUSH request
-		afterwards
-		- some underlying devices perform better with fua, some
-		  with nofua. The user should test it
-
-Status:
-1. error indicator - 0 if there was no error, otherwise error number
-2. the number of blocks
-3. the number of free blocks
-4. the number of blocks under writeback
-
-Messages:
-	flush
-		flush the cache device. The message returns successfully
-		if the cache device was flushed without an error
-	flush_on_suspend
-		flush the cache device on next suspend. Use this message
-		when you are going to remove the cache device. The proper
-		sequence for removing the cache device is:
-		1. send the "flush_on_suspend" message
-		2. load an inactive table with a linear target that maps
-		   to the underlying device
-		3. suspend the device
-		4. ask for status and verify that there are no errors
-		5. resume the device, so that it will use the linear
-		   target
-		6. the cache device is now inactive and it can be deleted
diff --git a/Documentation/device-mapper/zero.txt b/Documentation/device-mapper/zero.txt
deleted file mode 100644
index 20fb38e7fa7e..000000000000
--- a/Documentation/device-mapper/zero.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-dm-zero
-=======
-
-Device-Mapper's "zero" target provides a block-device that always returns
-zero'd data on reads and silently drops writes. This is similar behavior to
-/dev/zero, but as a block-device instead of a character-device.
-
-Dm-zero has no target-specific parameters.
-
-One very interesting use of dm-zero is for creating "sparse" devices in
-conjunction with dm-snapshot. A sparse device reports a device-size larger
-than the amount of actual storage space available for that device. A user can
-write data anywhere within the sparse device and read it back like a normal
-device. Reads to previously unwritten areas will return a zero'd buffer. When
-enough data has been written to fill up the actual storage space, the sparse
-device is deactivated. This can be very useful for testing device and
-filesystem limitations.
-
-To create a sparse device, start by creating a dm-zero device that's the
-desired size of the sparse device. For this example, we'll assume a 10TB
-sparse device.
-
-TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2`   # 10 TB in sectors
-echo "0 $TEN_TERABYTES zero" | dmsetup create zero1
-
-Then create a snapshot of the zero device, using any available block-device as
-the COW device. The size of the COW device will determine the amount of real
-space available to the sparse device. For this example, we'll assume /dev/sdb1
-is an available 10GB partition.
-
-echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \
-   dmsetup create sparse1
-
-This will create a 10TB sparse device called /dev/mapper/sparse1 that has
-10GB of actual storage space available. If more than 10GB of data is written
-to this device, it will start returning I/O errors.
-
diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile
index 63b139f9ae28..6b0dfd5c17ba 100644
--- a/Documentation/devicetree/bindings/Makefile
+++ b/Documentation/devicetree/bindings/Makefile
@@ -5,7 +5,7 @@ DT_MK_SCHEMA ?= dt-mk-schema
 DT_MK_SCHEMA_FLAGS := $(if $(DT_SCHEMA_FILES), -u)
 
 quiet_cmd_chk_binding = CHKDT   $(patsubst $(srctree)/%,%,$<)
-      cmd_chk_binding = $(DT_DOC_CHECKER) $< ; \
+      cmd_chk_binding = $(DT_DOC_CHECKER) -u $(srctree)/$(src) $< ; \
                         $(DT_EXTRACT_EX) $< > $@
 
 $(obj)/%.example.dts: $(src)/%.yaml FORCE
@@ -25,7 +25,7 @@ DT_DOCS = $(shell \
 DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS))
 
 extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
-extra-y += $(patsubst $(src)/%.yaml,%.example.dtb, $(DT_SCHEMA_FILES))
+extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
 
 $(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE
 	$(call if_changed,mk_schema)
diff --git a/Documentation/devicetree/bindings/arm/al,alpine.txt b/Documentation/devicetree/bindings/arm/al,alpine.txt
deleted file mode 100644
index d00debe2e86f..000000000000
--- a/Documentation/devicetree/bindings/arm/al,alpine.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Annapurna Labs Alpine Platform Device Tree Bindings
----------------------------------------------------------------
-
-Boards in the Alpine family shall have the following properties:
-
-* Required root node properties:
-compatible: must contain "al,alpine"
-
-* Example:
-
-/ {
-	model = "Annapurna Labs Alpine Dev Board";
-	compatible = "al,alpine";
-
-	...
-}
diff --git a/Documentation/devicetree/bindings/arm/al,alpine.yaml b/Documentation/devicetree/bindings/arm/al,alpine.yaml
new file mode 100644
index 000000000000..a70dff277e05
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/al,alpine.yaml
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/al,alpine.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Annapurna Labs Alpine Platform Device Tree Bindings
+
+maintainers:
+  - Tsahee Zidenberg <tsahee@annapurnalabs.com>
+  - Antoine Tenart <antoine.tenart@bootlin.com>
+
+properties:
+  compatible:
+    items:
+      - const: al,alpine
+  model:
+    items:
+      - const: "Annapurna Labs Alpine Dev Board"
+
+...
diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt
deleted file mode 100644
index 061f7b98a07f..000000000000
--- a/Documentation/devicetree/bindings/arm/amlogic.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Amlogic MesonX device tree bindings
--------------------------------------------
-
-Work in progress statement:
-
-Device tree files and bindings applying to Amlogic SoCs and boards are
-considered "unstable". Any Amlogic device tree binding may change at
-any time. Be sure to use a device tree binary and a kernel image
-generated from the same source tree.
-
-Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a
-stable binding/ABI.
-
----------------------------------------------------------------
-
-Boards with the Amlogic Meson6 SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,meson6"
-
-Boards with the Amlogic Meson8 SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,meson8";
-
-Boards with the Amlogic Meson8b SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,meson8b";
-
-Boards with the Amlogic Meson8m2 SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,meson8m2";
-
-Boards with the Amlogic Meson GXBaby SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,meson-gxbb";
-
-Boards with the Amlogic Meson GXL S905X SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,s905x", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S905D SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,s905d", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S805X SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,s805x", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S905W SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,s905w", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXM S912 SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,s912", "amlogic,meson-gxm";
-
-Boards with the Amlogic Meson AXG A113D SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,a113d", "amlogic,meson-axg";
-
-Boards with the Amlogic Meson G12A S905D2 SoC shall have the following properties:
-  Required root node property:
-    compatible: "amlogic,g12a";
-
-Board compatible values (alphabetically, grouped by SoC):
-
-  - "geniatech,atv1200" (Meson6)
-
-  - "minix,neo-x8" (Meson8)
-
-  - "endless,ec100" (Meson8b)
-  - "hardkernel,odroid-c1" (Meson8b)
-  - "tronfy,mxq" (Meson8b)
-
-  - "tronsmart,mxiii-plus" (Meson8m2)
-
-  - "amlogic,p200" (Meson gxbb)
-  - "amlogic,p201" (Meson gxbb)
-  - "friendlyarm,nanopi-k2" (Meson gxbb)
-  - "hardkernel,odroid-c2" (Meson gxbb)
-  - "nexbox,a95x" (Meson gxbb or Meson gxl s905x)
-  - "tronsmart,vega-s95-pro", "tronsmart,vega-s95" (Meson gxbb)
-  - "tronsmart,vega-s95-meta", "tronsmart,vega-s95" (Meson gxbb)
-  - "tronsmart,vega-s95-telos", "tronsmart,vega-s95" (Meson gxbb)
-  - "wetek,hub" (Meson gxbb)
-  - "wetek,play2" (Meson gxbb)
-
-  - "amlogic,p212" (Meson gxl s905x)
-  - "hwacom,amazetv" (Meson gxl s905x)
-  - "khadas,vim" (Meson gxl s905x)
-  - "libretech,cc" (Meson gxl s905x)
-
-  - "amlogic,p230" (Meson gxl s905d)
-  - "amlogic,p231" (Meson gxl s905d)
-  - "phicomm,n1" (Meson gxl s905d)
-
-  - "amlogic,p241" (Meson gxl s805x)
-  - "libretech,aml-s805x-ac" (Meson gxl s805x)
-
-  - "amlogic,p281" (Meson gxl s905w)
-  - "oranth,tx3-mini" (Meson gxl s905w)
-
-  - "amlogic,q200" (Meson gxm s912)
-  - "amlogic,q201" (Meson gxm s912)
-  - "khadas,vim2" (Meson gxm s912)
-  - "kingnovel,r-box-pro" (Meson gxm S912)
-  - "nexbox,a1" (Meson gxm s912)
-  - "tronsmart,vega-s96" (Meson gxm s912)
-
-  - "amlogic,s400" (Meson axg a113d)
-
-  - "amlogic,u200" (Meson g12a s905d2)
-  - "amediatech,x96-max" (Meson g12a s905x2)
-  - "seirobotics,sei510" (Meson g12a s905x2)
-
-Amlogic Meson Firmware registers Interface
-------------------------------------------
-
-The Meson SoCs have a register bank with status and data shared with the
-secure firmware.
-
-Required properties:
- - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon"
-
-Properties should indentify components of this register interface :
-
-Meson GX SoC Information
-------------------------
-A firmware register encodes the SoC type, package and revision information on
-the Meson GX SoCs.
-If present, the following property should be added :
-
-Optional properties:
-  - amlogic,has-chip-id: If present, the interface gives the current SoC version.
-
-Example
--------
-
-ao-secure@140 {
-	compatible = "amlogic,meson-gx-ao-secure", "syscon";
-	reg = <0x0 0x140 0x0 0x140>;
-	amlogic,has-chip-id;
-};
diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml
new file mode 100644
index 000000000000..325c6fd3566d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/amlogic.yaml
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/amlogic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Amlogic MesonX device tree bindings
+
+maintainers:
+  - Kevin Hilman <khilman@baylibre.com>
+
+description: |+
+  Work in progress statement:
+
+  Device tree files and bindings applying to Amlogic SoCs and boards are
+  considered "unstable". Any Amlogic device tree binding may change at
+  any time. Be sure to use a device tree binary and a kernel image
+  generated from the same source tree.
+
+  Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a
+  stable binding/ABI.
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    oneOf:
+      - description: Boards with the Amlogic Meson6 SoC
+        items:
+          - enum:
+              - geniatech,atv1200
+          - const: amlogic,meson6
+
+      - description: Boards with the Amlogic Meson8 SoC
+        items:
+          - enum:
+              - minix,neo-x8
+          - const: amlogic,meson8
+
+      - description: Boards with the Amlogic Meson8m2 SoC
+        items:
+          - enum:
+              - tronsmart,mxiii-plus
+          - const: amlogic,meson8m2
+
+      - description: Boards with the Amlogic Meson8b SoC
+        items:
+          - enum:
+              - endless,ec100
+              - hardkernel,odroid-c1
+              - tronfy,mxq
+          - const: amlogic,meson8b
+
+      - description: Boards with the Amlogic Meson GXBaby SoC
+        items:
+          - enum:
+              - amlogic,p200
+              - amlogic,p201
+              - friendlyarm,nanopi-k2
+              - hardkernel,odroid-c2
+              - nexbox,a95x
+              - wetek,hub
+              - wetek,play2
+          - const: amlogic,meson-gxbb
+
+      - description: Tronsmart Vega S95 devices
+        items:
+          - enum:
+              - tronsmart,vega-s95-pro
+              - tronsmart,vega-s95-meta
+              - tronsmart,vega-s95-telos
+          - const: tronsmart,vega-s95
+          - const: amlogic,meson-gxbb
+
+      - description: Boards with the Amlogic Meson GXL S805X SoC
+        items:
+          - enum:
+              - amlogic,p241
+              - libretech,aml-s805x-ac
+          - const: amlogic,s805x
+          - const: amlogic,meson-gxl
+
+      - description: Boards with the Amlogic Meson GXL S905W SoC
+        items:
+          - enum:
+              - amlogic,p281
+              - oranth,tx3-mini
+          - const: amlogic,s905w
+          - const: amlogic,meson-gxl
+
+      - description: Boards with the Amlogic Meson GXL S905X SoC
+        items:
+          - enum:
+              - amediatech,x96-max
+              - amlogic,p212
+              - hwacom,amazetv
+              - khadas,vim
+              - libretech,cc
+              - nexbox,a95x
+              - seirobotics,sei510
+          - const: amlogic,s905x
+          - const: amlogic,meson-gxl
+
+      - description: Boards with the Amlogic Meson GXL S905D SoC
+        items:
+          - enum:
+              - amlogic,p230
+              - amlogic,p231
+              - phicomm,n1
+          - const: amlogic,s905d
+          - const: amlogic,meson-gxl
+
+      - description: Boards with the Amlogic Meson GXM S912 SoC
+        items:
+          - enum:
+              - amlogic,q200
+              - amlogic,q201
+              - khadas,vim2
+              - kingnovel,r-box-pro
+              - nexbox,a1
+              - tronsmart,vega-s96
+          - const: amlogic,s912
+          - const: amlogic,meson-gxm
+
+      - description: Boards with the Amlogic Meson AXG A113D SoC
+        items:
+          - enum:
+              - amlogic,s400
+          - const: amlogic,a113d
+          - const: amlogic,meson-axg
+
+      - description: Boards with the Amlogic Meson G12A S905D2 SoC
+        items:
+          - enum:
+              - amlogic,u200
+          - const: amlogic,g12a
+
+      - description: Boards with the Amlogic Meson G12B S922X SoC
+        items:
+          - enum:
+              - hardkernel,odroid-n2
+          - const: amlogic,g12b
+
+...
diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt
new file mode 100644
index 000000000000..c67d9f48fb91
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt
@@ -0,0 +1,28 @@
+Amlogic Meson Firmware registers Interface
+------------------------------------------
+
+The Meson SoCs have a register bank with status and data shared with the
+secure firmware.
+
+Required properties:
+ - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon"
+
+Properties should indentify components of this register interface :
+
+Meson GX SoC Information
+------------------------
+A firmware register encodes the SoC type, package and revision information on
+the Meson GX SoCs.
+If present, the following property should be added :
+
+Optional properties:
+  - amlogic,has-chip-id: If present, the interface gives the current SoC version.
+
+Example
+-------
+
+ao-secure@140 {
+	compatible = "amlogic,meson-gx-ao-secure", "syscon";
+	reg = <0x0 0x140 0x0 0x140>;
+	amlogic,has-chip-id;
+};
diff --git a/Documentation/devicetree/bindings/arm/arm,scmi.txt b/Documentation/devicetree/bindings/arm/arm,scmi.txt
index 5f3719ab7075..317a2fc3667a 100644
--- a/Documentation/devicetree/bindings/arm/arm,scmi.txt
+++ b/Documentation/devicetree/bindings/arm/arm,scmi.txt
@@ -6,7 +6,7 @@ that are provided by the hardware platform it is running on, including power
 and performance functions.
 
 This binding is intended to define the interface the firmware implementing
-the SCMI as described in ARM document number ARM DUI 0922B ("ARM System Control
+the SCMI as described in ARM document number ARM DEN 0056A ("ARM System Control
 and Management Interface Platform Design Document")[0] provide for OSPM in
 the device tree.
 
diff --git a/Documentation/devicetree/bindings/arm/arm-boards b/Documentation/devicetree/bindings/arm/arm-boards
index b6e810c2781a..6758ece324b1 100644
--- a/Documentation/devicetree/bindings/arm/arm-boards
+++ b/Documentation/devicetree/bindings/arm/arm-boards
@@ -197,7 +197,7 @@ Required nodes:
 The description for the board must include:
    - a "psci" node describing the boot method used for the secondary CPUs.
      A detailed description of the bindings used for "psci" nodes is present
-     in the psci.txt file.
+     in the psci.yaml file.
    - a "cpus" node describing the available cores and their associated
      "enable-method"s. For more details see cpus.txt file.
 
@@ -216,7 +216,7 @@ Example:
 		#size-cells = <0>;
 
 		A57_0: cpu@0 {
-			compatible = "arm,cortex-a57","arm,armv8";
+			compatible = "arm,cortex-a57";
 			reg = <0x0 0x0>;
 			device_type = "cpu";
 			enable-method = "psci";
@@ -225,7 +225,7 @@ Example:
 		.....
 
 		A53_0: cpu@100 {
-			compatible = "arm,cortex-a53","arm,armv8";
+			compatible = "arm,cortex-a53";
 			reg = <0x0 0x100>;
 			device_type = "cpu";
 			enable-method = "psci";
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.txt b/Documentation/devicetree/bindings/arm/atmel-at91.txt
deleted file mode 100644
index 99dee23c74a4..000000000000
--- a/Documentation/devicetree/bindings/arm/atmel-at91.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-Atmel AT91 device tree bindings.
-================================
-
-Boards with a SoC of the Atmel AT91 or SMART family shall have the following
-properties:
-
-Required root node properties:
-compatible: must be one of:
- * "atmel,at91rm9200"
-
- * "atmel,at91sam9" for SoCs using an ARM926EJ-S core, shall be extended with
-   the specific SoC family or compatible:
-    o "atmel,at91sam9260"
-    o "atmel,at91sam9261"
-    o "atmel,at91sam9263"
-    o "atmel,at91sam9x5" for the 5 series, shall be extended with the specific
-      SoC compatible:
-       - "atmel,at91sam9g15"
-       - "atmel,at91sam9g25"
-       - "atmel,at91sam9g35"
-       - "atmel,at91sam9x25"
-       - "atmel,at91sam9x35"
-    o "atmel,at91sam9g20"
-    o "atmel,at91sam9g45"
-    o "atmel,at91sam9n12"
-    o "atmel,at91sam9rl"
-    o "atmel,at91sam9xe"
-    o "microchip,sam9x60"
- * "atmel,sama5" for SoCs using a Cortex-A5, shall be extended with the specific
-   SoC family:
-    o "atmel,sama5d2" shall be extended with the specific SoC compatible:
-       - "atmel,sama5d27"
-    o "atmel,sama5d3" shall be extended with the specific SoC compatible:
-       - "atmel,sama5d31"
-       - "atmel,sama5d33"
-       - "atmel,sama5d34"
-       - "atmel,sama5d35"
-       - "atmel,sama5d36"
-    o "atmel,sama5d4" shall be extended with the specific SoC compatible:
-       - "atmel,sama5d41"
-       - "atmel,sama5d42"
-       - "atmel,sama5d43"
-       - "atmel,sama5d44"
-
- * "atmel,samv7" for MCUs using a Cortex-M7, shall be extended with the specific
-   SoC family:
-    o "atmel,sams70" shall be extended with the specific MCU compatible:
-       - "atmel,sams70j19"
-       - "atmel,sams70j20"
-       - "atmel,sams70j21"
-       - "atmel,sams70n19"
-       - "atmel,sams70n20"
-       - "atmel,sams70n21"
-       - "atmel,sams70q19"
-       - "atmel,sams70q20"
-       - "atmel,sams70q21"
-    o "atmel,samv70" shall be extended with the specific MCU compatible:
-       - "atmel,samv70j19"
-       - "atmel,samv70j20"
-       - "atmel,samv70n19"
-       - "atmel,samv70n20"
-       - "atmel,samv70q19"
-       - "atmel,samv70q20"
-    o "atmel,samv71" shall be extended with the specific MCU compatible:
-       - "atmel,samv71j19"
-       - "atmel,samv71j20"
-       - "atmel,samv71j21"
-       - "atmel,samv71n19"
-       - "atmel,samv71n20"
-       - "atmel,samv71n21"
-       - "atmel,samv71q19"
-       - "atmel,samv71q20"
-       - "atmel,samv71q21"
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
new file mode 100644
index 000000000000..6e168abcd4d1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/atmel-at91.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel AT91 device tree bindings.
+
+maintainers:
+  - Alexandre Belloni <alexandre.belloni@bootlin.com>
+  - Ludovic Desroches <ludovic.desroches@microchip.com>
+
+description: |
+  Boards with a SoC of the Atmel AT91 or SMART family shall have the following
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    oneOf:
+      - items:
+          - const: atmel,at91rm9200
+      - items:
+          - enum:
+              - olimex,sam9-l9260
+          - enum:
+              - atmel,at91sam9260
+              - atmel,at91sam9261
+              - atmel,at91sam9263
+              - atmel,at91sam9g20
+              - atmel,at91sam9g45
+              - atmel,at91sam9n12
+              - atmel,at91sam9rl
+              - atmel,at91sam9xe
+              - atmel,at91sam9x60
+          - const: atmel,at91sam9
+
+      - items:
+          - enum:
+              - atmel,at91sam9g15
+              - atmel,at91sam9g25
+              - atmel,at91sam9g35
+              - atmel,at91sam9x25
+              - atmel,at91sam9x35
+          - const: atmel,at91sam9x5
+          - const: atmel,at91sam9
+
+      - items:
+          - const: atmel,sama5d27
+          - const: atmel,sama5d2
+          - const: atmel,sama5
+
+      - description: Nattis v2 board with Natte v2 power board
+        items:
+          - const: axentia,nattis-2
+          - const: axentia,natte-2
+          - const: axentia,linea
+          - const: atmel,sama5d31
+          - const: atmel,sama5d3
+          - const: atmel,sama5
+
+      - description: TSE-850 v3 board
+        items:
+          - const: axentia,tse850v3
+          - const: axentia,linea
+          - const: atmel,sama5d31
+          - const: atmel,sama5d3
+          - const: atmel,sama5
+
+      - items:
+          - const: axentia,linea
+          - const: atmel,sama5d31
+          - const: atmel,sama5d3
+          - const: atmel,sama5
+
+      - items:
+          - enum:
+              - atmel,sama5d31
+              - atmel,sama5d33
+              - atmel,sama5d34
+              - atmel,sama5d35
+              - atmel,sama5d36
+          - const: atmel,sama5d3
+          - const: atmel,sama5
+
+      - items:
+          - enum:
+              - atmel,sama5d41
+              - atmel,sama5d42
+              - atmel,sama5d43
+              - atmel,sama5d44
+          - const: atmel,sama5d4
+          - const: atmel,sama5
+
+      - items:
+          - enum:
+              - atmel,sams70j19
+              - atmel,sams70j20
+              - atmel,sams70j21
+              - atmel,sams70n19
+              - atmel,sams70n20
+              - atmel,sams70n21
+              - atmel,sams70q19
+              - atmel,sams70q20
+              - atmel,sams70q21
+          - const: atmel,sams70
+          - const: atmel,samv7
+
+      - items:
+          - enum:
+              - atmel,samv70j19
+              - atmel,samv70j20
+              - atmel,samv70n19
+              - atmel,samv70n20
+              - atmel,samv70q19
+              - atmel,samv70q20
+          - const: atmel,samv70
+          - const: atmel,samv7
+
+      - items:
+          - enum:
+              - atmel,samv71j19
+              - atmel,samv71j20
+              - atmel,samv71j21
+              - atmel,samv71n19
+              - atmel,samv71n20
+              - atmel,samv71n21
+              - atmel,samv71q19
+              - atmel,samv71q20
+              - atmel,samv71q21
+          - const: atmel,samv71
+          - const: atmel,samv7
+
+...
diff --git a/Documentation/devicetree/bindings/arm/axxia.txt b/Documentation/devicetree/bindings/arm/axxia.txt
deleted file mode 100644
index 7b4ef9c07696..000000000000
--- a/Documentation/devicetree/bindings/arm/axxia.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Axxia AXM55xx device tree bindings
-
-Boards using the AXM55xx SoC need to have the following properties:
-
-Required root node property:
-
-  - compatible = "lsi,axm5516"
-
-Boards:
-
-  LSI AXM5516 Validation board (Amarillo)
-	compatible = "lsi,axm5516-amarillo", "lsi,axm5516"
diff --git a/Documentation/devicetree/bindings/arm/axxia.yaml b/Documentation/devicetree/bindings/arm/axxia.yaml
new file mode 100644
index 000000000000..98780a569f22
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/axxia.yaml
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/axxia.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Axxia AXM55xx device tree bindings
+
+maintainers:
+  - Anders Berg <anders.berg@lsi.com>
+
+properties:
+  compatible:
+    description: LSI AXM5516 Validation board (Amarillo)
+    items:
+      - const: lsi,axm5516-amarillo
+      - const: lsi,axm5516
+
+...
diff --git a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt b/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
index 298291211ea4..f1de3247c1b7 100644
--- a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
+++ b/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
@@ -26,8 +26,8 @@ Required properties:
 		processor core is clocked by the internal CPU clock, so it
 		is enabled with CPU clock by default.
 
-- cpu : the CPU phandle the debug module is affined to. When omitted
-	the module is considered to belong to CPU0.
+- cpu : the CPU phandle the debug module is affined to. Do not assume it
+        to default to CPU0 if omitted.
 
 Optional properties:
 
diff --git a/Documentation/devicetree/bindings/arm/coresight.txt b/Documentation/devicetree/bindings/arm/coresight.txt
index 8a88ddebc1a2..fcc3bacfd8bc 100644
--- a/Documentation/devicetree/bindings/arm/coresight.txt
+++ b/Documentation/devicetree/bindings/arm/coresight.txt
@@ -59,6 +59,11 @@ its hardware characteristcs.
 
 	* port or ports: see "Graph bindings for Coresight" below.
 
+* Additional required property for Embedded Trace Macrocell (version 3.x and
+  version 4.x):
+	* cpu: the cpu phandle this ETM/PTM is affined to. Do not
+	  assume it to default to CPU0 if omitted.
+
 * Additional required properties for System Trace Macrocells (STM):
 	* reg: along with the physical base address and length of the register
 	  set as described above, another entry is required to describe the
@@ -87,9 +92,6 @@ its hardware characteristcs.
 	* arm,cp14: must be present if the system accesses ETM/PTM management
 	  registers via co-processor 14.
 
-	* cpu: the cpu phandle this ETM/PTM is affined to. When omitted the
-	  source is considered to belong to CPU0.
-
 * Optional property for TMC:
 
 	* arm,buffer-size: size of contiguous buffer space for TMC ETR
diff --git a/Documentation/devicetree/bindings/arm/cpu-capacity.txt b/Documentation/devicetree/bindings/arm/cpu-capacity.txt
index 96fa46cb133c..380e21c5fc7e 100644
--- a/Documentation/devicetree/bindings/arm/cpu-capacity.txt
+++ b/Documentation/devicetree/bindings/arm/cpu-capacity.txt
@@ -118,7 +118,7 @@ cpus {
 	};
 
 	A57_0: cpu@0 {
-		compatible = "arm,cortex-a57","arm,armv8";
+		compatible = "arm,cortex-a57";
 		reg = <0x0 0x0>;
 		device_type = "cpu";
 		enable-method = "psci";
@@ -129,7 +129,7 @@ cpus {
 	};
 
 	A57_1: cpu@1 {
-		compatible = "arm,cortex-a57","arm,armv8";
+		compatible = "arm,cortex-a57";
 		reg = <0x0 0x1>;
 		device_type = "cpu";
 		enable-method = "psci";
@@ -140,7 +140,7 @@ cpus {
 	};
 
 	A53_0: cpu@100 {
-		compatible = "arm,cortex-a53","arm,armv8";
+		compatible = "arm,cortex-a53";
 		reg = <0x0 0x100>;
 		device_type = "cpu";
 		enable-method = "psci";
@@ -151,7 +151,7 @@ cpus {
 	};
 
 	A53_1: cpu@101 {
-		compatible = "arm,cortex-a53","arm,armv8";
+		compatible = "arm,cortex-a53";
 		reg = <0x0 0x101>;
 		device_type = "cpu";
 		enable-method = "psci";
@@ -162,7 +162,7 @@ cpus {
 	};
 
 	A53_2: cpu@102 {
-		compatible = "arm,cortex-a53","arm,armv8";
+		compatible = "arm,cortex-a53";
 		reg = <0x0 0x102>;
 		device_type = "cpu";
 		enable-method = "psci";
@@ -173,7 +173,7 @@ cpus {
 	};
 
 	A53_3: cpu@103 {
-		compatible = "arm,cortex-a53","arm,armv8";
+		compatible = "arm,cortex-a53";
 		reg = <0x0 0x103>;
 		device_type = "cpu";
 		enable-method = "psci";
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
index 591bbd012d63..aa40b074b864 100644
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
@@ -39,281 +39,242 @@ description: |+
   described below.
 
 properties:
-  $nodename:
-    const: cpus
-    description: Container of cpu nodes
-
-  '#address-cells':
-    enum: [1, 2]
+  reg:
+    maxItems: 1
     description: |
-      Definition depends on ARM architecture version and configuration:
+      Usage and definition depend on ARM architecture version and
+      configuration:
 
       On uniprocessor ARM architectures previous to v7
-        value must be 1, to enable a simple enumeration
-        scheme for processors that do not have a HW CPU
-        identification register.
-      On 32-bit ARM 11 MPcore, ARM v7 or later systems
-        value must be 1, that corresponds to CPUID/MPIDR
-        registers sizes.
-      On ARM v8 64-bit systems value should be set to 2,
-        that corresponds to the MPIDR_EL1 register size.
-        If MPIDR_EL1[63:32] value is equal to 0 on all CPUs
-        in the system, #address-cells can be set to 1, since
-        MPIDR_EL1[63:32] bits are not used for CPUs
-        identification.
-
-  '#size-cells':
-    const: 0
-
-patternProperties:
-  '^cpu@[0-9a-f]+$':
-    type: object
-    properties:
-      device_type:
-        const: cpu
-
-      reg:
-        maxItems: 1
-        description: |
-          Usage and definition depend on ARM architecture version and
-          configuration:
-
-          On uniprocessor ARM architectures previous to v7
-          this property is required and must be set to 0.
-
-          On ARM 11 MPcore based systems this property is
-            required and matches the CPUID[11:0] register bits.
-
-            Bits [11:0] in the reg cell must be set to
-            bits [11:0] in CPU ID register.
-
-            All other bits in the reg cell must be set to 0.
-
-          On 32-bit ARM v7 or later systems this property is
-            required and matches the CPU MPIDR[23:0] register
-            bits.
-
-            Bits [23:0] in the reg cell must be set to
-            bits [23:0] in MPIDR.
-
-            All other bits in the reg cell must be set to 0.
-
-          On ARM v8 64-bit systems this property is required
-            and matches the MPIDR_EL1 register affinity bits.
+      this property is required and must be set to 0.
+
+      On ARM 11 MPcore based systems this property is
+        required and matches the CPUID[11:0] register bits.
+
+        Bits [11:0] in the reg cell must be set to
+        bits [11:0] in CPU ID register.
+
+        All other bits in the reg cell must be set to 0.
+
+      On 32-bit ARM v7 or later systems this property is
+        required and matches the CPU MPIDR[23:0] register
+        bits.
+
+        Bits [23:0] in the reg cell must be set to
+        bits [23:0] in MPIDR.
+
+        All other bits in the reg cell must be set to 0.
+
+      On ARM v8 64-bit systems this property is required
+        and matches the MPIDR_EL1 register affinity bits.
+
+        * If cpus node's #address-cells property is set to 2
+
+          The first reg cell bits [7:0] must be set to
+          bits [39:32] of MPIDR_EL1.
+
+          The second reg cell bits [23:0] must be set to
+          bits [23:0] of MPIDR_EL1.
+
+        * If cpus node's #address-cells property is set to 1
+
+          The reg cell bits [23:0] must be set to bits [23:0]
+          of MPIDR_EL1.
+
+      All other bits in the reg cells must be set to 0.
+
+  compatible:
+    enum:
+      - arm,arm710t
+      - arm,arm720t
+      - arm,arm740t
+      - arm,arm7ej-s
+      - arm,arm7tdmi
+      - arm,arm7tdmi-s
+      - arm,arm9es
+      - arm,arm9ej-s
+      - arm,arm920t
+      - arm,arm922t
+      - arm,arm925
+      - arm,arm926e-s
+      - arm,arm926ej-s
+      - arm,arm940t
+      - arm,arm946e-s
+      - arm,arm966e-s
+      - arm,arm968e-s
+      - arm,arm9tdmi
+      - arm,arm1020e
+      - arm,arm1020t
+      - arm,arm1022e
+      - arm,arm1026ej-s
+      - arm,arm1136j-s
+      - arm,arm1136jf-s
+      - arm,arm1156t2-s
+      - arm,arm1156t2f-s
+      - arm,arm1176jzf
+      - arm,arm1176jz-s
+      - arm,arm1176jzf-s
+      - arm,arm11mpcore
+      - arm,armv8 # Only for s/w models
+      - arm,cortex-a5
+      - arm,cortex-a7
+      - arm,cortex-a8
+      - arm,cortex-a9
+      - arm,cortex-a12
+      - arm,cortex-a15
+      - arm,cortex-a17
+      - arm,cortex-a53
+      - arm,cortex-a57
+      - arm,cortex-a72
+      - arm,cortex-a73
+      - arm,cortex-m0
+      - arm,cortex-m0+
+      - arm,cortex-m1
+      - arm,cortex-m3
+      - arm,cortex-m4
+      - arm,cortex-r4
+      - arm,cortex-r5
+      - arm,cortex-r7
+      - brcm,brahma-b15
+      - brcm,brahma-b53
+      - brcm,vulcan
+      - cavium,thunder
+      - cavium,thunder2
+      - faraday,fa526
+      - intel,sa110
+      - intel,sa1100
+      - marvell,feroceon
+      - marvell,mohawk
+      - marvell,pj4a
+      - marvell,pj4b
+      - marvell,sheeva-v5
+      - marvell,sheeva-v7
+      - nvidia,tegra132-denver
+      - nvidia,tegra186-denver
+      - nvidia,tegra194-carmel
+      - qcom,krait
+      - qcom,kryo
+      - qcom,kryo385
+      - qcom,scorpion
+
+  enable-method:
+    allOf:
+      - $ref: '/schemas/types.yaml#/definitions/string'
+      - oneOf:
+          # On ARM v8 64-bit this property is required
+          - enum:
+              - psci
+              - spin-table
+          # On ARM 32-bit systems this property is optional
+          - enum:
+              - actions,s500-smp
+              - allwinner,sun6i-a31
+              - allwinner,sun8i-a23
+              - allwinner,sun9i-a80-smp
+              - allwinner,sun8i-a83t-smp
+              - amlogic,meson8-smp
+              - amlogic,meson8b-smp
+              - arm,realview-smp
+              - brcm,bcm11351-cpu-method
+              - brcm,bcm23550
+              - brcm,bcm2836-smp
+              - brcm,bcm63138
+              - brcm,bcm-nsp-smp
+              - brcm,brahma-b15
+              - marvell,armada-375-smp
+              - marvell,armada-380-smp
+              - marvell,armada-390-smp
+              - marvell,armada-xp-smp
+              - marvell,98dx3236-smp
+              - mediatek,mt6589-smp
+              - mediatek,mt81xx-tz-smp
+              - qcom,gcc-msm8660
+              - qcom,kpss-acc-v1
+              - qcom,kpss-acc-v2
+              - renesas,apmu
+              - renesas,r9a06g032-smp
+              - rockchip,rk3036-smp
+              - rockchip,rk3066-smp
+              - socionext,milbeaut-m10v-smp
+              - ste,dbx500-smp
+
+  cpu-release-addr:
+    $ref: '/schemas/types.yaml#/definitions/uint64'
+
+    description:
+      Required for systems that have an "enable-method"
+        property value of "spin-table".
+      On ARM v8 64-bit systems must be a two cell
+        property identifying a 64-bit zero-initialised
+        memory location.
+
+  cpu-idle-states:
+    $ref: '/schemas/types.yaml#/definitions/phandle-array'
+    description: |
+      List of phandles to idle state nodes supported
+      by this cpu (see ./idle-states.txt).
+
+  capacity-dmips-mhz:
+    $ref: '/schemas/types.yaml#/definitions/uint32'
+    description:
+      u32 value representing CPU capacity (see ./cpu-capacity.txt) in
+      DMIPS/MHz, relative to highest capacity-dmips-mhz
+      in the system.
+
+  dynamic-power-coefficient:
+    $ref: '/schemas/types.yaml#/definitions/uint32'
+    description:
+      A u32 value that represents the running time dynamic
+      power coefficient in units of uW/MHz/V^2. The
+      coefficient can either be calculated from power
+      measurements or derived by analysis.
+
+      The dynamic power consumption of the CPU  is
+      proportional to the square of the Voltage (V) and
+      the clock frequency (f). The coefficient is used to
+      calculate the dynamic power as below -
+
+      Pdyn = dynamic-power-coefficient * V^2 * f
+
+      where voltage is in V, frequency is in MHz.
+
+  qcom,saw:
+    $ref: '/schemas/types.yaml#/definitions/phandle'
+    description: |
+      Specifies the SAW* node associated with this CPU.
 
-            * If cpus node's #address-cells property is set to 2
+      Required for systems that have an "enable-method" property
+      value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
 
-              The first reg cell bits [7:0] must be set to
-              bits [39:32] of MPIDR_EL1.
+      * arm/msm/qcom,saw2.txt
 
-              The second reg cell bits [23:0] must be set to
-              bits [23:0] of MPIDR_EL1.
+  qcom,acc:
+    $ref: '/schemas/types.yaml#/definitions/phandle'
+    description: |
+      Specifies the ACC* node associated with this CPU.
 
-            * If cpus node's #address-cells property is set to 1
+      Required for systems that have an "enable-method" property
+      value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
 
-              The reg cell bits [23:0] must be set to bits [23:0]
-              of MPIDR_EL1.
+      * arm/msm/qcom,kpss-acc.txt
 
-          All other bits in the reg cells must be set to 0.
+  rockchip,pmu:
+    $ref: '/schemas/types.yaml#/definitions/phandle'
+    description: |
+      Specifies the syscon node controlling the cpu core power domains.
 
-      compatible:
-        items:
-          - enum:
-              - arm,arm710t
-              - arm,arm720t
-              - arm,arm740t
-              - arm,arm7ej-s
-              - arm,arm7tdmi
-              - arm,arm7tdmi-s
-              - arm,arm9es
-              - arm,arm9ej-s
-              - arm,arm920t
-              - arm,arm922t
-              - arm,arm925
-              - arm,arm926e-s
-              - arm,arm926ej-s
-              - arm,arm940t
-              - arm,arm946e-s
-              - arm,arm966e-s
-              - arm,arm968e-s
-              - arm,arm9tdmi
-              - arm,arm1020e
-              - arm,arm1020t
-              - arm,arm1022e
-              - arm,arm1026ej-s
-              - arm,arm1136j-s
-              - arm,arm1136jf-s
-              - arm,arm1156t2-s
-              - arm,arm1156t2f-s
-              - arm,arm1176jzf
-              - arm,arm1176jz-s
-              - arm,arm1176jzf-s
-              - arm,arm11mpcore
-              - arm,armv8 # Only for s/w models
-              - arm,cortex-a5
-              - arm,cortex-a7
-              - arm,cortex-a8
-              - arm,cortex-a9
-              - arm,cortex-a12
-              - arm,cortex-a15
-              - arm,cortex-a17
-              - arm,cortex-a53
-              - arm,cortex-a57
-              - arm,cortex-a72
-              - arm,cortex-a73
-              - arm,cortex-m0
-              - arm,cortex-m0+
-              - arm,cortex-m1
-              - arm,cortex-m3
-              - arm,cortex-m4
-              - arm,cortex-r4
-              - arm,cortex-r5
-              - arm,cortex-r7
-              - brcm,brahma-b15
-              - brcm,brahma-b53
-              - brcm,vulcan
-              - cavium,thunder
-              - cavium,thunder2
-              - faraday,fa526
-              - intel,sa110
-              - intel,sa1100
-              - marvell,feroceon
-              - marvell,mohawk
-              - marvell,pj4a
-              - marvell,pj4b
-              - marvell,sheeva-v5
-              - marvell,sheeva-v7
-              - nvidia,tegra132-denver
-              - nvidia,tegra186-denver
-              - nvidia,tegra194-carmel
-              - qcom,krait
-              - qcom,kryo
-              - qcom,kryo385
-              - qcom,scorpion
-
-      enable-method:
-        allOf:
-          - $ref: '/schemas/types.yaml#/definitions/string'
-          - oneOf:
-            # On ARM v8 64-bit this property is required
-            - enum:
-                - psci
-                - spin-table
-            # On ARM 32-bit systems this property is optional
-            - enum:
-                - actions,s500-smp
-                - allwinner,sun6i-a31
-                - allwinner,sun8i-a23
-                - allwinner,sun9i-a80-smp
-                - allwinner,sun8i-a83t-smp
-                - amlogic,meson8-smp
-                - amlogic,meson8b-smp
-                - arm,realview-smp
-                - brcm,bcm11351-cpu-method
-                - brcm,bcm23550
-                - brcm,bcm2836-smp
-                - brcm,bcm63138
-                - brcm,bcm-nsp-smp
-                - brcm,brahma-b15
-                - marvell,armada-375-smp
-                - marvell,armada-380-smp
-                - marvell,armada-390-smp
-                - marvell,armada-xp-smp
-                - marvell,98dx3236-smp
-                - mediatek,mt6589-smp
-                - mediatek,mt81xx-tz-smp
-                - qcom,gcc-msm8660
-                - qcom,kpss-acc-v1
-                - qcom,kpss-acc-v2
-                - renesas,apmu
-                - renesas,r9a06g032-smp
-                - rockchip,rk3036-smp
-                - rockchip,rk3066-smp
-                - socionext,milbeaut-m10v-smp
-                - ste,dbx500-smp
-
-      cpu-release-addr:
-        $ref: '/schemas/types.yaml#/definitions/uint64'
-
-        description:
-          Required for systems that have an "enable-method"
-            property value of "spin-table".
-          On ARM v8 64-bit systems must be a two cell
-            property identifying a 64-bit zero-initialised
-            memory location.
-
-      cpu-idle-states:
-        $ref: '/schemas/types.yaml#/definitions/phandle-array'
-        description: |
-          List of phandles to idle state nodes supported
-          by this cpu (see ./idle-states.txt).
-
-      capacity-dmips-mhz:
-        $ref: '/schemas/types.yaml#/definitions/uint32'
-        description:
-          u32 value representing CPU capacity (see ./cpu-capacity.txt) in
-          DMIPS/MHz, relative to highest capacity-dmips-mhz
-          in the system.
-
-      dynamic-power-coefficient:
-        $ref: '/schemas/types.yaml#/definitions/uint32'
-        description:
-          A u32 value that represents the running time dynamic
-          power coefficient in units of uW/MHz/V^2. The
-          coefficient can either be calculated from power
-          measurements or derived by analysis.
-
-          The dynamic power consumption of the CPU  is
-          proportional to the square of the Voltage (V) and
-          the clock frequency (f). The coefficient is used to
-          calculate the dynamic power as below -
-
-          Pdyn = dynamic-power-coefficient * V^2 * f
-
-          where voltage is in V, frequency is in MHz.
-
-      qcom,saw:
-        $ref: '/schemas/types.yaml#/definitions/phandle'
-        description: |
-          Specifies the SAW* node associated with this CPU.
-
-          Required for systems that have an "enable-method" property
-          value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
-
-          * arm/msm/qcom,saw2.txt
-
-      qcom,acc:
-        $ref: '/schemas/types.yaml#/definitions/phandle'
-        description: |
-          Specifies the ACC* node associated with this CPU.
-
-          Required for systems that have an "enable-method" property
-          value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
-
-          * arm/msm/qcom,kpss-acc.txt
-
-      rockchip,pmu:
-        $ref: '/schemas/types.yaml#/definitions/phandle'
-        description: |
-          Specifies the syscon node controlling the cpu core power domains.
-
-          Optional for systems that have an "enable-method"
-          property value of "rockchip,rk3066-smp"
-          While optional, it is the preferred way to get access to
-          the cpu-core power-domains.
-
-    required:
-      - device_type
-      - reg
-      - compatible
-
-    dependencies:
-      cpu-release-addr: [enable-method]
-      rockchip,pmu: [enable-method]
+      Optional for systems that have an "enable-method"
+      property value of "rockchip,rk3066-smp"
+      While optional, it is the preferred way to get access to
+      the cpu-core power-domains.
 
 required:
-  - '#address-cells'
-  - '#size-cells'
+  - device_type
+  - reg
+  - compatible
+
+dependencies:
+  rockchip,pmu: [enable-method]
 
 examples:
   - |
diff --git a/Documentation/devicetree/bindings/arm/digicolor.txt b/Documentation/devicetree/bindings/arm/digicolor.txt
deleted file mode 100644
index 658553f40b23..000000000000
--- a/Documentation/devicetree/bindings/arm/digicolor.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-Conexant Digicolor Platforms Device Tree Bindings
-
-Each device tree must specify which Conexant Digicolor SoC it uses.
-Must be the following compatible string:
-
-  cnxt,cx92755
diff --git a/Documentation/devicetree/bindings/arm/digicolor.yaml b/Documentation/devicetree/bindings/arm/digicolor.yaml
new file mode 100644
index 000000000000..d9c80b827e9b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/digicolor.yaml
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/digicolor.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Conexant Digicolor Platforms Device Tree Bindings
+
+maintainers:
+  - Baruch Siach <baruch@tkos.co.il>
+
+properties:
+  compatible:
+    const: cnxt,cx92755
+
+...
diff --git a/Documentation/devicetree/bindings/arm/emtrion.txt b/Documentation/devicetree/bindings/arm/emtrion.txt
deleted file mode 100644
index 83329aefc483..000000000000
--- a/Documentation/devicetree/bindings/arm/emtrion.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Emtrion Devicetree Bindings
-===========================
-
-emCON Series:
--------------
-
-Required root node properties
-	- compatible:
-	- "emtrion,emcon-mx6", "fsl,imx6q"; 		: emCON-MX6D or emCON-MX6Q SoM
-	- "emtrion,emcon-mx6-avari", "fsl,imx6q";	: emCON-MX6D or emCON-MX6Q SoM on Avari Base
-	- "emtrion,emcon-mx6", "fsl,imx6dl"; 		: emCON-MX6S or emCON-MX6DL SoM
-	- "emtrion,emcon-mx6-avari", "fsl,imx6dl";	: emCON-MX6S or emCON-MX6DL SoM on Avari Base
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
index 5d7dbabbb784..a575e42f7fec 100644
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
+++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
@@ -133,6 +133,28 @@ RTC bindings based on SCU Message Protocol
 Required properties:
 - compatible: should be "fsl,imx8qxp-sc-rtc";
 
+OCOTP bindings based on SCU Message Protocol
+------------------------------------------------------------
+Required properties:
+- compatible:		Should be "fsl,imx8qxp-scu-ocotp"
+- #address-cells:	Must be 1. Contains byte index
+- #size-cells:		Must be 1. Contains byte length
+
+Optional Child nodes:
+
+- Data cells of ocotp:
+  Detailed bindings are described in bindings/nvmem/nvmem.txt
+
+Watchdog bindings based on SCU Message Protocol
+------------------------------------------------------------
+
+Required properties:
+- compatible: should be:
+              "fsl,imx8qxp-sc-wdt"
+              followed by "fsl,imx-sc-wdt";
+Optional properties:
+- timeout-sec: contains the watchdog timeout in seconds.
+
 Example (imx8qxp):
 -------------
 aliases {
@@ -177,6 +199,16 @@ firmware {
 			...
 		};
 
+		ocotp: imx8qx-ocotp {
+			compatible = "fsl,imx8qxp-scu-ocotp";
+			#address-cells = <1>;
+			#size-cells = <1>;
+
+			fec_mac0: mac@2c4 {
+				reg = <0x2c4 8>;
+			};
+		};
+
 		pd: imx8qx-pd {
 			compatible = "fsl,imx8qxp-scu-pd", "fsl,scu-pd";
 			#power-domain-cells = <1>;
@@ -185,6 +217,11 @@ firmware {
 		rtc: rtc {
 			compatible = "fsl,imx8qxp-sc-rtc";
 		};
+
+		watchdog {
+			compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
+			timeout-sec = <60>;
+		};
 	};
 };
 
diff --git a/Documentation/devicetree/bindings/arm/fsl.yaml b/Documentation/devicetree/bindings/arm/fsl.yaml
index 407138ebc0d0..7294ac36f4c0 100644
--- a/Documentation/devicetree/bindings/arm/fsl.yaml
+++ b/Documentation/devicetree/bindings/arm/fsl.yaml
@@ -15,6 +15,13 @@ properties:
     const: '/'
   compatible:
     oneOf:
+      - description: i.MX1 based Boards
+        items:
+          - enum:
+              - armadeus,imx1-apf9328
+              - fsl,imx1ads
+          - const: fsl,imx1
+
       - description: i.MX23 based Boards
         items:
           - enum:
@@ -51,6 +58,25 @@ properties:
           - const: i2se,duckbill-2
           - const: fsl,imx28
 
+      - description: i.MX31 based Boards
+        items:
+          - enum:
+              - buglabs,imx31-bug
+              - logicpd,imx31-lite
+          - const: fsl,imx31
+
+      - description: i.MX35 based Boards
+        items:
+          - enum:
+              - fsl,imx35-pdk
+          - const: fsl,imx35
+
+      - description: i.MX35 Eukrea CPUIMX35 Board
+        items:
+          - const: eukrea,mbimxsd35-baseboard
+          - const: eukrea,cpuimx35
+          - const: fsl,imx35
+
       - description: i.MX50 based Boards
         items:
           - enum:
@@ -80,6 +106,8 @@ properties:
       - description: i.MX6Q based Boards
         items:
           - enum:
+              - emtrion,emcon-mx6         # emCON-MX6D or emCON-MX6Q SoM
+              - emtrion,emcon-mx6-avari   # emCON-MX6D or emCON-MX6Q SoM on Avari Base
               - fsl,imx6q-arm2
               - fsl,imx6q-sabreauto
               - fsl,imx6q-sabrelite
@@ -99,6 +127,8 @@ properties:
         items:
           - enum:
               - eckelmann,imx6dl-ci4x10
+              - emtrion,emcon-mx6         # emCON-MX6S or emCON-MX6DL SoM
+              - emtrion,emcon-mx6-avari   # emCON-MX6S or emCON-MX6DL SoM on Avari Base
               - fsl,imx6dl-sabreauto      # i.MX6 DualLite/Solo SABRE Automotive Board
               - fsl,imx6dl-sabresd        # i.MX6 DualLite SABRE Smart Device Board
               - technologic,imx6dl-ts4900
@@ -156,6 +186,7 @@ properties:
         items:
           - enum:
               - fsl,imx7d-sdb             # i.MX7 SabreSD Board
+              - novtech,imx7d-meerkat96   # i.MX7 Meerkat96 Board
               - tq,imx7d-mba7             # i.MX7D TQ MBa7 with TQMa7D SoM
               - zii,imx7d-rpu2            # ZII RPU2 Board
           - const: fsl,imx7d
@@ -171,12 +202,25 @@ properties:
           - const: compulab,cl-som-imx7
           - const: fsl,imx7d
 
+      - description: i.MX7ULP based Boards
+        items:
+          - enum:
+              - fsl,imx7ulp-evk           # i.MX7ULP Evaluation Kit
+          - const: fsl,imx7ulp
+
       - description: i.MX8MM based Boards
         items:
           - enum:
               - fsl,imx8mm-evk            # i.MX8MM EVK Board
           - const: fsl,imx8mm
 
+      - description: i.MX8MQ based Boards
+        items:
+          - enum:
+              - fsl,imx8mq-evk            # i.MX8MQ EVK Board
+              - purism,librem5-devkit     # Purism Librem5 devkit
+          - const: fsl,imx8mq
+
       - description: i.MX8QXP based Boards
         items:
           - enum:
diff --git a/Documentation/devicetree/bindings/arm/idle-states.txt b/Documentation/devicetree/bindings/arm/idle-states.txt
index 45730ba60af5..2d325bed37e5 100644
--- a/Documentation/devicetree/bindings/arm/idle-states.txt
+++ b/Documentation/devicetree/bindings/arm/idle-states.txt
@@ -241,9 +241,13 @@ processor idle states, defined as device tree nodes, are listed.
 			   - "psci"
 			# On ARM 32-bit systems this property is optional
 
-The nodes describing the idle states (state) can only be defined within the
-idle-states node, any other configuration is considered invalid and therefore
-must be ignored.
+This assumes that the "enable-method" property is set to "psci" in the cpu
+node[6] that is responsible for setting up CPU idle management in the OS
+implementation.
+
+The nodes describing the idle states (state) can only be defined
+within the idle-states node, any other configuration is considered invalid
+and therefore must be ignored.
 
 ===========================================
 4 - state node
@@ -687,7 +691,7 @@ cpus {
     Documentation/devicetree/bindings/arm/cpus.yaml
 
 [2] ARM Linux Kernel documentation - PSCI bindings
-    Documentation/devicetree/bindings/arm/psci.txt
+    Documentation/devicetree/bindings/arm/psci.yaml
 
 [3] ARM Server Base System Architecture (SBSA)
     http://infocenter.arm.com/help/index.jsp
@@ -697,3 +701,6 @@ cpus {
 
 [5] Devicetree Specification
     https://www.devicetree.org/specifications/
+
+[6] ARM Linux Kernel documentation - Booting AArch64 Linux
+    Documentation/arm64/booting.rst
diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt b/Documentation/devicetree/bindings/arm/mediatek.txt
deleted file mode 100644
index 56ac7896d6d8..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-MediaTek SoC based Platforms Device Tree Bindings
-
-Boards with a MediaTek SoC shall have the following property:
-
-Required root node property:
-
-compatible: Must contain one of
-   "mediatek,mt2701"
-   "mediatek,mt2712"
-   "mediatek,mt6580"
-   "mediatek,mt6589"
-   "mediatek,mt6592"
-   "mediatek,mt6755"
-   "mediatek,mt6765"
-   "mediatek,mt6795"
-   "mediatek,mt6797"
-   "mediatek,mt7622"
-   "mediatek,mt7623"
-   "mediatek,mt7629"
-   "mediatek,mt8127"
-   "mediatek,mt8135"
-   "mediatek,mt8173"
-   "mediatek,mt8183"
-
-
-Supported boards:
-
-- Evaluation board for MT2701:
-    Required root node properties:
-      - compatible = "mediatek,mt2701-evb", "mediatek,mt2701";
-- Evaluation board for MT2712:
-    Required root node properties:
-      - compatible = "mediatek,mt2712-evb", "mediatek,mt2712";
-- Evaluation board for MT6580:
-    Required root node properties:
-      - compatible = "mediatek,mt6580-evbp1", "mediatek,mt6580";
-- bq Aquaris5 smart phone:
-    Required root node properties:
-      - compatible = "mundoreader,bq-aquaris5", "mediatek,mt6589";
-- Evaluation board for MT6592:
-    Required root node properties:
-      - compatible = "mediatek,mt6592-evb", "mediatek,mt6592";
-- Evaluation phone for MT6755(Helio P10):
-    Required root node properties:
-      - compatible = "mediatek,mt6755-evb", "mediatek,mt6755";
-- Evaluation board for MT6765(Helio P22):
-    Required root node properties:
-      - compatible = "mediatek,mt6765-evb", "mediatek,mt6765";
-- Evaluation board for MT6795(Helio X10):
-    Required root node properties:
-      - compatible = "mediatek,mt6795-evb", "mediatek,mt6795";
-- Evaluation board for MT6797(Helio X20):
-    Required root node properties:
-      - compatible = "mediatek,mt6797-evb", "mediatek,mt6797";
-- Mediatek X20 Development Board:
-    Required root node properties:
-      - compatible = "archermind,mt6797-x20-dev", "mediatek,mt6797";
-- Reference board variant 1 for MT7622:
-    Required root node properties:
-      - compatible = "mediatek,mt7622-rfb1", "mediatek,mt7622";
-- Bananapi BPI-R64 for MT7622:
-    Required root node properties:
-      - compatible = "bananapi,bpi-r64", "mediatek,mt7622";
-- Reference board for MT7623a with eMMC:
-    Required root node properties:
-      - compatible = "mediatek,mt7623a-rfb-emmc", "mediatek,mt7623";
-- Reference board for MT7623a with NAND:
-    Required root node properties:
-      - compatible = "mediatek,mt7623a-rfb-nand", "mediatek,mt7623";
-- Reference board for MT7623n with eMMC:
-    Required root node properties:
-      - compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Bananapi BPI-R2 board:
-      - compatible = "bananapi,bpi-r2", "mediatek,mt7623";
-- Reference board for MT7629:
-    Required root node properties:
-      - compatible = "mediatek,mt7629-rfb", "mediatek,mt7629";
-- MTK mt8127 tablet moose EVB:
-    Required root node properties:
-      - compatible = "mediatek,mt8127-moose", "mediatek,mt8127";
-- MTK mt8135 tablet EVB:
-    Required root node properties:
-      - compatible = "mediatek,mt8135-evbp1", "mediatek,mt8135";
-- MTK mt8173 tablet EVB:
-    Required root node properties:
-      - compatible = "mediatek,mt8173-evb", "mediatek,mt8173";
-- Evaluation board for MT8183:
-    Required root node properties:
-      - compatible = "mediatek,mt8183-evb", "mediatek,mt8183";
diff --git a/Documentation/devicetree/bindings/arm/mediatek.yaml b/Documentation/devicetree/bindings/arm/mediatek.yaml
new file mode 100644
index 000000000000..a4ad2eb926f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek.yaml
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek SoC based Platforms Device Tree Bindings
+
+maintainers:
+  - Sean Wang <sean.wang@mediatek.com>
+  - Matthias Brugger <matthias.bgg@gmail.com>
+description: |
+  Boards with a MediaTek SoC shall have the following properties.
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    oneOf:
+      - items:
+          - enum:
+              - mediatek,mt2701-evb
+          - const: mediatek,mt2701
+
+      - items:
+          - enum:
+              - mediatek,mt2712-evb
+          - const: mediatek,mt2712
+      - items:
+          - enum:
+              - mediatek,mt6580-evbp1
+          - const: mediatek,mt6580
+      - items:
+          - enum:
+              - mundoreader,bq-aquaris5
+          - const: mediatek,mt6589
+      - items:
+          - enum:
+              - mediatek,mt6592-evb
+          - const: mediatek,mt6592
+      - items:
+          - enum:
+              - mediatek,mt6755-evb
+          - const: mediatek,mt6755
+      - items:
+          - enum:
+              - mediatek,mt6765-evb
+          - const: mediatek,mt6765
+      - items:
+          - enum:
+              - mediatek,mt6795-evb
+          - const: mediatek,mt6795
+      - items:
+          - enum:
+              - archermind,mt6797-x20-dev
+              - mediatek,mt6797-evb
+          - const: mediatek,mt6797
+      - items:
+          - enum:
+              - bananapi,bpi-r64
+              - mediatek,mt7622-rfb1
+          - const: mediatek,mt7622
+      - items:
+          - enum:
+              - mediatek,mt7623a-rfb-emmc
+              - mediatek,mt7623a-rfb-nand
+              - mediatek,mt7623n-rfb-emmc
+              - bananapi,bpi-r2
+          - const: mediatek,mt7623
+
+      - items:
+          - enum:
+              - mediatek,mt7629-rfb
+          - const: mediatek,mt7629
+      - items:
+          - enum:
+              - mediatek,mt8127-moose
+          - const: mediatek,mt8127
+      - items:
+          - enum:
+              - mediatek,mt8135-evbp1
+          - const: mediatek,mt8135
+      - items:
+          - enum:
+              - mediatek,mt8173-evb
+          - const: mediatek,mt8173
+      - items:
+          - enum:
+              - mediatek,mt8183-evb
+          - const: mediatek,mt8183
+...
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
index f3cef1a6d95c..07c9d813465c 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
@@ -10,6 +10,7 @@ Required Properties:
 	- "mediatek,mt7622-audsys", "syscon"
 	- "mediatek,mt7623-audsys", "mediatek,mt2701-audsys", "syscon"
 	- "mediatek,mt8183-audiosys", "syscon"
+	- "mediatek,mt8516-audsys", "syscon"
 - #clock-cells: Must be 1
 
 The AUDSYS controller uses the common clk binding from
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
index 30cb645c0e54..f5518f26a914 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
@@ -9,6 +9,8 @@ Required Properties:
 	- "mediatek,mt7622-sgmiisys", "syscon"
 	- "mediatek,mt7629-sgmiisys", "syscon"
 - #clock-cells: Must be 1
+- mediatek,physpeed: Should be one of "auto", "1000" or "2500" to match up
+		     the capability of the target PHY.
 
 The SGMIISYS controller uses the common clk binding from
 Documentation/devicetree/bindings/clock/clock-bindings.txt
diff --git a/Documentation/devicetree/bindings/arm/moxart.txt b/Documentation/devicetree/bindings/arm/moxart.txt
deleted file mode 100644
index 11087edb0658..000000000000
--- a/Documentation/devicetree/bindings/arm/moxart.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-MOXA ART device tree bindings
-
-Boards with the MOXA ART SoC shall have the following properties:
-
-Required root node property:
-
-compatible = "moxa,moxart";
-
-Boards:
-
-- UC-7112-LX: embedded computer
-  compatible = "moxa,moxart-uc-7112-lx", "moxa,moxart"
diff --git a/Documentation/devicetree/bindings/arm/moxart.yaml b/Documentation/devicetree/bindings/arm/moxart.yaml
new file mode 100644
index 000000000000..c068df59fad2
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/moxart.yaml
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/moxart.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MOXA ART device tree bindings
+
+maintainers:
+  - Jonas Jensen <jonas.jensen@gmail.com>
+
+properties:
+  compatible:
+    description: UC-7112-LX embedded computer
+    items:
+      - const: moxa,moxart-uc-7112-lx
+      - const: moxa,moxart
+
+...
diff --git a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt
deleted file mode 100644
index 56ec8ddc4a3b..000000000000
--- a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-NXP LPC32xx Platforms Device Tree Bindings
-------------------------------------------
-
-Boards with the NXP LPC32xx SoC shall have the following properties:
-
-Required root node property:
-
-compatible: must be "nxp,lpc3220", "nxp,lpc3230", "nxp,lpc3240" or "nxp,lpc3250"
diff --git a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
new file mode 100644
index 000000000000..07f39d3eee7e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/nxp/lpc32xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP LPC32xx Platforms Device Tree Bindings
+
+maintainers:
+  - Roland Stigge <stigge@antcom.de>
+
+properties:
+  compatible:
+    oneOf:
+      - enum:
+          - nxp,lpc3220
+          - nxp,lpc3230
+          - nxp,lpc3240
+      - items:
+        - enum:
+            - ea,ea3250
+            - phytec,phy3250
+        - const: nxp,lpc3250
+
+...
diff --git a/Documentation/devicetree/bindings/arm/omap/crossbar.txt b/Documentation/devicetree/bindings/arm/omap/crossbar.txt
index 4cd5d873fc3a..a43e4c7aba3d 100644
--- a/Documentation/devicetree/bindings/arm/omap/crossbar.txt
+++ b/Documentation/devicetree/bindings/arm/omap/crossbar.txt
@@ -41,7 +41,7 @@ Examples:
 Consumer:
 ========
 See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt and
-Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt for
+Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml for
 further details.
 
 An interrupt consumer on an SoC using crossbar will use:
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt
index 1c1e48fd94b5..b301f753ed2c 100644
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -160,6 +160,9 @@ Boards:
 - AM335X phyCORE-AM335x: Development kit
   compatible = "phytec,am335x-pcm-953", "phytec,am335x-phycore-som", "ti,am33xx"
 
+- AM335x phyBOARD-REGOR: Single Board Computer
+  compatible = "phytec,am335x-regor", "phytec,am335x-phycore-som", "ti,am33xx"
+
 - AM335X UC-8100-ME-T: Communication-centric industrial computing platform
   compatible = "moxa,uc-8100-me-t", "ti,am33xx";
 
diff --git a/Documentation/devicetree/bindings/arm/psci.txt b/Documentation/devicetree/bindings/arm/psci.txt
deleted file mode 100644
index a2c4f1d52492..000000000000
--- a/Documentation/devicetree/bindings/arm/psci.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-* Power State Coordination Interface (PSCI)
-
-Firmware implementing the PSCI functions described in ARM document number
-ARM DEN 0022A ("Power State Coordination Interface System Software on ARM
-processors") can be used by Linux to initiate various CPU-centric power
-operations.
-
-Issue A of the specification describes functions for CPU suspend, hotplug
-and migration of secure software.
-
-Functions are invoked by trapping to the privilege level of the PSCI
-firmware (specified as part of the binding below) and passing arguments
-in a manner similar to that specified by AAPCS:
-
-	 r0		=> 32-bit Function ID / return value
-	{r1 - r3}	=> Parameters
-
-Note that the immediate field of the trapping instruction must be set
-to #0.
-
-
-Main node required properties:
-
- - compatible    : should contain at least one of:
-
-     * "arm,psci"     : For implementations complying to PSCI versions prior
-			to 0.2.
-			For these cases function IDs must be provided.
-
-     * "arm,psci-0.2" : For implementations complying to PSCI 0.2.
-			Function IDs are not required and should be ignored by
-			an OS with PSCI 0.2 support, but are permitted to be
-			present for compatibility with existing software when
-			"arm,psci" is later in the compatible list.
-
-     * "arm,psci-1.0" : For implementations complying to PSCI 1.0.
-			PSCI 1.0 is backward compatible with PSCI 0.2 with
-			minor specification updates, as defined in the PSCI
-			specification[2].
-
- - method        : The method of calling the PSCI firmware. Permitted
-                   values are:
-
-                   "smc" : SMC #0, with the register assignments specified
-		           in this binding.
-
-                   "hvc" : HVC #0, with the register assignments specified
-		           in this binding.
-
-Main node optional properties:
-
- - cpu_suspend   : Function ID for CPU_SUSPEND operation
-
- - cpu_off       : Function ID for CPU_OFF operation
-
- - cpu_on        : Function ID for CPU_ON operation
-
- - migrate       : Function ID for MIGRATE operation
-
-Device tree nodes that require usage of PSCI CPU_SUSPEND function (ie idle
-state nodes, as per bindings in [1]) must specify the following properties:
-
-- arm,psci-suspend-param
-		Usage: Required for state nodes[1] if the corresponding
-                       idle-states node entry-method property is set
-                       to "psci".
-		Value type: <u32>
-		Definition: power_state parameter to pass to the PSCI
-			    suspend call.
-
-Example:
-
-Case 1: PSCI v0.1 only.
-
-	psci {
-		compatible	= "arm,psci";
-		method		= "smc";
-		cpu_suspend	= <0x95c10000>;
-		cpu_off		= <0x95c10001>;
-		cpu_on		= <0x95c10002>;
-		migrate		= <0x95c10003>;
-	};
-
-Case 2: PSCI v0.2 only
-
-	psci {
-		compatible	= "arm,psci-0.2";
-		method		= "smc";
-	};
-
-Case 3: PSCI v0.2 and PSCI v0.1.
-
-	A DTB may provide IDs for use by kernels without PSCI 0.2 support,
-	enabling firmware and hypervisors to support existing and new kernels.
-	These IDs will be ignored by kernels with PSCI 0.2 support, which will
-	use the standard PSCI 0.2 IDs exclusively.
-
-	psci {
-		compatible = "arm,psci-0.2", "arm,psci";
-		method = "hvc";
-
-		cpu_on = < arbitrary value >;
-		cpu_off = < arbitrary value >;
-
-		...
-	};
-
-[1] Kernel documentation - ARM idle states bindings
-    Documentation/devicetree/bindings/arm/idle-states.txt
-[2] Power State Coordination Interface (PSCI) specification
-    http://infocenter.arm.com/help/topic/com.arm.doc.den0022c/DEN0022C_Power_State_Coordination_Interface.pdf
diff --git a/Documentation/devicetree/bindings/arm/psci.yaml b/Documentation/devicetree/bindings/arm/psci.yaml
new file mode 100644
index 000000000000..7abdf58b335e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/psci.yaml
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/psci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Power State Coordination Interface (PSCI)
+
+maintainers:
+  - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+
+description: |+
+  Firmware implementing the PSCI functions described in ARM document number
+  ARM DEN 0022A ("Power State Coordination Interface System Software on ARM
+  processors") can be used by Linux to initiate various CPU-centric power
+  operations.
+
+  Issue A of the specification describes functions for CPU suspend, hotplug
+  and migration of secure software.
+
+  Functions are invoked by trapping to the privilege level of the PSCI
+  firmware (specified as part of the binding below) and passing arguments
+  in a manner similar to that specified by AAPCS:
+
+     r0       => 32-bit Function ID / return value
+    {r1 - r3}	=> Parameters
+
+  Note that the immediate field of the trapping instruction must be set
+  to #0.
+
+  [2] Power State Coordination Interface (PSCI) specification
+    http://infocenter.arm.com/help/topic/com.arm.doc.den0022c/DEN0022C_Power_State_Coordination_Interface.pdf
+
+properties:
+  compatible:
+    oneOf:
+      - description:
+          For implementations complying to PSCI versions prior to 0.2.
+        const: arm,psci
+
+      - description:
+          For implementations complying to PSCI 0.2.
+        const: arm,psci-0.2
+
+      - description:
+          For implementations complying to PSCI 0.2.
+          Function IDs are not required and should be ignored by an OS with
+          PSCI 0.2 support, but are permitted to be present for compatibility
+          with existing software when "arm,psci" is later in the compatible
+          list.
+        items:
+          - const: arm,psci-0.2
+          - const: arm,psci
+
+      - description:
+          For implementations complying to PSCI 1.0.
+        const: arm,psci-1.0
+
+      - description:
+          For implementations complying to PSCI 1.0.
+          PSCI 1.0 is backward compatible with PSCI 0.2 with minor
+          specification updates, as defined in the PSCI specification[2].
+        items:
+          - const: arm,psci-1.0
+          - const: arm,psci-0.2
+
+  method:
+    description: The method of calling the PSCI firmware.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/string-array
+      - enum:
+          # SMC #0, with the register assignments specified in this binding.
+          - smc
+          # HVC #0, with the register assignments specified in this binding.
+          - hvc
+
+  cpu_suspend:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: Function ID for CPU_SUSPEND operation
+
+  cpu_off:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: Function ID for CPU_OFF operation
+
+  cpu_on:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: Function ID for CPU_ON operation
+
+  migrate:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: Function ID for MIGRATE operation
+
+  arm,psci-suspend-param:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: |
+      power_state parameter to pass to the PSCI suspend call.
+
+      Device tree nodes that require usage of PSCI CPU_SUSPEND function (ie
+      idle state nodes with entry-method property is set to "psci", as per
+      bindings in [1]) must specify this property.
+
+      [1] Kernel documentation - ARM idle states bindings
+        Documentation/devicetree/bindings/arm/idle-states.txt
+
+
+required:
+  - compatible
+  - method
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: arm,psci
+    then:
+      required:
+        - cpu_off
+        - cpu_on
+
+examples:
+  - |+
+
+    // Case 1: PSCI v0.1 only.
+
+    psci {
+      compatible      = "arm,psci";
+      method          = "smc";
+      cpu_suspend     = <0x95c10000>;
+      cpu_off         = <0x95c10001>;
+      cpu_on          = <0x95c10002>;
+      migrate         = <0x95c10003>;
+    };
+
+  - |+
+
+    // Case 2: PSCI v0.2 only
+
+    psci {
+      compatible      = "arm,psci-0.2";
+      method          = "smc";
+    };
+
+
+  - |+
+
+    // Case 3: PSCI v0.2 and PSCI v0.1.
+
+    /*
+     * A DTB may provide IDs for use by kernels without PSCI 0.2 support,
+     * enabling firmware and hypervisors to support existing and new kernels.
+     * These IDs will be ignored by kernels with PSCI 0.2 support, which will
+     * use the standard PSCI 0.2 IDs exclusively.
+     */
+
+    psci {
+      compatible = "arm,psci-0.2", "arm,psci";
+      method = "hvc";
+
+      cpu_on = <0x95c10002>;
+      cpu_off = <0x95c10001>;
+    };
+...
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index f6316ab66385..54ef6b6b9189 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -102,6 +102,15 @@ properties:
           - const: qcom,msm8960
 
       - items:
+          - enum:
+              - fairphone,fp2
+              - lge,hammerhead
+              - sony,xperia-amami
+              - sony,xperia-castor
+              - sony,xperia-honami
+          - const: qcom,msm8974
+
+      - items:
           - const: qcom,msm8916-mtp/1
           - const: qcom,msm8916-mtp
           - const: qcom,msm8916
@@ -110,6 +119,11 @@ properties:
           - const: qcom,msm8996-mtp
 
       - items:
+          - enum:
+              - qcom,ipq4019-ap-dk04.1-c3
+              - qcom,ipq4019-ap-dk07.1-c1
+              - qcom,ipq4019-ap-dk07.1-c2
+              - qcom,ipq4019-dk04.1-c1
           - const: qcom,ipq4019
 
       - items:
diff --git a/Documentation/devicetree/bindings/arm/rda.txt b/Documentation/devicetree/bindings/arm/rda.txt
deleted file mode 100644
index 43c80762c428..000000000000
--- a/Documentation/devicetree/bindings/arm/rda.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-RDA Micro platforms device tree bindings
-----------------------------------------
-
-RDA8810PL SoC
-=============
-
-Required root node properties:
-
- - compatible :  must contain "rda,8810pl"
-
-
-Boards:
-
-Root node property compatible must contain, depending on board:
-
- - Orange Pi 2G-IoT: "xunlong,orangepi-2g-iot"
- - Orange Pi i96: "xunlong,orangepi-i96"
diff --git a/Documentation/devicetree/bindings/arm/rda.yaml b/Documentation/devicetree/bindings/arm/rda.yaml
new file mode 100644
index 000000000000..51cec2b63b04
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/rda.yaml
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/rda.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RDA Micro platforms device tree bindings
+
+maintainers:
+  - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - xunlong,orangepi-2g-iot     # Orange Pi 2G-IoT
+          - xunlong,orangepi-i96        # Orange Pi i96
+      - const: rda,8810pl
+
+...
diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml b/Documentation/devicetree/bindings/arm/renesas.yaml
index 19f379863d50..08c923f8c257 100644
--- a/Documentation/devicetree/bindings/arm/renesas.yaml
+++ b/Documentation/devicetree/bindings/arm/renesas.yaml
@@ -106,6 +106,14 @@ properties:
 
       - description: RZ/G2M (R8A774A1)
         items:
+          - enum:
+              - hoperun,hihope-rzg2m # HopeRun HiHope RZ/G2M platform
+          - const: renesas,r8a774a1
+
+      - items:
+          - enum:
+              - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms
+          - const: hoperun,hihope-rzg2m
           - const: renesas,r8a774a1
 
       - description: RZ/G2E (R8A774C0)
diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml
index 5c6bbf10abc9..34865042f4e4 100644
--- a/Documentation/devicetree/bindings/arm/rockchip.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip.yaml
@@ -316,6 +316,19 @@ properties:
           - const: haoyu,marsboard-rk3066
           - const: rockchip,rk3066a
 
+      - description: Hugsun X99 TV Box
+        items:
+          - const: hugsun,x99
+          - const: rockchip,rk3399
+
+      - description: Khadas Edge series boards
+        items:
+          - enum:
+              - khadas,edge
+              - khadas,edge-captain
+              - khadas,edge-v
+          - const: rockchip,rk3399
+
       - description: mqmaker MiQi
         items:
           - const: mqmaker,miqi
diff --git a/Documentation/devicetree/bindings/arm/stm32/mlahb.txt b/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
new file mode 100644
index 000000000000..25307aa1eb9b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
@@ -0,0 +1,37 @@
+ML-AHB interconnect bindings
+
+These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
+a Cortex-M subsystem with dedicated memories.
+The MCU SRAM and RETRAM memory parts can be accessed through different addresses
+(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
+Cortex-M firmware accesses among those ports allows to tune the system
+performance.
+
+[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
+[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
+
+Required properties:
+- compatible: should be "simple-bus"
+- dma-ranges: describes memory addresses translation between the local CPU and
+	   the remote Cortex-M processor. Each memory region, is declared with
+	   3 parameters:
+		 - param 1: device base address (Cortex-M processor address)
+		 - param 2: physical base address (local CPU address)
+		 - param 3: size of the memory region.
+
+The Cortex-M remote processor accessed via the mlahb interconnect is described
+by a child node.
+
+Example:
+mlahb {
+	compatible = "simple-bus";
+	#address-cells = <1>;
+	#size-cells = <1>;
+	dma-ranges = <0x00000000 0x38000000 0x10000>,
+		     <0x10000000 0x10000000 0x60000>,
+		     <0x30000000 0x30000000 0x60000>;
+
+	m4_rproc: m4@10000000 {
+		...
+	};
+};
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.txt b/Documentation/devicetree/bindings/arm/stm32/stm32.txt
deleted file mode 100644
index 6808ed9ddfd5..000000000000
--- a/Documentation/devicetree/bindings/arm/stm32/stm32.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-STMicroelectronics STM32 Platforms Device Tree Bindings
-
-Each device tree must specify which STM32 SoC it uses,
-using one of the following compatible strings:
-
-  st,stm32f429
-  st,stm32f469
-  st,stm32f746
-  st,stm32h743
-  st,stm32mp157
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
new file mode 100644
index 000000000000..4d194f1eb03a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/stm32/stm32.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: STMicroelectronics STM32 Platforms Device Tree Bindings
+
+maintainers:
+  - Alexandre Torgue <alexandre.torgue@st.com>
+
+properties:
+  compatible:
+    oneOf:
+      - items:
+          - const: st,stm32f429
+
+      - items:
+          - const: st,stm32f469
+
+      - items:
+          - const: st,stm32f746
+
+      - items:
+          - const: st,stm32h743
+
+      - items:
+          - enum:
+              - arrow,stm32mp157a-avenger96 # Avenger96
+          - const: st,stm32mp157
+...
diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index 285f4fc8519d..000a00d12d6a 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -263,7 +263,7 @@ properties:
 
       - description: ICNova A20 SWAC
         items:
-          - const: swac,icnova-a20-swac
+          - const: incircuit,icnova-a20-swac
           - const: incircuit,icnova-a20
           - const: allwinner,sun7i-a20
 
diff --git a/Documentation/devicetree/bindings/arm/ti/k3.txt b/Documentation/devicetree/bindings/arm/ti/k3.txt
index 6a059cabb2da..333e7256126a 100644
--- a/Documentation/devicetree/bindings/arm/ti/k3.txt
+++ b/Documentation/devicetree/bindings/arm/ti/k3.txt
@@ -13,6 +13,9 @@ architecture it uses, using one of the following compatible values:
 - AM654
   compatible = "ti,am654";
 
+- J721E
+  compatible = "ti,j721e";
+
 Boards
 ------
 
diff --git a/Documentation/devicetree/bindings/arm/xen.txt b/Documentation/devicetree/bindings/arm/xen.txt
index c9b9321434ea..db5c56db30ec 100644
--- a/Documentation/devicetree/bindings/arm/xen.txt
+++ b/Documentation/devicetree/bindings/arm/xen.txt
@@ -54,7 +54,7 @@ hypervisor {
 };
 
 The format and meaning of the "xen,uefi-*" parameters are similar to those in
-Documentation/arm/uefi.txt, which are provided by the regular UEFI stub. However
+Documentation/arm/uefi.rst, which are provided by the regular UEFI stub. However
 they differ because they are provided by the Xen hypervisor, together with a set
 of UEFI runtime services implemented via hypercalls, see
 http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,platform.h.html.
diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
new file mode 100644
index 000000000000..be32f087c529
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/allwinner,sun8i-a23-rsb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A23 RSB Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun8i-a23-rsb
+      - items:
+        - const: allwinner,sun8i-a83t-rsb
+        - const: allwinner,sun8i-a23-rsb
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+  clock-frequency:
+    minimum: 1
+    maximum: 20000000
+
+patternProperties:
+  "^.*@[0-9a-fA-F]+$":
+    type: object
+    properties:
+      reg:
+        maxItems: 1
+
+    required:
+      - reg
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - resets
+
+examples:
+  - |
+    rsb@1f03400 {
+        compatible = "allwinner,sun8i-a23-rsb";
+        reg = <0x01f03400 0x400>;
+        interrupts = <0 39 4>;
+        clocks = <&apb0_gates 3>;
+        clock-frequency = <3000000>;
+        resets = <&apb0_rst 3>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        pmic@3e3 {
+            compatible = "...";
+            reg = <0x3e3>;
+
+            /* ... */
+        };
+    };
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt b/Documentation/devicetree/bindings/bus/sunxi-rsb.txt
deleted file mode 100644
index eb3ed628c6f1..000000000000
--- a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Allwinner Reduced Serial Bus (RSB) controller
-
-The RSB controller found on later Allwinner SoCs is an SMBus like 2 wire
-serial bus with 1 master and up to 15 slaves. It is represented by a node
-for the controller itself, and child nodes representing the slave devices.
-
-Required properties :
-
- - reg             : Offset and length of the register set for the controller.
- - compatible      : Shall be "allwinner,sun8i-a23-rsb".
- - interrupts      : The interrupt line associated to the RSB controller.
- - clocks          : The gate clk associated to the RSB controller.
- - resets          : The reset line associated to the RSB controller.
- - #address-cells  : shall be 1
- - #size-cells     : shall be 0
-
-Optional properties :
-
- - clock-frequency : Desired RSB bus clock frequency in Hz. Maximum is 20MHz.
-		     If not set this defaults to 3MHz.
-
-Child nodes:
-
-An RSB controller node can contain zero or more child nodes representing
-slave devices on the bus.  Child 'reg' properties should contain the slave
-device's hardware address. The hardware address is hardwired in the device,
-which can normally be found in the datasheet.
-
-Example:
-
-	rsb@1f03400 {
-		compatible = "allwinner,sun8i-a23-rsb";
-		reg = <0x01f03400 0x400>;
-		interrupts = <0 39 4>;
-		clocks = <&apb0_gates 3>;
-		clock-frequency = <3000000>;
-		resets = <&apb0_rst 3>;
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		pmic@3e3 {
-			compatible = "...";
-			reg = <0x3e3>;
-
-			/* ... */
-		};
-	};
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
new file mode 100644
index 000000000000..c935405458fe
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/phy/allwinner,sun4i-a10-ccu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner Clock Control Unit Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#clock-cells":
+    const: 1
+
+  "#reset-cells":
+    const: 1
+
+  compatible:
+    enum:
+      - allwinner,sun4i-a10-ccu
+      - allwinner,sun5i-a10s-ccu
+      - allwinner,sun5i-a13-ccu
+      - allwinner,sun6i-a31-ccu
+      - allwinner,sun7i-a20-ccu
+      - allwinner,sun8i-a23-ccu
+      - allwinner,sun8i-a33-ccu
+      - allwinner,sun8i-a83t-ccu
+      - allwinner,sun8i-a83t-r-ccu
+      - allwinner,sun8i-h3-ccu
+      - allwinner,sun8i-h3-r-ccu
+      - allwinner,sun8i-r40-ccu
+      - allwinner,sun8i-v3s-ccu
+      - allwinner,sun9i-a80-ccu
+      - allwinner,sun50i-a64-ccu
+      - allwinner,sun50i-a64-r-ccu
+      - allwinner,sun50i-h5-ccu
+      - allwinner,sun50i-h6-ccu
+      - allwinner,sun50i-h6-r-ccu
+      - allwinner,suniv-f1c100s-ccu
+      - nextthing,gr8-ccu
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    minItems: 2
+    maxItems: 4
+    items:
+      - description: High Frequency Oscillator (usually at 24MHz)
+      - description: Low Frequency Oscillator (usually at 32kHz)
+      - description: Internal Oscillator
+      - description: Peripherals PLL
+
+  clock-names:
+    minItems: 2
+    maxItems: 4
+    items:
+      - const: hosc
+      - const: losc
+      - const: iosc
+      - const: pll-periph
+
+required:
+  - "#clock-cells"
+  - "#reset-cells"
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+
+if:
+  properties:
+    compatible:
+      enum:
+        - allwinner,sun8i-a83t-r-ccu
+        - allwinner,sun8i-h3-r-ccu
+        - allwinner,sun50i-a64-r-ccu
+        - allwinner,sun50i-h6-r-ccu
+
+then:
+  properties:
+    clocks:
+      minItems: 4
+      maxItems: 4
+
+    clock-names:
+      minItems: 4
+      maxItems: 4
+
+else:
+  if:
+    properties:
+      compatible:
+        const: allwinner,sun50i-h6-ccu
+
+  then:
+    properties:
+      clocks:
+        minItems: 3
+        maxItems: 3
+
+      clock-names:
+        minItems: 3
+        maxItems: 3
+
+  else:
+    properties:
+      clocks:
+        minItems: 2
+        maxItems: 2
+
+      clock-names:
+        minItems: 2
+        maxItems: 2
+
+additionalProperties: false
+
+examples:
+  - |
+    ccu: clock@1c20000 {
+        compatible = "allwinner,sun8i-h3-ccu";
+        reg = <0x01c20000 0x400>;
+        clocks = <&osc24M>, <&osc32k>;
+        clock-names = "hosc", "losc";
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
+
+  - |
+    r_ccu: clock@1f01400 {
+        compatible = "allwinner,sun50i-a64-r-ccu";
+        reg = <0x01f01400 0x100>;
+        clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu 11>;
+        clock-names = "hosc", "losc", "iosc", "pll-periph";
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
index 5c8b105be4d6..6eaa52092313 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
@@ -10,6 +10,7 @@ Required Properties:
 		"amlogic,gxl-clkc" for GXL and GXM SoC,
 		"amlogic,axg-clkc" for AXG SoC.
 		"amlogic,g12a-clkc" for G12A SoC.
+		"amlogic,g12b-clkc" for G12B SoC.
 - clocks : list of clock phandle, one for each entry clock-names.
 - clock-names : should contain the following:
   * "xtal": the platform xtal
diff --git a/Documentation/devicetree/bindings/clock/at91-clock.txt b/Documentation/devicetree/bindings/clock/at91-clock.txt
index b520280e33ff..13f45db3b66d 100644
--- a/Documentation/devicetree/bindings/clock/at91-clock.txt
+++ b/Documentation/devicetree/bindings/clock/at91-clock.txt
@@ -9,10 +9,11 @@ Slow Clock controller:
 Required properties:
 - compatible : shall be one of the following:
 	"atmel,at91sam9x5-sckc",
-	"atmel,sama5d3-sckc" or
-	"atmel,sama5d4-sckc":
+	"atmel,sama5d3-sckc",
+	"atmel,sama5d4-sckc" or
+	"microchip,sam9x60-sckc":
 		at91 SCKC (Slow Clock Controller)
-- #clock-cells : shall be 0.
+- #clock-cells : shall be 1 for "microchip,sam9x60-sckc" otherwise shall be 0.
 - clocks : shall be the input parent clock phandle for the clock.
 
 Optional properties:
diff --git a/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt b/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt
new file mode 100644
index 000000000000..3041657e2f96
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt
@@ -0,0 +1,22 @@
+Gated Clock Controller Bindings for MIPS based BCM63XX SoCs
+
+Required properties:
+- compatible: must be one of:
+	 "brcm,bcm3368-clocks"
+	 "brcm,bcm6328-clocks"
+	 "brcm,bcm6358-clocks"
+	 "brcm,bcm6362-clocks"
+	 "brcm,bcm6368-clocks"
+	 "brcm,bcm63268-clocks"
+
+- reg: Address and length of the register set
+- #clock-cells: must be <1>
+
+
+Example:
+
+clkctl: clock-controller@10000004 {
+	compatible = "brcm,bcm6328-clocks";
+	reg = <0x10000004 0x4>;
+	#clock-cells = <1>;
+};
diff --git a/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt b/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
index b8d8ef3bdc5f..52a064c789ee 100644
--- a/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
+++ b/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
@@ -40,6 +40,7 @@ Optional properties:
        input audio clocks from host system.
      - ln-psia1-mclk, ln-psia2-mclk : Optional input audio clocks from
        external connector.
+     - ln-spdif-mclk : Optional input audio clock from SPDIF.
      - ln-spdif-clkout : Optional input audio clock from SPDIF.
      - ln-adat-mclk : Optional input audio clock from ADAT.
      - ln-pmic-32k : On board fixed clock.
diff --git a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
index 796c260c183d..d8f5c490f893 100644
--- a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
+++ b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
@@ -59,6 +59,7 @@ Required properties:
 	"marvell,dove-core-clock" - for Dove SoC core clocks
 	"marvell,kirkwood-core-clock" - for Kirkwood SoC (except mv88f6180)
 	"marvell,mv88f6180-core-clock" - for Kirkwood MV88f6180 SoC
+	"marvell,mv98dx1135-core-clock" - for Kirkwood 98dx1135 SoC
 	"marvell,mv88f5181-core-clock" - for Orion MV88F5181 SoC
 	"marvell,mv88f5182-core-clock" - for Orion MV88F5182 SoC
 	"marvell,mv88f5281-core-clock" - for Orion MV88F5281 SoC
diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
index 4e5215ef1acd..269afe8a757e 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
@@ -2,13 +2,15 @@ Qualcomm Graphics Clock & Reset Controller Binding
 --------------------------------------------------
 
 Required properties :
-- compatible : shall contain "qcom,sdm845-gpucc"
+- compatible : shall contain "qcom,sdm845-gpucc" or "qcom,msm8998-gpucc"
 - reg : shall contain base register location and length
 - #clock-cells : from common clock binding, shall contain 1
 - #reset-cells : from common reset binding, shall contain 1
 - #power-domain-cells : from generic power domain binding, shall contain 1
 - clocks : shall contain the XO clock
+	   shall contain the gpll0 out main clock (msm8998)
 - clock-names : shall be "xo"
+		shall be "gpll0" (msm8998)
 
 Example:
 	gpucc: clock-controller@5090000 {
diff --git a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
index d60b99756bb9..aed713cf0831 100644
--- a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
+++ b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
@@ -13,6 +13,7 @@ Required Properties:
 	- external (optional) RGMII_REFCLK
   - clock-names: Must be:
         clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext";
+  - #power-domain-cells: Must be 0
 
 Examples
 --------
@@ -27,6 +28,7 @@ Examples
 		clocks = <&ext_mclk>, <&ext_rtc_clk>,
 				<&ext_jtag_clk>, <&ext_rgmii_ref>;
 		clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext";
+		#power-domain-cells = <0>;
 	};
 
   - Other nodes can use the clocks provided by SYSCTRL as in:
@@ -38,6 +40,7 @@ Examples
 		interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>;
 		reg-shift = <2>;
 		reg-io-width = <4>;
-		clocks = <&sysctrl R9A06G032_CLK_UART0>;
-		clock-names = "baudclk";
+		clocks = <&sysctrl R9A06G032_CLK_UART0>, <&sysctrl R9A06G032_HCLK_UART0>;
+		clock-names = "baudclk", "apb_pclk";
+		power-domains = <&sysctrl>;
 	};
diff --git a/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt b/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt
index 15b48e20a061..a86c83bf9d4e 100644
--- a/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt
+++ b/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt
@@ -35,7 +35,7 @@ board device tree, including the system base clock, as selected by XOM[0]
 pin of the SoC. Refer to generic fixed rate clock bindings
 documentation[1] for more information how to specify these clocks.
 
-[1] Documentation/devicetree/bindings/clock/fixed-clock.txt
+[1] Documentation/devicetree/bindings/clock/fixed-clock.yaml
 
 Example: Clock controller node:
 
diff --git a/Documentation/devicetree/bindings/clock/silabs,si5341.txt b/Documentation/devicetree/bindings/clock/silabs,si5341.txt
new file mode 100644
index 000000000000..a70c333e4cd4
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/silabs,si5341.txt
@@ -0,0 +1,162 @@
+Binding for Silicon Labs Si5341 and Si5340 programmable i2c clock generator.
+
+Reference
+[1] Si5341 Data Sheet
+    https://www.silabs.com/documents/public/data-sheets/Si5341-40-D-DataSheet.pdf
+[2] Si5341 Reference Manual
+    https://www.silabs.com/documents/public/reference-manuals/Si5341-40-D-RM.pdf
+
+The Si5341 and Si5340 are programmable i2c clock generators with up to 10 output
+clocks. The chip contains a PLL that sources 5 (or 4) multisynth clocks, which
+in turn can be directed to any of the 10 (or 4) outputs through a divider.
+The internal structure of the clock generators can be found in [2].
+
+The driver can be used in "as is" mode, reading the current settings from the
+chip at boot, in case you have a (pre-)programmed device. If the PLL is not
+configured when the driver probes, it assumes the driver must fully initialize
+it.
+
+The device type, speed grade and revision are determined runtime by probing.
+
+The driver currently only supports XTAL input mode, and does not support any
+fancy input configurations. They can still be programmed into the chip and
+the driver will leave them "as is".
+
+==I2C device node==
+
+Required properties:
+- compatible: shall be one of the following:
+	"silabs,si5340" - Si5340 A/B/C/D
+	"silabs,si5341" - Si5341 A/B/C/D
+- reg: i2c device address, usually 0x74
+- #clock-cells: from common clock binding; shall be set to 2.
+	The first value is "0" for outputs, "1" for synthesizers.
+	The second value is the output or synthesizer index.
+- clocks: from common clock binding; list of parent clock  handles,
+	corresponding to inputs. Use a fixed clock for the "xtal" input.
+	At least one must be present.
+- clock-names: One of: "xtal", "in0", "in1", "in2"
+- vdd-supply: Regulator node for VDD
+
+Optional properties:
+- vdda-supply: Regulator node for VDDA
+- vdds-supply: Regulator node for VDDS
+- silabs,pll-m-num, silabs,pll-m-den: Numerator and denominator for PLL
+  feedback divider. Must be such that the PLL output is in the valid range. For
+  example, to create 14GHz from a 48MHz xtal, use m-num=14000 and m-den=48. Only
+  the fraction matters, using 3500 and 12 will deliver the exact same result.
+  If these are not specified, and the PLL is not yet programmed when the driver
+  probes, the PLL will be set to 14GHz.
+- silabs,reprogram: When present, the driver will always assume the device must
+  be initialized, and always performs the soft-reset routine. Since this will
+  temporarily stop all output clocks, don't do this if the chip is generating
+  the CPU clock for example.
+- interrupts: Interrupt for INTRb pin.
+- #address-cells: shall be set to 1.
+- #size-cells: shall be set to 0.
+
+
+== Child nodes: Outputs ==
+
+The child nodes list the output clocks.
+
+Each of the clock outputs can be overwritten individually by using a child node.
+If a child node for a clock output is not set, the configuration remains
+unchanged.
+
+Required child node properties:
+- reg: number of clock output.
+
+Optional child node properties:
+- vdd-supply: Regulator node for VDD for this output. The driver selects default
+	values for common-mode and amplitude based on the voltage.
+- silabs,format: Output format, one of:
+	1 = differential (defaults to LVDS levels)
+	2 = low-power (defaults to HCSL levels)
+	4 = LVCMOS
+- silabs,common-mode: Manually override output common mode, see [2] for values
+- silabs,amplitude: Manually override output amplitude, see [2] for values
+- silabs,synth-master: boolean. If present, this output is allowed to change the
+	multisynth frequency dynamically.
+- silabs,silabs,disable-high: boolean. If set, the clock output is driven HIGH
+	when disabled, otherwise it's driven LOW.
+
+==Example==
+
+/* 48MHz reference crystal */
+ref48: ref48M {
+	compatible = "fixed-clock";
+	#clock-cells = <0>;
+	clock-frequency = <48000000>;
+};
+
+i2c-master-node {
+	/* Programmable clock (for logic) */
+	si5341: clock-generator@74 {
+		reg = <0x74>;
+		compatible = "silabs,si5341";
+		#clock-cells = <2>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		clocks = <&ref48>;
+		clock-names = "xtal";
+
+		silabs,pll-m-num = <14000>; /* PLL at 14.0 GHz */
+		silabs,pll-m-den = <48>;
+		silabs,reprogram; /* Chips are not programmed, always reset */
+
+		out@0 {
+			reg = <0>;
+			silabs,format = <1>; /* LVDS 3v3 */
+			silabs,common-mode = <3>;
+			silabs,amplitude = <3>;
+			silabs,synth-master;
+		};
+
+		/*
+		 * Output 6 configuration:
+		 *  LVDS 1v8
+		 */
+		out@6 {
+			reg = <6>;
+			silabs,format = <1>; /* LVDS 1v8 */
+			silabs,common-mode = <13>;
+			silabs,amplitude = <3>;
+		};
+
+		/*
+		 * Output 8 configuration:
+		 *  HCSL 3v3
+		 */
+		out@8 {
+			reg = <8>;
+			silabs,format = <2>;
+			silabs,common-mode = <11>;
+			silabs,amplitude = <3>;
+		};
+	};
+};
+
+some-video-node {
+	/* Standard clock bindings */
+	clock-names = "pixel";
+	clocks = <&si5341 0 7>; /* Output 7 */
+
+	/* Set output 7 to use syntesizer 3 as its parent */
+	assigned-clocks = <&si5341 0 7>, <&si5341 1 3>;
+	assigned-clock-parents = <&si5341 1 3>;
+	/* Set output 7 to 148.5 MHz using a synth frequency of 594 MHz */
+	assigned-clock-rates = <148500000>, <594000000>;
+};
+
+some-audio-node {
+	clock-names = "i2s-clk";
+	clocks = <&si5341 0 0>;
+	/*
+	 * since output 0 is a synth-master, the synth will be automatically set
+	 * to an appropriate frequency when the audio driver requests another
+	 * frequency. We give control over synth 2 to this output here.
+	 */
+	assigned-clocks = <&si5341 0 0>;
+	assigned-clock-parents = <&si5341 1 2>;
+};
diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
deleted file mode 100644
index e3bd88ae456b..000000000000
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-Allwinner Clock Control Unit Binding
-------------------------------------
-
-Required properties :
-- compatible: must contain one of the following compatibles:
-		- "allwinner,sun4i-a10-ccu"
-		- "allwinner,sun5i-a10s-ccu"
-		- "allwinner,sun5i-a13-ccu"
-		- "allwinner,sun6i-a31-ccu"
-		- "allwinner,sun7i-a20-ccu"
-		- "allwinner,sun8i-a23-ccu"
-		- "allwinner,sun8i-a33-ccu"
-		- "allwinner,sun8i-a83t-ccu"
-		- "allwinner,sun8i-a83t-r-ccu"
-		- "allwinner,sun8i-h3-ccu"
-		- "allwinner,sun8i-h3-r-ccu"
-+		- "allwinner,sun8i-r40-ccu"
-		- "allwinner,sun8i-v3s-ccu"
-		- "allwinner,sun9i-a80-ccu"
-		- "allwinner,sun50i-a64-ccu"
-		- "allwinner,sun50i-a64-r-ccu"
-		- "allwinner,sun50i-h5-ccu"
-		- "allwinner,sun50i-h6-ccu"
-		- "allwinner,sun50i-h6-r-ccu"
-		- "allwinner,suniv-f1c100s-ccu"
-		- "nextthing,gr8-ccu"
-
-- reg: Must contain the registers base address and length
-- clocks: phandle to the oscillators feeding the CCU. Two are needed:
-  - "hosc": the high frequency oscillator (usually at 24MHz)
-  - "losc": the low frequency oscillator (usually at 32kHz)
-	    On the A83T, this is the internal 16MHz oscillator divided by 512
-- clock-names: Must contain the clock names described just above
-- #clock-cells : must contain 1
-- #reset-cells : must contain 1
-
-For the main CCU on H6, one more clock is needed:
-- "iosc": the SoC's internal frequency oscillator
-
-For the PRCM CCUs on A83T/H3/A64/H6, two more clocks are needed:
-- "pll-periph": the SoC's peripheral PLL from the main CCU
-- "iosc": the SoC's internal frequency oscillator
-
-Example for generic CCU:
-ccu: clock@1c20000 {
-	compatible = "allwinner,sun8i-h3-ccu";
-	reg = <0x01c20000 0x400>;
-	clocks = <&osc24M>, <&osc32k>;
-	clock-names = "hosc", "losc";
-	#clock-cells = <1>;
-	#reset-cells = <1>;
-};
-
-Example for PRCM CCU:
-r_ccu: clock@1f01400 {
-	compatible = "allwinner,sun50i-a64-r-ccu";
-	reg = <0x01f01400 0x100>;
-	clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu CLK_PLL_PERIPH0>;
-	clock-names = "hosc", "losc", "iosc", "pll-periph";
-	#clock-cells = <1>;
-	#reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/common-properties.txt b/Documentation/devicetree/bindings/common-properties.txt
index a3448bfa1c82..98a28130e100 100644
--- a/Documentation/devicetree/bindings/common-properties.txt
+++ b/Documentation/devicetree/bindings/common-properties.txt
@@ -5,30 +5,29 @@ Endianness
 ----------
 
 The Devicetree Specification does not define any properties related to hardware
-byteswapping, but endianness issues show up frequently in porting Linux to
+byte swapping, but endianness issues show up frequently in porting drivers to
 different machine types.  This document attempts to provide a consistent
-way of handling byteswapping across drivers.
+way of handling byte swapping across drivers.
 
 Optional properties:
  - big-endian: Boolean; force big endian register accesses
    unconditionally (e.g. ioread32be/iowrite32be).  Use this if you
-   know the peripheral always needs to be accessed in BE mode.
+   know the peripheral always needs to be accessed in big endian (BE) mode.
  - little-endian: Boolean; force little endian register accesses
    unconditionally (e.g. readl/writel).  Use this if you know the
-   peripheral always needs to be accessed in LE mode.
+   peripheral always needs to be accessed in little endian (LE) mode.
  - native-endian: Boolean; always use register accesses matched to the
    endianness of the kernel binary (e.g. LE vmlinux -> readl/writel,
-   BE vmlinux -> ioread32be/iowrite32be).  In this case no byteswaps
+   BE vmlinux -> ioread32be/iowrite32be).  In this case no byte swaps
    will ever be performed.  Use this if the hardware "self-adjusts"
    register endianness based on the CPU's configured endianness.
 
 If a binding supports these properties, then the binding should also
 specify the default behavior if none of these properties are present.
 In such cases, little-endian is the preferred default, but it is not
-a requirement.  The of_device_is_big_endian() and of_fdt_is_big_endian()
-helper functions do assume that little-endian is the default, because
-most existing (PCI-based) drivers implicitly default to LE by using
-readl/writel for MMIO accesses.
+a requirement.  Some implementations assume that little-endian is
+the default, because most existing (PCI-based) drivers implicitly
+default to LE for their MMIO accesses.
 
 Examples:
 Scenario 1 : CPU in LE mode & device in LE mode.
diff --git a/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt
new file mode 100644
index 000000000000..87bff5add3f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt
@@ -0,0 +1,37 @@
+i.MX CPUFreq-DT OPP bindings
+================================
+
+Certain i.MX SoCs support different OPPs depending on the "market segment" and
+"speed grading" value which are written in fuses. These bits are combined with
+the opp-supported-hw values for each OPP to check if the OPP is allowed.
+
+Required properties:
+--------------------
+
+For each opp entry in 'operating-points-v2' table:
+- opp-supported-hw: Two bitmaps indicating:
+  - Supported speed grade mask
+  - Supported market segment mask
+    0: Consumer
+    1: Extended Consumer
+    2: Industrial
+    3: Automotive
+
+Example:
+--------
+
+opp_table {
+	compatible = "operating-points-v2";
+	opp-1000000000 {
+		opp-hz = /bits/ 64 <1000000000>;
+		/* grade >= 0, consumer only */
+		opp-supported-hw = <0xf>, <0x3>;
+	};
+
+	opp-1300000000 {
+		opp-hz = /bits/ 64 <1300000000>;
+		opp-microvolt = <1000000>;
+		/* grade >= 1, all segments */
+		opp-supported-hw = <0xe>, <0x7>;
+	};
+}
diff --git a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
index 6b458bb2440d..f2aab3dc2b52 100644
--- a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
@@ -66,16 +66,3 @@ sha@f8034000 {
 	dmas = <&dma1 2 17>;
 	dma-names = "tx";
 };
-
-* Eliptic Curve Cryptography (I2C)
-
-Required properties:
-- compatible : must be "atmel,atecc508a".
-- reg: I2C bus address of the device.
-- clock-frequency: must be present in the i2c controller node.
-
-Example:
-atecc508a@c0 {
-	compatible = "atmel,atecc508a";
-	reg = <0xC0>;
-};
diff --git a/Documentation/devicetree/bindings/csky/pmu.txt b/Documentation/devicetree/bindings/csky/pmu.txt
new file mode 100644
index 000000000000..728d05ca6a1c
--- /dev/null
+++ b/Documentation/devicetree/bindings/csky/pmu.txt
@@ -0,0 +1,38 @@
+===============================
+C-SKY Performance Monitor Units
+===============================
+
+C-SKY Performance Monitor is designed for ck807/ck810/ck860 SMP soc and
+it could count cpu's events for helping analysis performance issues.
+
+============================
+PMU node bindings definition
+============================
+
+	Description: Describes PMU
+
+	PROPERTIES
+
+	- compatible
+		Usage: required
+		Value type: <string>
+		Definition: must be "csky,csky-pmu"
+	- interrupts
+		Usage: required
+		Value type: <u32 IRQ_TYPE_XXX>
+		Definition: must be pmu irq num defined by soc
+	- count-width
+		Usage: optional
+		Value type: <u32>
+		Definition: the width of pmu counter
+
+Examples:
+---------
+#include <dt-bindings/interrupt-controller/irq.h>
+
+	pmu: performace-monitor {
+		compatible = "csky,csky-pmu";
+		interrupts = <23 IRQ_TYPE_EDGE_RISING>;
+		interrupt-parent = <&intc>;
+		count-width = <48>;
+        };
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
new file mode 100644
index 000000000000..47950fced28d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-mipi-dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 MIPI-DSI Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#address-cells": true
+  "#size-cells": true
+
+  compatible:
+    const: allwinner,sun6i-a31-mipi-dsi
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: bus
+      - const: mod
+
+  resets:
+    maxItems: 1
+
+  phys:
+    maxItems: 1
+
+  phy-names:
+    const: dphy
+
+  port:
+    type: object
+    description:
+      A port node with endpoint definitions as defined in
+      Documentation/devicetree/bindings/media/video-interfaces.txt. That
+      port should be the input endpoint, usually coming from the
+      associated TCON.
+
+patternProperties:
+  "^panel@[0-9]+$": true
+
+required:
+  - "#address-cells"
+  - "#size-cells"
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - phys
+  - phy-names
+  - resets
+  - port
+
+additionalProperties: false
+
+examples:
+  - |
+    dsi0: dsi@1ca0000 {
+        compatible = "allwinner,sun6i-a31-mipi-dsi";
+        reg = <0x01ca0000 0x1000>;
+        interrupts = <0 89 4>;
+        clocks = <&ccu 23>, <&ccu 96>;
+        clock-names = "bus", "mod";
+        resets = <&ccu 4>;
+        phys = <&dphy0>;
+        phy-names = "dphy";
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        panel@0 {
+                compatible = "bananapi,lhr050h41", "ilitek,ili9881c";
+                reg = <0>;
+                power-gpios = <&pio 1 7 0>; /* PB07 */
+                reset-gpios = <&r_pio 0 5 1>; /* PL05 */
+                backlight = <&pwm_bl>;
+        };
+
+        port {
+            dsi0_in_tcon0: endpoint {
+                remote-endpoint = <&tcon0_out_dsi0>;
+            };
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/display/arm,komeda.txt b/Documentation/devicetree/bindings/display/arm,komeda.txt
index 02b226532ebd..8513695ee47f 100644
--- a/Documentation/devicetree/bindings/display/arm,komeda.txt
+++ b/Documentation/devicetree/bindings/display/arm,komeda.txt
@@ -7,10 +7,13 @@ Required properties:
 - clocks: A list of phandle + clock-specifier pairs, one for each entry
     in 'clock-names'
 - clock-names: A list of clock names. It should contain:
-      - "mclk": for the main processor clock
-      - "pclk": for the APB interface clock
+      - "aclk": for the main processor clock
 - #address-cells: Must be 1
 - #size-cells: Must be 0
+- iommus: configure the stream id to IOMMU, Must be configured if want to
+    enable iommu in display. for how to configure this node please reference
+        devicetree/bindings/iommu/arm,smmu-v3.txt,
+        devicetree/bindings/iommu/iommu.txt
 
 Required properties for sub-node: pipeline@nq
 Each device contains one or two pipeline sub-nodes (at least one), each
@@ -20,7 +23,6 @@ pipeline node should provide properties:
     in 'clock-names'
 - clock-names: should contain:
       - "pxclk": pixel clock
-      - "aclk": AXI interface clock
 
 - port: each pipeline connect to an encoder input port. The connection is
     modeled using the OF graph bindings specified in
@@ -42,12 +44,15 @@ Example:
 		compatible = "arm,mali-d71";
 		reg = <0xc00000 0x20000>;
 		interrupts = <0 168 4>;
-		clocks = <&dpu_mclk>, <&dpu_aclk>;
-		clock-names = "mclk", "pclk";
+		clocks = <&dpu_aclk>;
+		clock-names = "aclk";
+		iommus = <&smmu 0>, <&smmu 1>, <&smmu 2>, <&smmu 3>,
+			<&smmu 4>, <&smmu 5>, <&smmu 6>, <&smmu 7>,
+			<&smmu 8>, <&smmu 9>;
 
 		dp0_pipe0: pipeline@0 {
-			clocks = <&fpgaosc2>, <&dpu_aclk>;
-			clock-names = "pxclk", "aclk";
+			clocks = <&fpgaosc2>;
+			clock-names = "pxclk";
 			reg = <0>;
 
 			port {
@@ -58,8 +63,8 @@ Example:
 		};
 
 		dp0_pipe1: pipeline@1 {
-			clocks = <&fpgaosc2>, <&dpu_aclk>;
-			clock-names = "pxclk", "aclk";
+			clocks = <&fpgaosc2>;
+			clock-names = "pxclk";
 			reg = <1>;
 
 			port {
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
index a41d280c3f9f..db680413e89c 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
@@ -12,10 +12,12 @@ following device-specific properties.
 Required properties:
 
 - compatible : Shall contain one or more of
+  - "renesas,r8a774a1-hdmi" for R8A774A1 (RZ/G2M) compatible HDMI TX
   - "renesas,r8a7795-hdmi" for R8A7795 (R-Car H3) compatible HDMI TX
   - "renesas,r8a7796-hdmi" for R8A7796 (R-Car M3-W) compatible HDMI TX
   - "renesas,r8a77965-hdmi" for R8A77965 (R-Car M3-N) compatible HDMI TX
-  - "renesas,rcar-gen3-hdmi" for the generic R-Car Gen3 compatible HDMI TX
+  - "renesas,rcar-gen3-hdmi" for the generic R-Car Gen3 and RZ/G2 compatible
+			     HDMI TX
 
     When compatible with generic versions, nodes must list the SoC-specific
     version corresponding to the platform first, followed by the
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
index 900a884ad9f5..c6a196d0b075 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
@@ -9,6 +9,7 @@ Required properties:
 - compatible : Shall contain one of
   - "renesas,r8a7743-lvds" for R8A7743 (RZ/G1M) compatible LVDS encoders
   - "renesas,r8a7744-lvds" for R8A7744 (RZ/G1N) compatible LVDS encoders
+  - "renesas,r8a774a1-lvds" for R8A774A1 (RZ/G2M) compatible LVDS encoders
   - "renesas,r8a774c0-lvds" for R8A774C0 (RZ/G2E) compatible LVDS encoders
   - "renesas,r8a7790-lvds" for R8A7790 (R-Car H2) compatible LVDS encoders
   - "renesas,r8a7791-lvds" for R8A7791 (R-Car M2-W) compatible LVDS encoders
@@ -45,14 +46,24 @@ OF graph bindings specified in Documentation/devicetree/bindings/graph.txt.
 
 Each port shall have a single endpoint.
 
+Optional properties:
+
+- renesas,companion : phandle to the companion LVDS encoder. This property is
+  mandatory for the first LVDS encoder on D3 and E3 SoCs, and shall point to
+  the second encoder to be used as a companion in dual-link mode. It shall not
+  be set for any other LVDS encoder.
+
 
 Example:
 
 	lvds0: lvds@feb90000 {
-		compatible = "renesas,r8a7790-lvds";
-		reg = <0 0xfeb90000 0 0x1c>;
-		clocks = <&cpg CPG_MOD 726>;
-		resets = <&cpg 726>;
+		compatible = "renesas,r8a77990-lvds";
+		reg = <0 0xfeb90000 0 0x20>;
+		clocks = <&cpg CPG_MOD 727>;
+		power-domains = <&sysc R8A77990_PD_ALWAYS_ON>;
+		resets = <&cpg 727>;
+
+		renesas,companion = <&lvds1>;
 
 		ports {
 			#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/bridge/sii902x.txt b/Documentation/devicetree/bindings/display/bridge/sii902x.txt
index 72d2dc6c3e6b..2df44b7d3821 100644
--- a/Documentation/devicetree/bindings/display/bridge/sii902x.txt
+++ b/Documentation/devicetree/bindings/display/bridge/sii902x.txt
@@ -5,10 +5,44 @@ Required properties:
 	- reg: i2c address of the bridge
 
 Optional properties:
-	- interrupts: describe the interrupt line used to inform the host 
+	- interrupts: describe the interrupt line used to inform the host
 	  about hotplug events.
 	- reset-gpios: OF device-tree gpio specification for RST_N pin.
 
+	HDMI audio properties:
+	- #sound-dai-cells: <0> or <1>. <0> if only i2s or spdif pin
+	   is wired, <1> if the both are wired. HDMI audio is
+	   configured only if this property is found.
+	- sil,i2s-data-lanes: Array of up to 4 integers with values of 0-3
+	   Each integer indicates which i2s pin is connected to which
+	   audio fifo. The first integer selects i2s audio pin for the
+	   first audio fifo#0 (HDMI channels 1&2), second for fifo#1
+	   (HDMI channels 3&4), and so on. There is 4 fifos and 4 i2s
+	   pins (SD0 - SD3). Any i2s pin can be connected to any fifo,
+	   but there can be no gaps. E.g. an i2s pin must be mapped to
+	   fifo#0 and fifo#1 before mapping a channel to fifo#2. Default
+	   value is <0>, describing SD0 pin beiging routed to hdmi audio
+	   fifo #0.
+	- clocks: phandle and clock specifier for each clock listed in
+           the clock-names property
+	- clock-names: "mclk"
+	   Describes SII902x MCLK input. MCLK is used to produce
+	   HDMI audio CTS values. This property is required if
+	   "#sound-dai-cells"-property is present. This property follows
+	   Documentation/devicetree/bindings/clock/clock-bindings.txt
+	   consumer binding.
+
+	If HDMI audio is configured the sii902x device becomes an I2S
+	and/or spdif audio codec component (e.g a digital audio sink),
+	that can be used in configuring a full audio devices with
+	simple-card or audio-graph-card binding. See their binding
+	documents on how to describe the way the sii902x device is
+	connected to the rest of the audio system:
+	Documentation/devicetree/bindings/sound/simple-card.txt
+	Documentation/devicetree/bindings/sound/audio-graph-card.txt
+	Note: In case of the audio-graph-card binding the used port
+	index should be 3.
+
 Optional subnodes:
 	- video input: this subnode can contain a video input port node
 	  to connect the bridge to a display controller output (See this
@@ -21,6 +55,12 @@ Example:
 		compatible = "sil,sii9022";
 		reg = <0x39>;
 		reset-gpios = <&pioA 1 0>;
+
+		#sound-dai-cells = <0>;
+		sil,i2s-data-lanes = < 0 1 2 >;
+		clocks = <&mclk>;
+		clock-names = "mclk";
+
 		ports {
 			#address-cells = <1>;
 			#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
index 37f0c04d5a28..d17d1e5820d7 100644
--- a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
+++ b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
@@ -28,6 +28,12 @@ Optional video port nodes:
 - port@1: Second LVDS input port
 - port@3: Second digital CMOS/TTL parallel output
 
+The device can operate in single-link mode or dual-link mode. In single-link
+mode, all pixels are received on port@0, and port@1 shall not contain any
+endpoint. In dual-link mode, even-numbered pixels are received on port@0 and
+odd-numbered pixels on port@1, and both port@0 and port@1 shall contain
+endpoints.
+
 Example:
 --------
 
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
index e3f6aa6a214d..583c5e9dbe6b 100644
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
@@ -12,6 +12,7 @@ Optional properties:
                    (active high shutdown input)
  - reset-gpios: OF device-tree gpio specification for RSTX pin
                 (active low system reset)
+ - toshiba,hpd-pin: TC358767 GPIO pin number to which HPD is connected to (0 or 1)
  - ports: the ports node can contain video interface port nodes to connect
    to a DPI/DSI source and to an eDP/DP sink according to [1][2]:
     - port@0: DSI input port
diff --git a/Documentation/devicetree/bindings/display/ingenic,lcd.txt b/Documentation/devicetree/bindings/display/ingenic,lcd.txt
new file mode 100644
index 000000000000..7b536c8c6dde
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/ingenic,lcd.txt
@@ -0,0 +1,44 @@
+Ingenic JZ47xx LCD driver
+
+Required properties:
+- compatible: one of:
+  * ingenic,jz4740-lcd
+  * ingenic,jz4725b-lcd
+- reg: LCD registers location and length
+- clocks: LCD pixclock and device clock specifiers.
+	   The device clock is only required on the JZ4740.
+- clock-names: "lcd_pclk" and "lcd"
+- interrupts: Specifies the interrupt line the LCD controller is connected to.
+
+Example:
+
+panel {
+	compatible = "sharp,ls020b1dd01d";
+
+	backlight = <&backlight>;
+	power-supply = <&vcc>;
+
+	port {
+		panel_input: endpoint {
+			remote-endpoint = <&panel_output>;
+		};
+	};
+};
+
+
+lcd: lcd-controller@13050000 {
+	compatible = "ingenic,jz4725b-lcd";
+	reg = <0x13050000 0x1000>;
+
+	interrupt-parent = <&intc>;
+	interrupts = <31>;
+
+	clocks = <&cgu JZ4725B_CLK_LCD>;
+	clock-names = "lcd";
+
+	port {
+		panel_output: endpoint {
+			remote-endpoint = <&panel_input>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/display/msm/dpu.txt b/Documentation/devicetree/bindings/display/msm/dpu.txt
index ad2e8830324e..a61dd40f3792 100644
--- a/Documentation/devicetree/bindings/display/msm/dpu.txt
+++ b/Documentation/devicetree/bindings/display/msm/dpu.txt
@@ -28,6 +28,11 @@ Required properties:
 - #address-cells: number of address cells for the MDSS children. Should be 1.
 - #size-cells: Should be 1.
 - ranges: parent bus address space is the same as the child bus address space.
+- interconnects : interconnect path specifier for MDSS according to
+  Documentation/devicetree/bindings/interconnect/interconnect.txt. Should be
+  2 paths corresponding to 2 AXI ports.
+- interconnect-names : MDSS will have 2 port names to differentiate between the
+  2 interconnect paths defined with interconnect specifier.
 
 Optional properties:
 - assigned-clocks: list of clock specifiers for clocks needing rate assignment
@@ -86,6 +91,11 @@ Example:
 		interrupt-controller;
 		#interrupt-cells = <1>;
 
+		interconnects = <&rsc_hlos MASTER_MDP0 &rsc_hlos SLAVE_EBI1>,
+				<&rsc_hlos MASTER_MDP1 &rsc_hlos SLAVE_EBI1>;
+
+		interconnect-names = "mdp0-mem", "mdp1-mem";
+
 		iommus = <&apps_iommu 0>;
 
 		#address-cells = <2>;
diff --git a/Documentation/devicetree/bindings/display/msm/dsi.txt b/Documentation/devicetree/bindings/display/msm/dsi.txt
index 9ae946942720..af95586c898f 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi.txt
+++ b/Documentation/devicetree/bindings/display/msm/dsi.txt
@@ -88,6 +88,7 @@ Required properties:
   * "qcom,dsi-phy-28nm-8960"
   * "qcom,dsi-phy-14nm"
   * "qcom,dsi-phy-10nm"
+  * "qcom,dsi-phy-10nm-8998"
 - reg: Physical base address and length of the registers of PLL, PHY. Some
   revisions require the PHY regulator base address, whereas others require the
   PHY lane base address. See below for each PHY revision.
diff --git a/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt b/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt
new file mode 100644
index 000000000000..a30d63db3c8f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt
@@ -0,0 +1,9 @@
+Armadeus ST0700 Adapt. A Santek ST0700I5Y-RBSLW 7.0" WVGA (800x480) TFT with
+an adapter board.
+
+Required properties:
+- compatible: "armadeus,st0700-adapt"
+- power-supply: see panel-common.txt
+
+Optional properties:
+- backlight: see panel-common.txt
diff --git a/Documentation/devicetree/bindings/display/panel/edt,et-series.txt b/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
index f56b99ebd9be..be8684327ee4 100644
--- a/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
+++ b/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
@@ -6,6 +6,22 @@ Display bindings for EDT Display Technology Corp. Displays which are
 compatible with the simple-panel binding, which is specified in
 simple-panel.txt
 
+3,5" QVGA TFT Panels
+--------------------
++-----------------+---------------------+-------------------------------------+
+| Identifier      | compatbile          | description                         |
++=================+=====================+=====================================+
+| ET035012DM6     | edt,et035012dm6     | 3.5" QVGA TFT LCD panel             |
++-----------------+---------------------+-------------------------------------+
+
+4,3" WVGA TFT Panels
+--------------------
+
++-----------------+---------------------+-------------------------------------+
+| Identifier      | compatbile          | description                         |
++=================+=====================+=====================================+
+| ETM0430G0DH6    | edt,etm0430g0dh6    | 480x272 TFT Display                 |
++-----------------+---------------------+-------------------------------------+
 
 5,7" WVGA TFT Panels
 --------------------
diff --git a/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt b/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt
new file mode 100644
index 000000000000..82d22e191ac3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt
@@ -0,0 +1,12 @@
+Evervision Electronics Co. Ltd. VGG804821 5.0" WVGA TFT LCD Panel
+
+Required properties:
+- compatible: should be "evervision,vgg804821"
+- power-supply: See simple-panel.txt
+
+Optional properties:
+- backlight: See simple-panel.txt
+- enable-gpios: See simple-panel.txt
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt b/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt
new file mode 100644
index 000000000000..6c9156fc3478
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt
@@ -0,0 +1,32 @@
+FriendlyELEC HD702E 800x1280 LCD panel
+
+HD702E lcd is FriendlyELEC developed eDP LCD panel with 800x1280
+resolution. It has built in Goodix, GT9271 captive touchscreen
+with backlight adjustable via PWM.
+
+Required properties:
+- compatible: should be "friendlyarm,hd702e"
+- power-supply: regulator to provide the supply voltage
+
+Optional properties:
+- backlight: phandle of the backlight device attached to the panel
+
+Optional nodes:
+- Video port for LCD panel input.
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
+
+Example:
+
+	panel {
+		compatible ="friendlyarm,hd702e", "simple-panel";
+		backlight = <&backlight>;
+		power-supply = <&vcc3v3_sys>;
+
+		port {
+			panel_in_edp: endpoint {
+				remote-endpoint = <&edp_out_panel>;
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt b/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt
new file mode 100644
index 000000000000..be7ac666807b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt
@@ -0,0 +1,42 @@
+Kaohsiung Opto-Electronics Inc. 5.7" QVGA (320 x 240) TFT LCD panel
+
+Required properties:
+- compatible: should be "koe,tx14d24vm1bpa"
+- backlight: phandle of the backlight device attached to the panel
+- power-supply: single regulator to provide the supply voltage
+
+Required nodes:
+- port: Parallel port mapping to connect this display
+
+This panel needs single power supply voltage. Its backlight is conntrolled
+via PWM signal.
+
+Example:
+--------
+
+Example device-tree definition when connected to iMX53 based board
+
+	lcd_panel: lcd-panel {
+		compatible = "koe,tx14d24vm1bpa";
+		backlight = <&backlight_lcd>;
+		power-supply = <&reg_3v3>;
+
+		port {
+			lcd_panel_in: endpoint {
+				remote-endpoint = <&lcd_display_out>;
+			};
+		};
+	};
+
+Then one needs to extend the dispX node:
+
+	lcd_display: disp1 {
+
+		port@1 {
+			reg = <1>;
+
+			lcd_display_out: endpoint {
+				remote-endpoint = <&lcd_panel_in>;
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt
new file mode 100644
index 000000000000..85c0b2cacfda
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt
@@ -0,0 +1,11 @@
+One Stop Displays OSD101T2045-53TS 10.1" 1920x1200 panel
+
+Required properties:
+- compatible: should be "osddisplays,osd101t2045-53ts"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt
new file mode 100644
index 000000000000..9d88e96003fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt
@@ -0,0 +1,14 @@
+One Stop Displays OSD101T2587-53TS 10.1" 1920x1200 panel
+
+The panel is similar to OSD101T2045-53TS, but it needs additional
+MIPI_DSI_TURN_ON_PERIPHERAL message from the host.
+
+Required properties:
+- compatible: should be "osddisplays,osd101t2587-53ts"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt
new file mode 100644
index 000000000000..9fb9ebeef8e4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt
@@ -0,0 +1,33 @@
+Samsung s6e63m0 AMOLED LCD panel
+
+Required properties:
+  - compatible: "samsung,s6e63m0"
+  - reset-gpios: GPIO spec for reset pin
+  - vdd3-supply: VDD regulator
+  - vci-supply: VCI regulator
+
+The panel must obey rules for SPI slave device specified in document [1].
+
+The device node can contain one 'port' child node with one child
+'endpoint' node, according to the bindings defined in [2]. This
+node should describe panel's video bus.
+
+[1]: Documentation/devicetree/bindings/spi/spi-bus.txt
+[2]: Documentation/devicetree/bindings/media/video-interfaces.txt
+
+Example:
+
+		s6e63m0: display@0 {
+			compatible = "samsung,s6e63m0";
+			reg = <0>;
+			reset-gpio = <&mp05 5 1>;
+			vdd3-supply = <&ldo12_reg>;
+			vci-supply = <&ldo11_reg>;
+			spi-max-frequency = <1200000>;
+
+			port {
+				lcd_ep: endpoint {
+					remote-endpoint = <&fimd_ep>;
+				};
+			};
+		};
diff --git a/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt b/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt
new file mode 100644
index 000000000000..dfb572f085eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt
@@ -0,0 +1,15 @@
+TFC S9700RTWV43TR-01B 7" Three Five Corp 800x480 LCD panel with
+resistive touch
+
+The panel is found on TI AM335x-evm.
+
+Required properties:
+- compatible: should be "tfc,s9700rtwv43tr-01b"
+- power-supply: See panel-common.txt
+
+Optional properties:
+- enable-gpios: GPIO pin to enable or disable the panel, if there is one
+- backlight: phandle of the backlight device attached to the panel
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt b/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt
new file mode 100644
index 000000000000..b42bf06bbd99
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt
@@ -0,0 +1,12 @@
+VXT 800x480 color TFT LCD panel
+
+Required properties:
+- compatible: should be "vxt,vl050-8048nt-c01"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+- enable-gpios: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/renesas,du.txt b/Documentation/devicetree/bindings/display/renesas,du.txt
index aedb22b4d161..c97dfacad281 100644
--- a/Documentation/devicetree/bindings/display/renesas,du.txt
+++ b/Documentation/devicetree/bindings/display/renesas,du.txt
@@ -7,6 +7,7 @@ Required Properties:
     - "renesas,du-r8a7744" for R8A7744 (RZ/G1N) compatible DU
     - "renesas,du-r8a7745" for R8A7745 (RZ/G1E) compatible DU
     - "renesas,du-r8a77470" for R8A77470 (RZ/G1C) compatible DU
+    - "renesas,du-r8a774a1" for R8A774A1 (RZ/G2M) compatible DU
     - "renesas,du-r8a774c0" for R8A774C0 (RZ/G2E) compatible DU
     - "renesas,du-r8a7779" for R8A7779 (R-Car H1) compatible DU
     - "renesas,du-r8a7790" for R8A7790 (R-Car H2) compatible DU
@@ -58,6 +59,7 @@ corresponding to each DU output.
  R8A7744 (RZ/G1N)       DPAD 0         LVDS 0         -              -
  R8A7745 (RZ/G1E)       DPAD 0         DPAD 1         -              -
  R8A77470 (RZ/G1C)      DPAD 0         DPAD 1         LVDS 0         -
+ R8A774A1 (RZ/G2M)      DPAD 0         HDMI 0         LVDS 0         -
  R8A774C0 (RZ/G2E)      DPAD 0         LVDS 0         LVDS 1         -
  R8A7779 (R-Car H1)     DPAD 0         DPAD 1         -              -
  R8A7790 (R-Car H2)     DPAD 0         LVDS 0         LVDS 1         -
diff --git a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
index 39143424a474..3d32ce137e7f 100644
--- a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
+++ b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
@@ -12,6 +12,7 @@ following device-specific properties.
 Required properties:
 
 - compatible: should be one of the following:
+		"rockchip,rk3228-dw-hdmi"
 		"rockchip,rk3288-dw-hdmi"
 		"rockchip,rk3328-dw-hdmi"
 		"rockchip,rk3399-dw-hdmi"
@@ -38,6 +39,13 @@ Optional properties
 - phys: from general PHY binding: the phandle for the PHY device.
 - phy-names: Should be "hdmi" if phys references an external phy.
 
+Optional pinctrl entry:
+- If you have both a "unwedge" and "default" pinctrl entry, dw_hdmi
+  will switch to the unwedge pinctrl state for 10ms if it ever gets an
+  i2c timeout.  It's intended that this unwedge pinctrl entry will
+  cause the SDA line to be driven low to work around a hardware
+  errata.
+
 Example:
 
 hdmi: hdmi@ff980000 {
diff --git a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
index b052d76cf8b6..678776b6012a 100644
--- a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
+++ b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
@@ -126,6 +126,28 @@ required:
   # but usually they will be filled by the bootloader.
   - compatible
 
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,simple-framebuffer
+
+    then:
+      required:
+        - allwinner,pipeline
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: amlogic,simple-framebuffer
+
+    then:
+      required:
+        - amlogic,pipeline
+
+
 additionalProperties: false
 
 examples:
@@ -139,7 +161,8 @@ examples:
       #size-cells = <1>;
       stdout-path = "display0";
       framebuffer0: framebuffer@1d385000 {
-        compatible = "simple-framebuffer";
+        compatible = "allwinner,simple-framebuffer", "simple-framebuffer";
+        allwinner,pipeline = "de_be0-lcd0";
         reg = <0x1d385000 3840000>;
         width = <1600>;
         height = <1200>;
diff --git a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
index 3eb1b48b47dd..60c54da4e526 100644
--- a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
+++ b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
@@ -40,6 +40,8 @@ Mandatory nodes specific to STM32 DSI:
 - panel or bridge node: A node containing the panel or bridge description as
   documented in [6].
   - port: panel or bridge port node, connected to the DSI output port (port@1).
+Optional properties:
+- phy-dsi-supply: phandle of the regulator that provides the supply voltage.
 
 Note: You can find more documentation in the following references
 [1] Documentation/devicetree/bindings/clock/clock-bindings.txt
@@ -101,6 +103,7 @@ Example 2: DSI panel
 			clock-names = "pclk", "ref";
 			resets = <&rcc STM32F4_APB2_RESET(DSI)>;
 			reset-names = "apb";
+			phy-dsi-supply = <&reg18>;
 
 			ports {
 				#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt b/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt
deleted file mode 100644
index 6a6cf5de08b0..000000000000
--- a/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Allwinner A31 DSI Encoder
-=========================
-
-The DSI pipeline consists of two separate blocks: the DSI controller
-itself, and its associated D-PHY.
-
-DSI Encoder
------------
-
-The DSI Encoder generates the DSI signal from the TCON's.
-
-Required properties:
-  - compatible: value must be one of:
-    * allwinner,sun6i-a31-mipi-dsi
-  - reg: base address and size of memory-mapped region
-  - interrupts: interrupt associated to this IP
-  - clocks: phandles to the clocks feeding the DSI encoder
-    * bus: the DSI interface clock
-    * mod: the DSI module clock
-  - clock-names: the clock names mentioned above
-  - phys: phandle to the D-PHY
-  - phy-names: must be "dphy"
-  - resets: phandle to the reset controller driving the encoder
-
-  - ports: A ports node with endpoint definitions as defined in
-    Documentation/devicetree/bindings/media/video-interfaces.txt. The
-    first port should be the input endpoint, usually coming from the
-    associated TCON.
-
-Any MIPI-DSI device attached to this should be described according to
-the bindings defined in ../mipi-dsi-bus.txt
-
-D-PHY
------
-
-Required properties:
-  - compatible: value must be one of:
-    * allwinner,sun6i-a31-mipi-dphy
-  - reg: base address and size of memory-mapped region
-  - clocks: phandles to the clocks feeding the DSI encoder
-    * bus: the DSI interface clock
-    * mod: the DSI module clock
-  - clock-names: the clock names mentioned above
-  - resets: phandle to the reset controller driving the encoder
-
-Example:
-
-dsi0: dsi@1ca0000 {
-	compatible = "allwinner,sun6i-a31-mipi-dsi";
-	reg = <0x01ca0000 0x1000>;
-	interrupts = <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>;
-	clocks = <&ccu CLK_BUS_MIPI_DSI>,
-		 <&ccu CLK_DSI_SCLK>;
-	clock-names = "bus", "mod";
-	resets = <&ccu RST_BUS_MIPI_DSI>;
-	phys = <&dphy0>;
-	phy-names = "dphy";
-	#address-cells = <1>;
-	#size-cells = <0>;
-
-	panel@0 {
-		compatible = "bananapi,lhr050h41", "ilitek,ili9881c";
-		reg = <0>;
-		power-gpios = <&pio 1 7 GPIO_ACTIVE_HIGH>; /* PB07 */
-		reset-gpios = <&r_pio 0 5 GPIO_ACTIVE_LOW>; /* PL05 */
-		backlight = <&pwm_bl>;
-	};
-
-	ports {
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		port@0 {
-			#address-cells = <1>;
-			#size-cells = <0>;
-			reg = <0>;
-
-			dsi0_in_tcon0: endpoint {
-				remote-endpoint = <&tcon0_out_dsi0>;
-			};
-		};
-	};
-};
-
-dphy0: d-phy@1ca1000 {
-	compatible = "allwinner,sun6i-a31-mipi-dphy";
-	reg = <0x01ca1000 0x1000>;
-	clocks = <&ccu CLK_BUS_MIPI_DSI>,
-		 <&ccu CLK_DSI_DPHY>;
-	clock-names = "bus", "mod";
-	resets = <&ccu RST_BUS_MIPI_DSI>;
-	#phy-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt b/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt
deleted file mode 100644
index 3fe0961bcf64..000000000000
--- a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-* Mediatek UART APDMA Controller
-
-Required properties:
-- compatible should contain:
-  * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
-  * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
-
-- reg: The base address of the APDMA register bank.
-
-- interrupts: A single interrupt specifier.
-
-- clocks : Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names: The APDMA clock for register accesses
-
-Examples:
-
-	apdma: dma-controller@11000380 {
-		compatible = "mediatek,mt2712-uart-dma";
-		reg = <0 0x11000380 0 0x400>;
-		interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 65 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 66 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 67 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 68 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 69 IRQ_TYPE_LEVEL_LOW>,
-			     <GIC_SPI 70 IRQ_TYPE_LEVEL_LOW>;
-		clocks = <&pericfg CLK_PERI_AP_DMA>;
-		clock-names = "apdma";
-		#dma-cells = <1>;
-	};
-
diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt b/Documentation/devicetree/bindings/dma/arm-pl330.txt
index db7e2260f9c5..2c7fd1941abb 100644
--- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
+++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt
@@ -16,6 +16,9 @@ Optional properties:
   - dma-channels: contains the total number of DMA channels supported by the DMAC
   - dma-requests: contains the total number of DMA requests supported by the DMAC
   - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
+  - resets: contains an entry for each entry in reset-names.
+	    See ../reset/reset.txt for details.
+  - reset-names: must contain at least "dma", and optional is "dma-ocp".
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/dma/fsl-edma.txt b/Documentation/devicetree/bindings/dma/fsl-edma.txt
index 97e213e07660..29dd3ccb1235 100644
--- a/Documentation/devicetree/bindings/dma/fsl-edma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-edma.txt
@@ -9,15 +9,16 @@ group, DMAMUX0 or DMAMUX1, but not both.
 Required properties:
 - compatible :
 	- "fsl,vf610-edma" for eDMA used similar to that on Vybrid vf610 SoC
+	- "fsl,imx7ulp-edma" for eDMA2 used similar to that on i.mx7ulp
 - reg : Specifies base physical address(s) and size of the eDMA registers.
 	The 1st region is eDMA control register's address and size.
 	The 2nd and the 3rd regions are programmable channel multiplexing
 	control register's address and size.
 - interrupts : A list of interrupt-specifiers, one for each entry in
-	interrupt-names.
-- interrupt-names : Should contain:
-	"edma-tx" - the transmission interrupt
-	"edma-err" - the error interrupt
+	interrupt-names on vf610 similar SoC. But for i.mx7ulp per channel
+	per transmission interrupt, total 16 channel interrupt and 1
+	error interrupt(located in the last), no interrupt-names list on
+	i.mx7ulp for clean on dts.
 - #dma-cells : Must be <2>.
 	The 1st cell specifies the DMAMUX(0 for DMAMUX0 and 1 for DMAMUX1).
 	Specific request source can only be multiplexed by specific channels
@@ -28,6 +29,7 @@ Required properties:
 - clock-names : A list of channel group clock names. Should contain:
 	"dmamux0" - clock name of mux0 group
 	"dmamux1" - clock name of mux1 group
+	Note: No dmamux0 on i.mx7ulp, but another 'dma' clk added on i.mx7ulp.
 - clocks : A list of phandle and clock-specifier pairs, one for each entry in
 	clock-names.
 
@@ -35,6 +37,10 @@ Optional properties:
 - big-endian: If present registers and hardware scatter/gather descriptors
 	of the eDMA are implemented in big endian mode, otherwise in little
 	mode.
+- interrupt-names : Should contain the below on vf610 similar SoC but not used
+	on i.mx7ulp similar SoC:
+	"edma-tx" - the transmission interrupt
+	"edma-err" - the error interrupt
 
 
 Examples:
@@ -52,8 +58,36 @@ edma0: dma-controller@40018000 {
 	clock-names = "dmamux0", "dmamux1";
 	clocks = <&clks VF610_CLK_DMAMUX0>,
 		<&clks VF610_CLK_DMAMUX1>;
-};
+}; /* vf610 */
 
+edma1: dma-controller@40080000 {
+	#dma-cells = <2>;
+	compatible = "fsl,imx7ulp-edma";
+	reg = <0x40080000 0x2000>,
+		<0x40210000 0x1000>;
+	dma-channels = <32>;
+	interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>,
+		     /* last is eDMA2-ERR interrupt */
+		     <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
+	clock-names = "dma", "dmamux0";
+	clocks = <&pcc2 IMX7ULP_CLK_DMA1>,
+		 <&pcc2 IMX7ULP_CLK_DMA_MUX1>;
+}; /* i.mx7ulp */
 
 * DMA clients
 DMA client drivers that uses the DMA function must use the format described
diff --git a/Documentation/devicetree/bindings/dma/fsl-qdma.txt b/Documentation/devicetree/bindings/dma/fsl-qdma.txt
index 6a0ff9059e72..da371c4d406c 100644
--- a/Documentation/devicetree/bindings/dma/fsl-qdma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-qdma.txt
@@ -7,6 +7,7 @@ Required properties:
 
 - compatible:		Must be one of
 			 "fsl,ls1021a-qdma": for LS1021A Board
+			 "fsl,ls1028a-qdma": for LS1028A Board
 			 "fsl,ls1043a-qdma": for ls1043A Board
 			 "fsl,ls1046a-qdma": for ls1046A Board
 - reg:			Should contain the register's base address and length.
diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
new file mode 100644
index 000000000000..5d6f98c43e3d
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
@@ -0,0 +1,54 @@
+* Mediatek UART APDMA Controller
+
+Required properties:
+- compatible should contain:
+  * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
+  * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
+
+- reg: The base address of the APDMA register bank.
+
+- interrupts: A single interrupt specifier.
+ One interrupt per dma-requests, or 8 if no dma-requests property is present
+
+- dma-requests: The number of DMA channels
+
+- clocks : Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names: The APDMA clock for register accesses
+
+- mediatek,dma-33bits: Present if the DMA requires support
+
+Examples:
+
+	apdma: dma-controller@11000400 {
+		compatible = "mediatek,mt2712-uart-dma";
+		reg = <0 0x11000400 0 0x80>,
+		      <0 0x11000480 0 0x80>,
+		      <0 0x11000500 0 0x80>,
+		      <0 0x11000580 0 0x80>,
+		      <0 0x11000600 0 0x80>,
+		      <0 0x11000680 0 0x80>,
+		      <0 0x11000700 0 0x80>,
+		      <0 0x11000780 0 0x80>,
+		      <0 0x11000800 0 0x80>,
+		      <0 0x11000880 0 0x80>,
+		      <0 0x11000900 0 0x80>,
+		      <0 0x11000980 0 0x80>;
+		interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 104 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 105 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 106 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 107 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 108 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 109 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 110 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 112 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 113 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 114 IRQ_TYPE_LEVEL_LOW>;
+		dma-requests = <12>;
+		clocks = <&pericfg CLK_PERI_AP_DMA>;
+		clock-names = "apdma";
+		mediatek,dma-33bits;
+		#dma-cells = <1>;
+	};
diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
index 7fccc20d8331..cae31f4e77ba 100644
--- a/Documentation/devicetree/bindings/dma/sun6i-dma.txt
+++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
@@ -28,12 +28,17 @@ Example:
 	};
 
 ------------------------------------------------------------------------------
-For A64 DMA controller:
+For A64 and H6 DMA controller:
 
 Required properties:
-- compatible:	"allwinner,sun50i-a64-dma"
+- compatible:	Must be one of
+		  "allwinner,sun50i-a64-dma"
+		  "allwinner,sun50i-h6-dma"
 - dma-channels: Number of DMA channels supported by the controller.
 		Refer to Documentation/devicetree/bindings/dma/dma.txt
+- clocks:	In addition to parent AHB clock, it should also contain mbus
+		clock (H6 only)
+- clock-names:	Should contain "bus" and "mbus" (H6 only)
 - all properties above, i.e. reg, interrupts, clocks, resets and #dma-cells
 
 Optional properties:
diff --git a/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt
new file mode 100644
index 000000000000..d592c21245f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt
@@ -0,0 +1,19 @@
+FAIRCHILD SEMICONDUCTOR FSA9480 MICROUSB SWITCH
+
+The FSA9480 is a USB port accessory detector and switch. The FSA9480 is fully
+controlled using I2C and enables USB data, stereo and mono audio, video,
+microphone, and UART data to use a common connector port.
+
+Required properties:
+ - compatible : Must be "fcs,fsa9480"
+ - reg : Specifies i2c slave address. Must be 0x25.
+ - interrupts : Should contain one entry specifying interrupt signal of
+   interrupt parent to which interrupt pin of the chip is connected.
+
+ Example:
+	musb@25 {
+		compatible = "fcs,fsa9480";
+		reg = <0x25>;
+		interrupt-parent = <&gph2>;
+		interrupts = <7 0>;
+	};
diff --git a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
index 553b92a7e87b..bc6b4b62df83 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
@@ -5,6 +5,7 @@ Required Properties:
 			"ti,keystone-gpio": for Keystone 2 66AK2H/K, 66AK2L,
 						66AK2E SoCs
 			"ti,k2g-gpio", "ti,keystone-gpio": for 66AK2G
+			"ti,am654-gpio", "ti,keystone-gpio": for TI K3 AM654
 
 - reg: Physical base address of the controller and the size of memory mapped
        registers.
@@ -145,3 +146,20 @@ gpio0: gpio@260bf00 {
 	ti,ngpio = <32>;
 	ti,davinci-gpio-unbanked = <32>;
 };
+
+Example for K3 AM654:
+
+wkup_gpio0: wkup_gpio0@42110000 {
+	compatible = "ti,am654-gpio", "ti,keystone-gpio";
+	reg = <0x42110000 0x100>;
+	gpio-controller;
+	#gpio-cells = <2>;
+	interrupt-parent = <&intr_wkup_gpio>;
+	interrupts = <59 128>, <59 129>, <59 130>, <59 131>;
+	interrupt-controller;
+	#interrupt-cells = <2>;
+	ti,ngpio = <56>;
+	ti,davinci-gpio-unbanked = <0>;
+	clocks = <&k3_clks 59 0>;
+	clock-names = "gpio";
+};
diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt b/Documentation/devicetree/bindings/gpio/pl061-gpio.txt
deleted file mode 100644
index 89058d375b7c..000000000000
--- a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-ARM PL061 GPIO controller
-
-Required properties:
-- compatible : "arm,pl061", "arm,primecell"
-- #gpio-cells : Should be two. The first cell is the pin number and the
-  second cell is used to specify optional parameters:
-  - bit 0 specifies polarity (0 for normal, 1 for inverted)
-- gpio-controller : Marks the device node as a GPIO controller.
-- interrupts : Interrupt mapping for GPIO IRQ.
-- gpio-ranges : Interaction with the PINCTRL subsystem.
diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml
new file mode 100644
index 000000000000..313b17229247
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/pl061-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM PL061 GPIO controller
+
+maintainers:
+  - Linus Walleij <linus.walleij@linaro.org>
+  - Rob Herring <robh@kernel.org>
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+  properties:
+    compatible:
+      contains:
+        const: arm,pl061
+  required:
+    - compatible
+
+properties:
+  $nodename:
+    pattern: "^gpio@[0-9a-f]+$"
+
+  compatible:
+    items:
+      - const: arm,pl061
+      - const: arm,primecell
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    oneOf:
+      - maxItems: 1
+      - maxItems: 8
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+  clocks:
+    maxItems: 1
+
+  clock-names: true
+
+  "#gpio-cells":
+    const: 2
+
+  gpio-controller: true
+
+  gpio-ranges:
+    maxItems: 8
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-controller
+  - "#interrupt-cells"
+  - clocks
+  - "#gpio-cells"
+  - gpio-controller
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
index 1b1a74129141..9b298edec5b2 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
@@ -15,7 +15,9 @@ Required properties:
     + "arm,mali-t860"
     + "arm,mali-t880"
   * which must be preceded by one of the following vendor specifics:
+    + "allwinner,sun50i-h6-mali"
     + "amlogic,meson-gxm-mali"
+    + "samsung,exynos5433-mali"
     + "rockchip,rk3288-mali"
     + "rockchip,rk3399-mali"
 
@@ -31,21 +33,36 @@ Optional properties:
 
 - clocks : Phandle to clock for the Mali Midgard device.
 
+- clock-names : Specify the names of the clocks specified in clocks
+  when multiple clocks are present.
+    * core: clock driving the GPU itself (When only one clock is present,
+      assume it's this clock.)
+    * bus: bus clock for the GPU
+
 - mali-supply : Phandle to regulator for the Mali device. Refer to
   Documentation/devicetree/bindings/regulator/regulator.txt for details.
 
 - operating-points-v2 : Refer to Documentation/devicetree/bindings/opp/opp.txt
   for details.
 
+- #cooling-cells: Refer to Documentation/devicetree/bindings/thermal/thermal.txt
+  for details.
+
 - resets : Phandle of the GPU reset line.
 
 Vendor-specific bindings
 ------------------------
 
 The Mali GPU is integrated very differently from one SoC to
-another. In order to accomodate those differences, you have the option
+another. In order to accommodate those differences, you have the option
 to specify one more vendor-specific compatible, among:
 
+- "allwinner,sun50i-h6-mali"
+  Required properties:
+  - clocks : phandles to core and bus clocks
+  - clock-names : must contain "core" and "bus"
+  - resets: phandle to GPU reset line
+
 - "amlogic,meson-gxm-mali"
   Required properties:
   - resets : Should contain phandles of :
@@ -65,6 +82,7 @@ gpu@ffa30000 {
 	mali-supply = <&vdd_gpu>;
 	operating-points-v2 = <&gpu_opp_table>;
 	power-domains = <&power RK3288_PD_GPU>;
+	#cooling-cells = <2>;
 };
 
 gpu_opp_table: opp_table0 {
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
index ae63f09fda7d..b352a6851a06 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
@@ -17,6 +17,7 @@ Required properties:
       + amlogic,meson8b-mali
       + amlogic,meson-gxbb-mali
       + amlogic,meson-gxl-mali
+      + samsung,exynos4210-mali
       + rockchip,rk3036-mali
       + rockchip,rk3066-mali
       + rockchip,rk3188-mali
diff --git a/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt b/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
index 2c9804f4f4ac..8d365f89694c 100644
--- a/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
+++ b/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
@@ -1,12 +1,16 @@
-OMAP4+ HwSpinlock Driver
-========================
+TI HwSpinlock for OMAP and K3 based SoCs
+=========================================
 
 Required properties:
-- compatible:		Should be "ti,omap4-hwspinlock" for
-			    OMAP44xx, OMAP54xx, AM33xx, AM43xx, DRA7xx SoCs
+- compatible:		Should be one of the following,
+			  "ti,omap4-hwspinlock" for
+				OMAP44xx, OMAP54xx, AM33xx, AM43xx, DRA7xx SoCs
+			  "ti,am654-hwspinlock" for
+				K3 AM65x and J721E SoCs
 - reg:			Contains the hwspinlock module register address space
 			(base address and length)
 - ti,hwmods:		Name of the hwmod associated with the hwspinlock device
+			(for OMAP architecture based SoCs only)
 - #hwlock-cells:	Should be 1. The OMAP hwspinlock users will use a
 			0-indexed relative hwlock number as the argument
 			specifier value for requesting a specific hwspinlock
@@ -17,10 +21,21 @@ Please look at the generic hwlock binding for usage information for consumers,
 
 Example:
 
-/* OMAP4 */
+1. OMAP4 SoCs
 hwspinlock: spinlock@4a0f6000 {
 	compatible = "ti,omap4-hwspinlock";
 	reg = <0x4a0f6000 0x1000>;
 	ti,hwmods = "spinlock";
 	#hwlock-cells = <1>;
 };
+
+2. AM65x SoCs and J721E SoCs
+&cbass_main {
+	cbass_main_navss: interconnect0 {
+		hwspinlock: spinlock@30e00000 {
+			compatible = "ti,am654-hwspinlock";
+			reg = <0x00 0x30e00000 0x00 0x1000>;
+			#hwlock-cells = <1>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
new file mode 100644
index 000000000000..f9d526b7da01
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/allwinner,sun6i-a31-p2wi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 P2WI (Push/Pull 2 Wires Interface) Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+  - $ref: /schemas/i2c/i2c-controller.yaml#
+
+properties:
+  compatible:
+    const: allwinner,sun6i-a31-p2wi
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+  clock-frequency:
+    minimum: 1
+    maximum: 6000000
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - resets
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+  - |
+    i2c@1f03400 {
+        compatible = "allwinner,sun6i-a31-p2wi";
+        reg = <0x01f03400 0x400>;
+        interrupts = <0 39 4>;
+        clocks = <&apb0_gates 3>;
+        clock-frequency = <100000>;
+        resets = <&apb0_rst 3>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        axp221: pmic@68 {
+            compatible = "x-powers,axp221";
+            reg = <0x68>;
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt b/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt
new file mode 100644
index 000000000000..bc36f0eb94cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt
@@ -0,0 +1,25 @@
+MediaTek MT7621/MT7628 I2C master controller
+
+Required properties:
+
+- compatible: Should be one of the following:
+  - "mediatek,mt7621-i2c": for MT7621/MT7628/MT7688 platforms
+- #address-cells: should be 1.
+- #size-cells: should be 0.
+- reg: Address and length of the register set for the device
+- resets: phandle to the reset controller asserting this device in
+          reset
+  See ../reset/reset.txt for details.
+
+Optional properties :
+
+Example:
+
+i2c: i2c@900 {
+	compatible = "mediatek,mt7621-i2c";
+	reg = <0x900 0x100>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+	resets = <&rstctrl 16>;
+	reset-names = "i2c";
+};
diff --git a/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt b/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
deleted file mode 100644
index 0ffe65a316ae..000000000000
--- a/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-
-* Marvell MV64XXX I2C controller
-
-Required properties :
-
- - reg             : Offset and length of the register set for the device
- - compatible      : Should be either:
-                     - "allwinner,sun4i-a10-i2c"
-                     - "allwinner,sun6i-a31-i2c"
-                     - "marvell,mv64xxx-i2c"
-                     - "marvell,mv78230-i2c"
-                     - "marvell,mv78230-a0-i2c"
-                       * Note: Only use "marvell,mv78230-a0-i2c" for a
-                         very rare, initial version of the SoC which
-                         had broken offload support.  Linux
-                         auto-detects this and sets it appropriately.
- - interrupts      : The interrupt number
-
-Optional properties :
-
- - clock-frequency : Desired I2C bus clock frequency in Hz. If not set the
-default frequency is 100kHz
-
- - resets          : phandle to the parent reset controller. Mandatory
-                     whenever you're using the "allwinner,sun6i-a31-i2c"
-                     compatible.
-
- - clocks:	   : pointers to the reference clocks for this device, the
-		     first one is the one used for the clock on the i2c bus,
-		     the second one is the clock used to acces the registers
-		     of the controller
-
- - clock-names	   : names of used clocks, mandatory if the second clock is
-		     used, the name must be "core", and "reg" (the latter is
-		     only for Armada 7K/8K).
-
-Examples:
-
-	i2c@11000 {
-		compatible = "marvell,mv64xxx-i2c";
-		reg = <0x11000 0x20>;
-		interrupts = <29>;
-		clock-frequency = <100000>;
-	};
-
-For the Armada XP:
-
-	i2c@11000 {
-		compatible = "marvell,mv78230-i2c", "marvell,mv64xxx-i2c";
-		reg = <0x11000 0x100>;
-		interrupts = <29>;
-		clock-frequency = <100000>;
-	};
-
-For the Armada 7040:
-
-	i2c@701000 {
-		compatible = "marvell,mv78230-i2c";
-		reg = <0x701000 0x20>;
-		interrupts = <29>;
-		clock-frequency = <100000>;
-		clock-names = "core", "reg";
-		clocks = <&core_clock>, <&reg_clock>;
-	};
diff --git a/Documentation/devicetree/bindings/i2c/i2c-ocores.txt b/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
index 17bef9a34e50..6b25a80ae8d3 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
@@ -1,9 +1,13 @@
 Device tree configuration for i2c-ocores
 
 Required properties:
-- compatible      : "opencores,i2c-ocores" or "aeroflexgaisler,i2cmst"
+- compatible      : "opencores,i2c-ocores"
+                    "aeroflexgaisler,i2cmst"
+                    "sifive,fu540-c000-i2c", "sifive,i2c0"
+                    For Opencore based I2C IP block reimplemented in
+                    FU540-C000 SoC. Please refer to sifive-blocks-ip-versioning.txt
+                    for additional details.
 - reg             : bus address start and address range size of device
-- interrupts      : interrupt number
 - clocks          : handle to the controller clock; see the note below.
                     Mutually exclusive with opencores,ip-clock-frequency
 - opencores,ip-clock-frequency: frequency of the controller clock in Hz;
@@ -12,6 +16,7 @@ Required properties:
 - #size-cells     : should be <0>
 
 Optional properties:
+- interrupts      : interrupt number.
 - clock-frequency : frequency of bus clock in Hz; see the note below.
                     Defaults to 100 KHz when the property is not specified
 - reg-shift       : device register offsets are shifted by this value
diff --git a/Documentation/devicetree/bindings/i2c/i2c-omap.txt b/Documentation/devicetree/bindings/i2c/i2c-omap.txt
index 4b90ba9f31b7..a44573d7c118 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-omap.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-omap.txt
@@ -7,6 +7,7 @@ Required properties :
 	"ti,omap3-i2c" for OMAP3 SoCs
 	"ti,omap4-i2c" for OMAP4+ SoCs
 	"ti,am654-i2c", "ti,omap4-i2c" for AM654 SoCs
+	"ti,j721e-i2c", "ti,omap4-i2c" for J721E SoCs
 - ti,hwmods : Must be "i2c<n>", n being the instance number (1-based)
 - #address-cells = <1>;
 - #size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/i2c/i2c-stm32.txt b/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
index f334738f7a35..ce3df2fff6c8 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
@@ -21,6 +21,8 @@ Optional properties:
   100000 and 400000.
   For STM32F7, STM32H7 and STM32MP1 SoCs, Standard-mode, Fast-mode and Fast-mode
   Plus are supported, possible values are 100000, 400000 and 1000000.
+- dmas: List of phandles to rx and tx DMA channels. Refer to stm32-dma.txt.
+- dma-names: List of dma names. Valid names are: "rx" and "tx".
 - i2c-scl-rising-time-ns: I2C SCL Rising time for the board (default: 25)
   For STM32F7, STM32H7 and STM32MP1 only.
 - i2c-scl-falling-time-ns: I2C SCL Falling time for the board (default: 10)
diff --git a/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt b/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt
deleted file mode 100644
index 49df0053347a..000000000000
--- a/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-
-* Allwinner P2WI (Push/Pull 2 Wire Interface) controller
-
-Required properties :
-
- - reg             : Offset and length of the register set for the device.
- - compatible      : Should one of the following:
-                     - "allwinner,sun6i-a31-p2wi"
- - interrupts      : The interrupt line connected to the P2WI peripheral.
- - clocks          : The gate clk connected to the P2WI peripheral.
- - resets          : The reset line connected to the P2WI peripheral.
-
-Optional properties :
-
- - clock-frequency : Desired P2WI bus clock frequency in Hz. If not set the
-default frequency is 100kHz
-
-A P2WI may contain one child node encoding a P2WI slave device.
-
-Slave device properties:
-  Required properties:
-   - reg           : the I2C slave address used during the initialization
-                     process to switch from I2C to P2WI mode
-
-Example:
-
-	p2wi@1f03400 {
-		compatible = "allwinner,sun6i-a31-p2wi";
-		reg = <0x01f03400 0x400>;
-		interrupts = <0 39 4>;
-		clocks = <&apb0_gates 3>;
-		clock-frequency = <6000000>;
-		resets = <&apb0_rst 3>;
-
-		axp221: pmic@68 {
-			compatible = "x-powers,axp221";
-			reg = <0x68>;
-
-			/* ... */
-		};
-	};
diff --git a/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
new file mode 100644
index 000000000000..001f2b7abad0
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/marvell,mv64xxx-i2c.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell MV64XXX I2C Controller Device Tree Bindings
+
+maintainers:
+  - Gregory CLEMENT <gregory.clement@bootlin.com>
+
+properties:
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-i2c
+      - items:
+          - const: allwinner,sun7i-a20-i2c
+          - const: allwinner,sun4i-a10-i2c
+      - const: allwinner,sun6i-a31-i2c
+      - items:
+          - const: allwinner,sun8i-a23-i2c
+          - const: allwinner,sun6i-a31-i2c
+      - items:
+          - const: allwinner,sun8i-a83t-i2c
+          - const: allwinner,sun6i-a31-i2c
+      - items:
+          - const: allwinner,sun50i-a64-i2c
+          - const: allwinner,sun6i-a31-i2c
+
+      - const: marvell,mv64xxx-i2c
+      - const: marvell,mv78230-i2c
+      - const: marvell,mv78230-a0-i2c
+
+    description:
+      Only use "marvell,mv78230-a0-i2c" for a very rare, initial
+      version of the SoC which had broken offload support. Linux
+      auto-detects this and sets it appropriately.
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    minItems: 1
+    maxItems: 2
+    items:
+      - description: Reference clock for the I2C bus
+      - description: Bus clock (Only for Armada 7K/8K)
+
+  clock-names:
+    minItems: 1
+    maxItems: 2
+    items:
+      - const: core
+      - const: reg
+    description:
+      Mandatory if two clocks are used (only for Armada 7k and 8k).
+
+  resets:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+allOf:
+  - $ref: /schemas/i2c/i2c-controller.yaml#
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun4i-a10-i2c
+              - allwinner,sun6i-a31-i2c
+
+    then:
+      required:
+        - clocks
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,sun6i-a31-i2c
+
+    then:
+      required:
+        - resets
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+  - |
+    i2c@11000 {
+        compatible = "marvell,mv64xxx-i2c";
+        reg = <0x11000 0x20>;
+        interrupts = <29>;
+        clock-frequency = <100000>;
+    };
+
+  - |
+    i2c@11000 {
+        compatible = "marvell,mv78230-i2c";
+        reg = <0x11000 0x100>;
+        interrupts = <29>;
+        clock-frequency = <100000>;
+    };
+
+  - |
+    i2c@701000 {
+        compatible = "marvell,mv78230-i2c";
+        reg = <0x701000 0x20>;
+        interrupts = <29>;
+        clock-frequency = <100000>;
+        clock-names = "core", "reg";
+        clocks = <&core_clock>, <&reg_clock>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
index 69da2115abdc..1cf6182f888c 100644
--- a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
+++ b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
@@ -38,6 +38,6 @@ Example:
 
 		nunchuk: nunchuk@52 {
 			compatible = "nintendo,nunchuk";
-			reg = <0x52 0x80000010 0>;
+			reg = <0x52 0x0 0x10>;
 		};
 	};
diff --git a/Documentation/devicetree/bindings/i3c/i3c.txt b/Documentation/devicetree/bindings/i3c/i3c.txt
index ab729a0a86ae..4ffe059f0fec 100644
--- a/Documentation/devicetree/bindings/i3c/i3c.txt
+++ b/Documentation/devicetree/bindings/i3c/i3c.txt
@@ -39,7 +39,9 @@ valid here, but several new properties have been added.
 New constraint on existing properties:
 --------------------------------------
 - reg: contains 3 cells
-  + first cell : still encoding the I2C address
+  + first cell : still encoding the I2C address. 10 bit addressing is not
+    supported. Devices with 10 bit address can't be properly passed through
+    DEFSLVS command.
 
   + second cell: shall be 0
 
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
new file mode 100644
index 000000000000..7ba167e2e1ea
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl345.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers
+
+maintainers:
+  - Michael Hennerich <michael.hennerich@analog.com>
+
+description: |
+  Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers that supports
+  both I2C & SPI interfaces.
+    http://www.analog.com/en/products/mems/accelerometers/adxl345.html
+    http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html
+
+properties:
+  compatible:
+    enum:
+      - adi,adxl345
+      - adi,adxl375
+
+  reg:
+    maxItems: 1
+
+  spi-cpha: true
+
+  spi-cpol: true
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c0 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        /* Example for a I2C device node */
+        accelerometer@2a {
+            compatible = "adi,adxl345";
+            reg = <0x53>;
+            interrupt-parent = <&gpio0>;
+            interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+        };
+    };
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+    spi0 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        /* Example for a SPI device node */
+        accelerometer@0 {
+            compatible = "adi,adxl345";
+            reg = <0>;
+            spi-max-frequency = <5000000>;
+            spi-cpol;
+            spi-cpha;
+            interrupt-parent = <&gpio0>;
+            interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
new file mode 100644
index 000000000000..a7fafb9bf5c6
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl372.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer
+
+maintainers:
+  - Stefan Popa <stefan.popa@analog.com>
+
+description: |
+  Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer that supports
+  both I2C & SPI interfaces
+    https://www.analog.com/en/products/adxl372.html
+
+properties:
+  compatible:
+    enum:
+      - adi,adxl372
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+        #include <dt-bindings/gpio/gpio.h>
+        #include <dt-bindings/interrupt-controller/irq.h>
+        i2c0 {
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                /* Example for a I2C device node */
+                accelerometer@53 {
+                        compatible = "adi,adxl372";
+                        reg = <0x53>;
+                        interrupt-parent = <&gpio>;
+                        interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+                };
+        };
+  - |
+        #include <dt-bindings/gpio/gpio.h>
+        #include <dt-bindings/interrupt-controller/irq.h>
+        spi0 {
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                accelerometer@0 {
+                        compatible = "adi,adxl372";
+                        reg = <0>;
+                        spi-max-frequency = <1000000>;
+                        interrupt-parent = <&gpio>;
+                        interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+                };
+        };
diff --git a/Documentation/devicetree/bindings/iio/accel/adxl345.txt b/Documentation/devicetree/bindings/iio/accel/adxl345.txt
deleted file mode 100644
index f9525f6e3d43..000000000000
--- a/Documentation/devicetree/bindings/iio/accel/adxl345.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers
-
-http://www.analog.com/en/products/mems/accelerometers/adxl345.html
-http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html
-
-Required properties:
- - compatible : should be one of
-		"adi,adxl345"
-		"adi,adxl375"
- - reg : the I2C address or SPI chip select number of the sensor
-
-Required properties for SPI bus usage:
- - spi-max-frequency : set maximum clock frequency, must be 5000000
- - spi-cpol and spi-cpha : must be defined for adxl345 to enable SPI mode 3
-
-Optional properties:
- - interrupts: interrupt mapping for IRQ as documented in
-   Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-Example for a I2C device node:
-
-	accelerometer@2a {
-		compatible = "adi,adxl345";
-		reg = <0x53>;
-		interrupt-parent = <&gpio1>;
-		interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
-	};
-
-Example for a SPI device node:
-
-	accelerometer@0 {
-		compatible = "adi,adxl345";
-		reg = <0>;
-		spi-max-frequency = <5000000>;
-		spi-cpol;
-		spi-cpha;
-		interrupt-parent = <&gpio1>;
-		interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
-	};
diff --git a/Documentation/devicetree/bindings/iio/accel/adxl372.txt b/Documentation/devicetree/bindings/iio/accel/adxl372.txt
deleted file mode 100644
index a289964756a7..000000000000
--- a/Documentation/devicetree/bindings/iio/accel/adxl372.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer
-
-http://www.analog.com/media/en/technical-documentation/data-sheets/adxl372.pdf
-
-Required properties:
- - compatible : should be "adi,adxl372"
- - reg: the I2C address or SPI chip select number for the device
-
-Required properties for SPI bus usage:
- - spi-max-frequency: Max SPI frequency to use
-
-Optional properties:
- - interrupts: interrupt mapping for IRQ as documented in
-   Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-Example for a I2C device node:
-
-	accelerometer@53 {
-		compatible = "adi,adxl372";
-		reg = <0x53>;
-		interrupt-parent = <&gpio>;
-		interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
-	};
-
-Example for a SPI device node:
-
-	accelerometer@0 {
-		compatible = "adi,adxl372";
-		reg = <0>;
-		spi-max-frequency = <1000000>;
-		interrupt-parent = <&gpio>;
-		interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
-	};
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt
deleted file mode 100644
index 416273dce569..000000000000
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Analog Devices AD7124 ADC device driver
-
-Required properties for the AD7124:
-	- compatible: Must be one of "adi,ad7124-4" or "adi,ad7124-8"
-	- reg: SPI chip select number for the device
-	- spi-max-frequency: Max SPI frequency to use
-		see: Documentation/devicetree/bindings/spi/spi-bus.txt
-	- clocks: phandle to the master clock (mclk)
-		see: Documentation/devicetree/bindings/clock/clock-bindings.txt
-	- clock-names: Must be "mclk".
-	- interrupts: IRQ line for the ADC
-		see: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-	  Required properties:
-		* #address-cells: Must be 1.
-		* #size-cells: Must be 0.
-
-	  Subnode(s) represent the external channels which are connected to the ADC.
-	  Each subnode represents one channel and has the following properties:
-		Required properties:
-			* reg: The channel number. It can have up to 4 channels on ad7124-4
-			  and 8 channels on ad7124-8, numbered from 0 to 15.
-			* diff-channels: see: Documentation/devicetree/bindings/iio/adc/adc.txt
-
-		Optional properties:
-			* bipolar: see: Documentation/devicetree/bindings/iio/adc/adc.txt
-			* adi,reference-select: Select the reference source to use when
-			  converting on the the specific channel. Valid values are:
-			  0: REFIN1(+)/REFIN1(−).
-			  1: REFIN2(+)/REFIN2(−).
-			  3: AVDD
-			  If this field is left empty, internal reference is selected.
-
-Optional properties:
-	- refin1-supply: refin1 supply can be used as reference for conversion.
-	- refin2-supply: refin2 supply can be used as reference for conversion.
-	- avdd-supply: avdd supply can be used as reference for conversion.
-
-Example:
-	adc@0 {
-		compatible = "adi,ad7124-4";
-		reg = <0>;
-		spi-max-frequency = <5000000>;
-		interrupts = <25 2>;
-		interrupt-parent = <&gpio>;
-		refin1-supply = <&adc_vref>;
-		clocks = <&ad7124_mclk>;
-		clock-names = "mclk";
-
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		channel@0 {
-			reg = <0>;
-			diff-channels = <0 1>;
-			adi,reference-select = <0>;
-		};
-
-		channel@1 {
-			reg = <1>;
-			bipolar;
-			diff-channels = <2 3>;
-			adi,reference-select = <0>;
-		};
-
-		channel@2 {
-			reg = <2>;
-			diff-channels = <4 5>;
-		};
-
-		channel@3 {
-			reg = <3>;
-			diff-channels = <6 7>;
-		};
-	};
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml
new file mode 100644
index 000000000000..9692b7f719f5
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright 2019 Analog Devices Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bindings/iio/adc/adi,ad7124.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices AD7124 ADC device driver
+
+maintainers:
+  - Stefan Popa <stefan.popa@analog.com>
+
+description: |
+  Bindings for the Analog Devices AD7124 ADC device. Datasheet can be
+  found here:
+    https://www.analog.com/media/en/technical-documentation/data-sheets/AD7124-8.pdf
+
+properties:
+  compatible:
+    enum:
+      - adi,ad7124-4
+      - adi,ad7124-8
+
+  reg:
+    description: SPI chip select number for the device
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+    description: phandle to the master clock (mclk)
+
+  clock-names:
+    items:
+      - const: mclk
+
+  interrupts:
+    description: IRQ line for the ADC
+    maxItems: 1
+
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
+
+  refin1-supply:
+    description: refin1 supply can be used as reference for conversion.
+    maxItems: 1
+
+  refin2-supply:
+    description: refin2 supply can be used as reference for conversion.
+    maxItems: 1
+
+  avdd-supply:
+    description: avdd supply can be used as reference for conversion.
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - interrupts
+
+patternProperties:
+  "^channel@([0-9]|1[0-5])$":
+    type: object
+    description: |
+      Represents the external channels which are connected to the ADC.
+      See Documentation/devicetree/bindings/iio/adc/adc.txt.
+
+    properties:
+      reg:
+        description: |
+          The channel number. It can have up to 8 channels on ad7124-4
+          and 16 channels on ad7124-8, numbered from 0 to 15.
+        items:
+         minimum: 0
+         maximum: 15
+
+      adi,reference-select:
+        description: |
+          Select the reference source to use when converting on
+          the specific channel. Valid values are:
+          0: REFIN1(+)/REFIN1(−).
+          1: REFIN2(+)/REFIN2(−).
+          3: AVDD
+          If this field is left empty, internal reference is selected.
+        allOf:
+          - $ref: /schemas/types.yaml#/definitions/uint32
+          - enum: [0, 1, 3]
+
+      diff-channels:
+        description: see Documentation/devicetree/bindings/iio/adc/adc.txt
+        items:
+          minimum: 0
+          maximum: 15
+
+      bipolar:
+        description: see Documentation/devicetree/bindings/iio/adc/adc.txt
+        type: boolean
+
+      adi,buffered-positive:
+        description: Enable buffered mode for positive input.
+        type: boolean
+
+      adi,buffered-negative:
+        description: Enable buffered mode for negative input.
+        type: boolean
+
+    required:
+      - reg
+      - diff-channels
+
+examples:
+  - |
+    spi {
+      #address-cells = <1>;
+      #size-cells = <0>;
+
+      adc@0 {
+        compatible = "adi,ad7124-4";
+        reg = <0>;
+        spi-max-frequency = <5000000>;
+        interrupts = <25 2>;
+        interrupt-parent = <&gpio>;
+        refin1-supply = <&adc_vref>;
+        clocks = <&ad7124_mclk>;
+        clock-names = "mclk";
+
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        channel@0 {
+          reg = <0>;
+          diff-channels = <0 1>;
+          adi,reference-select = <0>;
+          adi,buffered-positive;
+        };
+
+        channel@1 {
+          reg = <1>;
+          bipolar;
+          diff-channels = <2 3>;
+          adi,reference-select = <0>;
+          adi,buffered-positive;
+          adi,buffered-negative;
+        };
+
+        channel@2 {
+          reg = <2>;
+          diff-channels = <4 5>;
+        };
+
+        channel@3 {
+          reg = <3>;
+          diff-channels = <6 7>;
+        };
+      };
+    };
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt
deleted file mode 100644
index 440e52555349..000000000000
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-* Analog Devices AD7170/AD7171/AD7780/AD7781
-
-Data sheets:
-
-- AD7170:
-	* https://www.analog.com/media/en/technical-documentation/data-sheets/AD7170.pdf
-- AD7171:
-	* https://www.analog.com/media/en/technical-documentation/data-sheets/AD7171.pdf
-- AD7780:
-	* https://www.analog.com/media/en/technical-documentation/data-sheets/ad7780.pdf
-- AD7781:
-	* https://www.analog.com/media/en/technical-documentation/data-sheets/AD7781.pdf
-
-Required properties:
-
-- compatible: should be one of
-	* "adi,ad7170"
-	* "adi,ad7171"
-	* "adi,ad7780"
-	* "adi,ad7781"
-- reg: spi chip select number for the device
-- vref-supply: the regulator supply for the ADC reference voltage
-
-Optional properties:
-
-- powerdown-gpios:  must be the device tree identifier of the PDRST pin. If
-		    specified, it will be asserted during driver probe. As the
-		    line is active high, it should be marked GPIO_ACTIVE_HIGH.
-- adi,gain-gpios:   must be the device tree identifier of the GAIN pin. Only for
-		    the ad778x chips. If specified, it will be asserted during
-		    driver probe. As the line is active low, it should be marked
-		    GPIO_ACTIVE_LOW.
-- adi,filter-gpios: must be the device tree identifier of the FILTER pin. Only
-		    for the ad778x chips. If specified, it will be asserted
-		    during driver probe. As the line is active low, it should be
-		    marked GPIO_ACTIVE_LOW.
-
-Example:
-
-adc@0 {
-	compatible =  "adi,ad7780";
-	reg =	      <0>;
-	vref-supply = <&vdd_supply>
-
-	powerdown-gpios  = <&gpio 12 GPIO_ACTIVE_HIGH>;
-	adi,gain-gpios   = <&gpio  5 GPIO_ACTIVE_LOW>;
-	adi,filter-gpios = <&gpio 15 GPIO_ACTIVE_LOW>;
-};
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
new file mode 100644
index 000000000000..d1109416963c
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/adc/adi,ad7780.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices AD7170/AD7171/AD7780/AD7781 analog to digital converters
+
+maintainers:
+  - Michael Hennerich <michael.hennerich@analog.com>
+
+description: |
+  The ad7780 is a sigma-delta analog to digital converter. This driver provides
+  reading voltage values and status bits from both the ad778x and ad717x series.
+  Its interface also allows writing on the FILTER and GAIN GPIO pins on the
+  ad778x.
+
+  Specifications on the converters can be found at:
+    AD7170:
+      https://www.analog.com/media/en/technical-documentation/data-sheets/AD7170.pdf
+    AD7171:
+      https://www.analog.com/media/en/technical-documentation/data-sheets/AD7171.pdf
+    AD7780:
+      https://www.analog.com/media/en/technical-documentation/data-sheets/ad7780.pdf
+    AD7781:
+      https://www.analog.com/media/en/technical-documentation/data-sheets/AD7781.pdf
+
+properties:
+  compatible:
+    enum:
+      - adi,ad7170
+      - adi,ad7171
+      - adi,ad7780
+      - adi,ad7781
+
+  reg:
+    maxItems: 1
+
+  avdd-supply:
+    description:
+      The regulator supply for the ADC reference voltage.
+    maxItems: 1
+
+  powerdown-gpios:
+    description:
+      Must be the device tree identifier of the PDRST pin. If
+      specified, it will be asserted during driver probe. As the
+      line is active high, it should be marked GPIO_ACTIVE_HIGH.
+    maxItems: 1
+
+  adi,gain-gpios:
+    description:
+      Must be the device tree identifier of the GAIN pin. Only for
+      the ad778x chips. If specified, it will be asserted during
+      driver probe. As the line is active low, it should be marked
+      GPIO_ACTIVE_LOW.
+    maxItems: 1
+
+  adi,filter-gpios:
+    description:
+      Must be the device tree identifier of the FILTER pin. Only
+      for the ad778x chips. If specified, it will be asserted
+      during driver probe. As the line is active low, it should be
+      marked GPIO_ACTIVE_LOW.
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    spi0 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        adc@0 {
+            compatible = "adi,ad7780";
+            reg = <0>;
+
+            avdd-supply      = <&vdd_supply>;
+            powerdown-gpios  = <&gpio0 12 GPIO_ACTIVE_HIGH>;
+            adi,gain-gpios   = <&gpio1  5 GPIO_ACTIVE_LOW>;
+            adi,filter-gpios = <&gpio2 15 GPIO_ACTIVE_LOW>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
index 8a4100ceeaf2..d76ece97c76c 100644
--- a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
@@ -61,6 +61,6 @@ examples:
         compatible = "avia,hx711";
         sck-gpios = <&gpio3 10 GPIO_ACTIVE_HIGH>;
         dout-gpios = <&gpio0 7 GPIO_ACTIVE_HIGH>;
-        avdd-suppy = <&avdd>;
+        avdd-supply = <&avdd>;
         clock-frequency = <100000>;
     };
diff --git a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
index 0df9befdaecc..78c06e05c8e5 100644
--- a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
+++ b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
@@ -13,8 +13,10 @@ Required properties:
   - compatible: Should be one of:
     - "mediatek,mt2701-auxadc": For MT2701 family of SoCs
     - "mediatek,mt2712-auxadc": For MT2712 family of SoCs
+    - "mediatek,mt6765-auxadc": For MT6765 family of SoCs
     - "mediatek,mt7622-auxadc": For MT7622 family of SoCs
     - "mediatek,mt8173-auxadc": For MT8173 family of SoCs
+    - "mediatek,mt8183-auxadc", "mediatek,mt8173-auxadc": For MT8183 family of SoCs
   - reg: Address range of the AUXADC unit.
   - clocks: Should contain a clock specifier for each entry in clock-names
   - clock-names: Should contain "main".
diff --git a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
index 8346bcb04ad7..93a0bd2efc05 100644
--- a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
+++ b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
@@ -38,6 +38,7 @@ Required properties:
     It's required on stm32h7.
 - clock-names: Must be "adc" and/or "bus" depending on part used.
 - interrupt-controller: Identifies the controller node as interrupt-parent
+- vdda-supply: Phandle to the vdda input analog voltage.
 - vref-supply: Phandle to the vref input analog reference voltage.
 - #interrupt-cells = <1>;
 - #address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt
deleted file mode 100644
index 6eee2709b5b6..000000000000
--- a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-* Sensirion SPS30 particulate matter sensor
-
-Required properties:
-- compatible: must be "sensirion,sps30"
-- reg: the I2C address of the sensor
-
-Example:
-
-sps30@69 {
-	compatible = "sensirion,sps30";
-	reg = <0x69>;
-};
diff --git a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml
new file mode 100644
index 000000000000..50a50a0d7070
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/chemical/sensirion,sps30.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sensirion SPS30 particulate matter sensor
+
+maintainers:
+  - Tomasz Duszynski <tduszyns@gmail.com>
+
+description: |
+  Air pollution sensor capable of measuring mass concentration of dust
+  particles.
+
+properties:
+  compatible:
+    enum:
+      - sensirion,sps30
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+
+      air-pollution-sensor@69 {
+        compatible = "sensirion,sps30";
+        reg = <0x69>;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml b/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml
new file mode 100644
index 000000000000..7ec3ec94356b
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/frequency/adf4371.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADF4371/ADF4372 Wideband Synthesizers
+
+maintainers:
+  - Popa Stefan <stefan.popa@analog.com>
+
+description: |
+  Analog Devices ADF4371/ADF4372 SPI Wideband Synthesizers
+  https://www.analog.com/media/en/technical-documentation/data-sheets/adf4371.pdf
+  https://www.analog.com/media/en/technical-documentation/data-sheets/adf4372.pdf
+
+properties:
+  compatible:
+    enum:
+      - adi,adf4371
+      - adi,adf4372
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    description:
+      Definition of the external clock (see clock/clock-bindings.txt)
+    maxItems: 1
+
+  clock-names:
+    description:
+      Must be "clkin"
+    maxItems: 1
+
+  adi,mute-till-lock-en:
+    type: boolean
+    description:
+      If this property is present, then the supply current to RF8P and RF8N
+      output stage will shut down until the ADF4371/ADF4372 achieves lock as
+      measured by the digital lock detect circuitry.
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+
+examples:
+  - |
+    spi0 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        frequency@0 {
+                compatible = "adi,adf4371";
+                reg = <0>;
+                spi-max-frequency = <1000000>;
+                clocks = <&adf4371_clkin>;
+                clock-names = "clkin";
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.txt b/Documentation/devicetree/bindings/iio/light/isl29018.txt
deleted file mode 100644
index b9bbde3e13ed..000000000000
--- a/Documentation/devicetree/bindings/iio/light/isl29018.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* ISL 29018/29023/29035 I2C ALS, Proximity, and Infrared sensor
-
-Required properties:
-
-  - compatible: Should be one of
-		"isil,isl29018"
-		"isil,isl29023"
-		"isil,isl29035"
-  - reg: the I2C address of the device
-
-Optional properties:
-
-  - interrupts: the sole interrupt generated by the device
-
-  Refer to interrupt-controller/interrupts.txt for generic interrupt client
-  node bindings.
-
-  - vcc-supply: phandle to the regulator that provides power to the sensor.
-
-Example:
-
-isl29018@44 {
-	compatible = "isil,isl29018";
-	reg = <0x44>;
-	interrupt-parent = <&gpio>;
-	interrupts = <TEGRA_GPIO(Z, 2) IRQ_TYPE_LEVEL_HIGH>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.yaml b/Documentation/devicetree/bindings/iio/light/isl29018.yaml
new file mode 100644
index 000000000000..cbb00be8f359
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/isl29018.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/isl29018.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: |
+  Intersil 29018/29023/29035 Ambient Light, Infrared Light, and Proximity Sensor
+
+maintainers:
+  - Brian Masney <masneyb@onstation.org>
+
+description: |
+  Ambient and infrared light sensing with proximity detection over an i2c
+  interface.
+
+  https://www.renesas.com/us/en/www/doc/datasheet/isl29018.pdf
+  https://www.renesas.com/us/en/www/doc/datasheet/isl29023.pdf
+  https://www.renesas.com/us/en/www/doc/datasheet/isl29035.pdf
+
+properties:
+  compatible:
+    enum:
+      - isil,isl29018
+      - isil,isl29023
+      - isil,isl29035
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  vcc-supply:
+    description: Regulator that provides power to the sensor
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        sensor@44 {
+                compatible = "isil,isl29018";
+                reg = <0x44>;
+                interrupts-extended = <&msmgpio 61 IRQ_TYPE_LEVEL_HIGH>;
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.txt b/Documentation/devicetree/bindings/iio/light/tsl2583.txt
deleted file mode 100644
index 059dffa1829a..000000000000
--- a/Documentation/devicetree/bindings/iio/light/tsl2583.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-* TAOS TSL 2580/2581/2583 ALS sensor
-
-Required properties:
-
-  - compatible: Should be one of
-		"amstaos,tsl2580"
-		"amstaos,tsl2581"
-		"amstaos,tsl2583"
-  - reg: the I2C address of the device
-
-Optional properties:
-
-  - interrupts: the sole interrupt generated by the device
-
-  Refer to interrupt-controller/interrupts.txt for generic interrupt client
-  node bindings.
-
-  - vcc-supply: phandle to the regulator that provides power to the sensor.
-
-Example:
-
-tsl2581@29 {
-	compatible = "amstaos,tsl2581";
-	reg = <0x29>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.yaml b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml
new file mode 100644
index 000000000000..e86ef64ecf03
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/tsl2583.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AMS/TAOS Ambient Light Sensor (ALS)
+
+maintainers:
+  - Brian Masney <masneyb@onstation.org>
+
+description: |
+  Ambient light sensing with an i2c interface.
+
+properties:
+  compatible:
+    enum:
+      - amstaos,tsl2580
+      - amstaos,tsl2581
+      - amstaos,tsl2583
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  vcc-supply:
+    description: Regulator that provides power to the sensor
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        light-sensor@29 {
+                compatible = "amstaos,tsl2581";
+                reg = <0x29>;
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.txt b/Documentation/devicetree/bindings/iio/light/tsl2772.txt
deleted file mode 100644
index 1c5e6f17a1df..000000000000
--- a/Documentation/devicetree/bindings/iio/light/tsl2772.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-* AMS/TAOS ALS and proximity sensor
-
-Required properties:
-
-  - compatible: Should be one of
-		"amstaos,tsl2571"
-		"amstaos,tsl2671"
-		"amstaos,tmd2671"
-		"amstaos,tsl2771"
-		"amstaos,tmd2771"
-		"amstaos,tsl2572"
-		"amstaos,tsl2672"
-		"amstaos,tmd2672"
-		"amstaos,tsl2772"
-		"amstaos,tmd2772"
-		"avago,apds9930"
-  - reg: the I2C address of the device
-
-Optional properties:
-
-  - amstaos,proximity-diodes - proximity diodes to enable. <0>, <1>, or <0 1>
-                               are the only valid values.
-  - led-max-microamp - current for the proximity LED. Must be 100000, 50000,
-                       25000, or 13000.
-  - vdd-supply: phandle to the regulator that provides power to the sensor.
-  - vddio-supply: phandle to the regulator that provides power to the bus.
-  - interrupts: the sole interrupt generated by the device
-
-  Refer to interrupt-controller/interrupts.txt for generic interrupt client
-  node bindings.
-
-Example:
-
-tsl2772@39 {
-	compatible = "amstaos,tsl2772";
-	reg = <0x39>;
-	interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>;
-	vdd-supply = <&pm8941_l17>;
-	vddio-supply = <&pm8941_lvs1>;
-	amstaos,proximity-diodes = <0>;
-	led-max-microamp = <100000>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.yaml b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml
new file mode 100644
index 000000000000..ed2c3d5eadf5
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/tsl2772.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AMS/TAOS Ambient Light Sensor (ALS) and Proximity Detector
+
+maintainers:
+  - Brian Masney <masneyb@onstation.org>
+
+description: |
+  Ambient light sensing and proximity detection with an i2c interface.
+  https://ams.com/documents/20143/36005/TSL2772_DS000181_2-00.pdf
+
+properties:
+  compatible:
+    enum:
+      - amstaos,tsl2571
+      - amstaos,tsl2671
+      - amstaos,tmd2671
+      - amstaos,tsl2771
+      - amstaos,tmd2771
+      - amstaos,tsl2572
+      - amstaos,tsl2672
+      - amstaos,tmd2672
+      - amstaos,tsl2772
+      - amstaos,tmd2772
+      - avago,apds9930
+
+  reg:
+    maxItems: 1
+
+  amstaos,proximity-diodes:
+    description: Proximity diodes to enable
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32-array
+      - minItems: 1
+        maxItems: 2
+        items:
+          minimum: 0
+          maximum: 1
+
+  interrupts:
+    maxItems: 1
+
+  led-max-microamp:
+    description: Current for the proximity LED
+    enum:
+      - 13000
+      - 25000
+      - 50000
+      - 100000
+
+  vdd-supply:
+    description: Regulator that provides power to the sensor
+
+  vddio-supply:
+    description: Regulator that provides power to the bus
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        sensor@39 {
+                compatible = "amstaos,tsl2772";
+                reg = <0x39>;
+                interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>;
+                vdd-supply = <&pm8941_l17>;
+                vddio-supply = <&pm8941_lvs1>;
+                amstaos,proximity-diodes = <0>;
+                led-max-microamp = <100000>;
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/input/elan_i2c.txt b/Documentation/devicetree/bindings/input/elan_i2c.txt
index 797607460735..9963247706f2 100644
--- a/Documentation/devicetree/bindings/input/elan_i2c.txt
+++ b/Documentation/devicetree/bindings/input/elan_i2c.txt
@@ -13,9 +13,20 @@ Optional properties:
   pinctrl binding [1]).
 - vcc-supply: a phandle for the regulator supplying 3.3V power.
 - elan,trackpoint: touchpad can support a trackpoint (boolean)
+- elan,clickpad: touchpad is a clickpad (the entire surface is a button)
+- elan,middle-button: touchpad has a physical middle button
+- elan,x_traces: number of antennas on the x axis
+- elan,y_traces: number of antennas on the y axis
+- some generic touchscreen properties [2]:
+  * touchscreen-size-x
+  * touchscreen-size-y
+  * touchscreen-x-mm
+  * touchscreen-y-mm
+
 
 [0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
 [1]: Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
 
 Example:
 	&i2c1 {
diff --git a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
index 496125c6bfb7..507b737612ea 100644
--- a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
+++ b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
@@ -5,6 +5,7 @@ Required properties:
  - compatible: should be one of the following string:
 		"allwinner,sun4i-a10-lradc-keys"
 		"allwinner,sun8i-a83t-r-lradc"
+		"allwinner,sun50i-a64-lradc", "allwinner,sun8i-a83t-r-lradc"
  - reg: mmio address range of the chip
  - interrupts: interrupt to which the chip is connected
  - vref-supply: powersupply for the lradc reference voltage
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt b/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt
new file mode 100644
index 000000000000..4e82fd575cec
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt
@@ -0,0 +1,29 @@
+Amazon's Annapurna Labs Fabric Interrupt Controller
+
+Required properties:
+
+- compatible: should be "amazon,al-fic"
+- reg: physical base address and size of the registers
+- interrupt-controller: identifies the node as an interrupt controller
+- #interrupt-cells: must be 2.
+  First cell defines the index of the interrupt within the controller.
+  Second cell is used to specify the trigger type and must be one of the
+  following:
+    - bits[3:0] trigger type and level flags
+	1 = low-to-high edge triggered
+	4 = active high level-sensitive
+- interrupt-parent: specifies the parent interrupt controller.
+- interrupts: describes which input line in the interrupt parent, this
+  fic's output is connected to. This field property depends on the parent's
+  binding
+
+Example:
+
+amazon_fic: interrupt-controller@0xfd8a8500 {
+	compatible = "amazon,al-fic";
+	interrupt-controller;
+	#interrupt-cells = <2>;
+	reg = <0x0 0xfd8a8500 0x0 0x1000>;
+	interrupt-parent = <&gic>;
+	interrupts = <GIC_SPI 0x0 IRQ_TYPE_LEVEL_HIGH>;
+};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
index 1502a51548bb..7d531d5fff29 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
@@ -15,6 +15,7 @@ Required properties:
     "amlogic,meson-gxbb-gpio-intc" for GXBB SoCs (S905) or
     "amlogic,meson-gxl-gpio-intc" for GXL SoCs (S905X, S912)
     "amlogic,meson-axg-gpio-intc" for AXG SoCs (A113D, A113X)
+    "amlogic,meson-g12a-gpio-intc" for G12A SoCs (S905D2, S905X2, S905Y2)
 - reg : Specifies base physical address and size of the registers.
 - interrupt-controller : Identifies the node as an interrupt controller.
 - #interrupt-cells : Specifies the number of cells needed to encode an
diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml
index 54838d4ea44c..9a47820ef346 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml
@@ -92,6 +92,8 @@ properties:
     minItems: 2
     maxItems: 4
 
+  ranges: true
+
   interrupts:
     description: Interrupt source of the parent interrupt controller on
       secondary GICs, or VGIC maintenance interrupt on primary GIC (see
@@ -197,28 +199,28 @@ examples:
     interrupt-controller@e1101000 {
       compatible = "arm,gic-400";
       #interrupt-cells = <3>;
-      #address-cells = <2>;
-      #size-cells = <2>;
+      #address-cells = <1>;
+      #size-cells = <1>;
       interrupt-controller;
       interrupts = <1 8 0xf04>;
-      ranges = <0 0 0 0xe1100000 0 0x100000>;
-      reg = <0x0 0xe1110000 0 0x01000>,
-            <0x0 0xe112f000 0 0x02000>,
-            <0x0 0xe1140000 0 0x10000>,
-            <0x0 0xe1160000 0 0x10000>;
+      ranges = <0 0xe1100000 0x100000>;
+      reg = <0xe1110000 0x01000>,
+            <0xe112f000 0x02000>,
+            <0xe1140000 0x10000>,
+            <0xe1160000 0x10000>;
 
-      v2m0: v2m@8000 {
+      v2m0: v2m@80000 {
         compatible = "arm,gic-v2m-frame";
         msi-controller;
-        reg = <0x0 0x80000 0 0x1000>;
+        reg = <0x80000 0x1000>;
       };
 
       //...
 
-      v2mN: v2m@9000 {
+      v2mN: v2m@90000 {
         compatible = "arm,gic-v2m-frame";
         msi-controller;
-        reg = <0x0 0x90000 0 0x1000>;
+        reg = <0x90000 0x1000>;
       };
     };
 ...
diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
index ab921f1698fb..e13405355166 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
@@ -6,11 +6,16 @@ C-SKY Multi-processors Interrupt Controller is designed for ck807/ck810/ck860
 SMP soc, and it also could be used in non-SMP system.
 
 Interrupt number definition:
-
   0-15  : software irq, and we use 15 as our IPI_IRQ.
  16-31  : private  irq, and we use 16 as the co-processor timer.
  31-1024: common irq for soc ip.
 
+Interrupt triger mode: (Defined in dt-bindings/interrupt-controller/irq.h)
+ IRQ_TYPE_LEVEL_HIGH (default)
+ IRQ_TYPE_LEVEL_LOW
+ IRQ_TYPE_EDGE_RISING
+ IRQ_TYPE_EDGE_FALLING
+
 =============================
 intc node bindings definition
 =============================
@@ -26,15 +31,22 @@ intc node bindings definition
 	- #interrupt-cells
 		Usage: required
 		Value type: <u32>
-		Definition: must be <1>
+		Definition: <2>
 	- interrupt-controller:
 		Usage: required
 
-Examples:
+Examples: ("interrupts = <irq_num IRQ_TYPE_XXX>")
 ---------
+#include <dt-bindings/interrupt-controller/irq.h>
 
 	intc: interrupt-controller {
 		compatible = "csky,mpintc";
-		#interrupt-cells = <1>;
+		#interrupt-cells = <2>;
 		interrupt-controller;
 	};
+
+	device: device-example {
+		...
+		interrupts = <34 IRQ_TYPE_EDGE_RISING>;
+		interrupt-parent = <&intc>;
+	};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
index 930fb462fd9f..0ebfc952cb34 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
@@ -23,7 +23,7 @@ Required properties:
 - marvell,spi-base     : List of GIC base SPI interrupts, one for each
                          ODMI frame. Those SPI interrupts are 0-based,
                          i.e marvell,spi-base = <128> will use SPI #96.
-                         See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
+                         See Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml
                          for details about the GIC Device Tree binding.
 
 Example:
diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt
new file mode 100644
index 000000000000..727b7e4cd6e0
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt
@@ -0,0 +1,43 @@
+DT bindings for the Renesas RZ/A1 Interrupt Controller
+
+The RZ/A1 Interrupt Controller is a front-end for the GIC found on Renesas
+RZ/A1 and RZ/A2 SoCs:
+  - IRQ sense select for 8 external interrupts, 1:1-mapped to 8 GIC SPI
+    interrupts,
+  - NMI edge select.
+
+Required properties:
+  - compatible: Must be "renesas,<soctype>-irqc", and "renesas,rza1-irqc" as
+		fallback.
+		Examples with soctypes are:
+		  - "renesas,r7s72100-irqc" (RZ/A1H)
+		  - "renesas,r7s9210-irqc" (RZ/A2M)
+  - #interrupt-cells: Must be 2 (an interrupt index and flags, as defined
+				 in interrupts.txt in this directory)
+  - #address-cells: Must be zero
+  - interrupt-controller: Marks the device as an interrupt controller
+  - reg: Base address and length of the memory resource used by the interrupt
+         controller
+  - interrupt-map: Specifies the mapping from external interrupts to GIC
+		   interrupts
+  - interrupt-map-mask: Must be <7 0>
+
+Example:
+
+	irqc: interrupt-controller@fcfef800 {
+		compatible = "renesas,r7s72100-irqc", "renesas,rza1-irqc";
+		#interrupt-cells = <2>;
+		#address-cells = <0>;
+		interrupt-controller;
+		reg = <0xfcfef800 0x6>;
+		interrupt-map =
+			<0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+			<1 0 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+			<2 0 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+			<3 0 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+			<4 0 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+			<5 0 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+			<6 0 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+			<7 0 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-map-mask = <7 0>;
+	};
diff --git a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
index 3538a214fff1..352f5e9c759b 100644
--- a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
@@ -36,4 +36,4 @@ Example:
             kcs_chan = <2>;
             status = "disabled";
         };
-    };
-\ No newline at end of file
+    };
diff --git a/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml b/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
index 4d61fe0a98a4..dc129d9a329e 100644
--- a/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
+++ b/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
@@ -23,16 +23,17 @@ properties:
   reg:
     maxItems: 1
 
-  ti,linear-mapping-mode:
-    description: |
-      Enable linear mapping mode. If disabled, then it will use exponential
-      mapping mode in which the ramp up/down appears to have a more uniform
-      transition to the human eye.
-    type: boolean
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
 
 required:
   - compatible
   - reg
+  - '#address-cells'
+  - '#size-cells'
 
 patternProperties:
   "^led@[01]$":
@@ -48,7 +49,6 @@ patternProperties:
           in this property. The two current sinks can be controlled
           independently with both banks, or bank A can be configured to control
           both sinks with the led-sources property.
-        maxItems: 1
         minimum: 0
         maximum: 1
 
@@ -73,6 +73,13 @@ patternProperties:
         minimum: 0
         maximum: 255
 
+      ti,linear-mapping-mode:
+        description: |
+          Enable linear mapping mode. If disabled, then it will use exponential
+          mapping mode in which the ramp up/down appears to have a more uniform
+          transition to the human eye.
+        type: boolean
+
     required:
       - reg
 
diff --git a/Documentation/devicetree/bindings/leds/irled/spi-ir-led.txt b/Documentation/devicetree/bindings/leds/irled/spi-ir-led.txt
index 896b6997cf30..21882c8d4b0c 100644
--- a/Documentation/devicetree/bindings/leds/irled/spi-ir-led.txt
+++ b/Documentation/devicetree/bindings/leds/irled/spi-ir-led.txt
@@ -15,7 +15,7 @@ Optional properties:
 	- power-supply: specifies the power source. It can either be a regulator
 	  or a gpio which enables a regulator, i.e. a regulator-fixed as
 	  described in
-	  Documentation/devicetree/bindings/regulator/fixed-regulator.txt
+	  Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/leds/leds-lm36274.txt b/Documentation/devicetree/bindings/leds/leds-lm36274.txt
new file mode 100644
index 000000000000..39c230d59a4d
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-lm36274.txt
@@ -0,0 +1,85 @@
+* Texas Instruments LM36274 4-Channel LCD Backlight Driver w/Integrated Bias
+
+The LM36274 is an integrated four-channel WLED driver and LCD bias supply.
+The backlight boost provides the power to bias four parallel LED strings with
+up to 29V total output voltage. The 11-bit LED current is programmable via
+the I2C bus and/or controlled via a logic level PWM input from 60 uA to 30 mA.
+
+Parent device properties are documented in
+Documentation/devicetree/bindings/mfd/ti-lmu.txt
+
+Regulator properties are documented in
+Documentation/devicetree/bindings/regulator/lm363x-regulator.txt
+
+Required backlight properties:
+	- compatible:
+		"ti,lm36274-backlight"
+	- reg : 0
+	- #address-cells : 1
+	- #size-cells : 0
+	- led-sources : Indicates which LED strings will be enabled.
+			Values from 0-3, sources is 0 based so strings will be
+			source value + 1.
+
+Optional backlight properties:
+	- label : see Documentation/devicetree/bindings/leds/common.txt
+	- linux,default-trigger :
+	   see Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is
+controlled by control bank B.
+
+lm36274@11 {
+	compatible = "ti,lm36274";
+	#address-cells = <1>;
+	#size-cells = <0>;
+	reg = <0x11>;
+
+	enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>;
+
+	regulators {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "ti,lm363x-regulator";
+
+		enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>,
+			       <&pioC 1 GPIO_ACTIVE_HIGH>;
+
+		vboost {
+			regulator-name = "lcd_boost";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <7150000>;
+			regulator-always-on;
+		};
+
+		vpos {
+			regulator-name = "lcd_vpos";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <6500000>;
+		};
+
+		vneg {
+			regulator-name = "lcd_vneg";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <6500000>;
+		};
+	};
+
+	backlight {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "ti,lm36274-backlight";
+
+		led@0 {
+			reg = <0>;
+			led-sources = <0 2>;
+			label = "white:backlight_cluster";
+			linux,default-trigger = "backlight";
+		};
+	};
+};
+
+For more product information please see the link below:
+http://www.ti.com/lit/ds/symlink/lm36274.pdf
diff --git a/Documentation/devicetree/bindings/leds/leds-lm3697.txt b/Documentation/devicetree/bindings/leds/leds-lm3697.txt
new file mode 100644
index 000000000000..63992d732959
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-lm3697.txt
@@ -0,0 +1,73 @@
+* Texas Instruments - LM3697 Highly Efficient White LED Driver
+
+The LM3697 11-bit LED driver provides high-
+performance backlight dimming for 1, 2, or 3 series
+LED strings while delivering up to 90% efficiency.
+
+This device is suitable for display and keypad lighting
+
+Required properties:
+	- compatible:
+		"ti,lm3697"
+	- reg :  I2C slave address
+	- #address-cells : 1
+	- #size-cells : 0
+
+Optional properties:
+	- enable-gpios : GPIO pin to enable/disable the device
+	- vled-supply : LED supply
+
+Required child properties:
+	- reg : 0 - LED is Controlled by bank A
+		1 - LED is Controlled by bank B
+	- led-sources : Indicates which HVLED string is associated to which
+			control bank.  This is a zero based property so
+			HVLED1 = 0, HVLED2 = 1, HVLED3 = 2.
+			Additional information is contained
+			in Documentation/devicetree/bindings/leds/common.txt
+
+Optional child properties:
+	- ti,brightness-resolution - see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+	- ramp-up-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+	- ramp-down-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+	- label : see Documentation/devicetree/bindings/leds/common.txt
+	- linux,default-trigger :
+	   see Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is
+controlled by control bank B.
+
+led-controller@36 {
+	compatible = "ti,lm3697";
+	#address-cells = <1>;
+	#size-cells = <0>;
+	reg = <0x36>;
+
+	enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>;
+	vled-supply = <&vbatt>;
+
+	led@0 {
+		reg = <0>;
+		led-sources = <0 2>;
+		ti,brightness-resolution = <2047>;
+		ramp-up-us = <5000>;
+		ramp-down-us = <1000>;
+		label = "white:first_backlight_cluster";
+		linux,default-trigger = "backlight";
+	};
+
+	led@1 {
+		reg = <1>;
+		led-sources = <1>;
+		ti,brightness-resolution = <255>;
+		ramp-up-us = <500>;
+		ramp-down-us = <1000>;
+		label = "white:second_backlight_cluster";
+		linux,default-trigger = "backlight";
+	};
+}
+
+For more product information please see the link below:
+http://www.ti.com/lit/ds/symlink/lm3697.pdf
diff --git a/Documentation/devicetree/bindings/leds/leds-spi-byte.txt b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt
new file mode 100644
index 000000000000..28b6b2d9091e
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt
@@ -0,0 +1,44 @@
+* Single Byte SPI LED Device Driver.
+
+The driver can be used for controllers with a very simple SPI protocol:
+- one LED is controlled by a single byte on MOSI
+- the value of the byte gives the brightness between two values (lowest to
+  highest)
+- no return value is necessary (no MISO signal)
+
+The value for lowest and highest brightness is dependent on the device and
+therefore on the compatible string.
+
+Depending on the compatible string some special functions (like hardware
+accelerated blinking) might can be supported too.
+
+The driver currently only supports one LED. The properties of the LED are
+configured in a sub-node in the device node.
+
+Required properties:
+- compatible: should be one of
+   * "ubnt,acb-spi-led"		microcontroller (SONiX 8F26E611LA) based device
+				used for example in Ubiquiti airCube ISP
+
+Property rules described in Documentation/devicetree/bindings/spi/spi-bus.txt
+apply.
+
+LED sub-node properties:
+- label:
+	see Documentation/devicetree/bindings/leds/common.txt
+- default-state:
+	see Documentation/devicetree/bindings/leds/common.txt
+	Only "on" and "off" are supported.
+
+Example:
+
+led-controller@0 {
+	compatible = "ubnt,acb-spi-led";
+	reg = <0>;
+	spi-max-frequency = <100000>;
+
+	led {
+		label = "white:status";
+		default-state = "on";
+	};
+};
diff --git a/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt b/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
index 0ef372656a3e..35c3f56b7f7b 100644
--- a/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
+++ b/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
@@ -1,4 +1,4 @@
-OMAP2+ Mailbox Driver
+OMAP2+ and K3 Mailbox
 =====================
 
 The OMAP mailbox hardware facilitates communication between different processors
@@ -7,7 +7,7 @@ various processor subsystems and is connected on an interconnect bus. The
 communication is achieved through a set of registers for message storage and
 interrupt configuration registers.
 
-Each mailbox IP block has a certain number of h/w fifo queues and output
+Each mailbox IP block/cluster has a certain number of h/w fifo queues and output
 interrupt lines. An output interrupt line is routed to an interrupt controller
 within a processor subsystem, and there can be more than one line going to a
 specific processor's interrupt controller. The interrupt line connections are
@@ -23,12 +23,16 @@ All the current OMAP SoCs except for the newest DRA7xx SoC has a single IP
 instance. DRA7xx has multiple instances with different number of h/w fifo queues
 and interrupt lines between different instances. The interrupt lines can also be
 routed to different processor sub-systems on DRA7xx as they are routed through
-the Crossbar, a kind of interrupt router/multiplexer.
+the Crossbar, a kind of interrupt router/multiplexer. The K3 AM65x and J721E
+SoCs has each of these instances form a cluster and combine multiple clusters
+into a single IP block present within the Main NavSS. The interrupt lines from
+all these clusters are multiplexed and routed to different processor subsystems
+over a limited number of common interrupt output lines of an Interrupt Router.
 
 Mailbox Device Node:
 ====================
-A Mailbox device node is used to represent a Mailbox IP instance within a SoC.
-The sub-mailboxes are represented as child nodes of this parent node.
+A Mailbox device node is used to represent a Mailbox IP instance/cluster within
+a SoC. The sub-mailboxes are represented as child nodes of this parent node.
 
 Required properties:
 --------------------
@@ -37,12 +41,12 @@ Required properties:
 			    "ti,omap3-mailbox" for OMAP3430, OMAP3630 SoCs
 			    "ti,omap4-mailbox" for OMAP44xx, OMAP54xx, AM33xx,
 						   AM43xx and DRA7xx SoCs
+			    "ti,am654-mailbox" for K3 AM65x and J721E SoCs
 - reg:			Contains the mailbox register address range (base
 			address and length)
 - interrupts:		Contains the interrupt information for the mailbox
 			device. The format is dependent on which interrupt
-			controller the OMAP device uses
-- ti,hwmods:		Name of the hwmod associated with the mailbox
+			controller the Mailbox device uses
 - #mbox-cells:		Common mailbox binding property to identify the number
 			of cells required for the mailbox specifier. Should be
 			1
@@ -50,6 +54,23 @@ Required properties:
 			device can interrupt
 - ti,mbox-num-fifos:	Number of h/w fifo queues within the mailbox IP block
 
+SoC-specific Required properties:
+---------------------------------
+The following are mandatory properties for the OMAP architecture based SoCs
+only:
+- ti,hwmods:		Name of the hwmod associated with the mailbox. This
+			should be defined in the mailbox node only if the node
+			is not defined as a child node of a corresponding sysc
+			interconnect node.
+
+The following are mandatory properties for the K3 AM65x and J721E SoCs only:
+- interrupt-parent:	Should contain a phandle to the TI-SCI interrupt
+			controller node that is used to dynamically program
+			the interrupt routes between the IP and the main GIC
+			controllers. See the following binding for additional
+			details,
+			Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt
+
 Child Nodes:
 ============
 A child node is used for representing the actual sub-mailbox device that is
@@ -98,7 +119,7 @@ to be used by the client user.
 Example:
 --------
 
-/* OMAP4 */
+1. /* OMAP4 */
 mailbox: mailbox@4a0f4000 {
 	compatible = "ti,omap4-mailbox";
 	reg = <0x4a0f4000 0x200>;
@@ -123,7 +144,7 @@ dsp {
 	...
 };
 
-/* AM33xx */
+2. /* AM33xx */
 mailbox: mailbox@480c8000 {
 	compatible = "ti,omap4-mailbox";
 	reg = <0x480C8000 0x200>;
@@ -137,3 +158,23 @@ mailbox: mailbox@480c8000 {
 		ti,mbox-rx = <0 0 3>;
 	};
 };
+
+3. /* AM65x */
+&cbass_main {
+	cbass_main_navss: interconnect0 {
+		mailbox0_cluster0: mailbox@31f80000 {
+			compatible = "ti,am654-mailbox";
+			reg = <0x00 0x31f80000 0x00 0x200>;
+			#mbox-cells = <1>;
+			ti,mbox-num-users = <4>;
+			ti,mbox-num-fifos = <16>;
+			interrupt-parent = <&intr_main_navss>;
+			interrupts = <164 0>;
+
+			mbox_mcu_r5fss0_core0: mbox-mcu-r5fss0-core0 {
+				ti,mbox-tx = <1 0 0>;
+				ti,mbox-rx = <0 0 0>;
+			};
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/media/allegro.txt b/Documentation/devicetree/bindings/media/allegro.txt
new file mode 100644
index 000000000000..a92e2fbf26c9
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/allegro.txt
@@ -0,0 +1,43 @@
+Device-tree bindings for the Allegro DVT video IP codecs present in the Xilinx
+ZynqMP SoC. The IP core may either be a H.264/H.265 encoder or H.264/H.265
+decoder ip core.
+
+Each actual codec engines is controlled by a microcontroller (MCU). Host
+software uses a provided mailbox interface to communicate with the MCU. The
+MCU share an interrupt.
+
+Required properties:
+  - compatible: value should be one of the following
+    "allegro,al5e-1.1", "allegro,al5e": encoder IP core
+    "allegro,al5d-1.1", "allegro,al5d": decoder IP core
+  - reg: base and length of the memory mapped register region and base and
+    length of the memory mapped sram
+  - reg-names: must include "regs" and "sram"
+  - interrupts: shared interrupt from the MCUs to the processing system
+  - clocks: must contain an entry for each entry in clock-names
+  - clock-names: must include "core_clk", "mcu_clk", "m_axi_core_aclk",
+    "m_axi_mcu_aclk", "s_axi_lite_aclk"
+
+Example:
+	al5e: video-codec@a0009000 {
+		compatible = "allegro,al5e-1.1", "allegro,al5e";
+		reg = <0 0xa0009000 0 0x1000>,
+		      <0 0xa0000000 0 0x8000>;
+		reg-names = "regs", "sram";
+		interrupts = <0 96 4>;
+		clocks = <&xlnx_vcu 0>, <&xlnx_vcu 1>,
+			 <&clkc 71>, <&clkc 71>, <&clkc 71>;
+		clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk",
+			      "m_axi_mcu_aclk", "s_axi_lite_aclk"
+	};
+	al5d: video-codec@a0029000 {
+		compatible = "allegro,al5d-1.1", "allegro,al5d";
+		reg = <0 0xa0029000 0 0x1000>,
+		      <0 0xa0020000 0 0x8000>;
+		reg-names = "regs", "sram";
+		interrupts = <0 96 4>;
+		clocks = <&xlnx_vcu 2>, <&xlnx_vcu 3>,
+			 <&clkc 71>, <&clkc 71>, <&clkc 71>;
+		clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk",
+			      "m_axi_mcu_aclk", "s_axi_lite_aclk"
+	};
diff --git a/Documentation/devicetree/bindings/media/amlogic,vdec.txt b/Documentation/devicetree/bindings/media/amlogic,vdec.txt
new file mode 100644
index 000000000000..aabdd01bcf32
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/amlogic,vdec.txt
@@ -0,0 +1,71 @@
+Amlogic Video Decoder
+================================
+
+The video decoding IP lies within the DOS memory region,
+except for the hardware bitstream parser that makes use of an undocumented
+region.
+
+It makes use of the following blocks:
+
+- ESPARSER is a bitstream parser that outputs to a VIFIFO. Further VDEC blocks
+then feed from this VIFIFO.
+- VDEC_1 can decode MPEG-1, MPEG-2, MPEG-4 part 2, MJPEG, H.263, H.264, VC-1.
+- VDEC_HEVC can decode HEVC and VP9.
+
+Both VDEC_1 and VDEC_HEVC share the "vdec" IRQ and as such cannot run
+concurrently.
+
+Device Tree Bindings:
+---------------------
+
+VDEC: Video Decoder
+--------------------------
+
+Required properties:
+- compatible: value should be different for each SoC family as :
+	- GXBB (S905) : "amlogic,gxbb-vdec"
+	- GXL (S905X, S905D) : "amlogic,gxl-vdec"
+	- GXM (S912) : "amlogic,gxm-vdec"
+- reg: base address and size of he following memory-mapped regions :
+	- dos
+	- esparser
+- reg-names: should contain the names of the previous memory regions
+- interrupts: should contain the following IRQs:
+	- vdec
+	- esparser
+- interrupt-names: should contain the names of the previous interrupts
+- amlogic,ao-sysctrl: should point to the AOBUS sysctrl node
+- amlogic,canvas: should point to a canvas provider node
+- clocks: should contain the following clocks :
+	- dos_parser
+	- dos
+	- vdec_1
+	- vdec_hevc
+- clock-names: should contain the names of the previous clocks
+- resets: should contain the parser reset
+- reset-names: should be "esparser"
+
+Example:
+
+vdec: video-decoder@c8820000 {
+	compatible = "amlogic,gxbb-vdec";
+	reg = <0x0 0xc8820000 0x0 0x10000>,
+	      <0x0 0xc110a580 0x0 0xe4>;
+	reg-names = "dos", "esparser";
+
+	interrupts = <GIC_SPI 44 IRQ_TYPE_EDGE_RISING>,
+		     <GIC_SPI 32 IRQ_TYPE_EDGE_RISING>;
+	interrupt-names = "vdec", "esparser";
+
+	amlogic,ao-sysctrl = <&sysctrl_AO>;
+	amlogic,canvas = <&canvas>;
+
+	clocks = <&clkc CLKID_DOS_PARSER>,
+		 <&clkc CLKID_DOS>,
+		 <&clkc CLKID_VDEC_1>,
+		 <&clkc CLKID_VDEC_HEVC>;
+	clock-names = "dos_parser", "dos", "vdec_1", "vdec_hevc";
+
+	resets = <&reset RESET_PARSER>;
+	reset-names = "esparser";
+};
diff --git a/Documentation/devicetree/bindings/media/imx7-csi.txt b/Documentation/devicetree/bindings/media/imx7-csi.txt
index 3c07bc676bc3..443aef07356e 100644
--- a/Documentation/devicetree/bindings/media/imx7-csi.txt
+++ b/Documentation/devicetree/bindings/media/imx7-csi.txt
@@ -14,8 +14,7 @@ Required properties:
 - interrupts    : should contain CSI interrupt;
 - clocks        : list of clock specifiers, see
         Documentation/devicetree/bindings/clock/clock-bindings.txt for details;
-- clock-names   : must contain "axi", "mclk" and "dcic" entries, matching
-                 entries in the clock property;
+- clock-names   : must contain "mclk";
 
 The device node shall contain one 'port' child node with one child 'endpoint'
 node, according to the bindings defined in:
@@ -32,10 +31,8 @@ example:
                         compatible = "fsl,imx7-csi";
                         reg = <0x30710000 0x10000>;
                         interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
-                        clocks = <&clks IMX7D_CLK_DUMMY>,
-                                        <&clks IMX7D_CSI_MCLK_ROOT_CLK>,
-                                        <&clks IMX7D_CLK_DUMMY>;
-                        clock-names = "axi", "mclk", "dcic";
+                        clocks = <&clks IMX7D_CSI_MCLK_ROOT_CLK>;
+                        clock-names = "mclk";
 
                         port {
                                 csi_from_csi_mux: endpoint {
diff --git a/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt
new file mode 100644
index 000000000000..7ec2c8c8a3b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt
@@ -0,0 +1,50 @@
+Marvell MMP2 camera host interface
+
+Required properties:
+ - compatible: Should be "marvell,mmp2-ccic".
+ - reg: Register base and size.
+ - interrupts: The interrupt number.
+ - #clock-cells: Must be 0.
+
+Optional properties:
+ - clocks: Reference to the input clock as specified by
+           Documentation/devicetree/bindings/clock/clock-bindings.txt.
+ - clock-names: Names of the clocks used; "axi" for the AXI bus interface,
+                "func" for the peripheral clock and "phy" for the parallel
+                video bus interface.
+ - clock-output-names: Optional clock source for sensors. Shall be "mclk".
+
+Required subnodes:
+ - port: The parallel bus interface port with a single endpoint linked to
+         the sensor's endpoint as described in
+         Documentation/devicetree/bindings/media/video-interfaces.txt.
+
+Required endpoint properties:
+ - bus-type: data bus type, <5> or <6> for Parallel or Bt.656 respectively
+ - pclk-sample: pixel clock polarity
+ - hsync-active: horizontal synchronization polarity (only required for
+   parallel bus)
+ - vsync-active: vertical synchronization polarity (only required for
+   parallel bus)
+
+Example:
+
+	camera0: camera@d420a000 {
+		compatible = "marvell,mmp2-ccic";
+		reg = <0xd420a000 0x800>;
+		interrupts = <42>;
+		clocks = <&soc_clocks MMP2_CLK_CCIC0>;
+		clock-names = "axi";
+		#clock-cells = <0>;
+		clock-output-names = "mclk";
+
+		port {
+			camera0_0: endpoint {
+				remote-endpoint = <&ov7670_0>;
+                                bus-type = <5>;      /* Parallel */
+                                hsync-active = <1>;  /* Active high */
+                                vsync-active = <1>;  /* Active high */
+                                pclk-sample = <0>;   /* Falling */
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
index 249790a93017..3122ded82eb4 100644
--- a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
+++ b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
@@ -11,7 +11,7 @@ Required properties:
 - clock-names: must contain "mclk", which is the DCMI peripherial clock
 - pinctrl: the pincontrol settings to configure muxing properly
            for pins that connect to DCMI device.
-           See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt.
+           See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml.
 - dmas: phandle to DMA controller node,
         see Documentation/devicetree/bindings/dma/stm32-dma.txt
 - dma-names: must contain "tx", which is the transmit channel from DCMI to DMA
diff --git a/Documentation/devicetree/bindings/media/sun6i-csi.txt b/Documentation/devicetree/bindings/media/sun6i-csi.txt
index 0dd540bb03db..a2e3e56f0257 100644
--- a/Documentation/devicetree/bindings/media/sun6i-csi.txt
+++ b/Documentation/devicetree/bindings/media/sun6i-csi.txt
@@ -6,6 +6,7 @@ Allwinner V3s SoC features a CSI module(CSI1) with parallel interface.
 Required properties:
   - compatible: value must be one of:
     * "allwinner,sun6i-a31-csi"
+    * "allwinner,sun8i-a83t-csi"
     * "allwinner,sun8i-h3-csi"
     * "allwinner,sun8i-v3s-csi"
     * "allwinner,sun50i-a64-csi"
diff --git a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
index f936b5589b19..59b8dcc118ee 100644
--- a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
+++ b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
@@ -5,6 +5,7 @@ controller in Ingenic JZ4780
 
 Required properties:
 - compatible: Should be set to one of:
+    "ingenic,jz4740-nemc" (JZ4740)
     "ingenic,jz4780-nemc" (JZ4780)
 - reg: Should specify the NEMC controller registers location and length.
 - clocks: Clock for the NEMC controller.
diff --git a/Documentation/devicetree/bindings/mfd/atmel-usart.txt b/Documentation/devicetree/bindings/mfd/atmel-usart.txt
index 7f0cd72f47d2..699fd3c9ace8 100644
--- a/Documentation/devicetree/bindings/mfd/atmel-usart.txt
+++ b/Documentation/devicetree/bindings/mfd/atmel-usart.txt
@@ -17,17 +17,24 @@ Required properties for USART in SPI mode:
 - cs-gpios: chipselects (internal cs not supported)
 - atmel,usart-mode : Must be <AT91_USART_MODE_SPI> (found in dt-bindings/mfd/at91-usart.h)
 
+Optional properties in serial and SPI mode:
+- dma bindings for dma transfer:
+	- dmas: DMA specifier, consisting of a phandle to DMA controller node,
+		memory peripheral interface and USART DMA channel ID, FIFO configuration.
+		The order of DMA channels is fixed. The first DMA channel must be TX
+		associated channel and the second one must be RX associated channel.
+		Refer to dma.txt and atmel-dma.txt for details.
+	- dma-names: "tx" for TX channel.
+		     "rx" for RX channel.
+		     The order of dma-names is also fixed. The first name must be "tx"
+		     and the second one must be "rx" as in the examples below.
+
 Optional properties in serial mode:
 - atmel,use-dma-rx: use of PDC or DMA for receiving data
 - atmel,use-dma-tx: use of PDC or DMA for transmitting data
 - {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD line respectively.
   It will use specified PIO instead of the peripheral function pin for the USART feature.
   If unsure, don't specify this property.
-- add dma bindings for dma transfer:
-	- dmas: DMA specifier, consisting of a phandle to DMA controller node,
-		memory peripheral interface and USART DMA channel ID, FIFO configuration.
-		Refer to dma.txt and atmel-dma.txt for details.
-	- dma-names: "rx" for RX channel, "tx" for TX channel.
 - atmel,fifo-size: maximum number of data the RX and TX FIFOs can store for FIFO
   capable USARTs.
 - rs485-rts-delay, rs485-rx-during-tx, linux,rs485-enabled-at-boot-time: see rs485.txt
@@ -81,5 +88,8 @@ Example:
 		interrupts = <12 IRQ_TYPE_LEVEL_HIGH 5>;
 		clocks = <&usart0_clk>;
 		clock-names = "usart";
+		dmas = <&dma0 2 AT91_DMA_CFG_PER_ID(3)>,
+		       <&dma0 2 (AT91_DMA_CFG_PER_ID(4) | AT91_DMA_CFG_FIFOCFG_ASAP)>;
+		dma-names = "tx", "rx";
 		cs-gpios = <&pioB 3 0>;
 	};
diff --git a/Documentation/devicetree/bindings/mfd/cros-ec.txt b/Documentation/devicetree/bindings/mfd/cros-ec.txt
index 6245c9b1a68b..4860eabd0f72 100644
--- a/Documentation/devicetree/bindings/mfd/cros-ec.txt
+++ b/Documentation/devicetree/bindings/mfd/cros-ec.txt
@@ -3,7 +3,7 @@ ChromeOS Embedded Controller
 Google's ChromeOS EC is a Cortex-M device which talks to the AP and
 implements various function such as keyboard and battery charging.
 
-The EC can be connect through various means (I2C, SPI, LPC) and the
+The EC can be connect through various means (I2C, SPI, LPC, RPMSG) and the
 compatible string used depends on the interface. Each connection method has
 its own driver which connects to the top level interface-agnostic EC driver.
 Other Linux driver (such as cros-ec-keyb for the matrix keyboard) connect to
@@ -17,6 +17,9 @@ Required properties (SPI):
 - compatible: "google,cros-ec-spi"
 - reg: SPI chip select
 
+Required properties (RPMSG):
+- compatible: "google,cros-ec-rpmsg"
+
 Optional properties (SPI):
 - google,cros-ec-spi-pre-delay: Some implementations of the EC need a little
   time to wake up from sleep before they can receive SPI transfers at a high
diff --git a/Documentation/devicetree/bindings/mfd/lp87565.txt b/Documentation/devicetree/bindings/mfd/lp87565.txt
index a48df7c08ab0..41671e0dc26b 100644
--- a/Documentation/devicetree/bindings/mfd/lp87565.txt
+++ b/Documentation/devicetree/bindings/mfd/lp87565.txt
@@ -41,3 +41,39 @@ lp87565_pmic: pmic@60 {
 		};
 	};
 };
+
+TI LP87561 PMIC:
+
+This is a single output 4-phase regulator configuration
+
+Required properties:
+  - compatible:	"ti,lp87561-q1"
+  - reg:		I2C slave address.
+  - gpio-controller:	Marks the device node as a GPIO Controller.
+  - #gpio-cells:	Should be two.  The first cell is the pin number and
+			the second cell is used to specify flags.
+			See ../gpio/gpio.txt for more information.
+  - xxx-in-supply:	Phandle to parent supply node of each regulator
+			populated under regulators node. xxx should match
+			the supply_name populated in driver.
+Example:
+
+lp87561_pmic: pmic@62 {
+	compatible = "ti,lp87561-q1";
+	reg = <0x62>;
+	gpio-controller;
+	#gpio-cells = <2>;
+
+	buck3210-in-supply = <&vsys_3v3>;
+
+	regulators: regulators {
+		buck3210_reg: buck3210 {
+			/* VDD_CORE */
+			regulator-name = "buck3210";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <800000>;
+			regulator-always-on;
+			regulator-boot-on;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/mfd/madera.txt b/Documentation/devicetree/bindings/mfd/madera.txt
index db3266088386..cad0f2800502 100644
--- a/Documentation/devicetree/bindings/mfd/madera.txt
+++ b/Documentation/devicetree/bindings/mfd/madera.txt
@@ -11,10 +11,14 @@ bindings/sound/madera.txt
 Required properties:
 
   - compatible : One of the following chip-specific strings:
+        "cirrus,cs47l15"
         "cirrus,cs47l35"
         "cirrus,cs47l85"
         "cirrus,cs47l90"
         "cirrus,cs47l91"
+        "cirrus,cs42l92"
+        "cirrus,cs47l92"
+        "cirrus,cs47l93"
         "cirrus,wm1840"
 
   - reg : I2C slave address when connected using I2C, chip select number when
@@ -22,7 +26,7 @@ Required properties:
 
   - DCVDD-supply : Power supply for the device as defined in
     bindings/regulator/regulator.txt
-    Mandatory on CS47L35, CS47L90, CS47L91
+    Mandatory on CS47L15, CS47L35, CS47L90, CS47L91, CS42L92, CS47L92, CS47L93
     Optional on CS47L85, WM1840
 
   - AVDD-supply, DBVDD1-supply, DBVDD2-supply, CPVDD1-supply, CPVDD2-supply :
@@ -35,7 +39,7 @@ Required properties:
     (CS47L85, WM1840)
 
   - SPKVDD-supply : Power supply for the device
-    (CS47L35)
+    (CS47L15, CS47L35)
 
   - interrupt-controller : Indicates that this device is an interrupt controller
 
diff --git a/Documentation/devicetree/bindings/mfd/rk808.txt b/Documentation/devicetree/bindings/mfd/rk808.txt
index 1683ec3245bc..04df07f6f793 100644
--- a/Documentation/devicetree/bindings/mfd/rk808.txt
+++ b/Documentation/devicetree/bindings/mfd/rk808.txt
@@ -3,11 +3,15 @@ RK8XX Power Management Integrated Circuit
 The rk8xx family current members:
 rk805
 rk808
+rk809
+rk817
 rk818
 
 Required properties:
 - compatible: "rockchip,rk805"
 - compatible: "rockchip,rk808"
+- compatible: "rockchip,rk809"
+- compatible: "rockchip,rk817"
 - compatible: "rockchip,rk818"
 - reg: I2C slave address
 - interrupts: the interrupt outputs of the controller.
@@ -45,6 +49,23 @@ Optional RK808 properties:
   the gpio controller. If DVS GPIOs aren't present, voltage changes will happen
   very quickly with no slow ramp time.
 
+Optional shared RK809 and RK817 properties:
+- vcc1-supply:  The input supply for DCDC_REG1
+- vcc2-supply:  The input supply for DCDC_REG2
+- vcc3-supply:  The input supply for DCDC_REG3
+- vcc4-supply:  The input supply for DCDC_REG4
+- vcc5-supply:  The input supply for LDO_REG1, LDO_REG2, LDO_REG3
+- vcc6-supply:  The input supply for LDO_REG4, LDO_REG5, LDO_REG6
+- vcc7-supply:  The input supply for LDO_REG7, LDO_REG8, LDO_REG9
+
+Optional RK809 properties:
+- vcc8-supply:  The input supply for SWITCH_REG1
+- vcc9-supply:  The input supply for DCDC_REG5, SWITCH_REG2
+
+Optional RK817 properties:
+- vcc8-supply:  The input supply for BOOST
+- vcc9-supply:  The input supply for OTG_SWITCH
+
 Optional RK818 properties:
 - vcc1-supply:  The input supply for DCDC_REG1
 - vcc2-supply:  The input supply for DCDC_REG2
@@ -86,6 +107,21 @@ number as described in RK808 datasheet.
 	- SWITCH_REGn
 		- valid values for n are 1 to 2
 
+Following regulators of the RK809 and RK817 PMIC blocks are supported. Note that
+the 'n' in regulator name, as in DCDC_REGn or LDOn, represents the DCDC or LDO
+number as described in RK809 and RK817 datasheets.
+
+	- DCDC_REGn
+		- valid values for n are 1 to 5 for RK809.
+		- valid values for n are 1 to 4 for RK817.
+	- LDO_REGn
+		- valid values for n are 1 to 9 for RK809.
+		- valid values for n are 1 to 9 for RK817.
+	- SWITCH_REGn
+		- valid values for n are 1 to 2 for RK809.
+	- BOOST for RK817
+	- OTG_SWITCH for RK817
+
 Following regulators of the RK818 PMIC block are supported. Note that
 the 'n' in regulator name, as in DCDC_REGn or LDOn, represents the DCDC or LDO
 number as described in RK818 datasheet.
@@ -98,6 +134,14 @@ number as described in RK818 datasheet.
 	- HDMI_SWITCH
 	- OTG_SWITCH
 
+It is necessary to configure three pins for both the RK809 and RK817, the three
+pins are "gpio_ts" "gpio_gt" "gpio_slp".
+	The gpio_gt and gpio_ts pins support the gpio function.
+	The gpio_slp pin is for controlling the pmic states, as below:
+		- reset
+		- power down
+		- sleep
+
 Standard regulator bindings are used inside regulator subnodes. Check
   Documentation/devicetree/bindings/regulator/regulator.txt
 for more details
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
new file mode 100644
index 000000000000..c3c02ce73cde
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
@@ -0,0 +1,102 @@
+* ROHM BD70528 Power Management Integrated Circuit bindings
+
+BD70528MWV is an ultra-low quiescent current general purpose, single-chip,
+power management IC for battery-powered portable devices. The IC
+integrates 3 ultra-low current consumption buck converters, 3 LDOs and 2
+LED Drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz
+clock gate, high-accuracy VREF for use with an external ADC, flexible
+dual-input power path, 10 bit SAR ADC for battery temperature monitor and
+1S battery charger with scalable charge currents.
+
+Required properties:
+ - compatible		: Should be "rohm,bd70528"
+ - reg			: I2C slave address.
+ - interrupts		: The interrupt line the device is connected to.
+ - interrupt-controller	: To indicate BD70528 acts as an interrupt controller.
+ - #interrupt-cells	: Should be 2. Usage is compliant to the 2 cells
+			  variant of ../interrupt-controller/interrupts.txt
+ - gpio-controller	: To indicate BD70528 acts as a GPIO controller.
+ - #gpio-cells		: Should be 2. The first cell is the pin number and
+			  the second cell is used to specify flags. See
+			  ../gpio/gpio.txt for more information.
+ - #clock-cells		: Should be 0.
+ - regulators:		: List of child nodes that specify the regulators.
+			  Please see ../regulator/rohm,bd70528-regulator.txt
+
+Optional properties:
+ - clock-output-names	: Should contain name for output clock.
+
+Example:
+/* External oscillator */
+osc: oscillator {
+	compatible = "fixed-clock";
+	#clock-cells = <1>;
+	clock-frequency  = <32768>;
+	clock-output-names = "osc";
+};
+
+pmic: pmic@4b {
+	compatible = "rohm,bd70528";
+	reg = <0x4b>;
+	interrupt-parent = <&gpio1>;
+	interrupts = <29 GPIO_ACTIVE_LOW>;
+	clocks = <&osc 0>;
+	#clock-cells = <0>;
+	clock-output-names = "bd70528-32k-out";
+	#gpio-cells = <2>;
+	gpio-controller;
+	interrupt-controller;
+	#interrupt-cells = <2>;
+
+	regulators {
+		buck1: BUCK1 {
+			regulator-name = "buck1";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <3400000>;
+			regulator-boot-on;
+			regulator-ramp-delay = <125>;
+		};
+		buck2: BUCK2 {
+			regulator-name = "buck2";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-boot-on;
+			regulator-ramp-delay = <125>;
+		};
+		buck3: BUCK3 {
+			regulator-name = "buck3";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-boot-on;
+			regulator-ramp-delay = <250>;
+		};
+		ldo1: LDO1 {
+			regulator-name = "ldo1";
+			regulator-min-microvolt = <1650000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-boot-on;
+		};
+		ldo2: LDO2 {
+			regulator-name = "ldo2";
+			regulator-min-microvolt = <1650000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-boot-on;
+		};
+
+		ldo3: LDO3 {
+			regulator-name = "ldo3";
+			regulator-min-microvolt = <1650000>;
+			regulator-max-microvolt = <3300000>;
+		};
+		led_ldo1: LED_LDO1 {
+			regulator-name = "led_ldo1";
+			regulator-min-microvolt = <200000>;
+			regulator-max-microvolt = <300000>;
+		};
+		led_ldo2: LED_LDO2 {
+			regulator-name = "led_ldo2";
+			regulator-min-microvolt = <200000>;
+			regulator-max-microvolt = <300000>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
index d5f68ac78d15..f22d74c7a8db 100644
--- a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
@@ -8,6 +8,8 @@ and 6 LDOs.
 
 Datasheet for BD71837 is available at:
 https://www.rohm.com/datasheet/BD71837MWV/bd71837mwv-e
+Datasheet for BD71847 is available at:
+https://www.rohm.com/datasheet/BD71847AMWV/bd71847amwv-e
 
 Required properties:
  - compatible		: Should be "rohm,bd71837" for bd71837
@@ -38,6 +40,14 @@ target state is set to READY by default. If SNVS state is used the boot
 crucial regulators must have the regulator-always-on and regulator-boot-on
 properties set in regulator node.
 
+- rohm,short-press-ms	: Short press duration in milliseconds
+- rohm,long-press-ms	: Long press duration in milliseconds
+
+Configure the "short press" and "long press" timers for the power button.
+Values are rounded to what hardware supports (500ms multiple for short and
+1000ms multiple for long). If these properties are not present the existing
+configuration (from bootloader or OTP) is not touched.
+
 Example:
 
 	/* external oscillator node */
diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt
index 86ca786d54fc..2296b8f24de4 100644
--- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt
+++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt
@@ -8,7 +8,7 @@ TI LMU driver supports lighting devices below.
   LM3632       Backlight and regulator
   LM3633       Backlight, LED and fault monitor
   LM3695       Backlight
-  LM3697       Backlight and fault monitor
+  LM36274      Backlight and regulator
 
 Required properties:
   - compatible: Should be one of:
@@ -16,15 +16,32 @@ Required properties:
                 "ti,lm3632"
                 "ti,lm3633"
                 "ti,lm3695"
-                "ti,lm3697"
+		"ti,lm36274"
   - reg: I2C slave address.
          0x11 for LM3632
          0x29 for LM3631
-         0x36 for LM3633, LM3697
+         0x36 for LM3633
          0x63 for LM3695
+         0x11 for LM36274
 
-Optional property:
+Optional properties:
   - enable-gpios: A GPIO specifier for hardware enable pin.
+  - ramp-up-us: Current ramping from one brightness level to
+		the a higher brightness level.
+		Range from 2048 us - 117.44 s
+  - ramp-down-us: Current ramping from one brightness level to
+		  the a lower brightness level.
+		  Range from 2048 us - 117.44 s
+  - ti,brightness-resolution - This determines whether to use 8 bit brightness
+			       mode or 11 bit brightness mode.  If this value is
+			       not set the device is defaulted to the preferred
+			       8bit brightness mode per 7.3.4.1 of the data
+			       sheet.  This setting can either be in the parent
+			       node or as part of the LED child nodes.  This
+			       is determined by the part itself if the strings
+			       have a common brightness register or individual
+			       brightness registers.
+			       The values are 255 (8bit) or 2047 (11bit).
 
 Required node:
   - backlight: All LMU devices have backlight child nodes.
@@ -35,14 +52,15 @@ Optional nodes:
     Required properties:
       - compatible: Should be one of:
                     "ti,lm3633-fault-monitor"
-                    "ti,lm3697-fault-monitor"
   - leds: LED properties for LM3633. Please refer to [2].
+	  LED properties for LM36274. Please refer to [4].
   - regulators: Regulator properties for LM3631 and LM3632.
                 Please refer to [3].
 
 [1] ../leds/backlight/ti-lmu-backlight.txt
 [2] ../leds/leds-lm3633.txt
 [3] ../regulator/lm363x-regulator.txt
+[4] ../leds/leds-lm36274.txt
 
 lm3631@29 {
 	compatible = "ti,lm3631";
@@ -90,7 +108,7 @@ lm3631@29 {
 
 		lcd_bl {
 			led-sources = <0 1>;
-			ramp-up-msec = <300>;
+			ramp-up-us = <300000>;
 		};
 	};
 };
@@ -152,15 +170,15 @@ lm3633@36 {
 		main {
 			label = "main_lcd";
 			led-sources = <1 2>;
-			ramp-up-msec = <500>;
-			ramp-down-msec = <500>;
+			ramp-up-us = <500000>;
+			ramp-down-us = <500000>;
 		};
 
 		front {
 			label = "front_lcd";
 			led-sources = <0>;
-			ramp-up-msec = <1000>;
-			ramp-down-msec = <0>;
+			ramp-up-us = <1000000>;
+			ramp-down-us = <0>;
 		};
 	};
 
@@ -201,23 +219,51 @@ lm3695@63 {
 	};
 };
 
-lm3697@36 {
-	compatible = "ti,lm3697";
-	reg = <0x36>;
+lm36274@11 {
+	compatible = "ti,lm36274";
+	#address-cells = <1>;
+	#size-cells = <0>;
+	reg = <0x11>;
 
 	enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>;
+	regulators {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "ti,lm363x-regulator";
 
-	backlight {
-		compatible = "ti,lm3697-backlight";
+		enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>,
+			       <&pioC 1 GPIO_ACTIVE_HIGH>;
 
-		lcd {
-			led-sources = <0 1 2>;
-			ramp-up-msec = <200>;
-			ramp-down-msec = <200>;
+		vboost {
+			regulator-name = "lcd_boost";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <7150000>;
+			regulator-always-on;
+		};
+
+		vpos {
+			regulator-name = "lcd_vpos";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <6500000>;
+		};
+
+		vneg {
+			regulator-name = "lcd_vneg";
+			regulator-min-microvolt = <4000000>;
+			regulator-max-microvolt = <6500000>;
 		};
 	};
 
-	fault-monitor {
-		compatible = "ti,lm3697-fault-monitor";
+	backlight {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "ti,lm36274-backlight";
+
+		led@0 {
+			reg = <0>;
+			led-sources = <0 2>;
+			label = "white:backlight_cluster";
+			linux,default-trigger = "backlight";
+		};
 	};
 };
diff --git a/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt
new file mode 100644
index 000000000000..1442ba5d2d98
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt
@@ -0,0 +1,11 @@
+DPAA2 console support
+
+Required properties:
+
+    - compatible
+        Value type: <string>
+        Definition: Must be "fsl,dpaa2-console".
+    - reg
+        Value type: <prop-encoded-array>
+        Definition: A standard property.  Specifies the region where the MCFBA
+                    (MC firmware base address) register can be found.
diff --git a/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt
new file mode 100644
index 000000000000..8c4d649cdd8f
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt
@@ -0,0 +1,23 @@
+OLPC XO-1.75 Embedded Controller
+
+Required properties:
+- compatible: Should be "olpc,xo1.75-ec".
+- cmd-gpios: gpio specifier of the CMD pin
+
+The embedded controller requires the SPI controller driver to signal readiness
+to receive a transfer (that is, when TX FIFO contains the response data) by
+strobing the ACK pin with the ready signal. See the "ready-gpios" property of the
+SSP binding as documented in:
+<Documentation/devicetree/bindings/spi/spi-pxa2xx.txt>.
+
+Example:
+	&ssp3 {
+		spi-slave;
+		ready-gpios = <&gpio 125 GPIO_ACTIVE_HIGH>;
+
+		slave {
+			compatible = "olpc,xo1.75-ec";
+			spi-cpha;
+			cmd-gpios = <&gpio 155 GPIO_ACTIVE_HIGH>;
+		};
+	};
diff --git a/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt
new file mode 100644
index 000000000000..e3289634fa30
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt
@@ -0,0 +1,58 @@
+* Xilinx SDFEC(16nm) IP *
+
+The Soft Decision Forward Error Correction (SDFEC) Engine is a Hard IP block
+which provides high-throughput LDPC and Turbo Code implementations.
+The LDPC decode & encode functionality is capable of covering a range of
+customer specified Quasi-cyclic (QC) codes. The Turbo decode functionality
+principally covers codes used by LTE. The FEC Engine offers significant
+power and area savings versus implementations done in the FPGA fabric.
+
+
+Required properties:
+- compatible: Must be "xlnx,sd-fec-1.1"
+- clock-names : List of input clock names from the following:
+    - "core_clk", Main processing clock for processing core (required)
+    - "s_axi_aclk", AXI4-Lite memory-mapped slave interface clock (required)
+    - "s_axis_din_aclk", DIN AXI4-Stream Slave interface clock (optional)
+    - "s_axis_din_words-aclk", DIN_WORDS AXI4-Stream Slave interface clock (optional)
+    - "s_axis_ctrl_aclk",  Control input AXI4-Stream Slave interface clock (optional)
+    - "m_axis_dout_aclk", DOUT AXI4-Stream Master interface clock (optional)
+    - "m_axis_dout_words_aclk", DOUT_WORDS AXI4-Stream Master interface clock (optional)
+    - "m_axis_status_aclk", Status output AXI4-Stream Master interface clock (optional)
+- clocks : Clock phandles (see clock_bindings.txt for details).
+- reg: Should contain Xilinx SDFEC 16nm Hardened IP block registers
+  location and length.
+- xlnx,sdfec-code : Should contain "ldpc" or "turbo" to describe the codes
+  being used.
+- xlnx,sdfec-din-words : A value 0 indicates that the DIN_WORDS interface is
+  driven with a fixed value and is not present on the device, a value of 1
+  configures the DIN_WORDS to be block based, while a value of 2 configures the
+  DIN_WORDS input to be supplied for each AXI transaction.
+- xlnx,sdfec-din-width : Configures the DIN AXI stream where a value of 1
+  configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width
+  of "4x128b".
+- xlnx,sdfec-dout-words : A value 0 indicates that the DOUT_WORDS interface is
+  driven with a fixed value and is not present on the device, a value of 1
+  configures the DOUT_WORDS to be block based, while a value of 2 configures the
+  DOUT_WORDS input to be supplied for each AXI transaction.
+- xlnx,sdfec-dout-width : Configures the DOUT AXI stream where a value of 1
+  configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width
+  of "4x128b".
+Optional properties:
+- interrupts: should contain SDFEC interrupt number
+
+Example
+---------------------------------------
+	sd_fec_0: sd-fec@a0040000 {
+		compatible = "xlnx,sd-fec-1.1";
+		clock-names = "core_clk","s_axi_aclk","s_axis_ctrl_aclk","s_axis_din_aclk","m_axis_status_aclk","m_axis_dout_aclk";
+		clocks = <&misc_clk_2>,<&misc_clk_0>,<&misc_clk_1>,<&misc_clk_1>,<&misc_clk_1>, <&misc_clk_1>;
+		reg = <0x0 0xa0040000 0x0 0x40000>;
+		interrupt-parent = <&axi_intc>;
+		interrupts = <1 0>;
+		xlnx,sdfec-code = "ldpc";
+		xlnx,sdfec-din-words = <0>;
+		xlnx,sdfec-din-width = <2>;
+		xlnx,sdfec-dout-words = <0>;
+		xlnx,sdfec-dout-width = <1>;
+	};
diff --git a/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml
new file mode 100644
index 000000000000..df0280edef97
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/allwinner,sun4i-a10-mmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MMC Controller Device Tree Bindings
+
+allOf:
+  - $ref: "mmc-controller.yaml"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#address-cells": true
+  "#size-cells": true
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-mmc
+      - const: allwinner,sun5i-a13-mmc
+      - const: allwinner,sun7i-a20-mmc
+      - const: allwinner,sun8i-a83t-emmc
+      - const: allwinner,sun9i-a80-mmc
+      - const: allwinner,sun50i-a64-emmc
+      - const: allwinner,sun50i-a64-mmc
+      - items:
+          - const: allwinner,sun8i-a83t-mmc
+          - const: allwinner,sun7i-a20-mmc
+      - items:
+          - const: allwinner,sun50i-h6-emmc
+          - const: allwinner,sun50i-a64-emmc
+      - items:
+          - const: allwinner,sun50i-h6-mmc
+          - const: allwinner,sun50i-a64-mmc
+      - items:
+          - const: allwinner,sun8i-r40-emmc
+          - const: allwinner,sun50i-a64-emmc
+      - items:
+          - const: allwinner,sun8i-r40-mmc
+          - const: allwinner,sun50i-a64-mmc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    minItems: 2
+    maxItems: 4
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+      - description: Output Clock
+      - description: Sample Clock
+
+  clock-names:
+    minItems: 2
+    maxItems: 4
+    items:
+      - const: ahb
+      - const: mmc
+      - const: output
+      - const: sample
+
+  resets:
+    maxItems: 1
+
+  reset-names:
+    const: ahb
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+
+examples:
+  - |
+    mmc0: mmc@1c0f000 {
+        compatible = "allwinner,sun5i-a13-mmc";
+        reg = <0x01c0f000 0x1000>;
+        clocks = <&ahb_gates 8>, <&mmc0_clk>;
+        clock-names = "ahb", "mmc";
+        interrupts = <32>;
+        bus-width = <4>;
+        cd-gpios = <&pio 7 1 0>;
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
index 13e70409e8ac..ccc5358db131 100644
--- a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
+++ b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
@@ -22,6 +22,10 @@ Required properties:
   clock rate requested by the MMC core.
 - resets     : phandle of the internal reset line
 
+Optional properties:
+- amlogic,dram-access-quirk: set when controller's internal DMA engine cannot access the
+  DRAM memory, like on the G12A dedicated SDIO controller.
+
 Example:
 
 	sd_emmc_a: mmc@70000 {
diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
new file mode 100644
index 000000000000..080754e0ef35
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/mmc-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MMC Controller Generic Binding
+
+maintainers:
+  - Ulf Hansson <ulf.hansson@linaro.org>
+
+description: |
+  These properties are common to multiple MMC host controllers. Any host
+  that requires the respective functionality should implement them using
+  these definitions.
+
+properties:
+  $nodename:
+    pattern: "^mmc(@.*)?$"
+
+  "#address-cells":
+    const: 1
+    description: |
+      The cell is the slot ID if a function subnode is used.
+
+  "#size-cells":
+    const: 0
+
+  # Card Detection.
+  # If none of these properties are supplied, the host native card
+  # detect will be used. Only one of them should be provided.
+
+  broken-cd:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      There is no card detection available; polling must be used.
+
+  cd-gpios:
+    description:
+      The card detection will be done using the GPIO provided.
+
+  non-removable:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Non-removable slot (like eMMC); assume always present.
+
+  # *NOTE* on CD and WP polarity. To use common for all SD/MMC host
+  # controllers line polarity properties, we have to fix the meaning
+  # of the "normal" and "inverted" line levels. We choose to follow
+  # the SDHCI standard, which specifies both those lines as "active
+  # low." Therefore, using the "cd-inverted" property means, that the
+  # CD line is active high, i.e. it is high, when a card is
+  # inserted. Similar logic applies to the "wp-inverted" property.
+  #
+  # CD and WP lines can be implemented on the hardware in one of two
+  # ways: as GPIOs, specified in cd-gpios and wp-gpios properties, or
+  # as dedicated pins. Polarity of dedicated pins can be specified,
+  # using *-inverted properties. GPIO polarity can also be specified
+  # using the GPIO_ACTIVE_LOW flag. This creates an ambiguity in the
+  # latter case. We choose to use the XOR logic for GPIO CD and WP
+  # lines.  This means, the two properties are "superimposed," for
+  # example leaving the GPIO_ACTIVE_LOW flag clear and specifying the
+  # respective *-inverted property property results in a
+  # double-inversion and actually means the "normal" line polarity is
+  # in effect.
+  wp-inverted:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      The Write Protect line polarity is inverted.
+
+  cd-inverted:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      The CD line polarity is inverted.
+
+  # Other properties
+
+  bus-width:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - enum: [1, 4, 8]
+        default: 1
+    description:
+      Number of data lines.
+
+  max-frequency:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 400000
+      - maximum: 200000000
+    description:
+      Maximum operating frequency of the bus.
+
+  disable-wp:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      When set, no physical write-protect line is present. This
+      property should only be specified when the controller has a
+      dedicated write-protect detection logic. If a GPIO is always
+      used for the write-protect detection. If a GPIO is always used
+      for the write-protect detection logic, it is sufficient to not
+      specify the wp-gpios property in the absence of a write-protect
+      line.
+
+  wp-gpios:
+    description:
+      GPIO to use for the write-protect detection.
+
+  cd-debounce-delay-ms:
+    description:
+      Set delay time before detecting card after card insert
+      interrupt.
+
+  no-1-8-v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      When specified, denotes that 1.8V card voltage is not supported
+      on this system, even if the controller claims it.
+
+  cap-sd-highspeed:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD high-speed timing is supported.
+
+  cap-mmc-highspeed:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      MMC high-speed timing is supported.
+
+  sd-uhs-sdr12:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD UHS SDR12 speed is supported.
+
+  sd-uhs-sdr25:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD UHS SDR25 speed is supported.
+
+  sd-uhs-sdr50:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD UHS SDR50 speed is supported.
+
+  sd-uhs-sdr104:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD UHS SDR104 speed is supported.
+
+  sd-uhs-ddr50:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SD UHS DDR50 speed is supported.
+
+  cap-power-off-card:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Powering off the card is safe.
+
+  cap-mmc-hw-reset:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC hardware reset is supported
+
+  cap-sdio-irq:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      enable SDIO IRQ signalling on this interface
+
+  full-pwr-cycle:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Full power cycle of the card is supported.
+
+  mmc-ddr-1_2v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC high-speed DDR mode (1.2V I/O) is supported.
+
+  mmc-ddr-1_8v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC high-speed DDR mode (1.8V I/O) is supported.
+
+  mmc-ddr-3_3v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC high-speed DDR mode (3.3V I/O) is supported.
+
+  mmc-hs200-1_2v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC HS200 mode (1.2V I/O) is supported.
+
+  mmc-hs200-1_8v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC HS200 mode (1.8V I/O) is supported.
+
+  mmc-hs400-1_2v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC HS400 mode (1.2V I/O) is supported.
+
+  mmc-hs400-1_8v:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC HS400 mode (1.8V I/O) is supported.
+
+  mmc-hs400-enhanced-strobe:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      eMMC HS400 enhanced strobe mode is supported
+
+  dsr:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 0xffff
+    description:
+      Value the card Driver Stage Register (DSR) should be programmed
+      with.
+
+  no-sdio:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Controller is limited to send SDIO commands during
+      initialization.
+
+  no-sd:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Controller is limited to send SD commands during initialization.
+
+  no-mmc:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Controller is limited to send MMC commands during
+      initialization.
+
+  fixed-emmc-driver-type:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 4
+    description:
+      For non-removable eMMC, enforce this driver type. The value is
+      the driver type as specified in the eMMC specification (table
+      206 in spec version 5.1)
+
+  post-power-on-delay-ms:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - default: 10
+    description:
+      It was invented for MMC pwrseq-simple which could be referred to
+      mmc-pwrseq-simple.txt. But now it\'s reused as a tunable delay
+      waiting for I/O signalling and card power supply to be stable,
+      regardless of whether pwrseq-simple is used. Default to 10ms if
+      no available.
+
+  supports-cqe:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      The presence of this property indicates that the corresponding
+      MMC host controller supports HW command queue feature.
+
+  disable-cqe-dcmd:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      The presence of this property indicates that the MMC
+      controller\'s command queue engine (CQE) does not support direct
+      commands (DCMDs).
+
+  keep-power-in-suspend:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SDIO only. Preserves card power during a suspend/resume cycle.
+
+  # Deprecated: enable-sdio-wakeup
+  wakeup-source:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      SDIO only. Enables wake up of host system on SDIO IRQ assertion.
+
+  vmmc-supply:
+    description:
+      Supply for the card power
+
+  vqmmc-supply:
+    description:
+      Supply for the bus IO line power
+
+  mmc-pwrseq:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description:
+      System-on-Chip designs may specify a specific MMC power
+      sequence. To successfully detect an (e)MMC/SD/SDIO card, that
+      power sequence must be maintained while initializing the card.
+
+patternProperties:
+  "^.*@[0-9]+$":
+    type: object
+    description: |
+      On embedded systems the cards connected to a host may need
+      additional properties. These can be specified in subnodes to the
+      host controller node. The subnodes are identified by the
+      standard \'reg\' property. Which information exactly can be
+      specified depends on the bindings for the SDIO function driver
+      for the subnode, as specified by the compatible string.
+
+    properties:
+      compatible:
+        description: |
+          Name of SDIO function following generic names recommended
+          practice
+
+      reg:
+        items:
+          - minimum: 0
+            maximum: 7
+            description:
+              Must contain the SDIO function number of the function this
+              subnode describes. A value of 0 denotes the memory SD
+              function, values from 1 to 7 denote the SDIO functions.
+
+      broken-hpi:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          Use this to indicate that the mmc-card has a broken hpi
+          implementation, and that hpi should not be used.
+
+    required:
+      - reg
+
+dependencies:
+  cd-debounce-delay-ms: [ cd-gpios ]
+  fixed-emmc-driver-type: [ non-removable ]
+
+examples:
+  - |
+    sdhci@ab000000 {
+        compatible = "sdhci";
+        reg = <0xab000000 0x200>;
+        interrupts = <23>;
+        bus-width = <4>;
+        cd-gpios = <&gpio 69 0>;
+        cd-inverted;
+        wp-gpios = <&gpio 70 0>;
+        max-frequency = <50000000>;
+        keep-power-in-suspend;
+        wakeup-source;
+        mmc-pwrseq = <&sdhci0_pwrseq>;
+    };
+
+  - |
+    mmc3: mmc@1c12000 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        pinctrl-names = "default";
+        pinctrl-0 = <&mmc3_pins_a>;
+        vmmc-supply = <&reg_vmmc3>;
+        bus-width = <4>;
+        non-removable;
+        mmc-pwrseq = <&sdhci0_pwrseq>;
+
+        brcmf: bcrmf@1 {
+            reg = <1>;
+            compatible = "brcm,bcm43xx-fmac";
+            interrupt-parent = <&pio>;
+            interrupts = <10 8>;
+            interrupt-names = "host-wake";
+        };
+    };
diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt
index c269dbe384fe..bf9d7d3febf1 100644
--- a/Documentation/devicetree/bindings/mmc/mmc.txt
+++ b/Documentation/devicetree/bindings/mmc/mmc.txt
@@ -1,177 +1 @@
-These properties are common to multiple MMC host controllers. Any host
-that requires the respective functionality should implement them using
-these definitions.
-
-Interpreted by the OF core:
-- reg: Registers location and length.
-- interrupts: Interrupts used by the MMC controller.
-
-Card detection:
-If no property below is supplied, host native card detect is used.
-Only one of the properties in this section should be supplied:
-  - broken-cd: There is no card detection available; polling must be used.
-  - cd-gpios: Specify GPIOs for card detection, see gpio binding
-  - non-removable: non-removable slot (like eMMC); assume always present.
-
-Optional properties:
-- bus-width: Number of data lines, can be <1>, <4>, or <8>.  The default
-  will be <1> if the property is absent.
-- wp-gpios: Specify GPIOs for write protection, see gpio binding
-- cd-inverted: when present, polarity on the CD line is inverted. See the note
-  below for the case, when a GPIO is used for the CD line
-- cd-debounce-delay-ms: Set delay time before detecting card after card insert interrupt.
-  It's only valid when cd-gpios is present.
-- wp-inverted: when present, polarity on the WP line is inverted. See the note
-  below for the case, when a GPIO is used for the WP line
-- disable-wp: When set no physical WP line is present. This property should
-  only be specified when the controller has a dedicated write-protect
-  detection logic. If a GPIO is always used for the write-protect detection
-  logic it is sufficient to not specify wp-gpios property in the absence of a WP
-  line.
-- max-frequency: maximum operating clock frequency
-- no-1-8-v: when present, denotes that 1.8v card voltage is not supported on
-  this system, even if the controller claims it is.
-- cap-sd-highspeed: SD high-speed timing is supported
-- cap-mmc-highspeed: MMC high-speed timing is supported
-- sd-uhs-sdr12: SD UHS SDR12 speed is supported
-- sd-uhs-sdr25: SD UHS SDR25 speed is supported
-- sd-uhs-sdr50: SD UHS SDR50 speed is supported
-- sd-uhs-sdr104: SD UHS SDR104 speed is supported
-- sd-uhs-ddr50: SD UHS DDR50 speed is supported
-- cap-power-off-card: powering off the card is safe
-- cap-mmc-hw-reset: eMMC hardware reset is supported
-- cap-sdio-irq: enable SDIO IRQ signalling on this interface
-- full-pwr-cycle: full power cycle of the card is supported
-- mmc-ddr-3_3v: eMMC high-speed DDR mode(3.3V I/O) is supported
-- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
-- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
-- mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported
-- mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported
-- mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported
-- mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported
-- mmc-hs400-enhanced-strobe: eMMC HS400 enhanced strobe mode is supported
-- dsr: Value the card's (optional) Driver Stage Register (DSR) should be
-  programmed with. Valid range: [0 .. 0xffff].
-- no-sdio: controller is limited to send sdio cmd during initialization
-- no-sd: controller is limited to send sd cmd during initialization
-- no-mmc: controller is limited to send mmc cmd during initialization
-- fixed-emmc-driver-type: for non-removable eMMC, enforce this driver type.
-  The value <n> is the driver type as specified in the eMMC specification
-  (table 206 in spec version 5.1).
-- post-power-on-delay-ms : It was invented for MMC pwrseq-simple which could
-  be referred to mmc-pwrseq-simple.txt. But now it's reused as a tunable delay
-  waiting for I/O signalling and card power supply to be stable, regardless of
-  whether pwrseq-simple is used. Default to 10ms if no available.
-- supports-cqe : The presence of this property indicates that the corresponding
-  MMC host controller supports HW command queue feature.
-- disable-cqe-dcmd: This property indicates that the MMC controller's command
-  queue engine (CQE) does not support direct commands (DCMDs).
-
-*NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line
-polarity properties, we have to fix the meaning of the "normal" and "inverted"
-line levels. We choose to follow the SDHCI standard, which specifies both those
-lines as "active low." Therefore, using the "cd-inverted" property means, that
-the CD line is active high, i.e. it is high, when a card is inserted. Similar
-logic applies to the "wp-inverted" property.
-
-CD and WP lines can be implemented on the hardware in one of two ways: as GPIOs,
-specified in cd-gpios and wp-gpios properties, or as dedicated pins. Polarity of
-dedicated pins can be specified, using *-inverted properties. GPIO polarity can
-also be specified using the GPIO_ACTIVE_LOW flag. This creates an ambiguity
-in the latter case. We choose to use the XOR logic for GPIO CD and WP lines.
-This means, the two properties are "superimposed," for example leaving the
-GPIO_ACTIVE_LOW flag clear and specifying the respective *-inverted property
-property results in a double-inversion and actually means the "normal" line
-polarity is in effect.
-
-Optional SDIO properties:
-- keep-power-in-suspend: Preserves card power during a suspend/resume cycle
-- wakeup-source: Enables wake up of host system on SDIO IRQ assertion
-		 (Legacy property supported: "enable-sdio-wakeup")
-
-MMC power
----------
-
-Controllers may implement power control from both the connected cards and
-the IO signaling (for example to change to high-speed 1.8V signalling). If
-the system supports this, then the following two properties should point
-to valid regulator nodes:
-
-- vqmmc-supply: supply node for IO line power
-- vmmc-supply: supply node for card's power
-
-
-MMC power sequences:
---------------------
-
-System on chip designs may specify a specific MMC power sequence. To
-successfully detect an (e)MMC/SD/SDIO card, that power sequence must be
-maintained while initializing the card.
-
-Optional property:
-- mmc-pwrseq: phandle to the MMC power sequence node. See "mmc-pwrseq-*"
-	for documentation of MMC power sequence bindings.
-
-
-Use of Function subnodes
-------------------------
-
-On embedded systems the cards connected to a host may need additional
-properties. These can be specified in subnodes to the host controller node.
-The subnodes are identified by the standard 'reg' property.
-Which information exactly can be specified depends on the bindings for the
-SDIO function driver for the subnode, as specified by the compatible string.
-
-Required host node properties when using function subnodes:
-- #address-cells: should be one. The cell is the slot id.
-- #size-cells: should be zero.
-
-Required function subnode properties:
-- reg: Must contain the SDIO function number of the function this subnode
-       describes. A value of 0 denotes the memory SD function, values from
-       1 to 7 denote the SDIO functions.
-
-Optional function subnode properties:
-- compatible: name of SDIO function following generic names recommended practice
-
-
-Examples
---------
-
-Basic example:
-
-sdhci@ab000000 {
-	compatible = "sdhci";
-	reg = <0xab000000 0x200>;
-	interrupts = <23>;
-	bus-width = <4>;
-	cd-gpios = <&gpio 69 0>;
-	cd-inverted;
-	wp-gpios = <&gpio 70 0>;
-	max-frequency = <50000000>;
-	keep-power-in-suspend;
-	wakeup-source;
-	mmc-pwrseq = <&sdhci0_pwrseq>
-}
-
-Example with sdio function subnode:
-
-mmc3: mmc@1c12000 {
-	#address-cells = <1>;
-	#size-cells = <0>;
-
-	pinctrl-names = "default";
-	pinctrl-0 = <&mmc3_pins_a>;
-	vmmc-supply = <&reg_vmmc3>;
-	bus-width = <4>;
-	non-removable;
-	mmc-pwrseq = <&sdhci0_pwrseq>
-
-	brcmf: bcrmf@1 {
-		reg = <1>;
-		compatible = "brcm,bcm43xx-fmac";
-		interrupt-parent = <&pio>;
-		interrupts = <10 8>; /* PH10 / EINT10 */
-		interrupt-names = "host-wake";
-	};
-};
+This file has moved to mmc-controller.yaml.
diff --git a/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt b/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt
new file mode 100644
index 000000000000..dd08d038a65c
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt
@@ -0,0 +1,111 @@
+* Renesas SDHI SD/MMC controller
+
+Required properties:
+- compatible: should contain one or more of the following:
+		"renesas,sdhi-sh73a0" - SDHI IP on SH73A0 SoC
+		"renesas,sdhi-r7s72100" - SDHI IP on R7S72100 SoC
+		"renesas,sdhi-r7s9210" - SDHI IP on R7S9210 SoC
+		"renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC
+		"renesas,sdhi-r8a7740" - SDHI IP on R8A7740 SoC
+		"renesas,sdhi-r8a7743" - SDHI IP on R8A7743 SoC
+		"renesas,sdhi-r8a7744" - SDHI IP on R8A7744 SoC
+		"renesas,sdhi-r8a7745" - SDHI IP on R8A7745 SoC
+		"renesas,sdhi-r8a774a1" - SDHI IP on R8A774A1 SoC
+		"renesas,sdhi-r8a774c0" - SDHI IP on R8A774C0 SoC
+		"renesas,sdhi-r8a77470" - SDHI IP on R8A77470 SoC
+		"renesas,sdhi-mmc-r8a77470" - SDHI/MMC IP on R8A77470 SoC
+		"renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
+		"renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
+		"renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
+		"renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
+		"renesas,sdhi-r8a7792" - SDHI IP on R8A7792 SoC
+		"renesas,sdhi-r8a7793" - SDHI IP on R8A7793 SoC
+		"renesas,sdhi-r8a7794" - SDHI IP on R8A7794 SoC
+		"renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC
+		"renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC
+		"renesas,sdhi-r8a77965" - SDHI IP on R8A77965 SoC
+		"renesas,sdhi-r8a77970" - SDHI IP on R8A77970 SoC
+		"renesas,sdhi-r8a77980" - SDHI IP on R8A77980 SoC
+		"renesas,sdhi-r8a77990" - SDHI IP on R8A77990 SoC
+		"renesas,sdhi-r8a77995" - SDHI IP on R8A77995 SoC
+		"renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller
+		"renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller
+		"renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 and RZ/G1 SDHI
+					   (not SDHI/MMC) controller
+		"renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 or RZ/G2
+					   SDHI controller
+
+
+		When compatible with the generic version, nodes must list
+		the SoC-specific version corresponding to the platform
+		first followed by the generic version.
+
+- clocks: Most controllers only have 1 clock source per channel. However, on
+	  some variations of this controller, the internal card detection
+	  logic that exists in this controller is sectioned off to be run by a
+	  separate second clock source to allow the main core clock to be turned
+	  off to save power.
+	  If 2 clocks are specified by the hardware, you must name them as
+	  "core" and "cd". If the controller only has 1 clock, naming is not
+	  required.
+	  Devices which have more than 1 clock are listed below:
+	  2: R7S72100, R7S9210
+
+Optional properties:
+- pinctrl-names: should be "default", "state_uhs"
+- pinctrl-0: should contain default/high speed pin ctrl
+- pinctrl-1: should contain uhs mode pin ctrl
+
+Example: R8A7790 (R-Car H2) SDHI controller nodes
+
+	sdhi0: sd@ee100000 {
+		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+		reg = <0 0xee100000 0 0x328>;
+		interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&cpg CPG_MOD 314>;
+		dmas = <&dmac0 0xcd>, <&dmac0 0xce>,
+		       <&dmac1 0xcd>, <&dmac1 0xce>;
+		dma-names = "tx", "rx", "tx", "rx";
+		max-frequency = <195000000>;
+		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+		resets = <&cpg 314>;
+	};
+
+	sdhi1: sd@ee120000 {
+		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+		reg = <0 0xee120000 0 0x328>;
+		interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&cpg CPG_MOD 313>;
+		dmas = <&dmac0 0xc9>, <&dmac0 0xca>,
+		       <&dmac1 0xc9>, <&dmac1 0xca>;
+		dma-names = "tx", "rx", "tx", "rx";
+		max-frequency = <195000000>;
+		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+		resets = <&cpg 313>;
+	};
+
+	sdhi2: sd@ee140000 {
+		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+		reg = <0 0xee140000 0 0x100>;
+		interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&cpg CPG_MOD 312>;
+		dmas = <&dmac0 0xc1>, <&dmac0 0xc2>,
+		       <&dmac1 0xc1>, <&dmac1 0xc2>;
+		dma-names = "tx", "rx", "tx", "rx";
+		max-frequency = <97500000>;
+		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+		resets = <&cpg 312>;
+	};
+
+	sdhi3: sd@ee160000 {
+		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+		reg = <0 0xee160000 0 0x100>;
+		interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&cpg CPG_MOD 311>;
+		dmas = <&dmac0 0xd3>, <&dmac0 0xd4>,
+		       <&dmac1 0xd3>, <&dmac1 0xd4>;
+		dma-names = "tx", "rx", "tx", "rx";
+		max-frequency = <97500000>;
+		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+		resets = <&cpg 311>;
+	};
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
index 15dbbbace27e..50e87df47971 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
+++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
@@ -8,7 +8,10 @@ Only deviations are documented here.
   [3] Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
 
 Required Properties:
-	- compatible: should be "ti,am654-sdhci-5.1"
+	- compatible: should be one of:
+			"ti,am654-sdhci-5.1": SDHCI on AM654 device.
+			"ti,j721e-sdhci-8bit": 8 bit SDHCI on J721E device.
+			"ti,j721e-sdhci-4bit": 4 bit SDHCI on J721E device.
 	- reg: Must be two entries.
 		- The first should be the sdhci register space
 		- The second should the subsystem/phy register space
@@ -16,9 +19,13 @@ Required Properties:
 	- clock-names: Tuple including "clk_xin" and "clk_ahb"
 	- interrupts: Interrupt specifiers
 	- ti,otap-del-sel: Output Tap Delay select
+
+Optional Properties (Required for ti,am654-sdhci-5.1 and ti,j721e-sdhci-8bit):
 	- ti,trm-icp: DLL trim select
 	- ti,driver-strength-ohm: driver strength in ohms.
 				  Valid values are 33, 40, 50, 66 and 100 ohms.
+Optional Properties:
+	- ti,strobe-sel: strobe select delay for HS400 speed mode. Default value: 0x0.
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
index 45c9978aad7b..eb7eb1b529f0 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
+++ b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
@@ -14,10 +14,31 @@ Required properties:
 - clock-names: Should contain the following:
 	"sdio" - SDIO source clock (required)
 	"enable" - gate clock which used for enabling/disabling the device (required)
+	"2x_enable" - gate clock controlling the device for some special platforms (optional)
 
 Optional properties:
 - assigned-clocks: the same with "sdio" clock
 - assigned-clock-parents: the default parent of "sdio" clock
+- pinctrl-names: should be "default", "state_uhs"
+- pinctrl-0: should contain default/high speed pin control
+- pinctrl-1: should contain uhs mode pin control
+
+PHY DLL delays are used to delay the data valid window, and align the window
+to sampling clock. PHY DLL delays can be configured by following properties,
+and each property contains 4 cells which are used to configure the clock data
+write line delay value, clock read command line delay value, clock read data
+positive edge delay value and clock read data negative edge delay value.
+Each cell's delay value unit is cycle of the PHY clock.
+
+- sprd,phy-delay-legacy: Delay value for legacy timing.
+- sprd,phy-delay-sd-highspeed: Delay value for SD high-speed timing.
+- sprd,phy-delay-sd-uhs-sdr50: Delay value for SD UHS SDR50 timing.
+- sprd,phy-delay-sd-uhs-sdr104: Delay value for SD UHS SDR50 timing.
+- sprd,phy-delay-mmc-highspeed: Delay value for MMC high-speed timing.
+- sprd,phy-delay-mmc-ddr52: Delay value for MMC DDR52 timing.
+- sprd,phy-delay-mmc-hs200: Delay value for MMC HS200 timing.
+- sprd,phy-delay-mmc-hs400: Delay value for MMC HS400 timing.
+- sprd,phy-delay-mmc-hs400es: Delay value for MMC HS400 enhanced strobe timing.
 
 Examples:
 
@@ -32,6 +53,11 @@ sdio0: sdio@20600000 {
 	assigned-clocks = <&ap_clk CLK_EMMC_2X>;
 	assigned-clock-parents = <&rpll CLK_RPLL_390M>;
 
+	pinctrl-names = "default", "state_uhs";
+	pinctrl-0 = <&sd0_pins_default>;
+	pinctrl-1 = <&sd0_pins_uhs>;
+
+	sprd,phy-delay-sd-uhs-sdr104 = <0x3f 0x7f 0x2e 0x2e>;
 	bus-width = <8>;
 	non-removable;
 	no-sdio;
diff --git a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt b/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt
deleted file mode 100644
index e9cb3ec5e502..000000000000
--- a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-* Allwinner sunxi MMC controller
-
-The highspeed MMC host controller on Allwinner SoCs provides an interface
-for MMC, SD and SDIO types of memory cards.
-
-Supported maximum speeds are the ones of the eMMC standard 4.5 as well
-as the speed of SD standard 3.0.
-Absolute maximum transfer rate is 200MB/s
-
-Required properties:
- - compatible : should be one of:
-   * "allwinner,sun4i-a10-mmc"
-   * "allwinner,sun5i-a13-mmc"
-   * "allwinner,sun7i-a20-mmc"
-   * "allwinner,sun8i-a83t-emmc"
-   * "allwinner,sun9i-a80-mmc"
-   * "allwinner,sun50i-a64-emmc"
-   * "allwinner,sun50i-a64-mmc"
-   * "allwinner,sun50i-h6-emmc", "allwinner.sun50i-a64-emmc"
-   * "allwinner,sun50i-h6-mmc", "allwinner.sun50i-a64-mmc"
- - reg : mmc controller base registers
- - clocks : a list with 4 phandle + clock specifier pairs
- - clock-names : must contain "ahb", "mmc", "output" and "sample"
- - interrupts : mmc controller interrupt
-
-Optional properties:
- - resets : phandle + reset specifier pair
- - reset-names : must contain "ahb"
- - for cd, bus-width and additional generic mmc parameters
-   please refer to mmc.txt within this directory
-
-Examples:
-	- Within .dtsi:
-	mmc0: mmc@1c0f000 {
-		compatible = "allwinner,sun5i-a13-mmc";
-		reg = <0x01c0f000 0x1000>;
-		clocks = <&ahb_gates 8>, <&mmc0_clk>, <&mmc0_output_clk>, <&mmc0_sample_clk>;
-		clock-names = "ahb", "mod", "output", "sample";
-		interrupts = <0 32 4>;
-		status = "disabled";
-	};
-
-	- Within dts:
-	mmc0: mmc@1c0f000 {
-		pinctrl-names = "default", "default";
-		pinctrl-0 = <&mmc0_pins_a>;
-		pinctrl-1 = <&mmc0_cd_pin_reference_design>;
-		bus-width = <4>;
-		cd-gpios = <&pio 7 1 0>; /* PH1 */
-		cd-inverted;
-		status = "okay";
-	};
diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
deleted file mode 100644
index 2b4f17ca9087..000000000000
--- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-* Toshiba Mobile IO SD/MMC controller
-
-The tmio-mmc driver doesn't probe its devices actively, instead its binding to
-devices is managed by either MFD drivers or by the sh_mobile_sdhi platform
-driver. Those drivers supply the tmio-mmc driver with platform data, that either
-describe hardware capabilities, known to them, or are obtained by them from
-their own platform data or from their DT information. In the latter case all
-compulsory and any optional properties, common to all SD/MMC drivers, as
-described in mmc.txt, can be used. Additionally the following tmio_mmc-specific
-optional bindings can be used.
-
-Required properties:
-- compatible: should contain one or more of the following:
-		"renesas,sdhi-sh73a0" - SDHI IP on SH73A0 SoC
-		"renesas,sdhi-r7s72100" - SDHI IP on R7S72100 SoC
-		"renesas,sdhi-r7s9210" - SDHI IP on R7S9210 SoC
-		"renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC
-		"renesas,sdhi-r8a7740" - SDHI IP on R8A7740 SoC
-		"renesas,sdhi-r8a7743" - SDHI IP on R8A7743 SoC
-		"renesas,sdhi-r8a7744" - SDHI IP on R8A7744 SoC
-		"renesas,sdhi-r8a7745" - SDHI IP on R8A7745 SoC
-		"renesas,sdhi-r8a774a1" - SDHI IP on R8A774A1 SoC
-		"renesas,sdhi-r8a774c0" - SDHI IP on R8A774C0 SoC
-		"renesas,sdhi-r8a77470" - SDHI IP on R8A77470 SoC
-		"renesas,sdhi-mmc-r8a77470" - SDHI/MMC IP on R8A77470 SoC
-		"renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
-		"renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
-		"renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
-		"renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
-		"renesas,sdhi-r8a7792" - SDHI IP on R8A7792 SoC
-		"renesas,sdhi-r8a7793" - SDHI IP on R8A7793 SoC
-		"renesas,sdhi-r8a7794" - SDHI IP on R8A7794 SoC
-		"renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC
-		"renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC
-		"renesas,sdhi-r8a77965" - SDHI IP on R8A77965 SoC
-		"renesas,sdhi-r8a77970" - SDHI IP on R8A77970 SoC
-		"renesas,sdhi-r8a77980" - SDHI IP on R8A77980 SoC
-		"renesas,sdhi-r8a77990" - SDHI IP on R8A77990 SoC
-		"renesas,sdhi-r8a77995" - SDHI IP on R8A77995 SoC
-		"renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller
-		"renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller
-		"renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 and RZ/G1 SDHI
-					   (not SDHI/MMC) controller
-		"renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 or RZ/G2
-					   SDHI controller
-
-
-		When compatible with the generic version, nodes must list
-		the SoC-specific version corresponding to the platform
-		first followed by the generic version.
-
-- clocks: Most controllers only have 1 clock source per channel. However, on
-	  some variations of this controller, the internal card detection
-	  logic that exists in this controller is sectioned off to be run by a
-	  separate second clock source to allow the main core clock to be turned
-	  off to save power.
-	  If 2 clocks are specified by the hardware, you must name them as
-	  "core" and "cd". If the controller only has 1 clock, naming is not
-	  required.
-	  Devices which have more than 1 clock are listed below:
-	  2: R7S72100, R7S9210
-
-Optional properties:
-- pinctrl-names: should be "default", "state_uhs"
-- pinctrl-0: should contain default/high speed pin ctrl
-- pinctrl-1: should contain uhs mode pin ctrl
-
-Example: R8A7790 (R-Car H2) SDHI controller nodes
-
-	sdhi0: sd@ee100000 {
-		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
-		reg = <0 0xee100000 0 0x328>;
-		interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cpg CPG_MOD 314>;
-		dmas = <&dmac0 0xcd>, <&dmac0 0xce>,
-		       <&dmac1 0xcd>, <&dmac1 0xce>;
-		dma-names = "tx", "rx", "tx", "rx";
-		max-frequency = <195000000>;
-		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
-		resets = <&cpg 314>;
-	};
-
-	sdhi1: sd@ee120000 {
-		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
-		reg = <0 0xee120000 0 0x328>;
-		interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cpg CPG_MOD 313>;
-		dmas = <&dmac0 0xc9>, <&dmac0 0xca>,
-		       <&dmac1 0xc9>, <&dmac1 0xca>;
-		dma-names = "tx", "rx", "tx", "rx";
-		max-frequency = <195000000>;
-		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
-		resets = <&cpg 313>;
-	};
-
-	sdhi2: sd@ee140000 {
-		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
-		reg = <0 0xee140000 0 0x100>;
-		interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cpg CPG_MOD 312>;
-		dmas = <&dmac0 0xc1>, <&dmac0 0xc2>,
-		       <&dmac1 0xc1>, <&dmac1 0xc2>;
-		dma-names = "tx", "rx", "tx", "rx";
-		max-frequency = <97500000>;
-		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
-		resets = <&cpg 312>;
-	};
-
-	sdhi3: sd@ee160000 {
-		compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
-		reg = <0 0xee160000 0 0x100>;
-		interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cpg CPG_MOD 311>;
-		dmas = <&dmac0 0xd3>, <&dmac0 0xd4>,
-		       <&dmac1 0xd3>, <&dmac1 0xd4>;
-		dma-names = "tx", "rx", "tx", "rx";
-		max-frequency = <97500000>;
-		power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
-		resets = <&cpg 311>;
-	};
diff --git a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
index fbd4da3684fc..b5b3cf5b1ac2 100644
--- a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
+++ b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
@@ -55,9 +55,9 @@ patternProperties:
   "^pinctrl-[0-9]+$": true
 
   "^nand@[a-f0-9]+$":
+    type: object
     properties:
       reg:
-        maxItems: 1
         minimum: 0
         maximum: 7
 
diff --git a/Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt b/Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
index 3983c11e062c..5794ab1147c1 100644
--- a/Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
@@ -24,7 +24,7 @@ Optional children nodes:
 Children nodes represent the available nand chips.
 
 Other properties:
-see Documentation/devicetree/bindings/mtd/nand.txt for generic bindings.
+see Documentation/devicetree/bindings/mtd/nand-controller.yaml for generic bindings.
 
 Example demonstrate on AXG SoC:
 
diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
index bcda1dfc4bac..82156dc8f304 100644
--- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
+++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
@@ -28,6 +28,7 @@ Required properties:
                          brcm,brcmnand-v7.0
                          brcm,brcmnand-v7.1
                          brcm,brcmnand-v7.2
+                         brcm,brcmnand-v7.3
                          brcm,brcmnand
 - reg              : the register start and length for NAND register region.
                      (optional) Flash DMA register range (if present)
@@ -101,12 +102,12 @@ Required properties:
                               number (e.g., 0, 1, 2, etc.)
 - #address-cells            : see partition.txt
 - #size-cells               : see partition.txt
-- nand-ecc-strength         : see nand.txt
-- nand-ecc-step-size        : must be 512 or 1024. See nand.txt
 
 Optional properties:
+- nand-ecc-strength         : see nand-controller.yaml
+- nand-ecc-step-size        : must be 512 or 1024. See nand-controller.yaml
 - nand-on-flash-bbt         : boolean, to enable the on-flash BBT for this
-                              chip-select. See nand.txt
+                              chip-select. See nand-controller.yaml
 - brcm,nand-oob-sector-size : integer, to denote the spare area sector size
                               expected for the ECC layout in use. This size, in
                               addition to the strength and step-size,
diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
index 4345c3a6f530..945be7d5b236 100644
--- a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
+++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
@@ -35,6 +35,9 @@ custom properties:
 		  (qspi_n_ss_out).
 - cdns,tslch-ns : Delay in nanoseconds between setting qspi_n_ss_out low
                   and first bit transfer.
+- resets	: Must contain an entry for each entry in reset-names.
+		  See ../reset/reset.txt for details.
+- reset-names	: Must include either "qspi" and/or "qspi-ocp".
 
 Example:
 
@@ -50,6 +53,8 @@ Example:
 		cdns,fifo-depth = <128>;
 		cdns,fifo-width = <4>;
 		cdns,trigger-address = <0x00000000>;
+		resets = <&rst QSPI_RESET>, <&rst QSPI_OCP_RESET>;
+		reset-names = "qspi", "qspi-ocp";
 
 		flash0: n25q00@0 {
 			...
diff --git a/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt b/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
new file mode 100644
index 000000000000..ad42f4db32f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
@@ -0,0 +1,13 @@
+Bindings for HyperFlash NOR flash chips compliant with Cypress HyperBus
+specification and supports Cypress CFI specification 1.5 command set.
+
+Required properties:
+- compatible : "cypress,hyperflash", "cfi-flash" for HyperFlash NOR chips
+- reg : Address of flash's memory map
+
+Example:
+
+	flash@0 {
+		compatible = "cypress,hyperflash", "cfi-flash";
+		reg = <0x0 0x4000000>;
+	};
diff --git a/Documentation/devicetree/bindings/mtd/denali-nand.txt b/Documentation/devicetree/bindings/mtd/denali-nand.txt
index b14b6751c2f3..b32aed1db46d 100644
--- a/Documentation/devicetree/bindings/mtd/denali-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/denali-nand.txt
@@ -22,16 +22,16 @@ Sub-nodes:
       select is connected.
 
   Optional properties:
-    - nand-ecc-step-size: see nand.txt for details.
+    - nand-ecc-step-size: see nand-controller.yaml for details.
       If present, the value must be
         512        for "altr,socfpga-denali-nand"
         1024       for "socionext,uniphier-denali-nand-v5a"
         1024       for "socionext,uniphier-denali-nand-v5b"
-    - nand-ecc-strength: see nand.txt for details. Valid values are:
+    - nand-ecc-strength: see nand-controller.yaml for details. Valid values are:
         8, 15      for "altr,socfpga-denali-nand"
         8, 16, 24  for "socionext,uniphier-denali-nand-v5a"
         8, 16      for "socionext,uniphier-denali-nand-v5b"
-    - nand-ecc-maximize: see nand.txt for details
+    - nand-ecc-maximize: see nand-controller.yaml for details
 
 The chip nodes may optionally contain sub-nodes describing partitions of the
 address space. See partition.txt for more detail.
diff --git a/Documentation/devicetree/bindings/mtd/fsmc-nand.txt b/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
index 32636eb77304..6762d3c4d5a4 100644
--- a/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/fsmc-nand.txt
@@ -30,9 +30,9 @@ Optional properties:
                  command is asserted. Zero means one cycle, 255 means 256
                  cycles.
 - bank: default NAND bank to use (0-3 are valid, 0 is the default).
-- nand-ecc-mode      : see nand.txt
-- nand-ecc-strength  : see nand.txt
-- nand-ecc-step-size : see nand.txt
+- nand-ecc-mode      : see nand-controller.yaml
+- nand-ecc-strength  : see nand-controller.yaml
+- nand-ecc-step-size : see nand-controller.yaml
 
 Can support 1-bit HW ECC (default) or if stronger correction is required,
 software-based BCH.
diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
index c059ab74ed88..44919d48d241 100644
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -8,7 +8,7 @@ explained in a separate documents - please refer to
 Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
 
 For NAND specific properties such as ECC modes or bus width, please refer to
-Documentation/devicetree/bindings/mtd/nand.txt
+Documentation/devicetree/bindings/mtd/nand-controller.yaml
 
 
 Required properties:
diff --git a/Documentation/devicetree/bindings/mtd/hisi504-nand.txt b/Documentation/devicetree/bindings/mtd/hisi504-nand.txt
index 2e35f0662912..8963983ae7cb 100644
--- a/Documentation/devicetree/bindings/mtd/hisi504-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/hisi504-nand.txt
@@ -7,7 +7,7 @@ Required properties:
                        NAND controller's registers. The second contains base
                        physical address and size of NAND controller's buffer.
 - interrupts:          Interrupt number for nfc.
-- nand-bus-width:      See nand.txt.
+- nand-bus-width:      See nand-controller.yaml.
 - nand-ecc-mode:       Support none and hw ecc mode.
 - #address-cells:      Partition address, should be set 1.
 - #size-cells:         Partition size, should be set 1.
diff --git a/Documentation/devicetree/bindings/mtd/marvell-nand.txt b/Documentation/devicetree/bindings/mtd/marvell-nand.txt
index e0c790706b9b..a2d9a0f2b683 100644
--- a/Documentation/devicetree/bindings/mtd/marvell-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/marvell-nand.txt
@@ -36,29 +36,29 @@ Children nodes represent the available NAND chips.
 
 Required properties:
 - reg: shall contain the native Chip Select ids (0-3).
-- nand-rb: see nand.txt (0-1).
+- nand-rb: see nand-controller.yaml (0-1).
 
 Optional properties:
 - marvell,nand-keep-config: orders the driver not to take the timings
   from the core and leaving them completely untouched. Bootloader
   timings will then be used.
 - label: MTD name.
-- nand-on-flash-bbt: see nand.txt.
-- nand-ecc-mode: see nand.txt. Will use hardware ECC if not specified.
-- nand-ecc-algo: see nand.txt. This property is essentially useful when
+- nand-on-flash-bbt: see nand-controller.yaml.
+- nand-ecc-mode: see nand-controller.yaml. Will use hardware ECC if not specified.
+- nand-ecc-algo: see nand-controller.yaml. This property is essentially useful when
   not using hardware ECC. Howerver, it may be added when using hardware
   ECC for clarification but will be ignored by the driver because ECC
   mode is chosen depending on the page size and the strength required by
   the NAND chip. This value may be overwritten with nand-ecc-strength
   property.
-- nand-ecc-strength: see nand.txt.
-- nand-ecc-step-size: see nand.txt. Marvell's NAND flash controller does
+- nand-ecc-strength: see nand-controller.yaml.
+- nand-ecc-step-size: see nand-controller.yaml. Marvell's NAND flash controller does
   use fixed strength (1-bit for Hamming, 16-bit for BCH), so the actual
   step size will shrink or grow in order to fit the required strength.
   Step sizes are not completely random for all and follow certain
   patterns described in AN-379, "Marvell SoC NFC ECC".
 
-See Documentation/devicetree/bindings/mtd/nand.txt for more details on
+See Documentation/devicetree/bindings/mtd/nand-controller.yaml for more details on
 generic bindings.
 
 
diff --git a/Documentation/devicetree/bindings/mtd/mxc-nand.txt b/Documentation/devicetree/bindings/mtd/mxc-nand.txt
index b5833d11c7be..2857c628fba4 100644
--- a/Documentation/devicetree/bindings/mtd/mxc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/mxc-nand.txt
@@ -4,9 +4,9 @@ Required properties:
 - compatible: "fsl,imxXX-nand"
 - reg: address range of the nfc block
 - interrupts: irq to be used
-- nand-bus-width: see nand.txt
-- nand-ecc-mode: see nand.txt
-- nand-on-flash-bbt: see nand.txt
+- nand-bus-width: see nand-controller.yaml
+- nand-ecc-mode: see nand-controller.yaml
+- nand-on-flash-bbt: see nand-controller.yaml
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/mtd/nand-controller.yaml b/Documentation/devicetree/bindings/mtd/nand-controller.yaml
index 199ba5ac2a06..d261b7096c69 100644
--- a/Documentation/devicetree/bindings/mtd/nand-controller.yaml
+++ b/Documentation/devicetree/bindings/mtd/nand-controller.yaml
@@ -40,6 +40,7 @@ properties:
 
 patternProperties:
   "^nand@[a-f0-9]$":
+    type: object
     properties:
       reg:
         description:
diff --git a/Documentation/devicetree/bindings/mtd/nvidia-tegra20-nand.txt b/Documentation/devicetree/bindings/mtd/nvidia-tegra20-nand.txt
index b2f2ca12f9e6..e737e5beb7bf 100644
--- a/Documentation/devicetree/bindings/mtd/nvidia-tegra20-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/nvidia-tegra20-nand.txt
@@ -26,14 +26,14 @@ Optional children node properties:
 		 "hw" is supported.
 - nand-ecc-algo: string, algorithm of NAND ECC.
 		 Supported values with "hw" ECC mode are: "rs", "bch".
-- nand-bus-width : See nand.txt
-- nand-on-flash-bbt: See nand.txt
+- nand-bus-width : See nand-controller.yaml
+- nand-on-flash-bbt: See nand-controller.yaml
 - nand-ecc-strength: integer representing the number of bits to correct
 		     per ECC step (always 512). Supported strength using HW ECC
 		     modes are:
 		     - RS: 4, 6, 8
 		     - BCH: 4, 8, 14, 16
-- nand-ecc-maximize: See nand.txt
+- nand-ecc-maximize: See nand-controller.yaml
 - nand-is-boot-medium: Makes sure only ECC strengths supported by the boot ROM
 		       are chosen.
 - wp-gpios: GPIO specifier for the write protect pin.
diff --git a/Documentation/devicetree/bindings/mtd/oxnas-nand.txt b/Documentation/devicetree/bindings/mtd/oxnas-nand.txt
index 56d5c19da41d..2ba07fc8b79c 100644
--- a/Documentation/devicetree/bindings/mtd/oxnas-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/oxnas-nand.txt
@@ -1,6 +1,6 @@
 * Oxford Semiconductor OXNAS NAND Controller
 
-Please refer to nand.txt for generic information regarding MTD NAND bindings.
+Please refer to nand-controller.yaml for generic information regarding MTD NAND bindings.
 
 Required properties:
  - compatible: "oxsemi,ox820-nand"
diff --git a/Documentation/devicetree/bindings/mtd/qcom_nandc.txt b/Documentation/devicetree/bindings/mtd/qcom_nandc.txt
index 1123cc6d56ef..5c2fba4b30fe 100644
--- a/Documentation/devicetree/bindings/mtd/qcom_nandc.txt
+++ b/Documentation/devicetree/bindings/mtd/qcom_nandc.txt
@@ -47,8 +47,8 @@ Required properties:
 - #size-cells:		see partition.txt
 
 Optional properties:
-- nand-bus-width:	see nand.txt
-- nand-ecc-strength:	see nand.txt. If not specified, then ECC strength will
+- nand-bus-width:	see nand-controller.yaml
+- nand-ecc-strength:	see nand-controller.yaml. If not specified, then ECC strength will
 			be used according to chip requirement and available
 			OOB size.
 
diff --git a/Documentation/devicetree/bindings/mtd/samsung-s3c2410.txt b/Documentation/devicetree/bindings/mtd/samsung-s3c2410.txt
index 0040eb8895e0..09815c40fc8a 100644
--- a/Documentation/devicetree/bindings/mtd/samsung-s3c2410.txt
+++ b/Documentation/devicetree/bindings/mtd/samsung-s3c2410.txt
@@ -6,7 +6,7 @@ Required properties:
 	"samsung,s3c2412-nand"
 	"samsung,s3c2440-nand"
 - reg : register's location and length.
-- #address-cells, #size-cells : see nand.txt
+- #address-cells, #size-cells : see nand-controller.yaml
 - clocks : phandle to the nand controller clock
 - clock-names : must contain "nand"
 
@@ -14,8 +14,8 @@ Optional child nodes:
 Child nodes representing the available nand chips.
 
 Optional child properties:
-- nand-ecc-mode : see nand.txt
-- nand-on-flash-bbt : see nand.txt
+- nand-ecc-mode : see nand-controller.yaml
+- nand-on-flash-bbt : see nand-controller.yaml
 
 Each child device node may optionally contain a 'partitions' sub-node,
 which further contains sub-nodes describing the flash partition mapping.
diff --git a/Documentation/devicetree/bindings/mtd/stm32-fmc2-nand.txt b/Documentation/devicetree/bindings/mtd/stm32-fmc2-nand.txt
index ad2bef826582..e55895e8dae4 100644
--- a/Documentation/devicetree/bindings/mtd/stm32-fmc2-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/stm32-fmc2-nand.txt
@@ -24,9 +24,9 @@ Required properties:
 - reg: describes the CS lines assigned to the NAND device.
 
 Optional properties:
-- nand-on-flash-bbt: see nand.txt
-- nand-ecc-strength: see nand.txt
-- nand-ecc-step-size: see nand.txt
+- nand-on-flash-bbt: see nand-controller.yaml
+- nand-ecc-strength: see nand-controller.yaml
+- nand-ecc-step-size: see nand-controller.yaml
 
 The following ECC strength and step size are currently supported:
  - nand-ecc-strength = <1>, nand-ecc-step-size = <512> (Hamming)
diff --git a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
deleted file mode 100644
index ddd18c135148..000000000000
--- a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-* STMicroelectronics Quad Serial Peripheral Interface(QuadSPI)
-
-Required properties:
-- compatible: should be "st,stm32f469-qspi"
-- reg: the first contains the register location and length.
-       the second contains the memory mapping address and length
-- reg-names: should contain the reg names "qspi" "qspi_mm"
-- interrupts: should contain the interrupt for the device
-- clocks: the phandle of the clock needed by the QSPI controller
-- A pinctrl must be defined to set pins in mode of operation for QSPI transfer
-
-Optional properties:
-- resets: must contain the phandle to the reset controller.
-
-A spi flash must be a child of the nor_flash node and could have some
-properties. Also see jedec,spi-nor.txt.
-
-Required properties:
-- reg: chip-Select number (QSPI controller may connect 2 nor flashes)
-- spi-max-frequency: max frequency of spi bus
-
-Optional property:
-- spi-rx-bus-width: see ../spi/spi-bus.txt for the description
-
-Example:
-
-qspi: spi@a0001000 {
-	compatible = "st,stm32f469-qspi";
-	reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>;
-	reg-names = "qspi", "qspi_mm";
-	interrupts = <91>;
-	resets = <&rcc STM32F4_AHB3_RESET(QSPI)>;
-	clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>;
-	pinctrl-names = "default";
-	pinctrl-0 = <&pinctrl_qspi0>;
-
-	flash@0 {
-		reg = <0>;
-		spi-rx-bus-width = <4>;
-		spi-max-frequency = <108000000>;
-		...
-	};
-};
diff --git a/Documentation/devicetree/bindings/mtd/tango-nand.txt b/Documentation/devicetree/bindings/mtd/tango-nand.txt
index cd1bf2ac9055..91c8420241af 100644
--- a/Documentation/devicetree/bindings/mtd/tango-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/tango-nand.txt
@@ -11,7 +11,7 @@ Required properties:
 - #size-cells: <0>
 
 Children nodes represent the available NAND chips.
-See Documentation/devicetree/bindings/mtd/nand.txt for generic bindings.
+See Documentation/devicetree/bindings/mtd/nand-controller.yaml for generic bindings.
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt b/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt
new file mode 100644
index 000000000000..faa81c2e5da6
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt
@@ -0,0 +1,51 @@
+Bindings for HyperBus Memory Controller (HBMC) on TI's K3 family of SoCs
+
+Required properties:
+- compatible : "ti,am654-hbmc" for AM654 SoC
+- reg : Two entries:
+	First entry pointed to the register space of HBMC controller
+	Second entry pointing to the memory map region dedicated for
+	MMIO access to attached flash devices
+- ranges : Address translation from offset within CS to allocated MMIO
+	   space in SoC
+
+Optional properties:
+- mux-controls : phandle to the multiplexer that controls selection of
+		 HBMC vs OSPI inside Flash SubSystem (FSS). Default is OSPI,
+		 if property is absent.
+		 See Documentation/devicetree/bindings/mux/reg-mux.txt
+		 for mmio-mux binding details
+
+Example:
+
+	system-controller@47000000 {
+		compatible = "syscon", "simple-mfd";
+		reg = <0x0 0x47000000 0x0 0x100>;
+		#address-cells = <2>;
+		#size-cells = <2>;
+		ranges;
+
+		hbmc_mux: multiplexer {
+			compatible = "mmio-mux";
+			#mux-control-cells = <1>;
+			mux-reg-masks = <0x4 0x2>; /* 0: reg 0x4, bit 1 */
+		};
+	};
+
+	hbmc: hyperbus@47034000 {
+		compatible = "ti,am654-hbmc";
+		reg = <0x0 0x47034000 0x0 0x100>,
+			<0x5 0x00000000 0x1 0x0000000>;
+		power-domains = <&k3_pds 55>;
+		#address-cells = <2>;
+		#size-cells = <1>;
+		ranges = <0x0 0x0 0x5 0x00000000 0x4000000>, /* CS0 - 64MB */
+			 <0x1 0x0 0x5 0x04000000 0x4000000>; /* CS1 - 64MB */
+		mux-controls = <&hbmc_mux 0>;
+
+		/* Slave flash node */
+		flash@0,0 {
+			compatible = "cypress,hyperflash", "cfi-flash";
+			reg = <0x0 0x0 0x4000000>;
+		};
+	};
diff --git a/Documentation/devicetree/bindings/mtd/vf610-nfc.txt b/Documentation/devicetree/bindings/mtd/vf610-nfc.txt
index c96eeb65f450..7db5e6e609df 100644
--- a/Documentation/devicetree/bindings/mtd/vf610-nfc.txt
+++ b/Documentation/devicetree/bindings/mtd/vf610-nfc.txt
@@ -25,14 +25,14 @@ only handle one NAND chip.
 
 Required properties:
 - compatible: Should be set to "fsl,vf610-nfc-cs".
-- nand-bus-width: see nand.txt
-- nand-ecc-mode: see nand.txt
+- nand-bus-width: see nand-controller.yaml
+- nand-ecc-mode: see nand-controller.yaml
 
 Required properties for hardware ECC:
-- nand-ecc-strength: supported strengths are 24 and 32 bit (see nand.txt)
+- nand-ecc-strength: supported strengths are 24 and 32 bit (see nand-controller.yaml)
 - nand-ecc-step-size: step size equals page size, currently only 2k pages are
     supported
-- nand-on-flash-bbt: see nand.txt
+- nand-on-flash-bbt: see nand-controller.yaml
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/mux/mmio-mux.txt b/Documentation/devicetree/bindings/mux/mmio-mux.txt
deleted file mode 100644
index a9bfb4d8b6ac..000000000000
--- a/Documentation/devicetree/bindings/mux/mmio-mux.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-MMIO register bitfield-based multiplexer controller bindings
-
-Define register bitfields to be used to control multiplexers. The parent
-device tree node must be a syscon node to provide register access.
-
-Required properties:
-- compatible : "mmio-mux"
-- #mux-control-cells : <1>
-- mux-reg-masks : an array of register offset and pre-shifted bitfield mask
-                  pairs, each describing a single mux control.
-* Standard mux-controller bindings as decribed in mux-controller.txt
-
-Optional properties:
-- idle-states : if present, the state the muxes will have when idle. The
-		special state MUX_IDLE_AS_IS is the default.
-
-The multiplexer state of each multiplexer is defined as the value of the
-bitfield described by the corresponding register offset and bitfield mask pair
-in the mux-reg-masks array, accessed through the parent syscon.
-
-Example:
-
-	syscon {
-		compatible = "syscon";
-
-		mux: mux-controller {
-			compatible = "mmio-mux";
-			#mux-control-cells = <1>;
-
-			mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */
-					<0x3 0x40>, /* 1: reg 0x3, bit 6 */
-			idle-states = <MUX_IDLE_AS_IS>, <0>;
-		};
-	};
-
-	video-mux {
-		compatible = "video-mux";
-		mux-controls = <&mux 0>;
-
-		ports {
-			/* inputs 0..3 */
-			port@0 {
-				reg = <0>;
-			};
-			port@1 {
-				reg = <1>;
-			};
-			port@2 {
-				reg = <2>;
-			};
-			port@3 {
-				reg = <3>;
-			};
-
-			/* output */
-			port@4 {
-				reg = <4>;
-			};
-		};
-	};
diff --git a/Documentation/devicetree/bindings/mux/reg-mux.txt b/Documentation/devicetree/bindings/mux/reg-mux.txt
new file mode 100644
index 000000000000..4afd7ba73d60
--- /dev/null
+++ b/Documentation/devicetree/bindings/mux/reg-mux.txt
@@ -0,0 +1,129 @@
+Generic register bitfield-based multiplexer controller bindings
+
+Define register bitfields to be used to control multiplexers. The parent
+device tree node must be a device node to provide register r/w access.
+
+Required properties:
+- compatible : should be one of
+	"reg-mux" : if parent device of mux controller is not syscon device
+	"mmio-mux" : if parent device of mux controller is syscon device
+- #mux-control-cells : <1>
+- mux-reg-masks : an array of register offset and pre-shifted bitfield mask
+                  pairs, each describing a single mux control.
+* Standard mux-controller bindings as decribed in mux-controller.txt
+
+Optional properties:
+- idle-states : if present, the state the muxes will have when idle. The
+		special state MUX_IDLE_AS_IS is the default.
+
+The multiplexer state of each multiplexer is defined as the value of the
+bitfield described by the corresponding register offset and bitfield mask
+pair in the mux-reg-masks array.
+
+Example 1:
+The parent device of mux controller is not a syscon device.
+
+&i2c0 {
+	fpga@66 { // fpga connected to i2c
+		compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c",
+			     "simple-mfd";
+		reg = <0x66>;
+
+		mux: mux-controller {
+			compatible = "reg-mux";
+			#mux-control-cells = <1>;
+			mux-reg-masks = <0x54 0xf8>, /* 0: reg 0x54, bits 7:3 */
+					<0x54 0x07>; /* 1: reg 0x54, bits 2:0 */
+		};
+	};
+};
+
+mdio-mux-1 {
+	compatible = "mdio-mux-multiplexer";
+	mux-controls = <&mux 0>;
+	mdio-parent-bus = <&emdio1>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	mdio@0 {
+		reg = <0x0>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+
+	mdio@8 {
+		reg = <0x8>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+
+	..
+	..
+};
+
+mdio-mux-2 {
+	compatible = "mdio-mux-multiplexer";
+	mux-controls = <&mux 1>;
+	mdio-parent-bus = <&emdio2>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	mdio@0 {
+		reg = <0x0>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+
+	mdio@1 {
+		reg = <0x1>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+
+	..
+	..
+};
+
+Example 2:
+The parent device of mux controller is syscon device.
+
+syscon {
+	compatible = "syscon";
+
+	mux: mux-controller {
+		compatible = "mmio-mux";
+		#mux-control-cells = <1>;
+
+		mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */
+				<0x3 0x40>, /* 1: reg 0x3, bit 6 */
+		idle-states = <MUX_IDLE_AS_IS>, <0>;
+	};
+};
+
+video-mux {
+	compatible = "video-mux";
+	mux-controls = <&mux 0>;
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	ports {
+		/* inputs 0..3 */
+		port@0 {
+			reg = <0>;
+		};
+		port@1 {
+			reg = <1>;
+		};
+		port@2 {
+			reg = <2>;
+		};
+		port@3 {
+			reg = <3>;
+		};
+
+		/* output */
+		port@4 {
+			reg = <4>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
new file mode 100644
index 000000000000..792196bf4abd
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-emac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 EMAC Ethernet Controller Device Tree Bindings
+
+allOf:
+  - $ref: "ethernet-controller.yaml#"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    const: allwinner,sun4i-a10-emac
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  allwinner,sram:
+    description: Phandle to the device SRAM
+    $ref: /schemas/types.yaml#/definitions/phandle-array
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - phy-handle
+  - allwinner,sram
+
+examples:
+  - |
+    emac: ethernet@1c0b000 {
+        compatible = "allwinner,sun4i-a10-emac";
+        reg = <0x01c0b000 0x1000>;
+        interrupts = <55>;
+        clocks = <&ahb_gates 17>;
+        phy-handle = <&phy0>;
+        allwinner,sram = <&emac_sram 1>;
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
new file mode 100644
index 000000000000..df24d9d969f7
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-mdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MDIO Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+  - $ref: "mdio.yaml#"
+
+# Select every compatible, including the deprecated ones. This way, we
+# will be able to report a warning when we have that compatible, since
+# we will validate the node thanks to the select, but won't report it
+# as a valid value in the compatible property description
+select:
+  properties:
+    compatible:
+      enum:
+        - allwinner,sun4i-a10-mdio
+
+        # Deprecated
+        - allwinner,sun4i-mdio
+
+  required:
+    - compatible
+
+properties:
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  compatible:
+    const: allwinner,sun4i-a10-mdio
+
+  reg:
+    maxItems: 1
+
+  phy-supply:
+    description: PHY regulator
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    mdio@1c0b080 {
+        compatible = "allwinner,sun4i-a10-mdio";
+        reg = <0x01c0b080 0x14>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+        phy-supply = <&reg_emac_3v3>;
+
+        phy0: ethernet-phy@0 {
+            reg = <0>;
+        };
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
deleted file mode 100644
index e98118aef5f6..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-* Allwinner EMAC ethernet controller
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-emac" (Deprecated:
-              "allwinner,sun4i-emac")
-- reg: address and length of the register set for the device.
-- interrupts: interrupt for the device
-- phy: see ethernet.txt file in the same directory.
-- clocks: A phandle to the reference clock for this device
-
-Example:
-
-emac: ethernet@1c0b000 {
-       compatible = "allwinner,sun4i-a10-emac";
-       reg = <0x01c0b000 0x1000>;
-       interrupts = <55>;
-       clocks = <&ahb_gates 17>;
-       phy = <&phy0>;
-};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
deleted file mode 100644
index ab5b8613b0ef..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner A10 MDIO Ethernet Controller interface
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-mdio"
-              (Deprecated: "allwinner,sun4i-mdio").
-- reg: address and length of the register set for the device.
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Example at the SoC level:
-mdio@1c0b080 {
-	compatible = "allwinner,sun4i-a10-mdio";
-	reg = <0x01c0b080 0x14>;
-	#address-cells = <1>;
-	#size-cells = <0>;
-};
-
-And at the board level:
-
-mdio@1c0b080 {
-	phy-supply = <&reg_emac_3v3>;
-
-	phy0: ethernet-phy@0 {
-		reg = <0>;
-	};
-};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
deleted file mode 100644
index 8b3f953656e3..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
- - compatible:  Should be "allwinner,sun7i-a20-gmac"
- - clocks: Should contain the GMAC main clock, and tx clock
-   The tx clock type should be "allwinner,sun7i-a20-gmac-clk"
- - clock-names: Should contain the clock names "stmmaceth",
-   and "allwinner_gmac_tx"
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Examples:
-
-	gmac: ethernet@1c50000 {
-		compatible = "allwinner,sun7i-a20-gmac";
-		reg = <0x01c50000 0x10000>,
-		      <0x01c20164 0x4>;
-		interrupts = <0 85 1>;
-		interrupt-names = "macirq";
-		clocks = <&ahb_gates 49>, <&gmac_tx>;
-		clock-names = "stmmaceth", "allwinner_gmac_tx";
-		phy-mode = "mii";
-	};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml
new file mode 100644
index 000000000000..06b1cc8bea14
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun7i-a20-gmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A20 GMAC Device Tree Bindings
+
+allOf:
+  - $ref: "snps,dwmac.yaml#"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    const: allwinner,sun7i-a20-gmac
+
+  interrupts:
+    maxItems: 1
+
+  interrupt-names:
+    const: macirq
+
+  clocks:
+    items:
+      - description: GMAC main clock
+      - description: TX clock
+
+  clock-names:
+    items:
+      - const: stmmaceth
+      - const: allwinner_gmac_tx
+
+  phy-supply:
+    description:
+      PHY regulator
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-names
+  - clocks
+  - clock-names
+  - phy-mode
+
+examples:
+  - |
+    gmac: ethernet@1c50000 {
+        compatible = "allwinner,sun7i-a20-gmac";
+        reg = <0x01c50000 0x10000>;
+        interrupts = <0 85 1>;
+        interrupt-names = "macirq";
+        clocks = <&ahb_gates 49>, <&gmac_tx>;
+        clock-names = "stmmaceth", "allwinner_gmac_tx";
+        phy-mode = "mii";
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml
new file mode 100644
index 000000000000..d4084c149768
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun8i-a83t-gmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A83t EMAC Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    oneOf:
+      - const: allwinner,sun8i-a83t-emac
+      - const: allwinner,sun8i-h3-emac
+      - const: allwinner,sun8i-r40-emac
+      - const: allwinner,sun8i-v3s-emac
+      - const: allwinner,sun50i-a64-emac
+      - items:
+        - const: allwinner,sun50i-h6-emac
+        - const: allwinner,sun50i-a64-emac
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  interrupt-names:
+    const: macirq
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    const: stmmaceth
+
+  syscon:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      Phandle to the device containing the EMAC or GMAC clock
+      register
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-names
+  - clocks
+  - clock-names
+  - resets
+  - reset-names
+  - phy-handle
+  - phy-mode
+  - syscon
+
+allOf:
+  - $ref: "snps,dwmac.yaml#"
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun8i-a83t-emac
+              - allwinner,sun8i-h3-emac
+              - allwinner,sun8i-v3s-emac
+              - allwinner,sun50i-a64-emac
+
+    then:
+      properties:
+        allwinner,tx-delay-ps:
+          default: 0
+          minimum: 0
+          maximum: 700
+          multipleOf: 100
+          description:
+            External RGMII PHY TX clock delay chain value in ps.
+
+        allwinner,rx-delay-ps:
+          default: 0
+          minimum: 0
+          maximum: 3100
+          multipleOf: 100
+          description:
+            External RGMII PHY TX clock delay chain value in ps.
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun8i-r40-emac
+
+    then:
+      properties:
+        allwinner,rx-delay-ps:
+          default: 0
+          minimum: 0
+          maximum: 700
+          multipleOf: 100
+          description:
+            External RGMII PHY TX clock delay chain value in ps.
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun8i-h3-emac
+              - allwinner,sun8i-v3s-emac
+
+    then:
+      properties:
+        allwinner,leds-active-low:
+          $ref: /schemas/types.yaml#definitions/flag
+          description:
+            EPHY LEDs are active low.
+
+        mdio-mux:
+          type: object
+
+          properties:
+            compatible:
+              const: allwinner,sun8i-h3-mdio-mux
+
+            mdio-parent-bus:
+              $ref: /schemas/types.yaml#definitions/phandle
+              description:
+                Phandle to EMAC MDIO.
+
+            mdio@1:
+              type: object
+              description: Internal MDIO Bus
+
+              properties:
+                "#address-cells":
+                  const: 1
+
+                "#size-cells":
+                  const: 0
+
+                compatible:
+                  const: allwinner,sun8i-h3-mdio-internal
+
+                reg:
+                  const: 1
+
+              patternProperties:
+                "^ethernet-phy@[0-9a-f]$":
+                  type: object
+                  description:
+                    Integrated PHY node
+
+                  properties:
+                    clocks:
+                      maxItems: 1
+
+                    resets:
+                      maxItems: 1
+
+                  required:
+                    - clocks
+                    - resets
+
+
+            mdio@2:
+              type: object
+              description: External MDIO Bus (H3 only)
+
+              properties:
+                "#address-cells":
+                  const: 1
+
+                "#size-cells":
+                  const: 0
+
+                reg:
+                  const: 2
+
+          required:
+            - compatible
+            - mdio-parent-bus
+            - mdio@1
+
+examples:
+  - |
+    ethernet@1c0b000 {
+        compatible = "allwinner,sun8i-h3-emac";
+        syscon = <&syscon>;
+        reg = <0x01c0b000 0x104>;
+        interrupts = <0 82 1>;
+        interrupt-names = "macirq";
+        resets = <&ccu 12>;
+        reset-names = "stmmaceth";
+        clocks = <&ccu 27>;
+        clock-names = "stmmaceth";
+
+        phy-handle = <&int_mii_phy>;
+        phy-mode = "mii";
+        allwinner,leds-active-low;
+
+        mdio1: mdio {
+            #address-cells = <1>;
+            #size-cells = <0>;
+            compatible = "snps,dwmac-mdio";
+        };
+
+        mdio-mux {
+            compatible = "allwinner,sun8i-h3-mdio-mux";
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            mdio-parent-bus = <&mdio1>;
+
+            int_mii_phy: mdio@1 {
+                compatible = "allwinner,sun8i-h3-mdio-internal";
+                reg = <1>;
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                ethernet-phy@1 {
+                    reg = <1>;
+                    clocks = <&ccu 67>;
+                    resets = <&ccu 39>;
+                    phy-is-integrated;
+                };
+            };
+
+            mdio@2 {
+                reg = <2>;
+                #address-cells = <1>;
+                #size-cells = <0>;
+            };
+        };
+    };
+
+  - |
+    ethernet@1c0b000 {
+        compatible = "allwinner,sun8i-h3-emac";
+        syscon = <&syscon>;
+        reg = <0x01c0b000 0x104>;
+        interrupts = <0 82 1>;
+        interrupt-names = "macirq";
+        resets = <&ccu 12>;
+        reset-names = "stmmaceth";
+        clocks = <&ccu 27>;
+        clock-names = "stmmaceth";
+
+        phy-handle = <&ext_rgmii_phy>;
+        phy-mode = "rgmii";
+        allwinner,leds-active-low;
+
+        mdio2: mdio {
+            #address-cells = <1>;
+            #size-cells = <0>;
+            compatible = "snps,dwmac-mdio";
+        };
+
+        mdio-mux {
+            compatible = "allwinner,sun8i-h3-mdio-mux";
+            #address-cells = <1>;
+            #size-cells = <0>;
+            mdio-parent-bus = <&mdio2>;
+
+            mdio@1 {
+                compatible = "allwinner,sun8i-h3-mdio-internal";
+                reg = <1>;
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                ethernet-phy@1 {
+                    reg = <1>;
+                    clocks = <&ccu 67>;
+                    resets = <&ccu 39>;
+                };
+            };
+
+            mdio@2 {
+                reg = <2>;
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                ext_rgmii_phy: ethernet-phy@1 {
+                    reg = <1>;
+                };
+            };
+        };
+    };
+
+  - |
+    ethernet@1c0b000 {
+        compatible = "allwinner,sun8i-a83t-emac";
+        syscon = <&syscon>;
+        reg = <0x01c0b000 0x104>;
+        interrupts = <0 82 1>;
+        interrupt-names = "macirq";
+        resets = <&ccu 13>;
+        reset-names = "stmmaceth";
+        clocks = <&ccu 27>;
+        clock-names = "stmmaceth";
+        phy-handle = <&ext_rgmii_phy1>;
+        phy-mode = "rgmii";
+
+        mdio {
+            compatible = "snps,dwmac-mdio";
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            ext_rgmii_phy1: ethernet-phy@1 {
+                reg = <1>;
+            };
+        };
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
index 188c8bd4eb67..5a0111d4de58 100644
--- a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
+++ b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
@@ -4,6 +4,7 @@ Required properties:
  - compatible: Should be one of the following:
    - "microchip,mcp2510" for MCP2510.
    - "microchip,mcp2515" for MCP2515.
+   - "microchip,mcp25625" for MCP25625.
  - reg: SPI chip select.
  - clocks: The clock feeding the CAN controller.
  - interrupts: Should contain IRQ line for the CAN controller.
diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt
index 9936b9ee67c3..b463e1268ac4 100644
--- a/Documentation/devicetree/bindings/net/can/rcar_can.txt
+++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt
@@ -6,6 +6,7 @@ Required properties:
 	      "renesas,can-r8a7744" if CAN controller is a part of R8A7744 SoC.
 	      "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC.
 	      "renesas,can-r8a774a1" if CAN controller is a part of R8A774A1 SoC.
+	      "renesas,can-r8a774c0" if CAN controller is a part of R8A774C0 SoC.
 	      "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
 	      "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC.
 	      "renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC.
@@ -27,13 +28,8 @@ Required properties:
 
 - reg: physical base address and size of the R-Car CAN register map.
 - interrupts: interrupt specifier for the sole interrupt.
-- clocks: phandles and clock specifiers for 2 CAN clock inputs for RZ/G2
-	  devices.
-	  phandles and clock specifiers for 3 CAN clock inputs for every other
-	  SoC.
-- clock-names: 2 clock input name strings for RZ/G2: "clkp1", "can_clk".
-	       3 clock input name strings for every other SoC: "clkp1", "clkp2",
-	       "can_clk".
+- clocks: phandles and clock specifiers for 3 CAN clock inputs.
+- clock-names: 3 clock input name strings: "clkp1", "clkp2", and "can_clk".
 - pinctrl-0: pin control group to be used for this controller.
 - pinctrl-names: must be "default".
 
@@ -49,8 +45,7 @@ using the below properties:
 Optional properties:
 - renesas,can-clock-select: R-Car CAN Clock Source Select. Valid values are:
 			    <0x0> (default) : Peripheral clock (clkp1)
-			    <0x1> : Peripheral clock (clkp2) (not supported by
-				    RZ/G2 devices)
+			    <0x1> : Peripheral clock (clkp2)
 			    <0x3> : External input clock
 
 Example
diff --git a/Documentation/devicetree/bindings/net/can/rcar_canfd.txt b/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
index ac71daa46195..32f051f6d338 100644
--- a/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
+++ b/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
@@ -3,11 +3,14 @@ Renesas R-Car CAN FD controller Device Tree Bindings
 
 Required properties:
 - compatible: Must contain one or more of the following:
-  - "renesas,rcar-gen3-canfd" for R-Car Gen3 compatible controller.
+  - "renesas,rcar-gen3-canfd" for R-Car Gen3 and RZ/G2 compatible controllers.
+  - "renesas,r8a774c0-canfd" for R8A774C0 (RZ/G2E) compatible controller.
   - "renesas,r8a7795-canfd" for R8A7795 (R-Car H3) compatible controller.
   - "renesas,r8a7796-canfd" for R8A7796 (R-Car M3-W) compatible controller.
+  - "renesas,r8a77965-canfd" for R8A77965 (R-Car M3-N) compatible controller.
   - "renesas,r8a77970-canfd" for R8A77970 (R-Car V3M) compatible controller.
   - "renesas,r8a77980-canfd" for R8A77980 (R-Car V3H) compatible controller.
+  - "renesas,r8a77990-canfd" for R8A77990 (R-Car E3) compatible controller.
 
   When compatible with the generic version, nodes must list the
   SoC-specific version corresponding to the platform first, followed by the
@@ -26,12 +29,13 @@ The name of the child nodes are "channel0" and "channel1" respectively. Each
 child node supports the "status" property only, which is used to
 enable/disable the respective channel.
 
-Required properties for "renesas,r8a7795-canfd" and "renesas,r8a7796-canfd"
+Required properties for "renesas,r8a774c0-canfd", "renesas,r8a7795-canfd",
+"renesas,r8a7796-canfd", "renesas,r8a77965-canfd", and "renesas,r8a77990-canfd"
 compatible:
-In R8A7795 and R8A7796 SoCs, canfd clock is a div6 clock and can be used by both
-CAN and CAN FD controller at the same time. It needs to be scaled to maximum
-frequency if any of these controllers use it. This is done using the below
-properties:
+In R8A774C0, R8A7795, R8A7796, R8A77965, and R8A77990 SoCs, canfd clock is a
+div6 clock and can be used by both CAN and CAN FD controller at the same time.
+It needs to be scaled to maximum frequency if any of these controllers use it.
+This is done using the below properties:
 
 - assigned-clocks: phandle of canfd clock.
 - assigned-clock-rates: maximum frequency of this clock.
diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt
index e7db7268fd0f..4ac21cef370e 100644
--- a/Documentation/devicetree/bindings/net/dsa/ksz.txt
+++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt
@@ -16,6 +16,8 @@ Required properties:
 Optional properties:
 
 - reset-gpios		: Should be a gpio specifier for a reset line
+- microchip,synclko-125 : Set if the output SYNCLKO frequency should be set to
+			  125MHz instead of 25MHz.
 
 See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
 required and optional properties.
diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt
index feb007af13cb..6f9538974bb9 100644
--- a/Documentation/devicetree/bindings/net/dsa/marvell.txt
+++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt
@@ -21,10 +21,13 @@ which is at a different MDIO base address in different switch families.
 			  6341, 6350, 6351, 6352
 - "marvell,mv88e6190"	: Switch has base address 0x00. Use with models:
 			  6190, 6190X, 6191, 6290, 6390, 6390X
+- "marvell,mv88e6250"	: Switch has base address 0x08 or 0x18. Use with model:
+			  6250
 
 Required properties:
-- compatible		: Should be one of "marvell,mv88e6085" or
-			  "marvell,mv88e6190" as indicated above
+- compatible		: Should be one of "marvell,mv88e6085",
+			  "marvell,mv88e6190" or "marvell,mv88e6250" as
+			  indicated above
 - reg			: Address on the MII bus for the switch.
 
 Optional properties:
diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
index 93a7469e70d4..ccbc6d89325d 100644
--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt
+++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
@@ -9,6 +9,10 @@ Required properties:
 - #size-cells: must be 0
 - #address-cells: must be 1
 
+Optional properties:
+
+- reset-gpios: GPIO to be used to reset the whole device
+
 Subnodes:
 
 The integrated switch subnode should be specified according to the binding
@@ -66,6 +70,7 @@ for the external mdio-bus configuration:
 			#address-cells = <1>;
 			#size-cells = <0>;
 
+			reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>;
 			reg = <0x10>;
 
 			ports {
@@ -123,6 +128,7 @@ for the internal master mdio-bus configuration:
 			#address-cells = <1>;
 			#size-cells = <0>;
 
+			reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>;
 			reg = <0x10>;
 
 			ports {
diff --git a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
index ed4710c40641..bbf4a13f6d75 100644
--- a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
+++ b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
@@ -2,8 +2,8 @@ Vitesse VSC73xx Switches
 ========================
 
 This defines device tree bindings for the Vitesse VSC73xx switch chips.
-The Vitesse company has been acquired by Microsemi and Microsemi in turn
-acquired by Microchip but retains this vendor branding.
+The Vitesse company has been acquired by Microsemi and Microsemi has
+been acquired Microchip but retains this vendor branding.
 
 The currently supported switch chips are:
 Vitesse VSC7385 SparX-G5 5+1-port Integrated Gigabit Ethernet Switch
@@ -11,8 +11,14 @@ Vitesse VSC7388 SparX-G8 8-port Integrated Gigabit Ethernet Switch
 Vitesse VSC7395 SparX-G5e 5+1-port Integrated Gigabit Ethernet Switch
 Vitesse VSC7398 SparX-G8e 8-port Integrated Gigabit Ethernet Switch
 
-The device tree node is an SPI device so it must reside inside a SPI bus
-device tree node, see spi/spi-bus.txt
+This switch could have two different management interface.
+
+If SPI interface is used, the device tree node is an SPI device so it must
+reside inside a SPI bus device tree node, see spi/spi-bus.txt
+
+When the chip is connected to a parallel memory bus and work in memory-mapped
+I/O mode, a platform device is used to represent the vsc73xx. In this case it
+must reside inside a platform bus device tree node.
 
 Required properties:
 
@@ -38,6 +44,7 @@ and subnodes of DSA switches.
 
 Examples:
 
+SPI:
 switch@0 {
 	compatible = "vitesse,vsc7395";
 	reg = <0>;
@@ -79,3 +86,46 @@ switch@0 {
 		};
 	};
 };
+
+Platform:
+switch@2,0 {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "vitesse,vsc7385";
+	reg = <0x2 0x0 0x20000>;
+	reset-gpios = <&gpio0 12 GPIO_ACTIVE_LOW>;
+
+	ports {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		port@0 {
+			reg = <0>;
+			label = "lan1";
+		};
+		port@1 {
+			reg = <1>;
+			label = "lan2";
+		};
+		port@2 {
+			reg = <2>;
+			label = "lan3";
+		};
+		port@3 {
+			reg = <3>;
+			label = "lan4";
+		};
+		vsc: port@6 {
+			reg = <6>;
+			label = "cpu";
+			ethernet = <&enet0>;
+			phy-mode = "rgmii";
+			fixed-link {
+				speed = <1000>;
+				full-duplex;
+				pause;
+			};
+		};
+	};
+
+};
diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
deleted file mode 100644
index 54c66d0611cb..000000000000
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ /dev/null
@@ -1,201 +0,0 @@
-* Allwinner sun8i GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
-- compatible: must be one of the following string:
-		"allwinner,sun8i-a83t-emac"
-		"allwinner,sun8i-h3-emac"
-		"allwinner,sun8i-r40-gmac"
-		"allwinner,sun8i-v3s-emac"
-		"allwinner,sun50i-a64-emac"
-		"allwinner,sun50i-h6-emac", "allwinner-sun50i-a64-emac"
-- reg: address and length of the register for the device.
-- interrupts: interrupt for the device
-- interrupt-names: must be "macirq"
-- clocks: A phandle to the reference clock for this device
-- clock-names: must be "stmmaceth"
-- resets: A phandle to the reset control for this device
-- reset-names: must be "stmmaceth"
-- phy-mode: See ethernet.txt
-- phy-handle: See ethernet.txt
-- syscon: A phandle to the device containing the EMAC or GMAC clock register
-
-Optional properties:
-- allwinner,tx-delay-ps: TX clock delay chain value in ps.
-			 Range is 0-700. Default is 0.
-			 Unavailable for allwinner,sun8i-r40-gmac
-- allwinner,rx-delay-ps: RX clock delay chain value in ps.
-			 Range is 0-3100. Default is 0.
-			 Range is 0-700 for allwinner,sun8i-r40-gmac
-Both delay properties need to be a multiple of 100. They control the
-clock delay for external RGMII PHY. They do not apply to the internal
-PHY or external non-RGMII PHYs.
-
-Optional properties for the following compatibles:
-  - "allwinner,sun8i-h3-emac",
-  - "allwinner,sun8i-v3s-emac":
-- allwinner,leds-active-low: EPHY LEDs are active low
-
-Required child node of emac:
-- mdio bus node: should be named mdio with compatible "snps,dwmac-mdio"
-
-Required properties of the mdio node:
-- #address-cells: shall be 1
-- #size-cells: shall be 0
-
-The device node referenced by "phy" or "phy-handle" must be a child node
-of the mdio node. See phy.txt for the generic PHY bindings.
-
-The following compatibles require that the emac node have a mdio-mux child
-node called "mdio-mux":
-  - "allwinner,sun8i-h3-emac"
-  - "allwinner,sun8i-v3s-emac":
-Required properties for the mdio-mux node:
-  - compatible = "allwinner,sun8i-h3-mdio-mux"
-  - mdio-parent-bus: a phandle to EMAC mdio
-  - one child mdio for the integrated mdio with the compatible
-    "allwinner,sun8i-h3-mdio-internal"
-  - one child mdio for the external mdio if present (V3s have none)
-Required properties for the mdio-mux children node:
-  - reg: 1 for internal MDIO bus, 2 for external MDIO bus
-
-The following compatibles require a PHY node representing the integrated
-PHY, under the integrated MDIO bus node if an mdio-mux node is used:
-  - "allwinner,sun8i-h3-emac",
-  - "allwinner,sun8i-v3s-emac":
-
-Additional information regarding generic multiplexer properties can be found
-at Documentation/devicetree/bindings/net/mdio-mux.txt
-
-Required properties of the integrated phy node:
-- clocks: a phandle to the reference clock for the EPHY
-- resets: a phandle to the reset control for the EPHY
-- Must be a child of the integrated mdio
-
-Example with integrated PHY:
-emac: ethernet@1c0b000 {
-	compatible = "allwinner,sun8i-h3-emac";
-	syscon = <&syscon>;
-	reg = <0x01c0b000 0x104>;
-	interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
-	interrupt-names = "macirq";
-	resets = <&ccu RST_BUS_EMAC>;
-	reset-names = "stmmaceth";
-	clocks = <&ccu CLK_BUS_EMAC>;
-	clock-names = "stmmaceth";
-
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-
-	mdio: mdio {
-		#address-cells = <1>;
-		#size-cells = <0>;
-		compatible = "snps,dwmac-mdio";
-	};
-
-	mdio-mux {
-		compatible = "mdio-mux", "allwinner,sun8i-h3-mdio-mux";
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		mdio-parent-bus = <&mdio>;
-
-		int_mdio: mdio@1 {
-			compatible = "allwinner,sun8i-h3-mdio-internal";
-			reg = <1>;
-			#address-cells = <1>;
-			#size-cells = <0>;
-			int_mii_phy: ethernet-phy@1 {
-				reg = <1>;
-				clocks = <&ccu CLK_BUS_EPHY>;
-				resets = <&ccu RST_BUS_EPHY>;
-				phy-is-integrated;
-			};
-		};
-		ext_mdio: mdio@2 {
-			reg = <2>;
-			#address-cells = <1>;
-			#size-cells = <0>;
-		};
-	};
-};
-
-Example with external PHY:
-emac: ethernet@1c0b000 {
-	compatible = "allwinner,sun8i-h3-emac";
-	syscon = <&syscon>;
-	reg = <0x01c0b000 0x104>;
-	interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
-	interrupt-names = "macirq";
-	resets = <&ccu RST_BUS_EMAC>;
-	reset-names = "stmmaceth";
-	clocks = <&ccu CLK_BUS_EMAC>;
-	clock-names = "stmmaceth";
-
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-	allwinner,leds-active-low;
-
-	mdio: mdio {
-		#address-cells = <1>;
-		#size-cells = <0>;
-		compatible = "snps,dwmac-mdio";
-	};
-
-	mdio-mux {
-		compatible = "allwinner,sun8i-h3-mdio-mux";
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		mdio-parent-bus = <&mdio>;
-
-		int_mdio: mdio@1 {
-			compatible = "allwinner,sun8i-h3-mdio-internal";
-			reg = <1>;
-			#address-cells = <1>;
-			#size-cells = <0>;
-			int_mii_phy: ethernet-phy@1 {
-				reg = <1>;
-				clocks = <&ccu CLK_BUS_EPHY>;
-				resets = <&ccu RST_BUS_EPHY>;
-			};
-		};
-		ext_mdio: mdio@2 {
-			reg = <2>;
-			#address-cells = <1>;
-			#size-cells = <0>;
-			ext_rgmii_phy: ethernet-phy@1 {
-				reg = <1>;
-			};
-		}:
-	};
-};
-
-Example with SoC without integrated PHY
-
-emac: ethernet@1c0b000 {
-	compatible = "allwinner,sun8i-a83t-emac";
-	syscon = <&syscon>;
-	reg = <0x01c0b000 0x104>;
-	interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
-	interrupt-names = "macirq";
-	resets = <&ccu RST_BUS_EMAC>;
-	reset-names = "stmmaceth";
-	clocks = <&ccu CLK_BUS_EMAC>;
-	clock-names = "stmmaceth";
-
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-
-	mdio: mdio {
-		compatible = "snps,dwmac-mdio";
-		#address-cells = <1>;
-		#size-cells = <0>;
-		ext_rgmii_phy: ethernet-phy@1 {
-			reg = <1>;
-		};
-	};
-};
diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
new file mode 100644
index 000000000000..0e7c31794ae6
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet Controller Generic Binding
+
+maintainers:
+  - David S. Miller <davem@davemloft.net>
+
+properties:
+  $nodename:
+    pattern: "^ethernet(@.*)?$"
+
+  local-mac-address:
+    allOf:
+      - $ref: /schemas/types.yaml#definitions/uint8-array
+      - items:
+          - minItems: 6
+            maxItems: 6
+    description:
+      Specifies the MAC address that was assigned to the network device.
+
+  mac-address:
+    allOf:
+      - $ref: /schemas/types.yaml#definitions/uint8-array
+      - items:
+          - minItems: 6
+            maxItems: 6
+    description:
+      Specifies the MAC address that was last used by the boot
+      program; should be used in cases where the MAC address assigned
+      to the device by the boot program is different from the
+      local-mac-address property.
+
+  max-frame-size:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      Maximum transfer unit (IEEE defined MTU), rather than the
+      maximum frame size (there\'s contradiction in the Devicetree
+      Specification).
+
+  max-speed:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      Specifies maximum speed in Mbit/s supported by the device.
+
+  nvmem-cells:
+    maxItems: 1
+    description:
+      Reference to an nvmem node for the MAC address
+
+  nvmem-cells-names:
+    const: mac-address
+
+  phy-connection-type:
+    description:
+      Operation mode of the PHY interface
+    enum:
+      # There is not a standard bus between the MAC and the PHY,
+      # something proprietary is being used to embed the PHY in the
+      # MAC.
+      - internal
+      - mii
+      - gmii
+      - sgmii
+      - qsgmii
+      - tbi
+      - rev-mii
+      - rmii
+
+      # RX and TX delays are added by the MAC when required
+      - rgmii
+
+      # RGMII with internal RX and TX delays provided by the PHY,
+      # the MAC should not add the RX or TX delays in this case
+      - rgmii-id
+
+      # RGMII with internal RX delay provided by the PHY, the MAC
+      # should not add an RX delay in this case
+      - rgmii-rxid
+
+      # RGMII with internal TX delay provided by the PHY, the MAC
+      # should not add an TX delay in this case
+      - rgmii-txid
+      - rtbi
+      - smii
+      - xgmii
+      - trgmii
+      - 1000base-x
+      - 2500base-x
+      - rxaui
+      - xaui
+
+      # 10GBASE-KR, XFI, SFI
+      - 10gbase-kr
+      - usxgmii
+
+  phy-mode:
+    $ref: "#/properties/phy-connection-type"
+
+  phy-handle:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      Specifies a reference to a node representing a PHY device.
+
+  phy:
+    $ref: "#/properties/phy-handle"
+    deprecated: true
+
+  phy-device:
+    $ref: "#/properties/phy-handle"
+    deprecated: true
+
+  rx-fifo-depth:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      The size of the controller\'s receive fifo in bytes. This is used
+      for components that can have configurable receive fifo sizes,
+      and is useful for determining certain configuration settings
+      such as flow control thresholds.
+
+  tx-fifo-depth:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      The size of the controller\'s transmit fifo in bytes. This
+      is used for components that can have configurable fifo sizes.
+
+  managed:
+    allOf:
+      - $ref: /schemas/types.yaml#definitions/string
+      - default: auto
+        enum:
+          - auto
+          - in-band-status
+    description:
+      Specifies the PHY management type. If auto is set and fixed-link
+      is not specified, it uses MDIO for management.
+
+  fixed-link:
+    allOf:
+      - if:
+          type: array
+        then:
+          deprecated: true
+          minItems: 1
+          maxItems: 1
+          items:
+            items:
+              - minimum: 0
+                maximum: 31
+                description:
+                  Emulated PHY ID, choose any but unique to the all
+                  specified fixed-links
+
+              - enum: [0, 1]
+                description:
+                  Duplex configuration. 0 for half duplex or 1 for
+                  full duplex
+
+              - enum: [10, 100, 1000]
+                description:
+                  Link speed in Mbits/sec.
+
+              - enum: [0, 1]
+                description:
+                  Pause configuration. 0 for no pause, 1 for pause
+
+              - enum: [0, 1]
+                description:
+                  Asymmetric pause configuration. 0 for no asymmetric
+                  pause, 1 for asymmetric pause
+
+
+      - if:
+          type: object
+        then:
+          properties:
+            speed:
+              allOf:
+                - $ref: /schemas/types.yaml#definitions/uint32
+                - enum: [10, 100, 1000]
+              description:
+                Link speed.
+
+            full-duplex:
+              $ref: /schemas/types.yaml#definitions/flag
+              description:
+                Indicates that full-duplex is used. When absent, half
+                duplex is assumed.
+
+            asym-pause:
+              $ref: /schemas/types.yaml#definitions/flag
+              description:
+                Indicates that asym_pause should be enabled.
+
+            link-gpios:
+              maxItems: 1
+              description:
+                GPIO to determine if the link is up
+
+          required:
+            - speed
+
+...
diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
new file mode 100644
index 000000000000..f70f18ff821f
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-phy.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet PHY Generic Binding
+
+maintainers:
+  - Andrew Lunn <andrew@lunn.ch>
+  - Florian Fainelli <f.fainelli@gmail.com>
+  - Heiner Kallweit <hkallweit1@gmail.com>
+
+# The dt-schema tools will generate a select statement first by using
+# the compatible, and second by using the node name if any. In our
+# case, the node name is the one we want to match on, while the
+# compatible is optional.
+select:
+  properties:
+    $nodename:
+      pattern: "^ethernet-phy(@[a-f0-9]+)?$"
+
+  required:
+    - $nodename
+
+properties:
+  $nodename:
+    pattern: "^ethernet-phy(@[a-f0-9]+)?$"
+
+  compatible:
+    oneOf:
+      - const: ethernet-phy-ieee802.3-c22
+        description: PHYs that implement IEEE802.3 clause 22
+      - const: ethernet-phy-ieee802.3-c45
+        description: PHYs that implement IEEE802.3 clause 45
+      - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
+        description:
+          If the PHY reports an incorrect ID (or none at all) then the
+          compatible list may contain an entry with the correct PHY ID
+          in the above form.
+          The first group of digits is the 16 bit Phy Identifier 1
+          register, this is the chip vendor OUI bits 3:18. The
+          second group of digits is the Phy Identifier 2 register,
+          this is the chip vendor OUI bits 19:24, followed by 10
+          bits of a vendor specific ID.
+      - items:
+          - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
+          - const: ethernet-phy-ieee802.3-c45
+
+  reg:
+    minimum: 0
+    maximum: 31
+    description:
+      The ID number for the PHY.
+
+  interrupts:
+    maxItems: 1
+
+  max-speed:
+    enum:
+      - 10
+      - 100
+      - 1000
+      - 2500
+      - 5000
+      - 10000
+      - 20000
+      - 25000
+      - 40000
+      - 50000
+      - 56000
+      - 100000
+      - 200000
+    description:
+      Maximum PHY supported speed in Mbits / seconds.
+
+  broken-turn-around:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      If set, indicates the PHY device does not correctly release
+      the turn around line low at the end of a MDIO transaction.
+
+  enet-phy-lane-swap:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      If set, indicates the PHY will swap the TX/RX lanes to
+      compensate for the board being designed with the lanes
+      swapped.
+
+  eee-broken-100tx:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  eee-broken-1000t:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gt:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  eee-broken-1000kx:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gkx4:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gkr:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Mark the corresponding energy efficient ethernet mode as
+      broken and request the ethernet to stop advertising it.
+
+  phy-is-integrated:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      If set, indicates that the PHY is integrated into the same
+      physical package as the Ethernet MAC. If needed, muxers
+      should be configured to ensure the integrated PHY is
+      used. The absence of this property indicates the muxers
+      should be configured so that the external PHY is used.
+
+  resets:
+    maxItems: 1
+
+  reset-names:
+    const: phy
+
+  reset-gpios:
+    maxItems: 1
+    description:
+      The GPIO phandle and specifier for the PHY reset signal.
+
+  reset-assert-us:
+    description:
+      Delay after the reset was asserted in microseconds. If this
+      property is missing the delay will be skipped.
+
+  reset-deassert-us:
+    description:
+      Delay after the reset was deasserted in microseconds. If
+      this property is missing the delay will be skipped.
+
+required:
+  - reg
+
+examples:
+  - |
+    ethernet {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        ethernet-phy@0 {
+            compatible = "ethernet-phy-id0141.0e90", "ethernet-phy-ieee802.3-c45";
+            interrupt-parent = <&PIC>;
+            interrupts = <35 1>;
+            reg = <0>;
+
+            resets = <&rst 8>;
+            reset-names = "phy";
+            reset-gpios = <&gpio1 4 1>;
+            reset-assert-us = <1000>;
+            reset-deassert-us = <2000>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index e88c3641d613..5df413d01be2 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -1,67 +1 @@
-The following properties are common to the Ethernet controllers:
-
-NOTE: All 'phy*' properties documented below are Ethernet specific. For the
-generic PHY 'phys' property, see
-Documentation/devicetree/bindings/phy/phy-bindings.txt.
-
-- mac-address: array of 6 bytes, specifies the MAC address that was last used by
-  the boot program; should be used in cases where the MAC address assigned to
-  the device by the boot program is different from the "local-mac-address"
-  property;
-- local-mac-address: array of 6 bytes, specifies the MAC address that was
-  assigned to the network device;
-- nvmem-cells: phandle, reference to an nvmem node for the MAC address
-- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used
-- max-speed: number, specifies maximum speed in Mbit/s supported by the device;
-- max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than
-  the maximum frame size (there's contradiction in the Devicetree
-  Specification).
-- phy-mode: string, operation mode of the PHY interface. This is now a de-facto
-  standard property; supported values are:
-  * "internal" (Internal means there is not a standard bus between the MAC and
-     the PHY, something proprietary is being used to embed the PHY in the MAC.)
-  * "mii"
-  * "gmii"
-  * "sgmii"
-  * "qsgmii"
-  * "tbi"
-  * "rev-mii"
-  * "rmii"
-  * "rgmii" (RX and TX delays are added by the MAC when required)
-  * "rgmii-id" (RGMII with internal RX and TX delays provided by the PHY, the
-     MAC should not add the RX or TX delays in this case)
-  * "rgmii-rxid" (RGMII with internal RX delay provided by the PHY, the MAC
-     should not add an RX delay in this case)
-  * "rgmii-txid" (RGMII with internal TX delay provided by the PHY, the MAC
-     should not add an TX delay in this case)
-  * "rtbi"
-  * "smii"
-  * "xgmii"
-  * "trgmii"
-  * "1000base-x",
-  * "2500base-x",
-  * "rxaui"
-  * "xaui"
-  * "10gbase-kr" (10GBASE-KR, XFI, SFI)
-- phy-connection-type: the same as "phy-mode" property but described in the
-  Devicetree Specification;
-- phy-handle: phandle, specifies a reference to a node representing a PHY
-  device; this property is described in the Devicetree Specification and so
-  preferred;
-- phy: the same as "phy-handle" property, not recommended for new bindings.
-- phy-device: the same as "phy-handle" property, not recommended for new
-  bindings.
-- rx-fifo-depth: the size of the controller's receive fifo in bytes. This
-  is used for components that can have configurable receive fifo sizes,
-  and is useful for determining certain configuration settings such as
-  flow control thresholds.
-- tx-fifo-depth: the size of the controller's transmit fifo in bytes. This
-  is used for components that can have configurable fifo sizes.
-- managed: string, specifies the PHY management type. Supported values are:
-  "auto", "in-band-status". "auto" is the default, it usess MDIO for
-  management if fixed-link is not specified.
-
-Child nodes of the Ethernet controller are typically the individual PHY devices
-connected via the MDIO bus (sometimes the MDIO bus controller is separate).
-They are described in the phy.txt file in this same directory.
-For non-MDIO PHY management see fixed-link.txt.
+This file has moved to ethernet-controller.yaml.
diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt
index ec5d889fe3d8..5df413d01be2 100644
--- a/Documentation/devicetree/bindings/net/fixed-link.txt
+++ b/Documentation/devicetree/bindings/net/fixed-link.txt
@@ -1,54 +1 @@
-Fixed link Device Tree binding
-------------------------------
-
-Some Ethernet MACs have a "fixed link", and are not connected to a
-normal MDIO-managed PHY device. For those situations, a Device Tree
-binding allows to describe a "fixed link".
-
-Such a fixed link situation is described by creating a 'fixed-link'
-sub-node of the Ethernet MAC device node, with the following
-properties:
-
-* 'speed' (integer, mandatory), to indicate the link speed. Accepted
-  values are 10, 100 and 1000
-* 'full-duplex' (boolean, optional), to indicate that full duplex is
-  used. When absent, half duplex is assumed.
-* 'pause' (boolean, optional), to indicate that pause should be
-  enabled.
-* 'asym-pause' (boolean, optional), to indicate that asym_pause should
-  be enabled.
-* 'link-gpios' ('gpio-list', optional), to indicate if a gpio can be read
-  to determine if the link is up.
-
-Old, deprecated 'fixed-link' binding:
-
-* A 'fixed-link' property in the Ethernet MAC node, with 5 cells, of the
-  form <a b c d e> with the following accepted values:
-  - a: emulated PHY ID, choose any but but unique to the all specified
-    fixed-links, from 0 to 31
-  - b: duplex configuration: 0 for half duplex, 1 for full duplex
-  - c: link speed in Mbits/sec, accepted values are: 10, 100 and 1000
-  - d: pause configuration: 0 for no pause, 1 for pause
-  - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for
-    asymmetric pause
-
-Examples:
-
-ethernet@0 {
-	...
-	fixed-link {
-	      speed = <1000>;
-	      full-duplex;
-	};
-	...
-};
-
-ethernet@1 {
-	...
-	fixed-link {
-	      speed = <1000>;
-	      pause;
-	      link-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>;
-	};
-	...
-};
+This file has moved to ethernet-controller.yaml.
diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt
index c812e25ae90f..25fc687419db 100644
--- a/Documentation/devicetree/bindings/net/fsl-enetc.txt
+++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt
@@ -16,8 +16,8 @@ Required properties:
 In this case, the ENETC node should include a "mdio" sub-node
 that in turn should contain the "ethernet-phy" node describing the
 external phy.  Below properties are required, their bindings
-already defined in ethernet.txt or phy.txt, under
-Documentation/devicetree/bindings/net/*.
+already defined in Documentation/devicetree/bindings/net/ethernet.txt or
+Documentation/devicetree/bindings/net/phy.txt.
 
 Required:
 
@@ -51,8 +51,7 @@ Example:
 connection:
 
 In this case, the ENETC port node defines a fixed link connection,
-as specified by "fixed-link.txt", under
-Documentation/devicetree/bindings/net/*.
+as specified by Documentation/devicetree/bindings/net/fixed-link.txt.
 
 Required:
 
diff --git a/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
index d1df8a00e1f3..464c0dafc617 100644
--- a/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
+++ b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
@@ -10,6 +10,7 @@ Required properties:
 	phandle, specifies a reference to the syscon ppe node
 	port, port number connected to the controller
 	channel, recv channel start from channel * number (RX_DESC_NUM)
+	group, field in the pkg desc, in general, it is the same as the port.
 - phy-mode: see ethernet.txt [1].
 
 Optional properties:
@@ -66,7 +67,7 @@ Example:
 		reg = <0x28b0000 0x10000>;
 		interrupts = <0 413 4>;
 		phy-mode = "mii";
-		port-handle = <&ppe 31 0>;
+		port-handle = <&ppe 31 0 31>;
 	};
 
 	ge0: ethernet@2800000 {
@@ -74,7 +75,7 @@ Example:
 		reg = <0x2800000 0x10000>;
 		interrupts = <0 402 4>;
 		phy-mode = "sgmii";
-		port-handle = <&ppe 0 1>;
+		port-handle = <&ppe 0 1 0>;
 		phy-handle = <&phy0>;
 	};
 
@@ -83,6 +84,6 @@ Example:
 		reg = <0x2880000 0x10000>;
 		interrupts = <0 410 4>;
 		phy-mode = "sgmii";
-		port-handle = <&ppe 8 2>;
+		port-handle = <&ppe 8 2 8>;
 		phy-handle = <&phy1>;
 	};
diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt
index 6262c2f293b0..24f11e042f8d 100644
--- a/Documentation/devicetree/bindings/net/keystone-netcp.txt
+++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt
@@ -104,6 +104,23 @@ Required properties:
 			- 10Gb mac<->mac forced mode : 11
 ----phy-handle:	phandle to PHY device
 
+- cpts:		sub-node time synchronization (CPTS) submodule configuration
+-- clocks:	CPTS reference clock. Should point on cpts-refclk-mux clock.
+-- clock-names: should be "cpts"
+-- cpts-refclk-mux: multiplexer clock definition sub-node for CPTS reference (RFTCLK) clock
+--- #clock-cells: should be 0
+--- clocks:	list of CPTS reference (RFTCLK) clock's parents as defined in Data manual
+--- ti,mux-tbl: array of multiplexer indexes as defined in Data manual
+--- assigned-clocks: should point on cpts-refclk-mux clock
+--- assigned-clock-parents: should point on required RFTCLK clock parent to be selected
+-- cpts_clock_mult: (optional) Numerator to convert input clock ticks
+		into nanoseconds
+-- cpts_clock_shift: (optional) Denominator to convert input clock ticks into
+		nanoseconds.
+		Mult and shift will be calculated basing on CPTS
+		rftclk frequency if both cpts_clock_shift and
+		cpts_clock_mult properties are not provided.
+
 Optional properties:
 - enable-ale:	NetCP driver keeps the address learning feature in the ethernet
 		switch module disabled. This attribute is to enable the address
@@ -168,6 +185,23 @@ netcp: netcp@2000000 {
 			tx-queue = <648>;
 			tx-channel = <8>;
 
+			cpts {
+				clocks = <&cpts_refclk_mux>;
+				clock-names = "cpts";
+
+				cpts_refclk_mux: cpts-refclk-mux {
+					#clock-cells = <0>;
+					clocks = <&chipclk12>, <&chipclk13>,
+						 <&timi0>, <&timi1>,
+						 <&tsipclka>, <&tsrefclk>,
+						 <&tsipclkb>;
+					ti,mux-tbl = <0x0>, <0x1>, <0x2>,
+						<0x3>, <0x4>, <0x8>, <0xC>;
+					assigned-clocks = <&cpts_refclk_mux>;
+					assigned-clock-parents = <&chipclk12>;
+				};
+			};
+
 			interfaces {
 				gbe0: interface-0 {
 					slave-port = <0>;
@@ -219,3 +253,13 @@ netcp: netcp@2000000 {
 		};
 	};
 };
+
+CPTS board configuration - select external CPTS RFTCLK:
+
+&tsrefclk{
+	clock-frequency = <500000000>;
+};
+
+&cpts_refclk_mux {
+	assigned-clock-parents = <&tsrefclk>;
+};
diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 9c5e94482b5f..63c73fafe26d 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -15,8 +15,11 @@ Required properties:
   Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
   Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
   Use "cdns,zynqmp-gem" for Zynq Ultrascale+ MPSoC.
+  Use "sifive,fu540-macb" for SiFive FU540-C000 SoC.
   Or the generic form: "cdns,emac".
 - reg: Address and length of the register set for the device
+	For "sifive,fu540-macb", second range is required to specify the
+	address and length of the registers for GEMGXL Management block.
 - interrupts: Should contain macb interrupt
 - phy-mode: See ethernet.txt file in the same directory.
 - clock-names: Tuple listing input clock names.
diff --git a/Documentation/devicetree/bindings/net/marvell-bluetooth.txt b/Documentation/devicetree/bindings/net/marvell-bluetooth.txt
new file mode 100644
index 000000000000..0e2842296032
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/marvell-bluetooth.txt
@@ -0,0 +1,25 @@
+Marvell Bluetooth Chips
+-----------------------
+
+This documents the binding structure and common properties for serial
+attached Marvell Bluetooth devices. The following chips are included in
+this binding:
+
+* Marvell 88W8897 Bluetooth devices
+
+Required properties:
+ - compatible: should be:
+    "mrvl,88w8897"
+
+Optional properties:
+None so far
+
+Example:
+
+&serial0 {
+	compatible = "ns16550a";
+	...
+	bluetooth {
+		compatible = "mrvl,88w8897";
+	};
+};
diff --git a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
index 42cd81090a2c..3f3cfc1d8d4d 100644
--- a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
+++ b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
@@ -16,7 +16,7 @@ Required properties:
 
 Optional properties:
 - interrupts: interrupt line number for the SMI error/done interrupt
-- clocks: phandle for up to three required clocks for the MDIO instance
+- clocks: phandle for up to four required clocks for the MDIO instance
 
 The child nodes of the MDIO driver are the individual PHY devices
 connected to this MDIO bus. They must have a "reg" property given the
diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt
index e3e1603f256c..cf8a0105488e 100644
--- a/Documentation/devicetree/bindings/net/mdio.txt
+++ b/Documentation/devicetree/bindings/net/mdio.txt
@@ -1,37 +1 @@
-Common MDIO bus properties.
-
-These are generic properties that can apply to any MDIO bus.
-
-Optional properties:
-- reset-gpios: One GPIO that control the RESET lines of all PHYs on that MDIO
-  bus.
-- reset-delay-us: RESET pulse width in microseconds.
-
-A list of child nodes, one per device on the bus is expected. These
-should follow the generic phy.txt, or a device specific binding document.
-
-The 'reset-delay-us' indicates the RESET signal pulse width in microseconds and
-applies to all PHY devices. It must therefore be appropriately determined based
-on all PHY requirements (maximum value of all per-PHY RESET pulse widths).
-
-Example :
-This example shows these optional properties, plus other properties
-required for the TI Davinci MDIO driver.
-
-	davinci_mdio: ethernet@5c030000 {
-		compatible = "ti,davinci_mdio";
-		reg = <0x5c030000 0x1000>;
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>;
-		reset-delay-us = <2>;
-
-		ethphy0: ethernet-phy@1 {
-			reg = <1>;
-		};
-
-		ethphy1: ethernet-phy@3 {
-			reg = <3>;
-		};
-	};
+This file has moved to mdio.yaml.
diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml
new file mode 100644
index 000000000000..5d08d2ffd4eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mdio.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/mdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MDIO Bus Generic Binding
+
+maintainers:
+  - Andrew Lunn <andrew@lunn.ch>
+  - Florian Fainelli <f.fainelli@gmail.com>
+  - Heiner Kallweit <hkallweit1@gmail.com>
+
+description:
+  These are generic properties that can apply to any MDIO bus. Any
+  MDIO bus must have a list of child nodes, one per device on the
+  bus. These should follow the generic ethernet-phy.yaml document, or
+  a device specific binding document.
+
+properties:
+  $nodename:
+    pattern: "^mdio(@.*)?"
+
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  reset-gpios:
+    maxItems: 1
+    description:
+      The phandle and specifier for the GPIO that controls the RESET
+      lines of all PHYs on that MDIO bus.
+
+  reset-delay-us:
+    description:
+      RESET pulse width in microseconds. It applies to all PHY devices
+      and must therefore be appropriately determined based on all PHY
+      requirements (maximum value of all per-PHY RESET pulse widths).
+
+patternProperties:
+  "^ethernet-phy@[0-9a-f]+$":
+    type: object
+
+    properties:
+      reg:
+        minimum: 0
+        maximum: 31
+        description:
+          The ID number for the PHY.
+
+    required:
+      - reg
+
+examples:
+  - |
+    davinci_mdio: mdio@5c030000 {
+        compatible = "ti,davinci_mdio";
+        reg = <0x5c030000 0x1000>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        reset-gpios = <&gpio2 5 1>;
+        reset-delay-us = <2>;
+
+        ethphy0: ethernet-phy@1 {
+            reg = <1>;
+        };
+
+        ethphy1: ethernet-phy@3 {
+            reg = <3>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
index 41a7dcc80f5b..112011c51d5e 100644
--- a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
@@ -50,16 +50,33 @@ Required properties:
 		  "mediatek,mt7663u-bluetooth": for MT7663U device
 		  "mediatek,mt7668u-bluetooth": for MT7668U device
 - vcc-supply:	Main voltage regulator
+
+If the pin controller on the platform can support both pinmux and GPIO
+control such as the most of MediaTek platform. Please use below properties.
+
 - pinctrl-names: Should be "default", "runtime"
 - pinctrl-0: Should contain UART RXD low when the device is powered up to
 	     enter proper bootstrap mode.
 - pinctrl-1: Should contain UART mode pin ctrl
 
+Else, the pin controller on the platform only can support pinmux control and
+the GPIO control still has to rely on the dedicated GPIO controller such as
+a legacy MediaTek SoC, MT7621. Please use the below properties.
+
+- boot-gpios:	GPIO same to the pin as UART RXD and used to keep LOW when
+		the device is powered up to enter proper bootstrap mode when
+- pinctrl-names: Should be "default"
+- pinctrl-0: Should contain UART mode pin ctrl
+
 Optional properties:
 
 - reset-gpios:	GPIO used to reset the device whose initial state keeps low,
 		if the GPIO is missing, then board-level design should be
 		guaranteed.
+- clocks:	Should be the clock specifiers corresponding to the entry in
+		clock-names property. If the clock is missing, then board-level
+		design should be guaranteed.
+- clock-names:	Should contain "osc" entry for the external oscillator.
 - current-speed:  Current baud rate of the device whose defaults to 921600
 
 Example:
diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt
index 503f2b9194e2..770ff98d4524 100644
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -11,6 +11,7 @@ Required properties:
 		"mediatek,mt2701-eth": for MT2701 SoC
 		"mediatek,mt7623-eth", "mediatek,mt2701-eth": for MT7623 SoC
 		"mediatek,mt7622-eth": for MT7622 SoC
+		"mediatek,mt7629-eth": for MT7629 SoC
 - reg: Address and length of the register set for the device
 - interrupts: Should contain the three frame engines interrupts in numeric
 	order. These are fe_int0, fe_int1 and fe_int2.
@@ -19,14 +20,23 @@ Required properties:
 	"ethif", "esw", "gp2", "gp1" : For MT2701 and MT7623 SoC
         "ethif", "esw", "gp0", "gp1", "gp2", "sgmii_tx250m", "sgmii_rx250m",
 	"sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii_ck", "eth2pll" : For MT7622 SoC
+	"ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "sgmii_tx250m",
+	"sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii2_tx250m",
+	"sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb", "sgmii_ck",
+	"eth2pll" : For MT7629 SoC.
 - power-domains: phandle to the power domain that the ethernet is part of
 - resets: Should contain phandles to the ethsys reset signals
 - reset-names: Should contain the names of reset signal listed in the resets
 		property
 		These are "fe", "gmac" and "ppe"
 - mediatek,ethsys: phandle to the syscon node that handles the port setup
-- mediatek,sgmiisys: phandle to the syscon node that handles the SGMII setup
-	which is required for those SoCs equipped with SGMII such as MT7622 SoC.
+- mediatek,infracfg: phandle to the syscon node that handles the path from
+	GMAC to PHY variants, which is required for MT7629 SoC.
+- mediatek,sgmiisys: a list of phandles to the syscon node that handles the
+	SGMII setup which is required for those SoCs equipped with SGMII such
+	as MT7622 and MT7629 SoC. And MT7622 have only one set of SGMII shared
+	by GMAC1 and GMAC2; MT7629 have two independent sets of SGMII directed
+	to GMAC1 and GMAC2, respectively.
 - mediatek,pctl: phandle to the syscon node that handles the ports slew rate
 	and driver current: only for MT2701 and MT7623 SoC
 
diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt
index 9b9e5b1765dd..2399ee60caed 100644
--- a/Documentation/devicetree/bindings/net/phy.txt
+++ b/Documentation/devicetree/bindings/net/phy.txt
@@ -1,79 +1 @@
-PHY nodes
-
-Required properties:
-
- - interrupts : interrupt specifier for the sole interrupt.
- - reg : The ID number for the phy, usually a small integer
-
-Optional Properties:
-
-- compatible: Compatible list, may contain
-  "ethernet-phy-ieee802.3-c22" or "ethernet-phy-ieee802.3-c45" for
-  PHYs that implement IEEE802.3 clause 22 or IEEE802.3 clause 45
-  specifications. If neither of these are specified, the default is to
-  assume clause 22.
-
-  If the PHY reports an incorrect ID (or none at all) then the
-  "compatible" list may contain an entry with the correct PHY ID in the
-  form: "ethernet-phy-idAAAA.BBBB" where
-     AAAA - The value of the 16 bit Phy Identifier 1 register as
-            4 hex digits. This is the chip vendor OUI bits 3:18
-     BBBB - The value of the 16 bit Phy Identifier 2 register as
-            4 hex digits. This is the chip vendor OUI bits 19:24,
-            followed by 10 bits of a vendor specific ID.
-
-  The compatible list should not contain other values than those
-  listed here.
-
-- max-speed: Maximum PHY supported speed (10, 100, 1000...)
-
-- broken-turn-around: If set, indicates the PHY device does not correctly
-  release the turn around line low at the end of a MDIO transaction.
-
-- enet-phy-lane-swap: If set, indicates the PHY will swap the TX/RX lanes to
-  compensate for the board being designed with the lanes swapped.
-
-- enet-phy-lane-no-swap: If set, indicates that PHY will disable swap of the
-  TX/RX lanes. This property allows the PHY to work correcly after e.g. wrong
-  bootstrap configuration caused by issues in PCB layout design.
-
-- eee-broken-100tx:
-- eee-broken-1000t:
-- eee-broken-10gt:
-- eee-broken-1000kx:
-- eee-broken-10gkx4:
-- eee-broken-10gkr:
-  Mark the corresponding energy efficient ethernet mode as broken and
-  request the ethernet to stop advertising it.
-
-- phy-is-integrated: If set, indicates that the PHY is integrated into the same
-  physical package as the Ethernet MAC. If needed, muxers should be configured
-  to ensure the integrated PHY is used. The absence of this property indicates
-  the muxers should be configured so that the external PHY is used.
-
-- resets: The reset-controller phandle and specifier for the PHY reset signal.
-
-- reset-names: Must be "phy" for the PHY reset signal.
-
-- reset-gpios: The GPIO phandle and specifier for the PHY reset signal.
-
-- reset-assert-us: Delay after the reset was asserted in microseconds.
-  If this property is missing the delay will be skipped.
-
-- reset-deassert-us: Delay after the reset was deasserted in microseconds.
-  If this property is missing the delay will be skipped.
-
-Example:
-
-ethernet-phy@0 {
-	compatible = "ethernet-phy-id0141.0e90", "ethernet-phy-ieee802.3-c22";
-	interrupt-parent = <&PIC>;
-	interrupts = <35 IRQ_TYPE_EDGE_RISING>;
-	reg = <0>;
-
-	resets = <&rst 8>;
-	reset-names = "phy";
-	reset-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>;
-	reset-assert-us = <1000>;
-	reset-deassert-us = <2000>;
-};
+This file has moved to ethernet-phy.yaml.
diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.txt b/Documentation/devicetree/bindings/net/qca,ar71xx.txt
new file mode 100644
index 000000000000..2a33e71ba72b
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/qca,ar71xx.txt
@@ -0,0 +1,45 @@
+Required properties:
+- compatible:	Should be "qca,<soc>-eth". Currently support compatibles are:
+		qca,ar7100-eth - Atheros AR7100
+		qca,ar7240-eth - Atheros AR7240
+		qca,ar7241-eth - Atheros AR7241
+		qca,ar7242-eth - Atheros AR7242
+		qca,ar9130-eth - Atheros AR9130
+		qca,ar9330-eth - Atheros AR9330
+		qca,ar9340-eth - Atheros AR9340
+		qca,qca9530-eth - Qualcomm Atheros QCA9530
+		qca,qca9550-eth - Qualcomm Atheros QCA9550
+		qca,qca9560-eth - Qualcomm Atheros QCA9560
+
+- reg : Address and length of the register set for the device
+- interrupts : Should contain eth interrupt
+- phy-mode : See ethernet.txt file in the same directory
+- clocks: the clock used by the core
+- clock-names: the names of the clock listed in the clocks property. These are
+	"eth" and "mdio".
+- resets: Should contain phandles to the reset signals
+- reset-names: Should contain the names of reset signal listed in the resets
+		property. These are "mac" and "mdio"
+
+Optional properties:
+- phy-handle : phandle to the PHY device connected to this device.
+- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
+  Use instead of phy-handle.
+
+Optional subnodes:
+- mdio : specifies the mdio bus, used as a container for phy nodes
+  according to phy.txt in the same directory
+
+Example:
+
+ethernet@1a000000 {
+	compatible = "qca,ar9330-eth";
+	reg = <0x1a000000 0x200>;
+	interrupts = <5>;
+	resets = <&rst 13>, <&rst 23>;
+	reset-names = "mac", "mdio";
+	clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_MDIO>;
+	clock-names = "eth", "mdio";
+
+	phy-mode = "gmii";
+};
diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
index 7ef6118abd3d..68b67d9db63a 100644
--- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
+++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
@@ -17,6 +17,7 @@ Optional properties for compatible string qcom,qca6174-bt:
 
  - enable-gpios: gpio specifier used to enable chip
  - clocks: clock provided to the controller (SUSCLK_32KHZ)
+ - firmware-name: specify the name of nvm firmware to load
 
 Required properties for compatible string qcom,wcn399x-bt:
 
@@ -28,6 +29,7 @@ Required properties for compatible string qcom,wcn399x-bt:
 Optional properties for compatible string qcom,wcn399x-bt:
 
  - max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt
+ - firmware-name: specify the name of nvm firmware to load
 
 Examples:
 
@@ -40,6 +42,7 @@ serial@7570000 {
 
 		enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
 		clocks = <&divclk4>;
+		firmware-name = "nvm_00440302.bin";
 	};
 };
 
@@ -52,5 +55,6 @@ serial@898000 {
 		vddrf-supply = <&vreg_l17a_1p3>;
 		vddch0-supply = <&vreg_l25a_3p3>;
 		max-speed = <3200000>;
+		firmware-name = "crnv21.bin";
 	};
 };
diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
new file mode 100644
index 000000000000..76fea2be66ac
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -0,0 +1,411 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/snps,dwmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DesignWare MAC Device Tree Bindings
+
+maintainers:
+  - Alexandre Torgue <alexandre.torgue@st.com>
+  - Giuseppe Cavallaro <peppe.cavallaro@st.com>
+  - Jose Abreu <joabreu@synopsys.com>
+
+# Select every compatible, including the deprecated ones. This way, we
+# will be able to report a warning when we have that compatible, since
+# we will validate the node thanks to the select, but won't report it
+# as a valid value in the compatible property description
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - snps,dwmac
+          - snps,dwmac-3.50a
+          - snps,dwmac-3.610
+          - snps,dwmac-3.70a
+          - snps,dwmac-3.710
+          - snps,dwmac-4.00
+          - snps,dwmac-4.10a
+          - snps,dwxgmac
+          - snps,dwxgmac-2.10
+
+          # Deprecated
+          - st,spear600-gmac
+
+  required:
+    - compatible
+
+properties:
+
+  # We need to include all the compatibles from schemas that will
+  # include that schemas, otherwise compatible won't validate for
+  # those.
+  compatible:
+    contains:
+      enum:
+        - allwinner,sun7i-a20-gmac
+        - allwinner,sun8i-a83t-emac
+        - allwinner,sun8i-h3-emac
+        - allwinner,sun8i-r40-emac
+        - allwinner,sun8i-v3s-emac
+        - allwinner,sun50i-a64-emac
+        - snps,dwmac
+        - snps,dwmac-3.50a
+        - snps,dwmac-3.610
+        - snps,dwmac-3.70a
+        - snps,dwmac-3.710
+        - snps,dwmac-4.00
+        - snps,dwmac-4.10a
+        - snps,dwxgmac
+        - snps,dwxgmac-2.10
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    minItems: 1
+    maxItems: 3
+    items:
+      - description: Combined signal for various interrupt events
+      - description: The interrupt to manage the remote wake-up packet detection
+      - description: The interrupt that occurs when Rx exits the LPI state
+
+  interrupt-names:
+    minItems: 1
+    maxItems: 3
+    items:
+      - const: macirq
+      - const: eth_wake_irq
+      - const: eth_lpi
+
+  clocks:
+    minItems: 1
+    maxItems: 3
+    items:
+      - description: GMAC main clock
+      - description: Peripheral registers interface clock
+      - description:
+          PTP reference clock. This clock is used for programming the
+          Timestamp Addend Register. If not passed then the system
+          clock will be used and this is fine on some platforms.
+
+  clock-names:
+    additionalItems: true
+    contains:
+      enum:
+        - stmmaceth
+        - pclk
+        - ptp_ref
+
+  resets:
+    maxItems: 1
+    description:
+      MAC Reset signal.
+
+  reset-names:
+    const: stmmaceth
+
+  snps,axi-config:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      AXI BUS Mode parameters. Phandle to a node that can contain the
+      following properties
+        * snps,lpi_en, enable Low Power Interface
+        * snps,xit_frm, unlock on WoL
+        * snps,wr_osr_lmt, max write outstanding req. limit
+        * snps,rd_osr_lmt, max read outstanding req. limit
+        * snps,kbbe, do not cross 1KiB boundary.
+        * snps,blen, this is a vector of supported burst length.
+        * snps,fb, fixed-burst
+        * snps,mb, mixed-burst
+        * snps,rb, rebuild INCRx Burst
+
+  snps,mtl-rx-config:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      Multiple RX Queues parameters. Phandle to a node that can
+      contain the following properties
+        * snps,rx-queues-to-use, number of RX queues to be used in the
+          driver
+        * Choose one of these RX scheduling algorithms
+          * snps,rx-sched-sp, Strict priority
+          * snps,rx-sched-wsp, Weighted Strict priority
+        * For each RX queue
+          * Choose one of these modes
+            * snps,dcb-algorithm, Queue to be enabled as DCB
+            * snps,avb-algorithm, Queue to be enabled as AVB
+          * snps,map-to-dma-channel, Channel to map
+          * Specifiy specific packet routing
+            * snps,route-avcp, AV Untagged Control packets
+            * snps,route-ptp, PTP Packets
+            * snps,route-dcbcp, DCB Control Packets
+            * snps,route-up, Untagged Packets
+            * snps,route-multi-broad, Multicast & Broadcast Packets
+          * snps,priority, RX queue priority (Range 0x0 to 0xF)
+
+  snps,mtl-tx-config:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      Multiple TX Queues parameters. Phandle to a node that can
+      contain the following properties
+        * snps,tx-queues-to-use, number of TX queues to be used in the
+          driver
+        * Choose one of these TX scheduling algorithms
+          * snps,tx-sched-wrr, Weighted Round Robin
+          * snps,tx-sched-wfq, Weighted Fair Queuing
+          * snps,tx-sched-dwrr, Deficit Weighted Round Robin
+          * snps,tx-sched-sp, Strict priority
+        * For each TX queue
+          * snps,weight, TX queue weight (if using a DCB weight
+            algorithm)
+          * Choose one of these modes
+            * snps,dcb-algorithm, TX queue will be working in DCB
+            * snps,avb-algorithm, TX queue will be working in AVB
+              [Attention] Queue 0 is reserved for legacy traffic
+                          and so no AVB is available in this queue.
+          * Configure Credit Base Shaper (if AVB Mode selected)
+            * snps,send_slope, enable Low Power Interface
+            * snps,idle_slope, unlock on WoL
+            * snps,high_credit, max write outstanding req. limit
+            * snps,low_credit, max read outstanding req. limit
+          * snps,priority, TX queue priority (Range 0x0 to 0xF)
+
+  snps,reset-gpio:
+    deprecated: true
+    maxItems: 1
+    description:
+      PHY Reset GPIO
+
+  snps,reset-active-low:
+    deprecated: true
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Indicates that the PHY Reset is active low
+
+  snps,reset-delays-us:
+    deprecated: true
+    allOf:
+      - $ref: /schemas/types.yaml#definitions/uint32-array
+      - minItems: 3
+        maxItems: 3
+    description:
+      Triplet of delays. The 1st cell is reset pre-delay in micro
+      seconds. The 2nd cell is reset pulse in micro seconds. The 3rd
+      cell is reset post-delay in micro seconds.
+
+  snps,aal:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Use Address-Aligned Beats
+
+  snps,fixed-burst:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Program the DMA to use the fixed burst mode
+
+  snps,mixed-burst:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Program the DMA to use the mixed burst mode
+
+  snps,force_thresh_dma_mode:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Force DMA to use the threshold mode for both tx and rx
+
+  snps,force_sf_dma_mode:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Force DMA to use the Store and Forward mode for both tx and
+      rx. This flag is ignored if force_thresh_dma_mode is set.
+
+  snps,en-tx-lpi-clockgating:
+    $ref: /schemas/types.yaml#definitions/flag
+    description:
+      Enable gating of the MAC TX clock during TX low-power mode
+
+  snps,multicast-filter-bins:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      Number of multicast filter hash bins supported by this device
+      instance
+
+  snps,perfect-filter-entries:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      Number of perfect filter entries supported by this device
+      instance
+
+  snps,ps-speed:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description:
+      Port selection speed that can be passed to the core when PCS
+      is supported. For example, this is used in case of SGMII and
+      MAC2MAC connection.
+
+  mdio:
+    type: object
+    description:
+      Creates and registers an MDIO bus.
+
+    properties:
+      compatible:
+        const: snps,dwmac-mdio
+
+    required:
+      - compatible
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-names
+  - phy-mode
+
+dependencies:
+  snps,reset-active-low: ["snps,reset-gpio"]
+  snps,reset-delay-us: ["snps,reset-gpio"]
+
+allOf:
+  - $ref: "ethernet-controller.yaml#"
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun7i-a20-gmac
+              - allwinner,sun8i-a83t-emac
+              - allwinner,sun8i-h3-emac
+              - allwinner,sun8i-r40-emac
+              - allwinner,sun8i-v3s-emac
+              - allwinner,sun50i-a64-emac
+              - snps,dwxgmac
+              - snps,dwxgmac-2.10
+              - st,spear600-gmac
+
+    then:
+      properties:
+        snps,pbl:
+          allOf:
+            - $ref: /schemas/types.yaml#definitions/uint32
+            - enum: [2, 4, 8]
+          description:
+            Programmable Burst Length (tx and rx)
+
+        snps,txpbl:
+          allOf:
+            - $ref: /schemas/types.yaml#definitions/uint32
+            - enum: [2, 4, 8]
+          description:
+            Tx Programmable Burst Length. If set, DMA tx will use this
+            value rather than snps,pbl.
+
+        snps,rxpbl:
+          allOf:
+            - $ref: /schemas/types.yaml#definitions/uint32
+            - enum: [2, 4, 8]
+          description:
+            Rx Programmable Burst Length. If set, DMA rx will use this
+            value rather than snps,pbl.
+
+        snps,no-pbl-x8:
+          $ref: /schemas/types.yaml#definitions/flag
+          description:
+            Don\'t multiply the pbl/txpbl/rxpbl values by 8. For core
+            rev < 3.50, don\'t multiply the values by 4.
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun7i-a20-gmac
+              - allwinner,sun8i-a83t-emac
+              - allwinner,sun8i-h3-emac
+              - allwinner,sun8i-r40-emac
+              - allwinner,sun8i-v3s-emac
+              - allwinner,sun50i-a64-emac
+              - snps,dwmac-4.00
+              - snps,dwmac-4.10a
+              - snps,dwxgmac
+              - snps,dwxgmac-2.10
+              - st,spear600-gmac
+
+    then:
+        snps,tso:
+          $ref: /schemas/types.yaml#definitions/flag
+          description:
+            Enables the TSO feature otherwise it will be managed by
+            MAC HW capability register.
+
+examples:
+  - |
+    stmmac_axi_setup: stmmac-axi-config {
+        snps,wr_osr_lmt = <0xf>;
+        snps,rd_osr_lmt = <0xf>;
+        snps,blen = <256 128 64 32 0 0 0>;
+    };
+
+    mtl_rx_setup: rx-queues-config {
+        snps,rx-queues-to-use = <1>;
+        snps,rx-sched-sp;
+        queue0 {
+            snps,dcb-algorithm;
+            snps,map-to-dma-channel = <0x0>;
+            snps,priority = <0x0>;
+        };
+    };
+
+    mtl_tx_setup: tx-queues-config {
+        snps,tx-queues-to-use = <2>;
+        snps,tx-sched-wrr;
+        queue0 {
+            snps,weight = <0x10>;
+            snps,dcb-algorithm;
+            snps,priority = <0x0>;
+        };
+
+        queue1 {
+            snps,avb-algorithm;
+            snps,send_slope = <0x1000>;
+            snps,idle_slope = <0x1000>;
+            snps,high_credit = <0x3E800>;
+            snps,low_credit = <0xFFC18000>;
+            snps,priority = <0x1>;
+        };
+    };
+
+    gmac0: ethernet@e0800000 {
+        compatible = "snps,dwxgmac-2.10", "snps,dwxgmac";
+        reg = <0xe0800000 0x8000>;
+        interrupt-parent = <&vic1>;
+        interrupts = <24 23 22>;
+        interrupt-names = "macirq", "eth_wake_irq", "eth_lpi";
+        mac-address = [000000000000]; /* Filled in by U-Boot */
+        max-frame-size = <3800>;
+        phy-mode = "gmii";
+        snps,multicast-filter-bins = <256>;
+        snps,perfect-filter-entries = <128>;
+        rx-fifo-depth = <16384>;
+        tx-fifo-depth = <16384>;
+        clocks = <&clock>;
+        clock-names = "stmmaceth";
+        snps,axi-config = <&stmmac_axi_setup>;
+        snps,mtl-rx-config = <&mtl_rx_setup>;
+        snps,mtl-tx-config = <&mtl_tx_setup>;
+        mdio0 {
+            #address-cells = <1>;
+            #size-cells = <0>;
+            compatible = "snps,dwmac-mdio";
+            phy1: ethernet-phy@0 {
+                reg = <0>;
+            };
+        };
+    };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
index 17d6819669c8..612a8e8abc88 100644
--- a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
@@ -6,11 +6,17 @@ present in Documentation/devicetree/bindings/net/stmmac.txt.
 The device node has additional properties:
 
 Required properties:
- - compatible	: Should contain "altr,socfpga-stmmac" along with
-		  "snps,dwmac" and any applicable more detailed
+ - compatible	: For Cyclone5/Arria5 SoCs it should contain
+		  "altr,socfpga-stmmac". For Arria10/Agilex/Stratix10 SoCs
+		  "altr,socfpga-stmmac-a10-s10".
+		  Along with "snps,dwmac" and any applicable more detailed
 		  designware version numbers documented in stmmac.txt
  - altr,sysmgr-syscon : Should be the phandle to the system manager node that
    encompasses the glue register, the register offset, and the register shift.
+   On Cyclone5/Arria5, the register shift represents the PHY mode bits, while
+   on the Arria10/Stratix10/Agilex platforms, the register shift represents
+   bit for each emac to enable/disable signals from the FPGA fabric to the
+   EMAC modules.
  - altr,f2h_ptp_ref_clk use f2h_ptp_ref_clk instead of default eosc1 clock
    for ptp ref clk. This affects all emacs as the clock is common.
 
diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index cb694062afff..7d48782767cb 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -1,178 +1 @@
-* STMicroelectronics 10/100/1000/2500/10000 Ethernet (GMAC/XGMAC)
-
-Required properties:
-- compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac" or
-	"snps,dwxgmac-<ip_version>", "snps,dwxgmac".
-	For backwards compatibility: "st,spear600-gmac" is also supported.
-- reg: Address and length of the register set for the device
-- interrupts: Should contain the STMMAC interrupts
-- interrupt-names: Should contain a list of interrupt names corresponding to
-	the interrupts in the interrupts property, if available.
-	Valid interrupt names are:
-  - "macirq" (combined signal for various interrupt events)
-  - "eth_wake_irq" (the interrupt to manage the remote wake-up packet detection)
-  - "eth_lpi" (the interrupt that occurs when Rx exits the LPI state)
-- phy-mode: See ethernet.txt file in the same directory.
-- snps,reset-gpio 	gpio number for phy reset.
-- snps,reset-active-low boolean flag to indicate if phy reset is active low.
-- snps,reset-delays-us  is triplet of delays
-	The 1st cell is reset pre-delay in micro seconds.
-	The 2nd cell is reset pulse in micro seconds.
-	The 3rd cell is reset post-delay in micro seconds.
-
-Optional properties:
-- resets: Should contain a phandle to the STMMAC reset signal, if any
-- reset-names: Should contain the reset signal name "stmmaceth", if a
-	reset phandle is given
-- max-frame-size: See ethernet.txt file in the same directory
-- clocks: If present, the first clock should be the GMAC main clock and
-  the second clock should be peripheral's register interface clock. Further
-  clocks may be specified in derived bindings.
-- clock-names: One name for each entry in the clocks property, the
-  first one should be "stmmaceth" and the second one should be "pclk".
-- ptp_ref: this is the PTP reference clock; in case of the PTP is available
-  this clock is used for programming the Timestamp Addend Register. If not
-  passed then the system clock will be used and this is fine on some
-  platforms.
-- tx-fifo-depth: See ethernet.txt file in the same directory
-- rx-fifo-depth: See ethernet.txt file in the same directory
-- snps,pbl		Programmable Burst Length (tx and rx)
-- snps,txpbl		Tx Programmable Burst Length. Only for GMAC and newer.
-			If set, DMA tx will use this value rather than snps,pbl.
-- snps,rxpbl		Rx Programmable Burst Length. Only for GMAC and newer.
-			If set, DMA rx will use this value rather than snps,pbl.
-- snps,no-pbl-x8	Don't multiply the pbl/txpbl/rxpbl values by 8.
-			For core rev < 3.50, don't multiply the values by 4.
-- snps,aal		Address-Aligned Beats
-- snps,fixed-burst	Program the DMA to use the fixed burst mode
-- snps,mixed-burst	Program the DMA to use the mixed burst mode
-- snps,force_thresh_dma_mode	Force DMA to use the threshold mode for
-				both tx and rx
-- snps,force_sf_dma_mode	Force DMA to use the Store and Forward
-				mode for both tx and rx. This flag is
-				ignored if force_thresh_dma_mode is set.
-- snps,en-tx-lpi-clockgating	Enable gating of the MAC TX clock during
-				TX low-power mode
-- snps,multicast-filter-bins:	Number of multicast filter hash bins
-				supported by this device instance
-- snps,perfect-filter-entries:	Number of perfect filter entries supported
-				by this device instance
-- snps,ps-speed: port selection speed that can be passed to the core when
-		 PCS is supported. For example, this is used in case of SGMII
-		 and MAC2MAC connection.
-- snps,tso: this enables the TSO feature otherwise it will be managed by
-		 MAC HW capability register. Only for GMAC4 and newer.
-- AXI BUS Mode parameters: below the list of all the parameters to program the
-			   AXI register inside the DMA module:
-	- snps,lpi_en: enable Low Power Interface
-	- snps,xit_frm: unlock on WoL
-	- snps,wr_osr_lmt: max write outstanding req. limit
-	- snps,rd_osr_lmt: max read outstanding req. limit
-	- snps,kbbe: do not cross 1KiB boundary.
-	- snps,blen: this is a vector of supported burst length.
-	- snps,fb: fixed-burst
-	- snps,mb: mixed-burst
-	- snps,rb: rebuild INCRx Burst
-- mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
-- Multiple RX Queues parameters: below the list of all the parameters to
-				 configure the multiple RX queues:
-	- snps,rx-queues-to-use: number of RX queues to be used in the driver
-	- Choose one of these RX scheduling algorithms:
-		- snps,rx-sched-sp: Strict priority
-		- snps,rx-sched-wsp: Weighted Strict priority
-	- For each RX queue
-		- Choose one of these modes:
-			- snps,dcb-algorithm: Queue to be enabled as DCB
-			- snps,avb-algorithm: Queue to be enabled as AVB
-		- snps,map-to-dma-channel: Channel to map
-		- Specifiy specific packet routing:
-			- snps,route-avcp: AV Untagged Control packets
-			- snps,route-ptp: PTP Packets
-			- snps,route-dcbcp: DCB Control Packets
-			- snps,route-up: Untagged Packets
-			- snps,route-multi-broad: Multicast & Broadcast Packets
-		- snps,priority: RX queue priority (Range: 0x0 to 0xF)
-- Multiple TX Queues parameters: below the list of all the parameters to
-				 configure the multiple TX queues:
-	- snps,tx-queues-to-use: number of TX queues to be used in the driver
-	- Choose one of these TX scheduling algorithms:
-		- snps,tx-sched-wrr: Weighted Round Robin
-		- snps,tx-sched-wfq: Weighted Fair Queuing
-		- snps,tx-sched-dwrr: Deficit Weighted Round Robin
-		- snps,tx-sched-sp: Strict priority
-	- For each TX queue
-		- snps,weight: TX queue weight (if using a DCB weight algorithm)
-		- Choose one of these modes:
-			- snps,dcb-algorithm: TX queue will be working in DCB
-			- snps,avb-algorithm: TX queue will be working in AVB
-			  [Attention] Queue 0 is reserved for legacy traffic
-			  and so no AVB is available in this queue.
-		- Configure Credit Base Shaper (if AVB Mode selected):
-			- snps,send_slope: enable Low Power Interface
-			- snps,idle_slope: unlock on WoL
-			- snps,high_credit: max write outstanding req. limit
-			- snps,low_credit: max read outstanding req. limit
-		- snps,priority: TX queue priority (Range: 0x0 to 0xF)
-Examples:
-
-	stmmac_axi_setup: stmmac-axi-config {
-		snps,wr_osr_lmt = <0xf>;
-		snps,rd_osr_lmt = <0xf>;
-		snps,blen = <256 128 64 32 0 0 0>;
-	};
-
-	mtl_rx_setup: rx-queues-config {
-		snps,rx-queues-to-use = <1>;
-		snps,rx-sched-sp;
-		queue0 {
-			snps,dcb-algorithm;
-			snps,map-to-dma-channel = <0x0>;
-			snps,priority = <0x0>;
-		};
-	};
-
-	mtl_tx_setup: tx-queues-config {
-		snps,tx-queues-to-use = <2>;
-		snps,tx-sched-wrr;
-		queue0 {
-			snps,weight = <0x10>;
-			snps,dcb-algorithm;
-			snps,priority = <0x0>;
-		};
-
-		queue1 {
-			snps,avb-algorithm;
-			snps,send_slope = <0x1000>;
-			snps,idle_slope = <0x1000>;
-			snps,high_credit = <0x3E800>;
-			snps,low_credit = <0xFFC18000>;
-			snps,priority = <0x1>;
-		};
-	};
-
-	gmac0: ethernet@e0800000 {
-		compatible = "st,spear600-gmac";
-		reg = <0xe0800000 0x8000>;
-		interrupt-parent = <&vic1>;
-		interrupts = <24 23 22>;
-		interrupt-names = "macirq", "eth_wake_irq", "eth_lpi";
-		mac-address = [000000000000]; /* Filled in by U-Boot */
-		max-frame-size = <3800>;
-		phy-mode = "gmii";
-		snps,multicast-filter-bins = <256>;
-		snps,perfect-filter-entries = <128>;
-		rx-fifo-depth = <16384>;
-		tx-fifo-depth = <16384>;
-		clocks = <&clock>;
-		clock-names = "stmmaceth";
-		snps,axi-config = <&stmmac_axi_setup>;
-		mdio0 {
-			#address-cells = <1>;
-			#size-cells = <0>;
-			compatible = "snps,dwmac-mdio";
-			phy1: ethernet-phy@0 {
-			};
-		};
-		snps,mtl-rx-config = <&mtl_rx_setup>;
-		snps,mtl-tx-config = <&mtl_tx_setup>;
-	};
+This file has moved to snps,dwmac.yaml.
diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
index 9ef9338aaee1..db6aa3f2215b 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -11,6 +11,14 @@ Required properties:
 	- ti,fifo-depth - Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h
 		for applicable values
 
+Note: If the interface type is PHY_INTERFACE_MODE_RGMII the TX/RX clock delays
+      will be left at their default values, as set by the PHY's pin strapping.
+      The default strapping will use a delay of 2.00 ns.  Thus
+      PHY_INTERFACE_MODE_RGMII, by default, does not behave as RGMII with no
+      internal delay, but as PHY_INTERFACE_MODE_RGMII_ID.  The device tree
+      should use "rgmii-id" if internal delays are desired as this may be
+      changed in future to cause "rgmii" mode to disable delays.
+
 Optional property:
 	- ti,min-output-impedance - MAC Interface Impedance control to set
 				    the programmable output impedance to
@@ -25,8 +33,10 @@ Optional property:
 				    software needs to take when this pin is
 				    strapped in these modes. See data manual
 				    for details.
-	- ti,clk-output-sel - Muxing option for CLK_OUT pin - see dt-bindings/net/ti-dp83867.h
-				    for applicable values.
+	- ti,clk-output-sel - Muxing option for CLK_OUT pin.  See dt-bindings/net/ti-dp83867.h
+			      for applicable values.  The CLK_OUT pin can also
+			      be disabled by this property.  When omitted, the
+			      PHY's default will be left as is.
 
 Note: ti,min-output-impedance and ti,max-output-impedance are mutually
       exclusive. When both properties are present ti,max-output-impedance
diff --git a/Documentation/devicetree/bindings/net/wiznet,w5x00.txt b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt
new file mode 100644
index 000000000000..e9665798c4be
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt
@@ -0,0 +1,50 @@
+* Wiznet w5x00
+
+This is a standalone 10/100 MBit Ethernet controller with SPI interface.
+
+For each device connected to a SPI bus, define a child node within
+the SPI master node.
+
+Required properties:
+- compatible: Should be one of the following strings:
+	      "wiznet,w5100"
+	      "wiznet,w5200"
+	      "wiznet,w5500"
+- reg: Specify the SPI chip select the chip is wired to.
+- interrupts: Specify the interrupt index within the interrupt controller (referred
+              to above in interrupt-parent) and interrupt type. w5x00 natively
+              generates falling edge interrupts, however, additional board logic
+              might invert the signal.
+- pinctrl-names: List of assigned state names, see pinctrl binding documentation.
+- pinctrl-0: List of phandles to configure the GPIO pin used as interrupt line,
+             see also generic and your platform specific pinctrl binding
+             documentation.
+
+Optional properties:
+- spi-max-frequency: Maximum frequency of the SPI bus when accessing the w5500.
+  According to the w5500 datasheet, the chip allows a maximum of 80 MHz, however,
+  board designs may need to limit this value.
+- local-mac-address: See ethernet.txt in the same directory.
+
+
+Example (for Raspberry Pi with pin control stuff for GPIO irq):
+
+&spi {
+	ethernet@0: w5500@0 {
+		compatible = "wiznet,w5500";
+		reg = <0>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&eth1_pins>;
+		interrupt-parent = <&gpio>;
+		interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+		spi-max-frequency = <30000000>;
+	};
+};
+
+&gpio {
+	eth1_pins: eth1_pins {
+		brcm,pins = <25>;
+		brcm,function = <0>; /* in */
+		brcm,pull = <0>; /* none */
+	};
+};
diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
index 38f9ec076743..7360617cdedb 100644
--- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt
+++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
@@ -17,8 +17,15 @@ For more details about mdio please refer phy.txt file in the same directory.
 Required properties:
 - compatible	: Must be one of "xlnx,axi-ethernet-1.00.a",
 		  "xlnx,axi-ethernet-1.01.a", "xlnx,axi-ethernet-2.01.a"
-- reg		: Address and length of the IO space.
-- interrupts	: Should be a list of two interrupt, TX and RX.
+- reg		: Address and length of the IO space, as well as the address
+                  and length of the AXI DMA controller IO space, unless
+                  axistream-connected is specified, in which case the reg
+                  attribute of the node referenced by it is used.
+- interrupts	: Should be a list of 2 or 3 interrupts: TX DMA, RX DMA,
+		  and optionally Ethernet core. If axistream-connected is
+		  specified, the TX/RX DMA interrupts should be on that node
+		  instead, and only the Ethernet core interrupt is optionally
+		  specified here.
 - phy-handle	: Should point to the external phy device.
 		  See ethernet.txt file in the same directory.
 - xlnx,rxmem	: Set to allocated memory buffer for Rx/Tx in the hardware
@@ -31,15 +38,29 @@ Optional properties:
 		  1 to enable partial TX checksum offload,
 		  2 to enable full TX checksum offload
 - xlnx,rxcsum	: Same values as xlnx,txcsum but for RX checksum offload
+- clocks	: AXI bus clock for the device. Refer to common clock bindings.
+		  Used to calculate MDIO clock divisor. If not specified, it is
+		  auto-detected from the CPU clock (but only on platforms where
+		  this is possible). New device trees should specify this - the
+		  auto detection is only for backward compatibility.
+- axistream-connected: Reference to another node which contains the resources
+		       for the AXI DMA controller used by this device.
+		       If this is specified, the DMA-related resources from that
+		       device (DMA registers and DMA TX/RX interrupts) rather
+		       than this one will be used.
+ - mdio		: Child node for MDIO bus. Must be defined if PHY access is
+		  required through the core's MDIO interface (i.e. always,
+		  unless the PHY is accessed through a different bus).
 
 Example:
 	axi_ethernet_eth: ethernet@40c00000 {
 		compatible = "xlnx,axi-ethernet-1.00.a";
 		device_type = "network";
 		interrupt-parent = <&microblaze_0_axi_intc>;
-		interrupts = <2 0>;
+		interrupts = <2 0 1>;
+		clocks = <&axi_clk>;
 		phy-mode = "mii";
-		reg = <0x40c00000 0x40000>;
+		reg = <0x40c00000 0x40000 0x50c00000 0x40000>;
 		xlnx,rxcsum = <0x2>;
 		xlnx,rxmem = <0x800>;
 		xlnx,txcsum = <0x2>;
diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml
new file mode 100644
index 000000000000..c9efd6e2c134
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/allwinner,sun4i-a10-sid.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Security ID Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+  - $ref: "nvmem.yaml#"
+
+properties:
+  compatible:
+    enum:
+      - allwinner,sun4i-a10-sid
+      - allwinner,sun7i-a20-sid
+      - allwinner,sun8i-a83t-sid
+      - allwinner,sun8i-h3-sid
+      - allwinner,sun50i-a64-sid
+      - allwinner,sun50i-h5-sid
+      - allwinner,sun50i-h6-sid
+
+  reg:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+  - |
+    sid@1c23800 {
+        compatible = "allwinner,sun4i-a10-sid";
+        reg = <0x01c23800 0x10>;
+    };
+
+  - |
+    sid@1c23800 {
+        compatible = "allwinner,sun7i-a20-sid";
+        reg = <0x01c23800 0x200>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt
deleted file mode 100644
index cfb18b4ef8f7..000000000000
--- a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-Allwinner sunxi-sid
-
-Required properties:
-- compatible: Should be one of the following:
-  "allwinner,sun4i-a10-sid"
-  "allwinner,sun7i-a20-sid"
-  "allwinner,sun8i-a83t-sid"
-  "allwinner,sun8i-h3-sid"
-  "allwinner,sun50i-a64-sid"
-  "allwinner,sun50i-h5-sid"
-  "allwinner,sun50i-h6-sid"
-
-- reg: Should contain registers location and length
-
-= Data cells =
-Are child nodes of sunxi-sid, bindings of which as described in
-bindings/nvmem/nvmem.txt
-
-Example for sun4i:
-	sid@1c23800 {
-		compatible = "allwinner,sun4i-a10-sid";
-		reg = <0x01c23800 0x10>
-	};
-
-Example for sun7i:
-	sid@1c23800 {
-		compatible = "allwinner,sun7i-a20-sid";
-		reg = <0x01c23800 0x200>
-	};
diff --git a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
index 68f7d6fdd140..96ffd06d2ca8 100644
--- a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
+++ b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
@@ -15,6 +15,7 @@ Required properties:
 	"fsl,imx6sll-ocotp" (i.MX6SLL),
 	"fsl,imx7ulp-ocotp" (i.MX7ULP),
 	"fsl,imx8mq-ocotp" (i.MX8MQ),
+	"fsl,imx8mm-ocotp" (i.MX8MM),
 	followed by "syscon".
 - #address-cells : Should be 1
 - #size-cells : Should be 1
diff --git a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
index b9165b72473c..3abeecf4983f 100644
--- a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
+++ b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
@@ -9,7 +9,6 @@ Freescale 83xx and 512x SOCs include the same PCI bridge core.
 
 Example (MPC8313ERDB)
 	pci0: pci@e0008500 {
-		cell-index = <1>;
 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
 		interrupt-map = <
 				/* IDSEL 0x0E -mini PCI */
diff --git a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
index 12b18f82d441..efa2c8b9b85a 100644
--- a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt
@@ -3,7 +3,7 @@ Amlogic Meson AXG DWC PCIE SoC controller
 Amlogic Meson PCIe host controller is based on the Synopsys DesignWare PCI core.
 It shares common functions with the PCIe DesignWare core driver and
 inherits common properties defined in
-Documentation/devicetree/bindings/pci/designware-pci.txt.
+Documentation/devicetree/bindings/pci/designware-pcie.txt.
 
 Additional properties are described here:
 
diff --git a/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt b/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
index a618d4787dd7..64156993e052 100644
--- a/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
@@ -10,8 +10,10 @@ Required properties:
 	interrupt source. The value must be 1.
 - compatible: Should contain "mbvl,gpex40-pcie"
 - reg: Should contain PCIe registers location and length
+	Mandatory:
 	"config_axi_slave": PCIe controller registers
 	"csr_axi_slave"	  : Bridge config registers
+	Optional:
 	"gpio_slave"	  : GPIO registers to control slot power
 	"apb_csr"	  : MSI registers
 
diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
index 145a4f04194f..7939bca47861 100644
--- a/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
@@ -65,6 +65,14 @@ Required properties:
   - afi
   - pcie_x
 
+Optional properties:
+- pinctrl-names: A list of pinctrl state names. Must contain the following
+  entries:
+  - "default": active state, puts PCIe I/O out of deep power down state
+  - "idle": puts PCIe I/O into deep power down state
+- pinctrl-0: phandle for the default/active state of pin configurations.
+- pinctrl-1: phandle for the idle state of pin configurations.
+
 Required properties on Tegra124 and later (deprecated):
 - phys: Must contain an entry for each entry in phy-names.
 - phy-names: Must include the following entries:
diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt
index 92c01db610df..2a5d91024059 100644
--- a/Documentation/devicetree/bindings/pci/pci.txt
+++ b/Documentation/devicetree/bindings/pci/pci.txt
@@ -24,6 +24,9 @@ driver implementation may support the following properties:
    unsupported link speed, for instance, trying to do training for
    unsupported link speed, etc.  Must be '4' for gen4, '3' for gen3, '2'
    for gen2, and '1' for gen1. Any other values are invalid.
+- reset-gpios:
+   If present this property specifies PERST# GPIO. Host drivers can parse the
+   GPIO and apply fundamental reset to endpoints.
 
 PCI-PCI Bridge properties
 -------------------------
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
index 1fd703bd73e0..ada80b01bf0c 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
@@ -10,6 +10,7 @@
 			- "qcom,pcie-msm8996" for msm8996 or apq8096
 			- "qcom,pcie-ipq4019" for ipq4019
 			- "qcom,pcie-ipq8074" for ipq8074
+			- "qcom,pcie-qcs404" for qcs404
 
 - reg:
 	Usage: required
@@ -116,6 +117,15 @@
 			- "ahb"		AHB clock
 			- "aux"		Auxiliary clock
 
+- clock-names:
+	Usage: required for qcs404
+	Value type: <stringlist>
+	Definition: Should contain the following entries
+			- "iface"	AHB clock
+			- "aux"		Auxiliary clock
+			- "master_bus"	AXI Master clock
+			- "slave_bus"	AXI Slave clock
+
 - resets:
 	Usage: required
 	Value type: <prop-encoded-array>
@@ -167,6 +177,17 @@
 			- "ahb"			AHB Reset
 			- "axi_m_sticky"	AXI Master Sticky reset
 
+- reset-names:
+	Usage: required for qcs404
+	Value type: <stringlist>
+	Definition: Should contain the following entries
+			- "axi_m"		AXI Master reset
+			- "axi_s"		AXI Slave reset
+			- "axi_m_sticky"	AXI Master Sticky reset
+			- "pipe_sticky"		PIPE sticky reset
+			- "pwr"			PWR reset
+			- "ahb"			AHB reset
+
 - power-domains:
 	Usage: required for apq8084 and msm8996/apq8096
 	Value type: <prop-encoded-array>
@@ -195,12 +216,12 @@
 	Definition: A phandle to the PCIe endpoint power supply
 
 - phys:
-	Usage: required for apq8084
+	Usage: required for apq8084 and qcs404
 	Value type: <phandle>
 	Definition: List of phandle(s) as listed in phy-names property
 
 - phy-names:
-	Usage: required for apq8084
+	Usage: required for apq8084 and qcs404
 	Value type: <stringlist>
 	Definition: Should contain "pciephy"
 
diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt
index 6904882a0e94..45bba9f88a51 100644
--- a/Documentation/devicetree/bindings/pci/rcar-pci.txt
+++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt
@@ -3,6 +3,7 @@
 Required properties:
 compatible: "renesas,pcie-r8a7743" for the R8A7743 SoC;
 	    "renesas,pcie-r8a7744" for the R8A7744 SoC;
+	    "renesas,pcie-r8a774a1" for the R8A774A1 SoC;
 	    "renesas,pcie-r8a774c0" for the R8A774C0 SoC;
 	    "renesas,pcie-r8a7779" for the R8A7779 SoC;
 	    "renesas,pcie-r8a7790" for the R8A7790 SoC;
diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
new file mode 100644
index 000000000000..d77e3f26f9e6
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
@@ -0,0 +1,21 @@
+* Freescale(NXP) IMX8 DDR performance monitor
+
+Required properties:
+
+- compatible: should be one of:
+	"fsl,imx8-ddr-pmu"
+	"fsl,imx8m-ddr-pmu"
+
+- reg: physical address and size
+
+- interrupts: single interrupt
+	generated by the control block
+
+Example:
+
+	ddr-pmu@5c020000 {
+		compatible = "fsl,imx8-ddr-pmu";
+		reg = <0x5c020000 0x10000>;
+		interrupt-parent = <&gic>;
+		interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>;
+	};
diff --git a/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml b/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml
new file mode 100644
index 000000000000..250f9d5aabdf
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-mipi-dphy.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 MIPI D-PHY Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#phy-cells":
+    const: 0
+
+  compatible:
+    const: allwinner,sun6i-a31-mipi-dphy
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: bus
+      - const: mod
+
+  resets:
+    maxItems: 1
+
+required:
+  - "#phy-cells"
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - resets
+
+additionalProperties: false
+
+examples:
+  - |
+    dphy0: d-phy@1ca1000 {
+        compatible = "allwinner,sun6i-a31-mipi-dphy";
+        reg = <0x01ca1000 0x1000>;
+        clocks = <&ccu 23>, <&ccu 97>;
+        clock-names = "bus", "mod";
+        resets = <&ccu 4>;
+        #phy-cells = <0>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt b/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt
new file mode 100644
index 000000000000..9b23407233c0
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt
@@ -0,0 +1,29 @@
+Mixel DSI PHY for i.MX8
+
+The Mixel MIPI-DSI PHY IP block is e.g. found on i.MX8 platforms (along the
+MIPI-DSI IP from Northwest Logic). It represents the physical layer for the
+electrical signals for DSI.
+
+Required properties:
+- compatible: Must be:
+  - "fsl,imx8mq-mipi-dphy"
+- clocks: Must contain an entry for each entry in clock-names.
+- clock-names: Must contain the following entries:
+  - "phy_ref": phandle and specifier referring to the DPHY ref clock
+- reg: the register range of the PHY controller
+- #phy-cells: number of cells in PHY, as defined in
+  Documentation/devicetree/bindings/phy/phy-bindings.txt
+  this must be <0>
+
+Optional properties:
+- power-domains: phandle to power domain
+
+Example:
+	dphy: dphy@30a0030 {
+		compatible = "fsl,imx8mq-mipi-dphy";
+		clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
+		clock-names = "phy_ref";
+		reg = <0x30a00300 0x100>;
+		power-domains = <&pd_mipi0>;
+		#phy-cells = <0>;
+        };
diff --git a/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt b/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
index 6ac98b3b5f57..c9f5c0caf8a9 100644
--- a/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
+++ b/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
@@ -7,6 +7,7 @@ Required properties:
 	* "fsl,imx6sl-usbphy" for imx6sl
 	* "fsl,vf610-usbphy" for Vybrid vf610
 	* "fsl,imx6sx-usbphy" for imx6sx
+	* "fsl,imx7ulp-usbphy" for imx7ulp
   "fsl,imx23-usbphy" is still a fallback for other strings
 - reg: Should contain registers location and length
 - interrupts: Should contain phy interrupt
@@ -23,7 +24,7 @@ Optional properties:
   the 17.78mA TX reference current. Default: 100
 
 Example:
-usbphy1: usbphy@20c9000 {
+usbphy1: usb-phy@20c9000 {
 	compatible = "fsl,imx6q-usbphy", "fsl,imx23-usbphy";
 	reg = <0x020c9000 0x1000>;
 	interrupts = <0 44 0x04>;
diff --git a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
index daedb15f322e..9fb682e47c29 100644
--- a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
+++ b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
@@ -42,6 +42,18 @@ Required properties:
 - reset-names: Must include the following entries:
   - "padctl"
 
+For Tegra124:
+- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V.
+- avdd-pll-erefe-supply: PLLE reference PLL power supply. Must supply 1.05 V.
+- avdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V.
+- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 3.3 V.
+
+For Tegra210:
+- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V.
+- avdd-pll-uerefe-supply: PLLE reference PLL power supply. Must supply 1.05 V.
+- dvdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V.
+- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 1.8 V.
+
 For Tegra186:
 - avdd-pll-erefeut-supply: UPHY brick and reference clock as well as UTMI PHY
   power supply. Must supply 1.8 V.
diff --git a/Documentation/devicetree/bindings/phy/phy-bindings.txt b/Documentation/devicetree/bindings/phy/phy-bindings.txt
index a403b81d0679..c4eb38902533 100644
--- a/Documentation/devicetree/bindings/phy/phy-bindings.txt
+++ b/Documentation/devicetree/bindings/phy/phy-bindings.txt
@@ -1,5 +1,5 @@
 This document explains only the device tree data binding. For general
-information about PHY subsystem refer to Documentation/phy.txt
+information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst
 
 PHY device node
 ===============
diff --git a/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt
new file mode 100644
index 000000000000..d80e36a77ec5
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt
@@ -0,0 +1,18 @@
+Marvell PXA USB PHY
+-------------------
+
+Required properties:
+- compatible: one of: "marvell,mmp2-usb-phy", "marvell,pxa910-usb-phy",
+	"marvell,pxa168-usb-phy",
+- #phy-cells: must be 0
+
+Example:
+	usb-phy: usbphy@d4207000 {
+		compatible = "marvell,mmp2-usb-phy";
+		reg = <0xd4207000 0x40>;
+		#phy-cells = <0>;
+		status = "okay";
+	};
+
+This document explains the device tree binding. For general
+information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst
diff --git a/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt
new file mode 100644
index 000000000000..30064253f290
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt
@@ -0,0 +1,42 @@
+Qualcomm PCIe2 PHY controller
+=============================
+
+The Qualcomm PCIe2 PHY is a Synopsys based phy found in a number of Qualcomm
+platforms.
+
+Required properties:
+ - compatible: compatible list, should be:
+	       "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy"
+
+ - reg: offset and length of the PHY register set.
+ - #phy-cells: must be 0.
+
+ - clocks: a clock-specifier pair for the "pipe" clock
+
+ - vdda-vp-supply: phandle to low voltage regulator
+ - vdda-vph-supply: phandle to high voltage regulator
+
+ - resets: reset-specifier pairs for the "phy" and "pipe" resets
+ - reset-names: list of resets, should contain:
+		"phy" and "pipe"
+
+ - clock-output-names: name of the outgoing clock signal from the PHY PLL
+ - #clock-cells: must be 0
+
+Example:
+ phy@7786000 {
+	compatible = "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy";
+	reg = <0x07786000 0xb8>;
+
+	clocks = <&gcc GCC_PCIE_0_PIPE_CLK>;
+	resets = <&gcc GCC_PCIEPHY_0_PHY_BCR>,
+	         <&gcc GCC_PCIE_0_PIPE_ARES>;
+	reset-names = "phy", "pipe";
+
+	vdda-vp-supply = <&vreg_l3_1p05>;
+	vdda-vph-supply = <&vreg_l5_1p8>;
+
+	clock-output-names = "pcie_0_pipe_clk";
+	#clock-cells = <0>;
+	#phy-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
index d46188f450bf..503a8cfb3184 100644
--- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
+++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
@@ -1,10 +1,12 @@
 * Renesas R-Car generation 3 USB 2.0 PHY
 
 This file provides information on what the device node for the R-Car generation
-3, RZ/G1C and RZ/G2 USB 2.0 PHY contain.
+3, RZ/G1C, RZ/G2 and RZ/A2 USB 2.0 PHY contain.
 
 Required properties:
-- compatible: "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470
+- compatible: "renesas,usb2-phy-r7s9210" if the device is a part of an R7S9210
+	      SoC.
+	      "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470
 	      SoC.
 	      "renesas,usb2-phy-r8a774a1" if the device is a part of an R8A774A1
 	      SoC.
@@ -20,8 +22,8 @@ Required properties:
 	      R8A77990 SoC.
 	      "renesas,usb2-phy-r8a77995" if the device is a part of an
 	      R8A77995 SoC.
-	      "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3 or RZ/G2
-	      compatible device.
+	      "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3, RZ/G2 or
+	      RZ/A2 compatible device.
 
 	      When compatible with the generic version, nodes must list the
 	      SoC-specific version corresponding to the platform first
@@ -46,6 +48,9 @@ channel as USB OTG:
 	       regulator will be managed during the PHY power on/off sequence.
 - renesas,no-otg-pins: boolean, specify when a board does not provide proper
 		       otg pins.
+- dr_mode: string, indicates the working mode for the PHY. Can be "host",
+           "peripheral", or "otg". Should be set if otg controller is not used.
+
 
 Example (R-Car H3):
 
diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
index cf96b7c20e4d..328585c6da58 100644
--- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
@@ -24,6 +24,8 @@ Required properties:
   "allwinner,sun8i-h3-pinctrl"
   "allwinner,sun8i-h3-r-pinctrl"
   "allwinner,sun8i-r40-pinctrl"
+  "allwinner,sun8i-v3-pinctrl"
+  "allwinner,sun8i-v3s-pinctrl"
   "allwinner,sun50i-a64-pinctrl"
   "allwinner,sun50i-a64-r-pinctrl"
   "allwinner,sun50i-h5-pinctrl"
diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml
new file mode 100644
index 000000000000..125599a2dc5e
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/aspeed,ast2400-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED AST2400 Pin Controller
+
+maintainers:
+  - Andrew Jeffery <andrew@aj.id.au>
+
+description: |+
+  The pin controller node should be the child of a syscon node with the
+  required property:
+
+  - compatible:     Should be one of the following:
+                    "aspeed,ast2400-scu", "syscon", "simple-mfd"
+                    "aspeed,g4-scu", "syscon", "simple-mfd"
+
+  Refer to the the bindings described in
+  Documentation/devicetree/bindings/mfd/syscon.txt
+
+properties:
+  compatible:
+    enum:
+      - aspeed,ast2400-pinctrl
+      - aspeed,g4-pinctrl
+
+patternProperties:
+  '^.*$':
+    if:
+      type: object
+    then:
+      patternProperties:
+        "^function|groups$":
+          allOf:
+            - $ref: "/schemas/types.yaml#/definitions/string"
+            - enum: [ "ACPI", "ADC0", "ADC1", "ADC10", "ADC11", "ADC12", "ADC13",
+              "ADC14", "ADC15", "ADC2", "ADC3", "ADC4", "ADC5", "ADC6", "ADC7",
+              "ADC8", "ADC9", "BMCINT", "DDCCLK", "DDCDAT", "EXTRST", "FLACK",
+              "FLBUSY", "FLWP", "GPID", "GPID0", "GPID2", "GPID4", "GPID6",
+              "GPIE0", "GPIE2", "GPIE4", "GPIE6", "I2C10", "I2C11", "I2C12",
+              "I2C13", "I2C14", "I2C3", "I2C4", "I2C5", "I2C6", "I2C7", "I2C8",
+              "I2C9", "LPCPD", "LPCPME", "LPCRST", "LPCSMI", "MAC1LINK",
+              "MAC2LINK", "MDIO1", "MDIO2", "NCTS1", "NCTS2", "NCTS3", "NCTS4",
+              "NDCD1", "NDCD2", "NDCD3", "NDCD4", "NDSR1", "NDSR2", "NDSR3",
+              "NDSR4", "NDTR1", "NDTR2", "NDTR3", "NDTR4", "NDTS4", "NRI1",
+              "NRI2", "NRI3", "NRI4", "NRTS1", "NRTS2", "NRTS3", "OSCCLK",
+              "PWM0", "PWM1", "PWM2", "PWM3", "PWM4", "PWM5", "PWM6", "PWM7",
+              "RGMII1", "RGMII2", "RMII1", "RMII2", "ROM16", "ROM8", "ROMCS1",
+              "ROMCS2", "ROMCS3", "ROMCS4", "RXD1", "RXD2", "RXD3", "RXD4",
+              "SALT1", "SALT2", "SALT3", "SALT4", "SD1", "SD2", "SGPMCK",
+              "SGPMI", "SGPMLD", "SGPMO", "SGPSCK", "SGPSI0", "SGPSI1", "SGPSLD",
+              "SIOONCTRL", "SIOPBI", "SIOPBO", "SIOPWREQ", "SIOPWRGD", "SIOS3",
+              "SIOS5", "SIOSCI", "SPI1", "SPI1DEBUG", "SPI1PASSTHRU", "SPICS1",
+              "TIMER3", "TIMER4", "TIMER5", "TIMER6", "TIMER7", "TIMER8", "TXD1",
+              "TXD2", "TXD3", "TXD4", "UART6", "USB11D1", "USB11H2", "USB2D1",
+              "USB2H1", "USBCKI", "VGABIOS_ROM", "VGAHS", "VGAVS", "VPI18",
+              "VPI24", "VPI30", "VPO12", "VPO24", "WDTRST1", "WDTRST2" ]
+
+required:
+  - compatible
+
+examples:
+  - |
+    syscon: scu@1e6e2000 {
+        compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd";
+        reg = <0x1e6e2000 0x1a8>;
+
+        pinctrl: pinctrl {
+            compatible = "aspeed,g4-pinctrl";
+
+            pinctrl_i2c3_default: i2c3_default {
+                function = "I2C3";
+                groups = "I2C3";
+            };
+
+            pinctrl_gpioh0_unbiased_default: gpioh0 {
+                pins = "A8";
+                bias-disable;
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml
new file mode 100644
index 000000000000..3e6d85318577
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/aspeed,ast2500-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED AST2500 Pin Controller
+
+maintainers:
+  - Andrew Jeffery <andrew@aj.id.au>
+
+description: |+
+  The pin controller node should be the child of a syscon node with the
+  required property:
+
+  - compatible: 	Should be one of the following:
+  			"aspeed,ast2500-scu", "syscon", "simple-mfd"
+  			"aspeed,g5-scu", "syscon", "simple-mfd"
+
+  Refer to the the bindings described in
+  Documentation/devicetree/bindings/mfd/syscon.txt
+
+properties:
+  compatible:
+    enum:
+      - aspeed,ast2500-pinctrl
+      - aspeed,g5-pinctrl
+  aspeed,external-nodes:
+    minItems: 2
+    maxItems: 2
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/phandle-array
+    description: |
+      A cell of phandles to external controller nodes:
+      0: compatible with "aspeed,ast2500-gfx", "syscon"
+      1: compatible with "aspeed,ast2500-lhc", "syscon"
+
+patternProperties:
+  '^.*$':
+    if:
+      type: object
+    then:
+      patternProperties:
+        "^function|groups$":
+          allOf:
+            - $ref: "/schemas/types.yaml#/definitions/string"
+            - enum: [ "ACPI", "ADC0", "ADC1", "ADC10", "ADC11", "ADC12", "ADC13",
+              "ADC14", "ADC15", "ADC2", "ADC3", "ADC4", "ADC5", "ADC6", "ADC7",
+              "ADC8", "ADC9", "BMCINT", "DDCCLK", "DDCDAT", "ESPI", "FWSPICS1",
+              "FWSPICS2", "GPID0", "GPID2", "GPID4", "GPID6", "GPIE0", "GPIE2",
+              "GPIE4", "GPIE6", "I2C10", "I2C11", "I2C12", "I2C13", "I2C14",
+              "I2C3", "I2C4", "I2C5", "I2C6", "I2C7", "I2C8", "I2C9", "LAD0",
+              "LAD1", "LAD2", "LAD3", "LCLK", "LFRAME", "LPCHC", "LPCPD",
+              "LPCPLUS", "LPCPME", "LPCRST", "LPCSMI", "LSIRQ", "MAC1LINK",
+              "MAC2LINK", "MDIO1", "MDIO2", "NCTS1", "NCTS2", "NCTS3", "NCTS4",
+              "NDCD1", "NDCD2", "NDCD3", "NDCD4", "NDSR1", "NDSR2", "NDSR3",
+              "NDSR4", "NDTR1", "NDTR2", "NDTR3", "NDTR4", "NRI1", "NRI2",
+              "NRI3", "NRI4", "NRTS1", "NRTS2", "NRTS3", "NRTS4", "OSCCLK",
+              "PEWAKE", "PNOR", "PWM0", "PWM1", "PWM2", "PWM3", "PWM4", "PWM5",
+              "PWM6", "PWM7", "RGMII1", "RGMII2", "RMII1", "RMII2", "RXD1",
+              "RXD2", "RXD3", "RXD4", "SALT1", "SALT10", "SALT11", "SALT12",
+              "SALT13", "SALT14", "SALT2", "SALT3", "SALT4", "SALT5", "SALT6",
+              "SALT7", "SALT8", "SALT9", "SCL1", "SCL2", "SD1", "SD2", "SDA1",
+              "SDA2", "SGPS1", "SGPS2", "SIOONCTRL", "SIOPBI", "SIOPBO",
+              "SIOPWREQ", "SIOPWRGD", "SIOS3", "SIOS5", "SIOSCI", "SPI1",
+              "SPI1CS1", "SPI1DEBUG", "SPI1PASSTHRU", "SPI2CK", "SPI2CS0",
+              "SPI2CS1", "SPI2MISO", "SPI2MOSI", "TIMER3", "TIMER4", "TIMER5",
+              "TIMER6", "TIMER7", "TIMER8", "TXD1", "TXD2", "TXD3", "TXD4",
+              "UART6", "USB11BHID", "USB2AD", "USB2AH", "USB2BD", "USB2BH",
+              "USBCKI", "VGABIOSROM", "VGAHS", "VGAVS", "VPI24", "VPO",
+              "WDTRST1", "WDTRST2", ]
+
+required:
+  - compatible
+  - aspeed,external-nodes
+
+examples:
+  - |
+    apb {
+        compatible = "simple-bus";
+        #address-cells = <1>;
+        #size-cells = <1>;
+        ranges;
+
+        syscon: scu@1e6e2000 {
+            compatible = "aspeed,ast2500-scu", "syscon", "simple-mfd";
+            reg = <0x1e6e2000 0x1a8>;
+
+            pinctrl: pinctrl {
+                compatible = "aspeed,g5-pinctrl";
+                aspeed,external-nodes = <&gfx>, <&lhc>;
+
+                pinctrl_i2c3_default: i2c3_default {
+                    function = "I2C3";
+                    groups = "I2C3";
+                };
+
+                pinctrl_gpioh0_unbiased_default: gpioh0 {
+                    pins = "A18";
+                    bias-disable;
+                };
+            };
+        };
+
+        gfx: display@1e6e6000 {
+            compatible = "aspeed,ast2500-gfx", "syscon";
+            reg = <0x1e6e6000 0x1000>;
+        };
+    };
+
+    lpc: lpc@1e789000 {
+        compatible = "aspeed,ast2500-lpc", "simple-mfd";
+        reg = <0x1e789000 0x1000>;
+
+        #address-cells = <1>;
+        #size-cells = <1>;
+        ranges = <0x0 0x1e789000 0x1000>;
+
+        lpc_host: lpc-host@80 {
+            compatible = "aspeed,ast2500-lpc-host", "simple-mfd", "syscon";
+            reg = <0x80 0x1e0>;
+            reg-io-width = <4>;
+
+            #address-cells = <1>;
+            #size-cells = <1>;
+            ranges = <0x0 0x80 0x1e0>;
+
+            lhc: lhc@20 {
+                   compatible = "aspeed,ast2500-lhc";
+                   reg = <0x20 0x24 0x48 0x8>;
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
index ed34bb1ee81c..4980776122cc 100644
--- a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
@@ -14,7 +14,8 @@ phrase "pin configuration node".
 The pin configuration nodes act as a container for an arbitrary number of
 subnodes. Each of these subnodes represents some desired configuration for a
 pin, a group, or a list of pins or groups. This configuration for BM1880 SoC
-includes only pinmux as there is no pinconf support available in SoC.
+includes pinmux and various pin configuration parameters, such as pull-up,
+slew rate etc...
 
 Each configuration node can consist of multiple nodes describing the pinmux
 options. The name of each subnode is not important; all subnodes should be
@@ -84,10 +85,37 @@ Required Properties:
                   gpio66, gpio67, eth1, i2s0, i2s0_mclkin, i2s1, i2s1_mclkin,
                   spi0
 
+Optional Properties:
+
+- bias-disable:  No arguments. Disable pin bias.
+- bias-pull-down: No arguments. The specified pins should be configured as
+                  pull down.
+- bias-pull-up:   No arguments. The specified pins should be configured as
+                  pull up.
+- input-schmitt-enable: No arguments: Enable schmitt trigger for the specified
+                  pins
+- input-schmitt-disable: No arguments: Disable schmitt trigger for the specified
+                  pins
+- slew-rate:      Integer. Sets slew rate for the specified pins.
+                  Valid values are:
+                  <0>  - Slow
+                  <1>  - Fast
+- drive-strength: Integer. Selects the drive strength for the specified
+                  pins in mA.
+                  Valid values are:
+                  <4>
+                  <8>
+                  <12>
+                  <16>
+                  <20>
+                  <24>
+                  <28>
+                  <32>
+
 Example:
-        pinctrl: pinctrl@50 {
+        pinctrl: pinctrl@400 {
                 compatible = "bitmain,bm1880-pinctrl";
-                reg = <0x50 0x4B0>;
+                reg = <0x400 0x120>;
 
                 pinctrl_uart0_default: uart0-default {
                         pinmux {
diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
index 3fac0a061bcc..ac6d614d74e0 100644
--- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
+++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
@@ -5,6 +5,9 @@ controller, and pinmux/control device.
 
 Required properties:
 - compatible: "brcm,bcm2835-gpio"
+- compatible: should be one of:
+  "brcm,bcm2835-gpio" - BCM2835 compatible pinctrl
+  "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl
 - reg: Should contain the physical address of the GPIO module's registers.
 - gpio-controller: Marks the device node as a GPIO controller.
 - #gpio-cells : Should be two. The first cell is the pin number and the
diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
index 524a16fca666..e4e01c05cf83 100644
--- a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
@@ -12,7 +12,7 @@ Required properties in sub-nodes:
 - fsl,pins: each entry consists of 6 integers and represents the mux and config
   setting for one pin.  The first 5 integers <mux_reg conf_reg input_reg mux_val
   input_val> are specified using a PIN_FUNC_ID macro, which can be found in
-  <dt-bindings/pinctrl/imx8mm-pinfunc.h>. The last integer CONFIG is
+  <arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h>. The last integer CONFIG is
   the pad setting value like pull-up on this pin.  Please refer to i.MX8M Mini
   Reference Manual for detailed CONFIG settings.
 
diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt
new file mode 100644
index 000000000000..330716c971b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt
@@ -0,0 +1,39 @@
+* Freescale IMX8MN IOMUX Controller
+
+Please refer to fsl,imx-pinctrl.txt and pinctrl-bindings.txt in this directory
+for common binding part and usage.
+
+Required properties:
+- compatible: "fsl,imx8mn-iomuxc"
+- reg: should contain the base physical address and size of the iomuxc
+  registers.
+
+Required properties in sub-nodes:
+- fsl,pins: each entry consists of 6 integers and represents the mux and config
+  setting for one pin.  The first 5 integers <mux_reg conf_reg input_reg mux_val
+  input_val> are specified using a PIN_FUNC_ID macro, which can be found in
+  <arch/arm64/boot/dts/freescale/imx8mn-pinfunc.h>. The last integer CONFIG is
+  the pad setting value like pull-up on this pin. Please refer to i.MX8M Nano
+  Reference Manual for detailed CONFIG settings.
+
+Examples:
+
+&uart1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart1>;
+};
+
+iomuxc: pinctrl@30330000 {
+        compatible = "fsl,imx8mn-iomuxc";
+        reg = <0x0 0x30330000 0x0 0x10000>;
+
+        pinctrl_uart1: uart1grp {
+                fsl,pins = <
+			MX8MN_IOMUXC_UART1_RXD_UART1_DCE_RX	0x140
+			MX8MN_IOMUXC_UART1_TXD_UART1_DCE_TX	0x140
+			MX8MN_IOMUXC_UART3_RXD_UART1_DCE_CTS_B	0x140
+			MX8MN_IOMUXC_UART3_TXD_UART1_DCE_RTS_B	0x140
+			MX8MN_IOMUXC_SD1_DATA4_GPIO2_IO6	0x19
+                >;
+        };
+};
diff --git a/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
index 6c0ea155b708..2932f171ee85 100644
--- a/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
@@ -6,8 +6,8 @@ part and usage.
 Required properties:
 - compatible: "marvell,88f6180-pinctrl",
               "marvell,88f6190-pinctrl", "marvell,88f6192-pinctrl",
-              "marvell,88f6281-pinctrl", "marvell,88f6282-pinctrl"
-              "marvell,98dx4122-pinctrl"
+              "marvell,88f6281-pinctrl", "marvell,88f6282-pinctrl",
+              "marvell,98dx4122-pinctrl", "marvell,98dx1135-pinctrl"
 - reg: register specifier of MPP registers
 
 This driver supports all kirkwood variants, i.e. 88f6180, 88f619x, and 88f628x.
@@ -317,3 +317,43 @@ mpp44         44       gpio
 mpp45         45       gpio
 mpp49         49       gpio
 
+* Marvell Poncat2 98dx1135
+
+name          pins     functions
+================================================================================
+
+mpp0          0        gpio, nand(io2), spi(cs)
+mpp1          1        gpo, nand(io3), spi(mosi)
+mpp2          2        gpo, nand(io4), spi(sck)
+mpp3          3        gpo, nand(io5), spi(miso)
+mpp4          4        gpio, nand(io6), uart0(rxd)
+mpp5          5        gpo, nand(io7), uart0(txd)
+mpp6          6        sysrst(out)
+mpp7          7        gpo, spi(cs)
+mpp8          8        gpio, twsi0(sda), uart1(rts)
+mpp9          9        gpio, twsi(sck), uart1(cts)
+mpp10         10       gpo, uart0(txd)
+mpp11         11       gpio, uart0(rxd)
+mpp13         13       gpio, uart1(txd)
+mpp14         14       gpio, uart1(rxd)
+mpp15         15       gpio, uart0(rts)
+mpp16         16       gpio, uart0(cts)
+mpp17         17       gpio, nand(cle)
+mpp18         18       gpo, nand(io0)
+mpp19         19       gpo, nand(io1)
+mpp20         20       gpio
+mpp21         21       gpio
+mpp22         22       gpio
+mpp23         23       gpio
+mpp24         24       gpio
+mpp25         25       gpio
+mpp26         26       gpio
+mpp27         27       gpio
+mpp28         28       gpio, nand(ren)
+mpp29         29       gpio, nand(wen)
+mpp30         30       gpio
+mpp31         31       gpio
+mpp32         32       gpio
+mpp33         33       gpio
+mpp34         34       gpio, nand(ale)
+mpp35         35       gpio, nand(cen)
diff --git a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
index a47dd990a8d3..10dc4f7176ca 100644
--- a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
@@ -47,9 +47,19 @@ Required properties for pinmux nodes are:
 Required properties for configuration nodes:
  - pins: a list of pin names
 
-Configuration nodes support the generic properties "bias-disable",
-"bias-pull-up" and "bias-pull-down", described in file
-pinctrl-bindings.txt
+Configuration nodes support the following generic properties, as
+described in file pinctrl-bindings.txt:
+ - "bias-disable"
+ - "bias-pull-up"
+ - "bias-pull-down"
+ - "output-enable"
+ - "output-disable"
+ - "output-low"
+ - "output-high"
+
+Optional properties :
+ - drive-strength-microamp: Drive strength for the specified pins in uA.
+			    This property is only valid for G12A and newer.
 
 === Example ===
 
diff --git a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
index 29b72e303ebf..51efd2085113 100644
--- a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
@@ -5,7 +5,7 @@ Please refer to pinctrl-bindings.txt, ../gpio/gpio.txt, and
 pin controller, GPIO, and interrupt bindings.
 
 PIC32 'pin configuration node' is a node of a group of pins which can be
-used for a specific device or function. This node represents configuraions of
+used for a specific device or function. This node represents configurations of
 pins, optional function, and optional mux related configuration.
 
 Required properties for pin controller node:
diff --git a/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
index 83f4bbac94bb..a1264cc8660d 100644
--- a/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
@@ -213,4 +213,4 @@ pinctrl: pinctrl@f0800000 {
 		groups = "clkreq";
 		function = "clkreq";
 	};
-};
-\ No newline at end of file
+};
diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt
new file mode 100644
index 000000000000..8763f448c376
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt
@@ -0,0 +1,107 @@
+NVIDIA Tegra194 pinmux controller
+
+Required properties:
+- compatible: "nvidia,tegra194-pinmux"
+- reg: Should contain a list of base address and size pairs for:
+  - first entry: The APB_MISC_GP_*_PADCTRL registers (pad control)
+  - second entry: The PINMUX_AUX_* registers (pinmux)
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+Tegra's pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, tristate, drive strength, etc.
+
+See the TRM to determine which properties and values apply to each pin/group.
+Macro values for property values are defined in
+include/dt-binding/pinctrl/pinctrl-tegra.h.
+
+Required subnode-properties:
+- nvidia,pins : An array of strings. Each string contains the name of a pin or
+    group. Valid values for these names are listed below.
+
+Optional subnode-properties:
+- nvidia,function: A string containing the name of the function to mux to the
+    pin or group.
+- nvidia,pull: Integer, representing the pull-down/up to apply to the pin.
+    0: none, 1: down, 2: up.
+- nvidia,tristate: Integer.
+    0: drive, 1: tristate.
+- nvidia,enable-input: Integer. Enable the pin's input path.
+    enable :TEGRA_PIN_ENABLE and
+    disable or output only: TEGRA_PIN_DISABLE.
+- nvidia,open-drain: Integer.
+    enable: TEGRA_PIN_ENABLE.
+    disable: TEGRA_PIN_DISABLE.
+- nvidia,lock: Integer. Lock the pin configuration against further changes
+    until reset.
+    enable: TEGRA_PIN_ENABLE.
+    disable: TEGRA_PIN_DISABLE.
+- nvidia,io-hv: Integer. Select high-voltage receivers.
+    normal: TEGRA_PIN_DISABLE
+    high: TEGRA_PIN_ENABLE
+- nvidia,schmitt: Integer. Enables Schmitt Trigger on the input.
+    normal: TEGRA_PIN_DISABLE
+    high: TEGRA_PIN_ENABLE
+- nvidia,drive-type: Integer. Valid range 0...3.
+- nvidia,pull-down-strength: Integer. Controls drive strength. 0 is weakest.
+    The range of valid values depends on the pingroup. See "CAL_DRVDN" in the
+    Tegra TRM.
+- nvidia,pull-up-strength: Integer. Controls drive strength. 0 is weakest.
+    The range of valid values depends on the pingroup. See "CAL_DRVUP" in the
+    Tegra TRM.
+
+Valid values for pin and group names (nvidia,pin) are:
+
+    These correspond to Tegra PADCTL_* (pinmux) registers.
+
+  Mux groups:
+
+    These correspond to Tegra PADCTL_* (pinmux) registers. Any property
+    that exists in those registers may be set for the following pin names.
+
+    pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1
+
+  Drive groups:
+
+    These registers controls a single pin for which a mux group exists.
+    See the list above for the pin name to use when configuring the pinmux.
+
+    pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1
+
+Valid values for nvidia,functions are:
+
+    pe5
+
+Power Domain:
+    pex_l5_clkreq_n_pgg0 and pex_l5_rst_n_pgg1 are part of PCIE C5 power
+    partition. Client devices must enable this partition before accessing
+    these pins here.
+
+
+Example:
+
+		tegra_pinctrl: pinmux: pinmux@2430000 {
+			compatible = "nvidia,tegra194-pinmux";
+			reg = <0x2430000 0x17000
+			       0xc300000 0x4000>;
+
+			pinctrl-names = "pex_rst";
+			pinctrl-0 = <&pex_rst_c5_out_state>;
+
+			pex_rst_c5_out_state: pex_rst_c5_out {
+				pex_rst {
+					nvidia,pins = "pex_l5_rst_n_pgg1";
+					nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+					nvidia,lpdr = <TEGRA_PIN_ENABLE>;
+					nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+					nvidia,io-high-voltage = <TEGRA_PIN_ENABLE>;
+					nvidia,tristate = <TEGRA_PIN_DISABLE>;
+					nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				};
+			};
+		};
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
deleted file mode 100644
index 3b7266c7c438..000000000000
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-======================
-Aspeed Pin Controllers
-======================
-
-The Aspeed SoCs vary in functionality inside a generation but have a common mux
-device register layout.
-
-Required properties for g4:
-- compatible : 			Should be one of the following:
-				"aspeed,ast2400-pinctrl"
-				"aspeed,g4-pinctrl"
-
-Required properties for g5:
-- compatible : 			Should be one of the following:
-				"aspeed,ast2500-pinctrl"
-				"aspeed,g5-pinctrl"
-
-- aspeed,external-nodes:	A cell of phandles to external controller nodes:
-				0: compatible with "aspeed,ast2500-gfx", "syscon"
-				1: compatible with "aspeed,ast2500-lhc", "syscon"
-
-The pin controller node should be the child of a syscon node with the required
-property:
-
-- compatible : 		Should be one of the following:
-			"aspeed,ast2400-scu", "syscon", "simple-mfd"
-			"aspeed,g4-scu", "syscon", "simple-mfd"
-			"aspeed,ast2500-scu", "syscon", "simple-mfd"
-			"aspeed,g5-scu", "syscon", "simple-mfd"
-
-Refer to the the bindings described in
-Documentation/devicetree/bindings/mfd/syscon.txt
-
-Subnode Format
-==============
-
-The required properties of pinmux child nodes are:
-- function: the mux function to select
-- groups  : the list of groups to select with this function
-
-Required properties of pinconf child nodes are:
-- groups: A list of groups to select (either this or "pins" must be
-          specified)
-- pins  : A list of ball names as strings, eg "D14" (either this or "groups"
-          must be specified)
-
-Optional properties of pinconf child nodes are:
-- bias-disable  : disable any pin bias
-- bias-pull-down: pull down the pin
-- drive-strength: sink or source at most X mA
-
-Definitions are as specified in
-Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt, with any
-further limitations as described above.
-
-For pinmux, each mux function has only one associated pin group. Each group is
-named by its function. The following values for the function and groups
-properties are supported:
-
-aspeed,ast2400-pinctrl, aspeed,g4-pinctrl:
-
-ACPI ADC0 ADC1 ADC10 ADC11 ADC12 ADC13 ADC14 ADC15 ADC2 ADC3 ADC4 ADC5 ADC6
-ADC7 ADC8 ADC9 BMCINT DDCCLK DDCDAT EXTRST FLACK FLBUSY FLWP GPID GPID0 GPID2
-GPID4 GPID6 GPIE0 GPIE2 GPIE4 GPIE6 I2C10 I2C11 I2C12 I2C13 I2C14 I2C3 I2C4
-I2C5 I2C6 I2C7 I2C8 I2C9 LPCPD LPCPME LPCRST LPCSMI MAC1LINK MAC2LINK MDIO1
-MDIO2 NCTS1 NCTS2 NCTS3 NCTS4 NDCD1 NDCD2 NDCD3 NDCD4 NDSR1 NDSR2 NDSR3 NDSR4
-NDTR1 NDTR2 NDTR3 NDTR4 NDTS4 NRI1 NRI2 NRI3 NRI4 NRTS1 NRTS2 NRTS3 OSCCLK PWM0
-PWM1 PWM2 PWM3 PWM4 PWM5 PWM6 PWM7 RGMII1 RGMII2 RMII1 RMII2 ROM16 ROM8 ROMCS1
-ROMCS2 ROMCS3 ROMCS4 RXD1 RXD2 RXD3 RXD4 SALT1 SALT2 SALT3 SALT4 SD1 SD2 SGPMCK
-SGPMI SGPMLD SGPMO SGPSCK SGPSI0 SGPSI1 SGPSLD SIOONCTRL SIOPBI SIOPBO SIOPWREQ
-SIOPWRGD SIOS3 SIOS5 SIOSCI SPI1 SPI1DEBUG SPI1PASSTHRU SPICS1 TIMER3 TIMER4
-TIMER5 TIMER6 TIMER7 TIMER8 TXD1 TXD2 TXD3 TXD4 UART6 USB11D1 USB11H2 USB2D1
-USB2H1 USBCKI VGABIOS_ROM VGAHS VGAVS VPI18 VPI24 VPI30 VPO12 VPO24 WDTRST1
-WDTRST2
-
-aspeed,ast2500-pinctrl, aspeed,g5-pinctrl:
-
-ACPI ADC0 ADC1 ADC10 ADC11 ADC12 ADC13 ADC14 ADC15 ADC2 ADC3 ADC4 ADC5 ADC6
-ADC7 ADC8 ADC9 BMCINT DDCCLK DDCDAT ESPI FWSPICS1 FWSPICS2 GPID0 GPID2 GPID4
-GPID6 GPIE0 GPIE2 GPIE4 GPIE6 I2C10 I2C11 I2C12 I2C13 I2C14 I2C3 I2C4 I2C5 I2C6
-I2C7 I2C8 I2C9 LAD0 LAD1 LAD2 LAD3 LCLK LFRAME LPCHC LPCPD LPCPLUS LPCPME
-LPCRST LPCSMI LSIRQ MAC1LINK MAC2LINK MDIO1 MDIO2 NCTS1 NCTS2 NCTS3 NCTS4 NDCD1
-NDCD2 NDCD3 NDCD4 NDSR1 NDSR2 NDSR3 NDSR4 NDTR1 NDTR2 NDTR3 NDTR4 NRI1 NRI2
-NRI3 NRI4 NRTS1 NRTS2 NRTS3 NRTS4 OSCCLK PEWAKE PNOR PWM0 PWM1 PWM2 PWM3 PWM4
-PWM5 PWM6 PWM7 RGMII1 RGMII2 RMII1 RMII2 RXD1 RXD2 RXD3 RXD4 SALT1 SALT10
-SALT11 SALT12 SALT13 SALT14 SALT2 SALT3 SALT4 SALT5 SALT6 SALT7 SALT8 SALT9
-SCL1 SCL2 SD1 SD2 SDA1 SDA2 SGPS1 SGPS2 SIOONCTRL SIOPBI SIOPBO SIOPWREQ
-SIOPWRGD SIOS3 SIOS5 SIOSCI SPI1 SPI1CS1 SPI1DEBUG SPI1PASSTHRU SPI2CK SPI2CS0
-SPI2CS1 SPI2MISO SPI2MOSI TIMER3 TIMER4 TIMER5 TIMER6 TIMER7 TIMER8 TXD1 TXD2
-TXD3 TXD4 UART6 USB11BHID USB2AD USB2AH USB2BD USB2BH USBCKI VGABIOSROM VGAHS
-VGAVS VPI24 VPO WDTRST1 WDTRST2
-
-Examples
-========
-
-g4 Example
-----------
-
-syscon: scu@1e6e2000 {
-	compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd";
-	reg = <0x1e6e2000 0x1a8>;
-
-	pinctrl: pinctrl {
-		compatible = "aspeed,g4-pinctrl";
-
-		pinctrl_i2c3_default: i2c3_default {
-			function = "I2C3";
-			groups = "I2C3";
-		};
-
-		pinctrl_gpioh0_unbiased_default: gpioh0 {
-			pins = "A8";
-			bias-disable;
-		};
-	};
-};
-
-g5 Example
-----------
-
-ahb {
-	apb {
-		syscon: scu@1e6e2000 {
-			compatible = "aspeed,ast2500-scu", "syscon", "simple-mfd";
-			reg = <0x1e6e2000 0x1a8>;
-
-			pinctrl: pinctrl {
-				compatible = "aspeed,g5-pinctrl";
-				aspeed,external-nodes = <&gfx &lhc>;
-
-				pinctrl_i2c3_default: i2c3_default {
-					function = "I2C3";
-					groups = "I2C3";
-				};
-
-				pinctrl_gpioh0_unbiased_default: gpioh0 {
-					pins = "A18";
-					bias-disable;
-				};
-			};
-		};
-
-		gfx: display@1e6e6000 {
-			compatible = "aspeed,ast2500-gfx", "syscon";
-			reg = <0x1e6e6000 0x1000>;
-		};
-	};
-
-	lpc: lpc@1e789000 {
-		compatible = "aspeed,ast2500-lpc", "simple-mfd";
-		reg = <0x1e789000 0x1000>;
-
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x0 0x1e789000 0x1000>;
-
-		lpc_host: lpc-host@80 {
-			compatible = "aspeed,ast2500-lpc-host", "simple-mfd", "syscon";
-			reg = <0x80 0x1e0>;
-			reg-io-width = <4>;
-
-			#address-cells = <1>;
-			#size-cells = <1>;
-			ranges = <0x0 0x80 0x1e0>;
-
-			lhc: lhc@20 {
-			       compatible = "aspeed,ast2500-lhc";
-			       reg = <0x20 0x24 0x48 0x8>;
-			};
-		};
-	};
-};
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index cef2b5855d60..fcd37e93ed4d 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -258,6 +258,7 @@ drive-push-pull		- drive actively high and low
 drive-open-drain	- drive with open drain
 drive-open-source	- drive with open source
 drive-strength		- sink or source at most X mA
+drive-strength-microamp	- sink or source at most X uA
 input-enable		- enable input on pin (no effect on output, such as
 			  enabling an input buffer)
 input-disable		- disable input on pin (no effect on output, such as
@@ -326,6 +327,8 @@ arguments are described below.
 
 - drive-strength takes as argument the target strength in mA.
 
+- drive-strength-microamp takes as argument the target strength in uA.
+
 - input-debounce takes the debounce time in usec as argument
   or 0 to disable debouncing
 
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
index 68e93d5b7ede..c9782397ff14 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
@@ -122,17 +122,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
index 6dd72f8599e9..7b151894f5a0 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
@@ -118,17 +118,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
index 86ecdcfc4fb8..d46973968873 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
@@ -97,17 +97,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
index 195a7a0ef0cc..3354a63296d9 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
@@ -130,17 +130,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
index 5034eb6653c7..a7dd213c77c6 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
@@ -124,17 +124,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
index f15443f6e78e..da52df6273bc 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
@@ -128,17 +128,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
index fa97f609fe45..a56cb882830c 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
@@ -149,17 +149,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
index e70c79bbbc5b..cdec1eeb2799 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
@@ -40,6 +40,14 @@ MSM8998 platform.
 	Definition: must be 2. Specifying the pin number and flags, as defined
 		    in <dt-bindings/gpio/gpio.h>
 
+- gpio-ranges:
+	Usage: required
+	Definition:  see ../gpio/gpio.txt
+
+- gpio-reserved-ranges:
+	Usage: optional
+	Definition: see ../gpio/gpio.txt
+
 Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
 a general description of GPIO and interrupt bindings.
 
@@ -135,17 +143,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
@@ -175,6 +183,8 @@ Example:
 		interrupts = <0 208 0>;
 		gpio-controller;
 		#gpio-cells = <2>;
+		gpio-ranges = <&tlmm 0 0 175>;
+		gpio-reserved-ranges = <0 4>, <81 4>;
 		interrupt-controller;
 		#interrupt-cells = <2>;
 
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
index 2b8f77762edc..a50e74684195 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
@@ -150,17 +150,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
index 769ca83bb40d..be034d329e10 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
@@ -142,17 +142,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
index 665aadb5ea28..7462e3743c68 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
@@ -79,7 +79,7 @@ to specify in a pin configuration subnode:
 		      gpio0-gpio149
 		        Supports mux, bias and drive-strength
 
-		      sdc2_clk, sdc2_cmd, sdc2_data
+		      sdc2_clk, sdc2_cmd, sdc2_data, ufs_reset
 		        Supports bias and drive-strength
 
 - function:
@@ -118,17 +118,17 @@ to specify in a pin configuration subnode:
 - bias-disable:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as no pull.
+	Definition: The specified pins should be configured as no pull.
 
 - bias-pull-down:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull down.
+	Definition: The specified pins should be configured as pull down.
 
 - bias-pull-up:
 	Usage: optional
 	Value type: <none>
-	Definition: The specified pins should be configued as pull up.
+	Definition: The specified pins should be configured as pull up.
 
 - output-high:
 	Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt
new file mode 100644
index 000000000000..fa37733e5102
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt
@@ -0,0 +1,190 @@
+Qualcomm SM8150 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+QCS404 platform.
+
+- compatible:
+	Usage: required
+	Value type: <string>
+	Definition: must be "qcom,sm8150-pinctrl"
+
+- reg:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: the base address and size of the north, south, west
+		    and east TLMM tiles.
+
+- reg-names:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Defintiion: names for the cells of reg, must contain "north", "south"
+		    "west" and "east".
+
+- interrupts:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+	Usage: required
+	Value type: <none>
+	Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+	Usage: required
+	Value type: <u32>
+	Definition: must be 2. Specifying the pin number and flags, as defined
+		    in <dt-bindings/interrupt-controller/irq.h>
+
+- gpio-controller:
+	Usage: required
+	Value type: <none>
+	Definition: identifies this node as a gpio controller
+
+- #gpio-cells:
+	Usage: required
+	Value type: <u32>
+	Definition: must be 2. Specifying the pin number and flags, as defined
+		    in <dt-bindings/gpio/gpio.h>
+
+- gpio-ranges:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition:  see ../gpio/gpio.txt
+
+- gpio-reserved-ranges:
+	Usage: optional
+	Value type: <prop-encoded-array>
+	Definition: see ../gpio/gpio.txt
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+	Usage: required
+	Value type: <string-array>
+	Definition: List of gpio pins affected by the properties specified in
+		    this subnode.
+
+		    Valid pins are:
+		      gpio0-gpio149
+		        Supports mux, bias and drive-strength
+
+		      sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd,
+		      sdc2_data sdc1_rclk
+		        Supports bias and drive-strength
+
+		      ufs_reset
+		        Supports bias and drive-strength
+
+- function:
+	Usage: required
+	Value type: <string>
+	Definition: Specify the alternative function to be configured for the
+		    specified pins. Functions are only valid for gpio pins.
+		    Valid values are:
+
+		    adsp_ext, agera_pll, aoss_cti, ddr_pxi2, atest_char,
+		    atest_char0, atest_char1, atest_char2, atest_char3,
+		    audio_ref, atest_usb1, atest_usb2, atest_usb10,
+		    atest_usb11, atest_usb12, atest_usb13, atest_usb20,
+		    atest_usb21, atest_usb22, atest_usb2, atest_usb23,
+		    btfm_slimbus, cam_mclk, cci_async, cci_i2c, cci_timer0,
+		    cci_timer1, cci_timer2, cci_timer3, cci_timer4,
+		    cri_trng, cri_trng0, cri_trng1, dbg_out, ddr_bist,
+		    ddr_pxi0, ddr_pxi1, ddr_pxi3, edp_hot, edp_lcd,
+		    emac_phy, emac_pps, gcc_gp1, gcc_gp2, gcc_gp3, gpio,
+		    hs1_mi2s, hs2_mi2s, hs3_mi2s, jitter_bist,
+		    lpass_slimbus, mdp_vsync, mdp_vsync0, mdp_vsync1,
+		    mdp_vsync2, mdp_vsync3, mss_lte, m_voc, nav_pps,
+		    pa_indicator, pci_e0, phase_flag, pll_bypassnl,
+		    pll_bist, pci_e1, pll_reset, pri_mi2s, pri_mi2s_ws,
+		    prng_rosc, qdss, qdss_cti, qlink_request, qlink_enable,
+		    qspi0, qspi1, qspi2, qspi3, qspi_clk, qspi_cs, qua_mi2s,
+		    qup0, qup1, qup2, qup3, qup4, qup5, qup6, qup7, qup8,
+		    qup9, qup10, qup11, qup12, qup13, qup14, qup15, qup16,
+		    qup17, qup18, qup19, qup_l4, qup_l5, qup_l6, rgmii,
+		    sdc4, sd_write, sec_mi2s, spkr_i2s, sp_cmu, ter_mi2s,
+		    tgu_ch0, tgu_ch1, tgu_ch2, tgu_ch3, tsense_pwm1,
+		    tsense_pwm2, tsif1, tsif2, uim1, uim2, uim_batt,
+		    usb2phy_ac, usb_phy, vfr_1, vsense_trigger, wlan1_adc0,
+		    wlan1_adc1, wlan2_adc0, wlan2_adc1, wmss_reset
+
+- bias-disable:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins should be configued as no pull.
+
+- bias-pull-down:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins should be configued as pull down.
+
+- bias-pull-up:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins should be configued as pull up.
+
+- output-high:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins are configured in output mode, driven
+		    high.
+		    Not valid for sdc pins.
+
+- output-low:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins are configured in output mode, driven
+		    low.
+		    Not valid for sdc pins.
+
+- drive-strength:
+	Usage: optional
+	Value type: <u32>
+	Definition: Selects the drive strength for the specified pins, in mA.
+		    Valid values are: 2, 4, 6, 8, 10, 12, 14 and 16
+
+Example:
+
+	tlmm: pinctrl@3000000 {
+		compatible = "qcom,sm8150-pinctrl";
+		reg = <0x03100000 0x300000>,
+		      <0x03500000 0x300000>,
+		      <0x03900000 0x300000>,
+		      <0x03D00000 0x300000>;
+		reg-names = "west", "east", "north", "south";
+		interrupts = <GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>;
+		gpio-controller;
+		#gpio-cells = <2>;
+		gpio-ranges = <&tlmm 0 0 175>;
+		gpio-reserved-ranges = <0 4>, <126 4>;
+		interrupt-controller;
+		#interrupt-cells = <2>;
+	};
diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
deleted file mode 100644
index 00169255e48c..000000000000
--- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+++ /dev/null
@@ -1,208 +0,0 @@
-* STM32 GPIO and Pin Mux/Config controller
-
-STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware
-controller. It controls the input/output settings on the available pins and
-also provides ability to multiplex and configure the output of various on-chip
-controllers onto these pads.
-
-Pin controller node:
-Required properies:
- - compatible: value should be one of the following:
-   "st,stm32f429-pinctrl"
-   "st,stm32f469-pinctrl"
-   "st,stm32f746-pinctrl"
-   "st,stm32f769-pinctrl"
-   "st,stm32h743-pinctrl"
-   "st,stm32mp157-pinctrl"
-   "st,stm32mp157-z-pinctrl"
- - #address-cells: The value of this property must be 1
- - #size-cells	: The value of this property must be 1
- - ranges	: defines mapping between pin controller node (parent) to
-   gpio-bank node (children).
- - pins-are-numbered: Specify the subnodes are using numbered pinmux to
-   specify pins.
-
-GPIO controller/bank node:
-Required properties:
- - gpio-controller : Indicates this device is a GPIO controller
- - #gpio-cells	  : Should be two.
-			The first cell is the pin number
-			The second one is the polarity:
-				- 0 for active high
-				- 1 for active low
- - reg		  : The gpio address range, relative to the pinctrl range
- - clocks	  : clock that drives this bank
- - st,bank-name	  : Should be a name string for this bank as specified in
-   the datasheet
-
-Optional properties:
- - reset:	  : Reference to the reset controller
- - st,syscfg: Should be phandle/offset/mask.
-	-The phandle to the syscon node which includes IRQ mux selection register.
-	-The offset of the IRQ mux selection register
-	-The field mask of IRQ mux, needed if different of 0xf.
- - gpio-ranges: Define a dedicated mapping between a pin-controller and
-   a gpio controller. Format is <&phandle a b c> with:
-	-(phandle): phandle of pin-controller.
-	-(a): gpio base offset in range.
-	-(b): pin base offset in range.
-	-(c): gpio count in range
-   This entry has to be used either if there are holes inside a bank:
-	GPIOB0/B1/B2/B14/B15 (see example 2)
-   or if banks are not contiguous:
-	GPIOA/B/C/E...
-   NOTE: If "gpio-ranges" is used for a gpio controller, all gpio-controller
-   have to use a "gpio-ranges" entry.
-   More details in Documentation/devicetree/bindings/gpio/gpio.txt.
- - st,bank-ioport: should correspond to the EXTI IOport selection (EXTI line
-   used to select GPIOs as interrupts).
- - hwlocks: reference to a phandle of a hardware spinlock provider node.
- - st,package: Indicates the SOC package used.
-   More details in include/dt-bindings/pinctrl/stm32-pinfunc.h
-
-Example 1:
-#include <dt-bindings/pinctrl/stm32f429-pinfunc.h>
-...
-
-	pin-controller {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		compatible = "st,stm32f429-pinctrl";
-		ranges = <0 0x40020000 0x3000>;
-		pins-are-numbered;
-
-		gpioa: gpio@40020000 {
-			gpio-controller;
-			#gpio-cells = <2>;
-			reg = <0x0 0x400>;
-			resets = <&reset_ahb1 0>;
-			st,bank-name = "GPIOA";
-		};
-		...
-		pin-functions nodes follow...
-	};
-
-Example 2:
-#include <dt-bindings/pinctrl/stm32f429-pinfunc.h>
-...
-
-	pinctrl: pin-controller {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		compatible = "st,stm32f429-pinctrl";
-		ranges = <0 0x40020000 0x3000>;
-		pins-are-numbered;
-
-		gpioa: gpio@40020000 {
-			gpio-controller;
-			#gpio-cells = <2>;
-			reg = <0x0 0x400>;
-			resets = <&reset_ahb1 0>;
-			st,bank-name = "GPIOA";
-			gpio-ranges = <&pinctrl 0 0 16>;
-		};
-
-		gpiob: gpio@40020400 {
-			gpio-controller;
-			#gpio-cells = <2>;
-			reg = <0x0 0x400>;
-			resets = <&reset_ahb1 0>;
-			st,bank-name = "GPIOB";
-			ngpios = 4;
-			gpio-ranges = <&pinctrl 0 16 3>,
-				      <&pinctrl 14 30 2>;
-		};
-
-
-		...
-		pin-functions nodes follow...
-	};
-
-
-Contents of function subnode node:
-----------------------------------
-Subnode format
-A pinctrl node should contain at least one subnode representing the
-pinctrl group available on the machine. Each subnode will list the
-pins it needs, and how they should be configured, with regard to muxer
-configuration, pullups, drive, output high/low and output speed.
-
-    node {
-	pinmux = <PIN_NUMBER_PINMUX>;
-	GENERIC_PINCONFIG;
-    };
-
-Required properties:
-- pinmux: integer array, represents gpio pin number and mux setting.
-  Supported pin number and mux varies for different SoCs, and are defined in
-  dt-bindings/pinctrl/<soc>-pinfunc.h directly.
-  These defines are calculated as:
-    ((port * 16 + line) << 8) | function
-  With:
-    - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11)
-    - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15)
-    - function: The function number, can be:
-      * 0 : GPIO
-      * 1 : Alternate Function 0
-      * 2 : Alternate Function 1
-      * 3 : Alternate Function 2
-      * ...
-      * 16 : Alternate Function 15
-      * 17 : Analog
-
-  To simplify the usage, macro is available to generate "pinmux" field.
-  This macro is available here:
-    - include/dt-bindings/pinctrl/stm32-pinfunc.h
-
-  Some examples of using macro:
-    /* GPIO A9 set as alernate function 2 */
-    ... {
-		pinmux = <STM32_PINMUX('A', 9, AF2)>;
-    };
-    /* GPIO A9 set as GPIO  */
-    ... {
-		pinmux = <STM32_PINMUX('A', 9, GPIO)>;
-    };
-    /* GPIO A9 set as analog */
-    ... {
-		pinmux = <STM32_PINMUX('A', 9, ANALOG)>;
-    };
-
-Optional properties:
-- GENERIC_PINCONFIG: is the generic pinconfig options to use.
-  Available options are:
-   - bias-disable,
-   - bias-pull-down,
-   - bias-pull-up,
-   - drive-push-pull,
-   - drive-open-drain,
-   - output-low
-   - output-high
-   - slew-rate = <x>, with x being:
-       < 0 > : Low speed
-       < 1 > : Medium speed
-       < 2 > : Fast speed
-       < 3 > : High speed
-
-Example:
-
-pin-controller {
-...
-	usart1_pins_a: usart1@0 {
-		pins1 {
-			pinmux = <STM32_PINMUX('A', 9, AF7)>;
-			bias-disable;
-			drive-push-pull;
-			slew-rate = <0>;
-		};
-		pins2 {
-			pinmux = <STM32_PINMUX('A', 10, AF7)>;
-			bias-disable;
-		};
-	};
-};
-
-&usart1 {
-	pinctrl-0 = <&usart1_pins_a>;
-	pinctrl-names = "default";
-};
diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
new file mode 100644
index 000000000000..91d3e78b3395
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright (C) STMicroelectronics 2019.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/st,stm32-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: STM32 GPIO and Pin Mux/Config controller
+
+maintainers:
+  - Alexandre TORGUE <alexandre.torgue@st.com>
+
+description: |
+  STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware
+  controller. It controls the input/output settings on the available pins and
+  also provides ability to multiplex and configure the output of various
+  on-chip controllers onto these pads.
+
+properties:
+  compatible:
+    enum:
+      - st,stm32f429-pinctrl
+      - st,stm32f469-pinctrl
+      - st,stm32f746-pinctrl
+      - st,stm32f769-pinctrl
+      - st,stm32h743-pinctrl
+      - st,stm32mp157-pinctrl
+      - st,stm32mp157-z-pinctrl
+
+  '#address-cells':
+    const: 1
+  '#size-cells':
+    const: 1
+
+  ranges: true
+  pins-are-numbered: true
+  hwlocks: true
+
+  st,syscfg:
+    $ref: "/schemas/types.yaml#/definitions/phandle-array"
+    description: Should be phandle/offset/mask
+    items:
+      - description: Phandle to the syscon node which includes IRQ mux selection.
+      - description: The offset of the IRQ mux selection register.
+      - description: The field mask of IRQ mux, needed if different of 0xf.
+
+  st,package:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - enum: [1, 2, 4, 8]
+    description:
+     Indicates the SOC package used.
+     More details in include/dt-bindings/pinctrl/stm32-pinfunc.h
+
+
+patternProperties:
+  '^gpio@[0-9a-f]*$':
+    type: object
+    properties:
+      gpio-controller: true
+      '#gpio-cells':
+        const: 2
+
+      reg:
+        maxItems: 1
+      clocks:
+        maxItems: 1
+      reset:
+        minItems: 1
+        maxItems: 1
+      gpio-ranges:
+        minItems: 1
+        maxItems: 16
+      ngpios:
+        description:
+          Number of available gpios in a bank.
+        minimum: 1
+        maximum: 16
+
+      st,bank-name:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/string"
+          - enum:
+            - GPIOA
+            - GPIOB
+            - GPIOC
+            - GPIOD
+            - GPIOE
+            - GPIOF
+            - GPIOG
+            - GPIOH
+            - GPIOI
+            - GPIOJ
+            - GPIOK
+            - GPIOZ
+        description:
+          Should be a name string for this bank as specified in the datasheet.
+
+      st,bank-ioport:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+          - maximum: 11
+
+        description:
+          Should correspond to the EXTI IOport selection (EXTI line used
+          to select GPIOs as interrupts).
+
+    required:
+      - gpio-controller
+      - '#gpio-cells'
+      - reg
+      - clocks
+      - st,bank-name
+
+  '-[0-9]*$':
+    type: object
+    patternProperties:
+      '^pins':
+        type: object
+        description: |
+          A pinctrl node should contain at least one subnode representing the
+          pinctrl group available on the machine. Each subnode will list the
+          pins it needs, and how they should be configured, with regard to muxer
+          configuration, pullups, drive, output high/low and output speed.
+        properties:
+          pinmux:
+            allOf:
+              - $ref: "/schemas/types.yaml#/definitions/uint32-array"
+            description: |
+              Integer array, represents gpio pin number and mux setting.
+              Supported pin number and mux varies for different SoCs, and are
+              defined in dt-bindings/pinctrl/<soc>-pinfunc.h directly.
+              These defines are calculated as: ((port * 16 + line) << 8) | function
+              With:
+              - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11)
+              - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15)
+              - function: The function number, can be:
+              * 0 : GPIO
+              * 1 : Alternate Function 0
+              * 2 : Alternate Function 1
+              * 3 : Alternate Function 2
+              * ...
+              * 16 : Alternate Function 15
+              * 17 : Analog
+              To simplify the usage, macro is available to generate "pinmux" field.
+              This macro is available here:
+                - include/dt-bindings/pinctrl/stm32-pinfunc.h
+              Some examples of using macro:
+               /* GPIO A9 set as alernate function 2 */
+               ... {
+                          pinmux = <STM32_PINMUX('A', 9, AF2)>;
+               };
+               /* GPIO A9 set as GPIO  */
+               ... {
+                          pinmux = <STM32_PINMUX('A', 9, GPIO)>;
+               };
+               /* GPIO A9 set as analog */
+               ... {
+                          pinmux = <STM32_PINMUX('A', 9, ANALOG)>;
+               };
+
+          bias-disable:
+            type: boolean
+          bias-pull-down:
+            type: boolean
+          bias-pull-up:
+            type: boolean
+          drive-push-pull:
+            type: boolean
+          drive-open-drain:
+            type: boolean
+          output-low:
+            type: boolean
+          output-high:
+            type: boolean
+          slew-rate:
+            description: |
+              0: Low speed
+              1: Medium speed
+              2: Fast speed
+              3: High speed
+            allOf:
+              - $ref: /schemas/types.yaml#/definitions/uint32
+              - enum: [0, 1, 2, 3]
+
+        required:
+          - pinmux
+
+required:
+  - compatible
+  - '#address-cells'
+  - '#size-cells'
+  - ranges
+  - pins-are-numbered
+
+examples:
+  - |
+    #include <dt-bindings/pinctrl/stm32-pinfunc.h>
+    #include <dt-bindings/mfd/stm32f4-rcc.h>
+    //Example 1
+      pinctrl@40020000 {
+              #address-cells = <1>;
+              #size-cells = <1>;
+              compatible = "st,stm32f429-pinctrl";
+              ranges = <0 0x40020000 0x3000>;
+              pins-are-numbered;
+
+              gpioa: gpio@0 {
+                      gpio-controller;
+                      #gpio-cells = <2>;
+                      reg = <0x0 0x400>;
+                      resets = <&reset_ahb1 0>;
+                      clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>;
+                      st,bank-name = "GPIOA";
+              };
+       };
+
+    //Example 2 (using gpio-ranges)
+      pinctrl@50020000 {
+              #address-cells = <1>;
+              #size-cells = <1>;
+              compatible = "st,stm32f429-pinctrl";
+              ranges = <0 0x50020000 0x3000>;
+              pins-are-numbered;
+
+              gpiob: gpio@1000 {
+                      gpio-controller;
+                      #gpio-cells = <2>;
+                      reg = <0x1000 0x400>;
+                      resets = <&reset_ahb1 0>;
+                      clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>;
+                      st,bank-name = "GPIOB";
+                      gpio-ranges = <&pinctrl 0 0 16>;
+              };
+
+              gpioc: gpio@2000 {
+                      gpio-controller;
+                      #gpio-cells = <2>;
+                      reg = <0x2000 0x400>;
+                      resets = <&reset_ahb1 0>;
+                      clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>;
+                      st,bank-name = "GPIOC";
+                      ngpios = <5>;
+                      gpio-ranges = <&pinctrl 0 16 3>,
+                                    <&pinctrl 14 30 2>;
+              };
+      };
+
+    //Example 3 pin groups
+      pinctrl@60020000 {
+        usart1_pins_a: usart1-0 {
+                pins1 {
+                        pinmux = <STM32_PINMUX('A', 9, AF7)>;
+                        bias-disable;
+                        drive-push-pull;
+                        slew-rate = <0>;
+                };
+                pins2 {
+                        pinmux = <STM32_PINMUX('A', 10, AF7)>;
+                        bias-disable;
+                };
+        };
+    };
+
+    usart1 {
+                pinctrl-0 = <&usart1_pins_a>;
+                pinctrl-names = "default";
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
index 980e5413d18f..eb35b22f9e23 100644
--- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
+++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
@@ -6,6 +6,8 @@ which then translates it into a corresponding voltage on a rail
 Required Properties:
  - compatible: Should be one of the following
 	* qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC
+	* qcom,msm8998-rpmpd: RPM Power domain for the msm8998 family of SoC
+	* qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC
 	* qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC
  - #power-domain-cells: number of cells in Power domain specifier
 	must be 1.
diff --git a/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt
new file mode 100644
index 000000000000..752d6126d5da
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt
@@ -0,0 +1,26 @@
+NVMEM reboot mode driver
+
+This driver gets reboot mode magic value from reboot-mode driver
+and stores it in a NVMEM cell named "reboot-mode". Then the bootloader
+can read it and take different action according to the magic
+value stored.
+
+Required properties:
+- compatible: should be "nvmem-reboot-mode".
+- nvmem-cells: A phandle to the reboot mode provided by a nvmem device.
+- nvmem-cell-names: Should be "reboot-mode".
+
+The rest of the properties should follow the generic reboot-mode description
+found in reboot-mode.txt
+
+Example:
+	reboot-mode {
+		compatible = "nvmem-reboot-mode";
+		nvmem-cells = <&reboot_mode>;
+		nvmem-cell-names = "reboot-mode";
+
+		mode-normal     = <0xAAAA5501>;
+		mode-bootloader = <0xBBBB5500>;
+		mode-recovery   = <0xCCCC5502>;
+		mode-test       = <0xDDDD5503>;
+	};
diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
index 5705f575862d..0c0dc3a1e693 100644
--- a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
+++ b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
@@ -9,6 +9,7 @@ Required Properties:
 -compatible: Must be one of:
 	"qcom,pm8916-pon"
 	"qcom,pms405-pon"
+	"qcom,pm8998-pon"
 
 -reg: Specifies the physical address of the pon register
 
diff --git a/Documentation/devicetree/bindings/property-units.txt b/Documentation/devicetree/bindings/property-units.txt
index bfd33734faca..e9b8360b3288 100644
--- a/Documentation/devicetree/bindings/property-units.txt
+++ b/Documentation/devicetree/bindings/property-units.txt
@@ -12,32 +12,32 @@ unit prefixes.
 Time/Frequency
 ----------------------------------------
 -mhz		: megahertz
--hz		: Hertz (preferred)
--sec		: seconds
--ms		: milliseconds
--us		: microseconds
--ns		: nanoseconds
+-hz		: hertz (preferred)
+-sec		: second
+-ms		: millisecond
+-us		: microsecond
+-ns		: nanosecond
 
 Distance
 ----------------------------------------
--mm		: millimeters
+-mm		: millimeter
 
 Electricity
 ----------------------------------------
--microamp	: micro amps
--microamp-hours : micro amp-hours
--ohms		: Ohms
--micro-ohms	: micro Ohms
--microwatt-hours: micro Watt-hours
--microvolt	: micro volts
--picofarads	: picofarads
--femtofarads	: femtofarads
+-microamp	: microampere
+-microamp-hours : microampere hour
+-ohms		: ohm
+-micro-ohms	: microohm
+-microwatt-hours: microwatt hour
+-microvolt	: microvolt
+-picofarads	: picofarad
+-femtofarads	: femtofarad
 
 Temperature
 ----------------------------------------
--celsius	: Degrees Celsius
--millicelsius	: Degreee milli-Celsius
+-celsius	: degree Celsius
+-millicelsius	: millidegree Celsius
 
 Pressure
 ----------------------------------------
--kpascal	: kiloPascal
+-kpascal	: kilopascal
diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
index 454c937076a2..d48f9eb3636e 100644
--- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
+++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
@@ -4,6 +4,8 @@ General Properties:
 
   - compatible   Should be "fsl,etsec-ptp" for eTSEC
                  Should be "fsl,fman-ptp-timer" for DPAA FMan
+                 Should be "fsl,dpaa2-ptp" for DPAA2
+                 Should be "fsl,enetc-ptp" for ENETC
   - reg          Offset and length of the register set for the device
   - interrupts   There should be at least two interrupts. Some devices
                  have as many as four PTP related interrupts.
diff --git a/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml b/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml
new file mode 100644
index 000000000000..0ac52f83a58c
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/allwinner,sun4i-a10-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 PWM Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#pwm-cells":
+    const: 3
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-pwm
+      - const: allwinner,sun5i-a10s-pwm
+      - const: allwinner,sun5i-a13-pwm
+      - const: allwinner,sun7i-a20-pwm
+      - const: allwinner,sun8i-h3-pwm
+      - items:
+          - const: allwinner,sun8i-a83t-pwm
+          - const: allwinner,sun8i-h3-pwm
+      - items:
+          - const: allwinner,sun50i-a64-pwm
+          - const: allwinner,sun5i-a13-pwm
+      - items:
+          - const: allwinner,sun50i-h5-pwm
+          - const: allwinner,sun5i-a13-pwm
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+required:
+  - "#pwm-cells"
+  - compatible
+  - reg
+  - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    pwm: pwm@1c20e00 {
+        compatible = "allwinner,sun7i-a20-pwm";
+        reg = <0x01c20e00 0xc>;
+        clocks = <&osc24M>;
+        #pwm-cells = <3>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt b/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
index 7d9d3f90641b..493bec80d59b 100644
--- a/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
+++ b/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
@@ -2,10 +2,7 @@ Ingenic JZ47xx PWM Controller
 =============================
 
 Required properties:
-- compatible: One of:
-  * "ingenic,jz4740-pwm"
-  * "ingenic,jz4770-pwm"
-  * "ingenic,jz4780-pwm"
+- compatible: Should be "ingenic,jz4740-pwm"
 - #pwm-cells: Should be 3. See pwm.txt in this directory for a description
   of the cells format.
 - clocks : phandle to the external clock.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sifive.txt b/Documentation/devicetree/bindings/pwm/pwm-sifive.txt
new file mode 100644
index 000000000000..36447e3c9378
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-sifive.txt
@@ -0,0 +1,33 @@
+SiFive PWM controller
+
+Unlike most other PWM controllers, the SiFive PWM controller currently only
+supports one period for all channels in the PWM. All PWMs need to run at
+the same period. The period also has significant restrictions on the values
+it can achieve, which the driver rounds to the nearest achievable period.
+PWM RTL that corresponds to the IP block version numbers can be found
+here:
+
+https://github.com/sifive/sifive-blocks/tree/master/src/main/scala/devices/pwm
+
+Required properties:
+- compatible: Should be "sifive,<chip>-pwm" and "sifive,pwm<version>".
+  Supported compatible strings are: "sifive,fu540-c000-pwm" for the SiFive
+  PWM v0 as integrated onto the SiFive FU540 chip, and "sifive,pwm0" for the
+  SiFive PWM v0 IP block with no chip integration tweaks.
+  Please refer to sifive-blocks-ip-versioning.txt for details.
+- reg: physical base address and length of the controller's registers
+- clocks: Should contain a clock identifier for the PWM's parent clock.
+- #pwm-cells: Should be 3. See pwm.txt in this directory
+  for a description of the cell format.
+- interrupts: one interrupt per PWM channel
+
+Examples:
+
+pwm:  pwm@10020000 {
+	compatible = "sifive,fu540-c000-pwm", "sifive,pwm0";
+	reg = <0x0 0x10020000 0x0 0x1000>;
+	clocks = <&tlclk>;
+	interrupt-parent = <&plic>;
+	interrupts = <42 43 44 45>;
+	#pwm-cells = <3>;
+};
diff --git a/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt b/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
index bd23302e84be..6521bc44a74e 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
@@ -11,8 +11,10 @@ Required parameters:
 			bindings defined in pwm.txt.
 
 Optional properties:
-- pinctrl-names: 	Set to "default".
-- pinctrl-0: 		Phandle pointing to pin configuration node for PWM.
+- pinctrl-names: 	Set to "default". An additional "sleep" state can be
+			defined to set pins in sleep state when in low power.
+- pinctrl-n: 		Phandle(s) pointing to pin configuration node for PWM,
+			respectively for "default" and "sleep" states.
 
 Example:
 	timer@40002400 {
@@ -21,7 +23,8 @@ Example:
 		pwm {
 			compatible = "st,stm32-pwm-lp";
 			#pwm-cells = <3>;
-			pinctrl-names = "default";
+			pinctrl-names = "default", "sleep";
 			pinctrl-0 = <&lppwm1_pins>;
+			pinctrl-1 = <&lppwm1_sleep_pins>;
 		};
 	};
diff --git a/Documentation/devicetree/bindings/pwm/pwm-stm32.txt b/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
index 3e6d55018d7a..a8690bfa5e1f 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
@@ -8,6 +8,8 @@ Required parameters:
 - pinctrl-names: 	Set to "default".
 - pinctrl-0: 		List of phandles pointing to pin configuration nodes for PWM module.
 			For Pinctrl properties see ../pinctrl/pinctrl-bindings.txt
+- #pwm-cells:		Should be set to 3. This PWM chip uses the default 3 cells
+			bindings defined in pwm.txt.
 
 Optional parameters:
 - st,breakinput:	One or two <index level filter> to describe break input configurations.
@@ -28,6 +30,7 @@ Example:
 
 		pwm {
 			compatible = "st,stm32-pwm";
+			#pwm-cells = <3>;
 			pinctrl-0	= <&pwm1_pins>;
 			pinctrl-names	= "default";
 			st,breakinput = <0 1 5>;
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
deleted file mode 100644
index 2a1affbff45e..000000000000
--- a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Allwinner sun4i and sun7i SoC PWM controller
-
-Required properties:
-  - compatible: should be one of:
-    - "allwinner,sun4i-a10-pwm"
-    - "allwinner,sun5i-a10s-pwm"
-    - "allwinner,sun5i-a13-pwm"
-    - "allwinner,sun7i-a20-pwm"
-    - "allwinner,sun8i-h3-pwm"
-    - "allwinner,sun50i-a64-pwm", "allwinner,sun5i-a13-pwm"
-    - "allwinner,sun50i-h5-pwm", "allwinner,sun5i-a13-pwm"
-  - reg: physical base address and length of the controller's registers
-  - #pwm-cells: should be 3. See pwm.txt in this directory for a description of
-    the cells format.
-  - clocks: From common clock binding, handle to the parent clock.
-
-Example:
-
-	pwm: pwm@1c20e00 {
-		compatible = "allwinner,sun7i-a20-pwm";
-		reg = <0x01c20e00 0xc>;
-		clocks = <&osc24M>;
-		#pwm-cells = <3>;
-	};
diff --git a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
index 443564d7784f..69bf41949b01 100644
--- a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
@@ -5,7 +5,8 @@ of analogue I/O.
 
 This document lists regulator specific bindings, see the primary binding
 document:
-  ../mfd/arizona.txt
+  For Wolfson Microelectronic Arizona codecs: ../mfd/arizona.txt
+  For Cirrus Logic Madera codecs: ../mfd/madera.txt
 
 Optional properties:
   - wlf,ldoena : GPIO specifier for the GPIO controlling LDOENA
diff --git a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
index d289c2f7455a..a650b457085d 100644
--- a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
@@ -12,10 +12,13 @@ maintainers:
 
 description:
   Any property defined as part of the core regulator binding, defined in
-  regulator.txt, can also be used. However a fixed voltage regulator is
+  regulator.yaml, can also be used. However a fixed voltage regulator is
   expected to have the regulator-min-microvolt and regulator-max-microvolt
   to be the same.
 
+allOf:
+  - $ref: "regulator.yaml#"
+
 properties:
   compatible:
     const: regulator-fixed
diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt b/Documentation/devicetree/bindings/regulator/gpio-regulator.txt
deleted file mode 100644
index dd25e73b5d79..000000000000
--- a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-GPIO controlled regulators
-
-Required properties:
-- compatible		: Must be "regulator-gpio".
-- regulator-name	: Defined in regulator.txt as optional, but required
-			  here.
-- gpios			: Array of one or more GPIO pins used to select the
-			  regulator voltage/current listed in "states".
-- states		: Selection of available voltages/currents provided by
-			  this regulator and matching GPIO configurations to
-			  achieve them. If there are no states in the "states"
-			  array, use a fixed regulator instead.
-
-Optional properties:
-- enable-gpios		: GPIO used to enable/disable the regulator.
-			  Warning, the GPIO phandle flags are ignored and the
-			  GPIO polarity is controlled solely by the presence
-			  of "enable-active-high" DT property. This is due to
-			  compatibility with old DTs.
-- enable-active-high	: Polarity of "enable-gpio" GPIO is active HIGH.
-			  Default is active LOW.
-- gpios-states		: On operating systems, that don't support reading back
-			  gpio values in output mode (most notably linux), this
-			  array provides the state of GPIO pins set when
-			  requesting them from the gpio controller. Systems,
-			  that are capable of preserving state when requesting
-			  the lines, are free to ignore this property.
-			  0: LOW, 1: HIGH. Default is LOW if nothing else
-			  is specified.
-- startup-delay-us	: Startup time in microseconds.
-- regulator-type	: Specifies what is being regulated, must be either
-			  "voltage" or "current", defaults to voltage.
-
-Any property defined as part of the core regulator binding defined in
-regulator.txt can also be used.
-
-Example:
-
-	mmciv: gpio-regulator {
-		compatible = "regulator-gpio";
-
-		regulator-name = "mmci-gpio-supply";
-		regulator-min-microvolt = <1800000>;
-		regulator-max-microvolt = <2600000>;
-		regulator-boot-on;
-
-		enable-gpios = <&gpio0 23 0x4>;
-		gpios = <&gpio0 24 0x4
-			 &gpio0 25 0x4>;
-		states = <1800000 0x3
-			  2200000 0x2
-			  2600000 0x1
-			  2900000 0x0>;
-
-		startup-delay-us = <100000>;
-		enable-active-high;
-	};
diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml
new file mode 100644
index 000000000000..9d3b28417fb6
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/gpio-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: GPIO controlled regulators
+
+maintainers:
+  - Liam Girdwood <lgirdwood@gmail.com>
+  - Mark Brown <broonie@kernel.org>
+
+description:
+  Any property defined as part of the core regulator binding, defined in
+  regulator.txt, can also be used.
+
+allOf:
+  - $ref: "regulator.yaml#"
+
+properties:
+  compatible:
+    const: regulator-gpio
+
+  regulator-name: true
+
+  enable-gpios:
+    description: GPIO to use to enable/disable the regulator.
+      Warning, the GPIO phandle flags are ignored and the GPIO polarity is
+      controlled solely by the presence of "enable-active-high" DT property.
+      This is due to compatibility with old DTs.
+    maxItems: 1
+
+  gpios:
+    description: Array of one or more GPIO pins used to select the regulator
+      voltage/current listed in "states".
+    minItems: 1
+    maxItems: 8  # Should be enough...
+
+  gpios-states:
+    description: |
+      On operating systems, that don't support reading back gpio values in
+      output mode (most notably linux), this array provides the state of GPIO
+      pins set when requesting them from the gpio controller. Systems, that are
+      capable of preserving state when requesting the lines, are free to ignore
+      this property.
+        0: LOW
+        1: HIGH
+      Default is LOW if nothing else is specified.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32-array
+      - maxItems: 8
+        items:
+          enum: [ 0, 1 ]
+          default: 0
+
+  states:
+    description: Selection of available voltages/currents provided by this
+      regulator and matching GPIO configurations to achieve them. If there are
+      no states in the "states" array, use a fixed regulator instead.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32-matrix
+      - maxItems: 8
+        items:
+          items:
+            - description: Voltage in microvolts
+            - description: GPIO group state value
+
+  startup-delay-us:
+    description: startup time in microseconds
+
+  enable-active-high:
+    description: Polarity of "enable-gpio" GPIO is active HIGH. Default is
+      active LOW.
+    type: boolean
+
+  gpio-open-drain:
+    description:
+      GPIO is open drain type. If this property is missing then default
+      assumption is false.
+    type: boolean
+
+  regulator-type:
+    description: Specifies what is being regulated.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/string
+      - enum:
+          - voltage
+          - current
+        default: voltage
+
+required:
+  - compatible
+  - regulator-name
+  - gpios
+  - states
+
+examples:
+  - |
+    gpio-regulator {
+      compatible = "regulator-gpio";
+
+      regulator-name = "mmci-gpio-supply";
+      regulator-min-microvolt = <1800000>;
+      regulator-max-microvolt = <2600000>;
+      regulator-boot-on;
+
+      enable-gpios = <&gpio0 23 0x4>;
+      gpios = <&gpio0 24 0x4
+        &gpio0 25 0x4>;
+      states = <1800000 0x3>,
+        <2200000 0x2>,
+        <2600000 0x1>,
+        <2900000 0x0>;
+
+      startup-delay-us = <100000>;
+      enable-active-high;
+    };
+...
diff --git a/Documentation/devicetree/bindings/regulator/max8660.txt b/Documentation/devicetree/bindings/regulator/max8660.txt
deleted file mode 100644
index 8ba994d8a142..000000000000
--- a/Documentation/devicetree/bindings/regulator/max8660.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Maxim MAX8660 voltage regulator
-
-Required properties:
-- compatible: must be one of "maxim,max8660", "maxim,max8661"
-- reg: I2C slave address, usually 0x34
-- any required generic properties defined in regulator.txt
-
-Example:
-
-	i2c_master {
-		max8660@34 {
-			compatible = "maxim,max8660";
-			reg = <0x34>;
-
-			regulators {
-				regulator@0 {
-					regulator-compatible= "V3(DCDC)";
-					regulator-min-microvolt = <725000>;
-					regulator-max-microvolt = <1800000>;
-				};
-
-				regulator@1 {
-					regulator-compatible= "V4(DCDC)";
-					regulator-min-microvolt = <725000>;
-					regulator-max-microvolt = <1800000>;
-				};
-
-				regulator@2 {
-					regulator-compatible= "V5(LDO)";
-					regulator-min-microvolt = <1700000>;
-					regulator-max-microvolt = <2000000>;
-				};
-
-				regulator@3 {
-					regulator-compatible= "V6(LDO)";
-					regulator-min-microvolt = <1800000>;
-					regulator-max-microvolt = <3300000>;
-				};
-
-				regulator@4 {
-					regulator-compatible= "V7(LDO)";
-					regulator-min-microvolt = <1800000>;
-					regulator-max-microvolt = <3300000>;
-				};
-			};
-		};
-	};
diff --git a/Documentation/devicetree/bindings/regulator/max8660.yaml b/Documentation/devicetree/bindings/regulator/max8660.yaml
new file mode 100644
index 000000000000..9c038698f880
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/max8660.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/max8660.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Maxim MAX8660 voltage regulator
+
+maintainers:
+  - Daniel Mack <zonque@gmail.com>
+
+properties:
+  $nodename:
+    pattern: "pmic@[0-9a-f]{1,2}"
+  compatible:
+    enum:
+      - maxim,max8660
+      - maxim,max8661
+
+  reg:
+    maxItems: 1
+
+  regulators:
+    type: object
+
+    patternProperties:
+      "regulator-.+":
+        $ref: "regulator.yaml#"
+
+    additionalProperties: false
+
+additionalProperties: false
+
+examples:
+  - |
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+
+      pmic@34 {
+        compatible = "maxim,max8660";
+        reg = <0x34>;
+
+        regulators {
+          regulator-V3 {
+            regulator-compatible= "V3(DCDC)";
+            regulator-min-microvolt = <725000>;
+            regulator-max-microvolt = <1800000>;
+          };
+
+          regulator-V4 {
+            regulator-compatible= "V4(DCDC)";
+            regulator-min-microvolt = <725000>;
+            regulator-max-microvolt = <1800000>;
+          };
+
+          regulator-V5 {
+            regulator-compatible= "V5(LDO)";
+            regulator-min-microvolt = <1700000>;
+            regulator-max-microvolt = <2000000>;
+          };
+
+          regulator-V6 {
+            regulator-compatible= "V6(LDO)";
+            regulator-min-microvolt = <1800000>;
+            regulator-max-microvolt = <3300000>;
+          };
+
+          regulator-V7 {
+            regulator-compatible= "V7(LDO)";
+            regulator-min-microvolt = <1800000>;
+            regulator-max-microvolt = <3300000>;
+          };
+        };
+      };
+    };
+...
diff --git a/Documentation/devicetree/bindings/regulator/pv88060.txt b/Documentation/devicetree/bindings/regulator/pv88060.txt
index 10a6dadc008e..6a7c8a92fdb0 100644
--- a/Documentation/devicetree/bindings/regulator/pv88060.txt
+++ b/Documentation/devicetree/bindings/regulator/pv88060.txt
@@ -121,4 +121,4 @@ Example
 				regulator-max-microvolt = <5000000>;
 			};
 		};
-	};
-\ No newline at end of file
+	};
diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
index 7ef2dbe48e8a..14d2eee96b3d 100644
--- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt
@@ -97,7 +97,7 @@ Second Level Nodes - Regulators
 		    sent for this regulator including those which are for a
 		    strictly lower power state.
 
-Other properties defined in Documentation/devicetree/bindings/regulator.txt
+Other properties defined in Documentation/devicetree/bindings/regulator/regulator.txt
 may also be used.  regulator-initial-mode and regulator-allowed-modes may be
 specified for VRM regulators using mode values from
 include/dt-bindings/regulator/qcom,rpmh-regulator.h.  regulator-allow-bypass
diff --git a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
index 406f2e570c50..430b8622bda1 100644
--- a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
@@ -4,11 +4,13 @@ Qualcomm SPMI Regulators
 	Usage: required
 	Value type: <string>
 	Definition: must be one of:
+			"qcom,pm8005-regulators"
 			"qcom,pm8841-regulators"
 			"qcom,pm8916-regulators"
 			"qcom,pm8941-regulators"
 			"qcom,pm8994-regulators"
 			"qcom,pmi8994-regulators"
+			"qcom,pms405-regulators"
 
 - interrupts:
 	Usage: optional
@@ -110,6 +112,23 @@ Qualcomm SPMI Regulators
 	Definition: Reference to regulator supplying the input pin, as
 		    described in the data sheet.
 
+- vdd_l1_l2-supply:
+- vdd_l3_l8-supply:
+- vdd_l4-supply:
+- vdd_l5_l6-supply:
+- vdd_l10_l11_l12_l13-supply:
+- vdd_l7-supply:
+- vdd_l9-supply:
+- vdd_s1-supply:
+- vdd_s2-supply:
+- vdd_s3-supply:
+- vdd_s4-supply:
+- vdd_s5-supply
+	Usage: optional (pms405 only)
+	Value type: <phandle>
+	Definition: Reference to regulator supplying the input pin, as
+		    described in the data sheet.
+
 - qcom,saw-reg:
 	Usage: optional
 	Value type: <phandle>
@@ -120,6 +139,9 @@ The regulator node houses sub-nodes for each regulator within the device. Each
 sub-node is identified using the node's name, with valid values listed for each
 of the PMICs below.
 
+pm8005:
+	s1, s2, s3, s4
+
 pm8841:
 	s1, s2, s3, s4, s5, s6, s7, s8
 
diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index 0a3f087d5844..487ccd8370b3 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -1,139 +1 @@
-Voltage/Current Regulators
-
-Optional properties:
-- regulator-name: A string used as a descriptive name for regulator outputs
-- regulator-min-microvolt: smallest voltage consumers may set
-- regulator-max-microvolt: largest voltage consumers may set
-- regulator-microvolt-offset: Offset applied to voltages to compensate for voltage drops
-- regulator-min-microamp: smallest current consumers may set
-- regulator-max-microamp: largest current consumers may set
-- regulator-input-current-limit-microamp: maximum input current regulator allows
-- regulator-always-on: boolean, regulator should never be disabled
-- regulator-boot-on: bootloader/firmware enabled regulator
-- regulator-allow-bypass: allow the regulator to go into bypass mode
-- regulator-allow-set-load: allow the regulator performance level to be configured
-- <name>-supply: phandle to the parent supply/regulator node
-- regulator-ramp-delay: ramp delay for regulator(in uV/us)
-  For hardware which supports disabling ramp rate, it should be explicitly
-  initialised to zero (regulator-ramp-delay = <0>) for disabling ramp delay.
-- regulator-enable-ramp-delay: The time taken, in microseconds, for the supply
-  rail to reach the target voltage, plus/minus whatever tolerance the board
-  design requires. This property describes the total system ramp time
-  required due to the combination of internal ramping of the regulator itself,
-  and board design issues such as trace capacitance and load on the supply.
-- regulator-settling-time-us: Settling time, in microseconds, for voltage
-  change if regulator have the constant time for any level voltage change.
-  This is useful when regulator have exponential voltage change.
-- regulator-settling-time-up-us: Settling time, in microseconds, for voltage
-  increase if the regulator needs a constant time to settle after voltage
-  increases of any level. This is useful for regulators with exponential
-  voltage changes.
-- regulator-settling-time-down-us: Settling time, in microseconds, for voltage
-  decrease if the regulator needs a constant time to settle after voltage
-  decreases of any level. This is useful for regulators with exponential
-  voltage changes.
-- regulator-soft-start: Enable soft start so that voltage ramps slowly
-- regulator-state-standby sub-root node for Standby mode
-  : equivalent with standby Linux sleep state, which provides energy savings
-  with a relatively quick transition back time.
-- regulator-state-mem sub-root node for Suspend-to-RAM mode
-  : suspend to memory, the device goes to sleep, but all data stored in memory,
-  only some external interrupt can wake the device.
-- regulator-state-disk sub-root node for Suspend-to-DISK mode
-  : suspend to disk, this state operates similarly to Suspend-to-RAM,
-  but includes a final step of writing memory contents to disk.
-- regulator-state-[mem/disk/standby] node has following common properties:
-	- regulator-on-in-suspend: regulator should be on in suspend state.
-	- regulator-off-in-suspend: regulator should be off in suspend state.
-	- regulator-suspend-min-microvolt: minimum voltage may be set in
-	  suspend state.
-	- regulator-suspend-max-microvolt: maximum voltage may be set in
-	  suspend state.
-	- regulator-suspend-microvolt: the default voltage which regulator
-	  would be set in suspend. This property is now deprecated, instead
-	  setting voltage for suspend mode via the API which regulator
-	  driver provides is recommended.
-	- regulator-changeable-in-suspend: whether the default voltage and
-	  the regulator on/off in suspend can be changed in runtime.
-	- regulator-mode: operating mode in the given suspend state.
-	  The set of possible operating modes depends on the capabilities of
-	  every hardware so the valid modes are documented on each regulator
-	  device tree binding document.
-- regulator-initial-mode: initial operating mode. The set of possible operating
-  modes depends on the capabilities of every hardware so each device binding
-  documentation explains which values the regulator supports.
-- regulator-allowed-modes: list of operating modes that software is allowed to
-  configure for the regulator at run-time.  Elements may be specified in any
-  order.  The set of possible operating modes depends on the capabilities of
-  every hardware so each device binding document explains which values the
-  regulator supports.
-- regulator-system-load: Load in uA present on regulator that is not captured by
-  any consumer request.
-- regulator-pull-down: Enable pull down resistor when the regulator is disabled.
-- regulator-over-current-protection: Enable over current protection.
-- regulator-active-discharge: tristate, enable/disable active discharge of
-  regulators. The values are:
-	0: Disable active discharge.
-	1: Enable active discharge.
-	Absence of this property will leave configuration to default.
-- regulator-coupled-with: Regulators with which the regulator
-  is coupled. The linkage is 2-way - all coupled regulators should be linked
-  with each other. A regulator should not be coupled with its supplier.
-- regulator-coupled-max-spread: Array of maximum spread between voltages of
-  coupled regulators in microvolts, each value in the array relates to the
-  corresponding couple specified by the regulator-coupled-with property.
-- regulator-max-step-microvolt: Maximum difference between current and target
-  voltages that can be changed safely in a single step.
-
-Deprecated properties:
-- regulator-compatible: If a regulator chip contains multiple
-  regulators, and if the chip's binding contains a child node that
-  describes each regulator, then this property indicates which regulator
-  this child node is intended to configure. If this property is missing,
-  the node's name will be used instead.
-
-Example:
-
-	xyzreg: regulator@0 {
-		regulator-min-microvolt = <1000000>;
-		regulator-max-microvolt = <2500000>;
-		regulator-always-on;
-		vin-supply = <&vin>;
-
-		regulator-state-mem {
-			regulator-on-in-suspend;
-		};
-	};
-
-Regulator Consumers:
-Consumer nodes can reference one or more of its supplies/
-regulators using the below bindings.
-
-- <name>-supply: phandle to the regulator node
-
-These are the same bindings that a regulator in the above
-example used to reference its own supply, in which case
-its just seen as a special case of a regulator being a
-consumer itself.
-
-Example of a consumer device node (mmc) referencing two
-regulators (twl_reg1 and twl_reg2),
-
-	twl_reg1: regulator@0 {
-		...
-		...
-		...
-	};
-
-	twl_reg2: regulator@1 {
-		...
-		...
-		...
-	};
-
-	mmc: mmc@0 {
-		...
-		...
-		vmmc-supply = <&twl_reg1>;
-		vmmcaux-supply = <&twl_reg2>;
-	};
+This file has moved to regulator.yaml.
diff --git a/Documentation/devicetree/bindings/regulator/regulator.yaml b/Documentation/devicetree/bindings/regulator/regulator.yaml
new file mode 100644
index 000000000000..02c3043ce419
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/regulator.yaml
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Voltage/Current Regulators
+
+maintainers:
+  - Liam Girdwood <lgirdwood@gmail.com>
+  - Mark Brown <broonie@kernel.org>
+
+properties:
+  regulator-name:
+    description: A string used as a descriptive name for regulator outputs
+    $ref: "/schemas/types.yaml#/definitions/string"
+
+  regulator-min-microvolt:
+    description: smallest voltage consumers may set
+
+  regulator-max-microvolt:
+    description: largest voltage consumers may set
+
+  regulator-microvolt-offset:
+    description: Offset applied to voltages to compensate for voltage drops
+
+  regulator-min-microamp:
+    description: smallest current consumers may set
+
+  regulator-max-microamp:
+    description: largest current consumers may set
+
+  regulator-input-current-limit-microamp:
+    description: maximum input current regulator allows
+
+  regulator-always-on:
+    description: boolean, regulator should never be disabled
+    type: boolean
+
+  regulator-boot-on:
+    description: bootloader/firmware enabled regulator
+    type: boolean
+
+  regulator-allow-bypass:
+    description: allow the regulator to go into bypass mode
+    type: boolean
+
+  regulator-allow-set-load:
+    description: allow the regulator performance level to be configured
+    type: boolean
+
+  regulator-ramp-delay:
+    description: ramp delay for regulator(in uV/us) For hardware which supports
+      disabling ramp rate, it should be explicitly initialised to zero (regulator-ramp-delay
+      = <0>) for disabling ramp delay.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  regulator-enable-ramp-delay:
+    description: The time taken, in microseconds, for the supply rail to
+      reach the target voltage, plus/minus whatever tolerance the board
+      design requires. This property describes the total system ramp time
+      required due to the combination of internal ramping of the regulator
+      itself, and board design issues such as trace capacitance and load
+      on the supply.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  regulator-settling-time-us:
+    description: Settling time, in microseconds, for voltage change if regulator
+      have the constant time for any level voltage change. This is useful
+      when regulator have exponential voltage change.
+
+  regulator-settling-time-up-us:
+    description: Settling time, in microseconds, for voltage increase if
+      the regulator needs a constant time to settle after voltage increases
+      of any level. This is useful for regulators with exponential voltage
+      changes.
+
+  regulator-settling-time-down-us:
+    description: Settling time, in microseconds, for voltage decrease if
+      the regulator needs a constant time to settle after voltage decreases
+      of any level. This is useful for regulators with exponential voltage
+      changes.
+
+  regulator-soft-start:
+    description: Enable soft start so that voltage ramps slowly
+    type: boolean
+
+  regulator-initial-mode:
+    description: initial operating mode. The set of possible operating modes
+      depends on the capabilities of every hardware so each device binding
+      documentation explains which values the regulator supports.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  regulator-allowed-modes:
+    description: list of operating modes that software is allowed to configure
+      for the regulator at run-time.  Elements may be specified in any order.
+      The set of possible operating modes depends on the capabilities of
+      every hardware so each device binding document explains which values
+      the regulator supports.
+    $ref: "/schemas/types.yaml#/definitions/uint32-array"
+
+  regulator-system-load:
+    description: Load in uA present on regulator that is not captured by
+      any consumer request.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  regulator-pull-down:
+    description: Enable pull down resistor when the regulator is disabled.
+    type: boolean
+
+  regulator-over-current-protection:
+    description: Enable over current protection.
+    type: boolean
+
+  regulator-active-discharge:
+    description: |
+      tristate, enable/disable active discharge of regulators. The values are:
+      0: Disable active discharge.
+      1: Enable active discharge.
+      Absence of this property will leave configuration to default.
+    allOf:
+      - $ref: "/schemas/types.yaml#/definitions/uint32"
+      - enum: [ 0, 1 ]
+
+  regulator-coupled-with:
+    description: Regulators with which the regulator is coupled. The linkage
+      is 2-way - all coupled regulators should be linked with each other.
+      A regulator should not be coupled with its supplier.
+    $ref: "/schemas/types.yaml#/definitions/phandle-array"
+
+  regulator-coupled-max-spread:
+    description: Array of maximum spread between voltages of coupled regulators
+      in microvolts, each value in the array relates to the corresponding
+      couple specified by the regulator-coupled-with property.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  regulator-max-step-microvolt:
+    description: Maximum difference between current and target voltages
+      that can be changed safely in a single step.
+
+patternProperties:
+  ".*-supply$":
+    description: Input supply phandle(s) for this node
+
+  regulator-state-(standby|mem|disk):
+    type: object
+    description:
+      sub-nodes for regulator state in Standby, Suspend-to-RAM, and
+      Suspend-to-DISK modes. Equivalent with standby, mem, and disk Linux
+      sleep states.
+
+    properties:
+      regulator-on-in-suspend:
+        description: regulator should be on in suspend state.
+        type: boolean
+
+      regulator-off-in-suspend:
+        description: regulator should be off in suspend state.
+        type: boolean
+
+      regulator-suspend-min-microvolt:
+        description: minimum voltage may be set in suspend state.
+
+      regulator-suspend-max-microvolt:
+        description: maximum voltage may be set in suspend state.
+
+      regulator-suspend-microvolt:
+        description: the default voltage which regulator would be set in
+          suspend. This property is now deprecated, instead setting voltage
+          for suspend mode via the API which regulator driver provides is
+          recommended.
+
+      regulator-changeable-in-suspend:
+        description: whether the default voltage and the regulator on/off
+          in suspend can be changed in runtime.
+        type: boolean
+
+      regulator-mode:
+        description: operating mode in the given suspend state. The set
+          of possible operating modes depends on the capabilities of every
+          hardware so the valid modes are documented on each regulator device
+          tree binding document.
+        $ref: "/schemas/types.yaml#/definitions/uint32"
+
+    additionalProperties: false
+
+examples:
+  - |
+    xyzreg: regulator@0 {
+      regulator-min-microvolt = <1000000>;
+      regulator-max-microvolt = <2500000>;
+      regulator-always-on;
+      vin-supply = <&vin>;
+
+      regulator-state-mem {
+        regulator-on-in-suspend;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/regulator/slg51000.txt b/Documentation/devicetree/bindings/regulator/slg51000.txt
new file mode 100644
index 000000000000..aa0733e49b90
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/slg51000.txt
@@ -0,0 +1,88 @@
+* Dialog Semiconductor SLG51000 Voltage Regulator
+
+Required properties:
+- compatible : Should be "dlg,slg51000" for SLG51000
+- reg : Specifies the I2C slave address.
+- xxx-supply: Input voltage supply regulator for ldo3 to ldo7.
+  These entries are required if regulators are enabled for a device.
+  An absence of these properties can cause the regulator registration to fail.
+  If some of input supply is powered through battery or always-on supply then
+  also it is required to have these parameters with proper node handle of always
+  on power supply.
+    vin3-supply: Input supply for ldo3
+    vin4-supply: Input supply for ldo4
+    vin5-supply: Input supply for ldo5
+    vin6-supply: Input supply for ldo6
+    vin7-supply: Input supply for ldo7
+
+Optional properties:
+- interrupt-parent : Specifies the reference to the interrupt controller.
+- interrupts : IRQ line information.
+- dlg,cs-gpios : Specify a valid GPIO for chip select
+
+Sub-nodes:
+- regulators : This node defines the settings for the regulators.
+  The content of the sub-node is defined by the standard binding
+  for regulators; see regulator.txt.
+
+  The SLG51000 regulators are bound using their names listed below:
+    ldo1
+    ldo2
+    ldo3
+    ldo4
+    ldo5
+    ldo6
+    ldo7
+
+Optional properties for regulators:
+- enable-gpios : Specify a valid GPIO for platform control of the regulator.
+
+Example:
+	pmic: slg51000@75 {
+		compatible = "dlg,slg51000";
+		reg = <0x75>;
+
+		regulators {
+			ldo1 {
+			        regulator-name = "ldo1";
+			        regulator-min-microvolt = <2400000>;
+			        regulator-max-microvolt = <3300000>;
+			};
+
+			ldo2 {
+			        regulator-name = "ldo2";
+			        regulator-min-microvolt = <2400000>;
+			        regulator-max-microvolt = <3300000>;
+			};
+
+			ldo3 {
+			        regulator-name = "ldo3";
+			        regulator-min-microvolt = <1200000>;
+			        regulator-max-microvolt = <3750000>;
+			};
+
+			ldo4 {
+			        regulator-name = "ldo4";
+			        regulator-min-microvolt = <1200000>;
+			        regulator-max-microvolt = <3750000>;
+			};
+
+			ldo5 {
+			        regulator-name = "ldo5";
+			        regulator-min-microvolt = <500000>;
+			        regulator-max-microvolt = <1200000>;
+			};
+
+			ldo6 {
+			        regulator-name = "ldo6";
+			        regulator-min-microvolt = <500000>;
+			        regulator-max-microvolt = <1200000>;
+			};
+
+			ldo7 {
+			        regulator-name = "ldo7";
+			        regulator-min-microvolt = <1200000>;
+			        regulator-max-microvolt = <3750000>;
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt b/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt
new file mode 100644
index 000000000000..479ad4c8758e
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt
@@ -0,0 +1,18 @@
+STM32 BOOSTER - Booster for ADC analog input switches
+
+Some STM32 devices embed a 3.3V booster supplied by Vdda, that can be used
+to supply ADC analog input switches.
+
+Required properties:
+- compatible: Should be one of:
+  "st,stm32h7-booster"
+  "st,stm32mp1-booster"
+- st,syscfg: Phandle to system configuration controller.
+- vdda-supply: Phandle to the vdda input analog voltage.
+
+Example:
+	booster: regulator-booster {
+		compatible = "st,stm32mp1-booster";
+		st,syscfg = <&syscfg>;
+		vdda-supply = <&vdda>;
+	};
diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt
deleted file mode 100644
index 66af2c30944f..000000000000
--- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-Qualcomm Technology Inc. ADSP Peripheral Image Loader
-
-This document defines the binding for a component that loads and boots firmware
-on the Qualcomm Technology Inc. ADSP Hexagon core.
-
-- compatible:
-	Usage: required
-	Value type: <string>
-	Definition: must be one of:
-		    "qcom,sdm845-adsp-pil"
-
-- reg:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: must specify the base address and size of the qdsp6ss register
-
-- interrupts-extended:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: must list the watchdog, fatal IRQs ready, handover and
-		    stop-ack IRQs
-
-- interrupt-names:
-	Usage: required
-	Value type: <stringlist>
-	Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
-
-- clocks:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition:  List of 8 phandle and clock specifier pairs for the adsp.
-
-- clock-names:
-	Usage: required
-	Value type: <stringlist>
-	Definition: List of clock input name strings sorted in the same
-		    order as the clocks property. Definition must have
-		    "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
-		    "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
-		    and "qdsp6ss_core".
-
-- power-domains:
-	Usage: required
-	Value type: <phandle>
-	Definition: reference to cx power domain node.
-
-- resets:
-	Usage: required
-	Value type: <phandle>
-	Definition: reference to the list of 2 reset-controller for the adsp.
-
-- reset-names:
-        Usage: required
-        Value type: <stringlist>
-        Definition: must be "pdc_sync" and "cc_lpass"
-
-- qcom,halt-regs:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: a phandle reference to a syscon representing TCSR followed
-			by the offset within syscon for lpass halt register.
-
-- memory-region:
-	Usage: required
-	Value type: <phandle>
-	Definition: reference to the reserved-memory for the ADSP
-
-- qcom,smem-states:
-	Usage: required
-	Value type: <phandle>
-	Definition: reference to the smem state for requesting the ADSP to
-		    shut down
-
-- qcom,smem-state-names:
-	Usage: required
-	Value type: <stringlist>
-	Definition: must be "stop"
-
-
-= SUBNODES
-The adsp node may have an subnode named "glink-edge" that describes the
-communication edge, channels and devices related to the ADSP.
-See ../soc/qcom/qcom,glink.txt for details on how to describe these.
-
-= EXAMPLE
-The following example describes the resources needed to boot control the
-ADSP, as it is found on SDM845 boards.
-
-	remoteproc@17300000 {
-		compatible = "qcom,sdm845-adsp-pil";
-		reg = <0x17300000 0x40c>;
-
-		interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>,
-			<&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>,
-			<&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>,
-			<&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>,
-			<&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>;
-		interrupt-names = "wdog", "fatal", "ready",
-			"handover", "stop-ack";
-
-		clocks = <&rpmhcc RPMH_CXO_CLK>,
-			<&gcc GCC_LPASS_SWAY_CLK>,
-			<&lpasscc LPASS_Q6SS_AHBS_AON_CLK>,
-			<&lpasscc LPASS_Q6SS_AHBM_AON_CLK>,
-			<&lpasscc LPASS_QDSP6SS_XO_CLK>,
-			<&lpasscc LPASS_QDSP6SS_SLEEP_CLK>,
-			<&lpasscc LPASS_QDSP6SS_CORE_CLK>;
-		clock-names = "xo", "sway_cbcr",
-			"lpass_ahbs_aon_cbcr",
-			"lpass_ahbm_aon_cbcr", "qdsp6ss_xo",
-			"qdsp6ss_sleep", "qdsp6ss_core";
-
-		power-domains = <&rpmhpd SDM845_CX>;
-
-		resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>,
-			 <&aoss_reset AOSS_CC_LPASS_RESTART>;
-		reset-names = "pdc_sync", "cc_lpass";
-
-		qcom,halt-regs = <&tcsr_mutex_regs 0x22000>;
-
-		memory-region = <&pil_adsp_mem>;
-
-		qcom,smem-states = <&adsp_smp2p_out 0>;
-		qcom,smem-state-names = "stop";
-	};
diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
new file mode 100644
index 000000000000..1337a3d93d35
--- /dev/null
+++ b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
@@ -0,0 +1,140 @@
+Qualcomm Technology Inc. Hexagon v56 Peripheral Image Loader
+
+This document defines the binding for a component that loads and boots firmware
+on the Qualcomm Technology Inc. Hexagon v56 core.
+
+- compatible:
+	Usage: required
+	Value type: <string>
+	Definition: must be one of:
+		    "qcom,qcs404-cdsp-pil",
+		    "qcom,sdm845-adsp-pil"
+
+- reg:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: must specify the base address and size of the qdsp6ss register
+
+- interrupts-extended:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: must list the watchdog, fatal IRQs ready, handover and
+		    stop-ack IRQs
+
+- interrupt-names:
+	Usage: required
+	Value type: <stringlist>
+	Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
+
+- clocks:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition:  List of phandles and clock specifier pairs for the Hexagon,
+		     per clock-names below.
+
+- clock-names:
+	Usage: required for SDM845 ADSP
+	Value type: <stringlist>
+	Definition: List of clock input name strings sorted in the same
+		    order as the clocks property. Definition must have
+		    "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
+		    "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
+		    and "qdsp6ss_core".
+
+- clock-names:
+	Usage: required for QCS404 CDSP
+	Value type: <stringlist>
+	Definition: List of clock input name strings sorted in the same
+		    order as the clocks property. Definition must have
+		    "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
+		    "q6ss_master", "q6_axim".
+
+- power-domains:
+	Usage: required
+	Value type: <phandle>
+	Definition: reference to cx power domain node.
+
+- resets:
+	Usage: required
+	Value type: <phandle>
+	Definition: reference to the list of resets for the Hexagon.
+
+- reset-names:
+        Usage: required for SDM845 ADSP
+        Value type: <stringlist>
+        Definition: must be "pdc_sync" and "cc_lpass"
+
+- reset-names:
+        Usage: required for QCS404 CDSP
+        Value type: <stringlist>
+        Definition: must be "restart"
+
+- qcom,halt-regs:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: a phandle reference to a syscon representing TCSR followed
+		    by the offset within syscon for Hexagon halt register.
+
+- memory-region:
+	Usage: required
+	Value type: <phandle>
+	Definition: reference to the reserved-memory for the firmware
+
+- qcom,smem-states:
+	Usage: required
+	Value type: <phandle>
+	Definition: reference to the smem state for requesting the Hexagon to
+		    shut down
+
+- qcom,smem-state-names:
+	Usage: required
+	Value type: <stringlist>
+	Definition: must be "stop"
+
+
+= SUBNODES
+The adsp node may have an subnode named "glink-edge" that describes the
+communication edge, channels and devices related to the Hexagon.
+See ../soc/qcom/qcom,glink.txt for details on how to describe these.
+
+= EXAMPLE
+The following example describes the resources needed to boot control the
+ADSP, as it is found on SDM845 boards.
+
+	remoteproc@17300000 {
+		compatible = "qcom,sdm845-adsp-pil";
+		reg = <0x17300000 0x40c>;
+
+		interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>,
+			<&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>,
+			<&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>,
+			<&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>,
+			<&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>;
+		interrupt-names = "wdog", "fatal", "ready",
+			"handover", "stop-ack";
+
+		clocks = <&rpmhcc RPMH_CXO_CLK>,
+			<&gcc GCC_LPASS_SWAY_CLK>,
+			<&lpasscc LPASS_Q6SS_AHBS_AON_CLK>,
+			<&lpasscc LPASS_Q6SS_AHBM_AON_CLK>,
+			<&lpasscc LPASS_QDSP6SS_XO_CLK>,
+			<&lpasscc LPASS_QDSP6SS_SLEEP_CLK>,
+			<&lpasscc LPASS_QDSP6SS_CORE_CLK>;
+		clock-names = "xo", "sway_cbcr",
+			"lpass_ahbs_aon_cbcr",
+			"lpass_ahbm_aon_cbcr", "qdsp6ss_xo",
+			"qdsp6ss_sleep", "qdsp6ss_core";
+
+		power-domains = <&rpmhpd SDM845_CX>;
+
+		resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>,
+			 <&aoss_reset AOSS_CC_LPASS_RESTART>;
+		reset-names = "pdc_sync", "cc_lpass";
+
+		qcom,halt-regs = <&tcsr_mutex_regs 0x22000>;
+
+		memory-region = <&pil_adsp_mem>;
+
+		qcom,smem-states = <&adsp_smp2p_out 0>;
+		qcom,smem-state-names = "stop";
+	};
diff --git a/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt b/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt
new file mode 100644
index 000000000000..5fa915a4b736
--- /dev/null
+++ b/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt
@@ -0,0 +1,63 @@
+STMicroelectronics STM32 Remoteproc
+-----------------------------------
+This document defines the binding for the remoteproc component that loads and
+boots firmwares on the ST32MP family chipset.
+
+Required properties:
+- compatible:	Must be "st,stm32mp1-m4"
+- reg:		Address ranges of the RETRAM and MCU SRAM memories used by the
+		remote processor.
+- resets:	Reference to a reset controller asserting the remote processor.
+- st,syscfg-holdboot: Reference to the system configuration which holds the
+		remote processor reset hold boot
+	1st cell: phandle of syscon block
+	2nd cell: register offset containing the hold boot setting
+	3rd cell: register bitmask for the hold boot field
+- st,syscfg-tz: Reference to the system configuration which holds the RCC trust
+		zone mode
+	1st cell: phandle to syscon block
+	2nd cell: register offset containing the RCC trust zone mode setting
+	3rd cell: register bitmask for the RCC trust zone mode bit
+
+Optional properties:
+- interrupts:	Should contain the watchdog interrupt
+- mboxes:	This property is required only if the rpmsg/virtio functionality
+		is used. List of phandle and mailbox channel specifiers:
+		- a channel (a) used to communicate through virtqueues with the
+		  remote proc.
+		  Bi-directional channel:
+		      - from local to remote = send message
+		      - from remote to local = send message ack
+		- a channel (b) working the opposite direction of channel (a)
+		- a channel (c) used by the local proc to notify the remote proc
+		  that it is about to be shut down.
+		  Unidirectional channel:
+		      - from local to remote, where ACK from the remote means
+		        that it is ready for shutdown
+- mbox-names:	This property is required if the mboxes property is used.
+		- must be "vq0" for channel (a)
+		- must be "vq1" for channel (b)
+		- must be "shutdown" for channel (c)
+- memory-region: List of phandles to the reserved memory regions associated with
+		the remoteproc device. This is variable and describes the
+		memories shared with the remote processor (eg: remoteproc
+		firmware and carveouts, rpmsg vrings, ...).
+		(see ../reserved-memory/reserved-memory.txt)
+- st,syscfg-pdds: Reference to the system configuration which holds the remote
+		processor deep sleep setting
+	1st cell: phandle to syscon block
+	2nd cell: register offset containing the deep sleep setting
+	3rd cell: register bitmask for the deep sleep bit
+- st,auto-boot:	If defined, when remoteproc is probed, it loads the default
+		firmware and starts the remote processor.
+
+Example:
+	m4_rproc: m4@10000000 {
+		compatible = "st,stm32mp1-m4";
+		reg = <0x10000000 0x40000>,
+		      <0x30000000 0x40000>,
+		      <0x38000000 0x10000>;
+		resets = <&rcc MCU_R>;
+		st,syscfg-holdboot = <&rcc 0x10C 0x1>;
+		st,syscfg-tz = <&rcc 0x000 0x1>;
+	};
diff --git a/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
new file mode 100644
index 000000000000..a6f8455ae6c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
@@ -0,0 +1,18 @@
+Bitmain BM1880 SoC Reset Controller
+===================================
+
+Please also refer to reset.txt in this directory for common reset
+controller binding usage.
+
+Required properties:
+- compatible:   Should be "bitmain,bm1880-reset"
+- reg:          Offset and length of reset controller space in SCTRL.
+- #reset-cells: Must be 1.
+
+Example:
+
+        rst: reset-controller@c00 {
+                compatible = "bitmain,bm1880-reset";
+                reg = <0xc00 0x8>;
+                #reset-cells = <1>;
+        };
diff --git a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
index 2ecf33815d18..13e095182db4 100644
--- a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
+++ b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
@@ -45,6 +45,6 @@ Example:
         };
 
 
-For list of all valid reset indicies see
+For list of all valid reset indices see
 <dt-bindings/reset/imx7-reset.h> for i.MX7 and
 <dt-bindings/reset/imx8mq-reset.h> for i.MX8MQ
diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml
new file mode 100644
index 000000000000..c899111aa5e3
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/cpus.yaml
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/cpus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RISC-V bindings for 'cpus' DT nodes
+
+maintainers:
+  - Paul Walmsley <paul.walmsley@sifive.com>
+  - Palmer Dabbelt <palmer@sifive.com>
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - sifive,rocket0
+          - sifive,e5
+          - sifive,e51
+          - sifive,u54-mc
+          - sifive,u54
+          - sifive,u5
+      - const: riscv
+    description:
+      Identifies that the hart uses the RISC-V instruction set
+      and identifies the type of the hart.
+
+  mmu-type:
+    allOf:
+      - $ref: "/schemas/types.yaml#/definitions/string"
+      - enum:
+          - riscv,sv32
+          - riscv,sv39
+          - riscv,sv48
+    description:
+      Identifies the MMU address translation mode used on this
+      hart.  These values originate from the RISC-V Privileged
+      Specification document, available from
+      https://riscv.org/specifications/
+
+  riscv,isa:
+    allOf:
+      - $ref: "/schemas/types.yaml#/definitions/string"
+      - enum:
+          - rv64imac
+          - rv64imafdc
+    description:
+      Identifies the specific RISC-V instruction set architecture
+      supported by the hart.  These are documented in the RISC-V
+      User-Level ISA document, available from
+      https://riscv.org/specifications/
+
+  timebase-frequency:
+    type: integer
+    minimum: 1
+    description:
+      Specifies the clock frequency of the system timer in Hz.
+      This value is common to all harts on a single system image.
+
+  interrupt-controller:
+    type: object
+    description: Describes the CPU's local interrupt controller
+
+    properties:
+      '#interrupt-cells':
+        const: 1
+
+      compatible:
+        const: riscv,cpu-intc
+
+      interrupt-controller: true
+
+    required:
+      - '#interrupt-cells'
+      - compatible
+      - interrupt-controller
+
+required:
+  - riscv,isa
+  - timebase-frequency
+  - interrupt-controller
+
+examples:
+  - |
+    // Example 1: SiFive Freedom U540G Development Kit
+    cpus {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        timebase-frequency = <1000000>;
+        cpu@0 {
+                clock-frequency = <0>;
+                compatible = "sifive,rocket0", "riscv";
+                device_type = "cpu";
+                i-cache-block-size = <64>;
+                i-cache-sets = <128>;
+                i-cache-size = <16384>;
+                reg = <0>;
+                riscv,isa = "rv64imac";
+                cpu_intc0: interrupt-controller {
+                        #interrupt-cells = <1>;
+                        compatible = "riscv,cpu-intc";
+                        interrupt-controller;
+                };
+        };
+        cpu@1 {
+                clock-frequency = <0>;
+                compatible = "sifive,rocket0", "riscv";
+                d-cache-block-size = <64>;
+                d-cache-sets = <64>;
+                d-cache-size = <32768>;
+                d-tlb-sets = <1>;
+                d-tlb-size = <32>;
+                device_type = "cpu";
+                i-cache-block-size = <64>;
+                i-cache-sets = <64>;
+                i-cache-size = <32768>;
+                i-tlb-sets = <1>;
+                i-tlb-size = <32>;
+                mmu-type = "riscv,sv39";
+                reg = <1>;
+                riscv,isa = "rv64imafdc";
+                tlb-split;
+                cpu_intc1: interrupt-controller {
+                        #interrupt-cells = <1>;
+                        compatible = "riscv,cpu-intc";
+                        interrupt-controller;
+                };
+        };
+    };
+
+  - |
+    // Example 2: Spike ISA Simulator with 1 Hart
+    cpus {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        cpu@0 {
+                device_type = "cpu";
+                reg = <0>;
+                compatible = "riscv";
+                riscv,isa = "rv64imafdc";
+                mmu-type = "riscv,sv48";
+                interrupt-controller {
+                        #interrupt-cells = <1>;
+                        interrupt-controller;
+                        compatible = "riscv,cpu-intc";
+                };
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/riscv/sifive.yaml b/Documentation/devicetree/bindings/riscv/sifive.yaml
new file mode 100644
index 000000000000..9d17dc2f3f84
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/sifive.yaml
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/sifive.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SiFive SoC-based boards
+
+maintainers:
+  - Paul Walmsley <paul.walmsley@sifive.com>
+  - Palmer Dabbelt <palmer@sifive.com>
+
+description:
+  SiFive SoC-based boards
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    items:
+      - enum:
+          - sifive,freedom-unleashed-a00
+      - const: sifive,fu540-c000
+      - const: sifive,fu540
+...
diff --git a/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt b/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
index 0014da9145af..c223e54452da 100644
--- a/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
+++ b/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
@@ -2,6 +2,7 @@ HWRNG support for the iproc-rng200 driver
 
 Required properties:
 - compatible : Must be one of:
+	       "brcm,bcm7211-rng200"
 	       "brcm,bcm7278-rng200"
 	       "brcm,iproc-rng200"
 - reg : base address and size of control register block
diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml
new file mode 100644
index 000000000000..46d69c32b89b
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/allwinner,sun4i-a10-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 RTC Device Tree Bindings
+
+allOf:
+  - $ref: "rtc.yaml#"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    enum:
+      - allwinner,sun4i-a10-rtc
+      - allwinner,sun7i-a20-rtc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    rtc: rtc@1c20d00 {
+        compatible = "allwinner,sun4i-a10-rtc";
+        reg = <0x01c20d00 0x20>;
+        interrupts = <24>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
new file mode 100644
index 000000000000..924622f39c44
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/allwinner,sun6i-a31-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 RTC Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#clock-cells":
+    const: 1
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun6i-a31-rtc
+      - const: allwinner,sun8i-a23-rtc
+      - const: allwinner,sun8i-h3-rtc
+      - const: allwinner,sun8i-r40-rtc
+      - const: allwinner,sun8i-v3-rtc
+      - const: allwinner,sun50i-h5-rtc
+      - items:
+          - const: allwinner,sun50i-a64-rtc
+          - const: allwinner,sun8i-h3-rtc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    minItems: 1
+    maxItems: 2
+    items:
+      - description: RTC Alarm 0
+      - description: RTC Alarm 1
+
+  clocks:
+    maxItems: 1
+
+  clock-output-names:
+    minItems: 1
+    maxItems: 3
+    description:
+      The RTC provides up to three clocks
+        - the Low Frequency Oscillator or LOSC, at index 0,
+        - the Low Frequency Oscillator External output (X32KFOUT in
+          the datasheet), at index 1,
+        - the Internal Oscillator, at index 2.
+
+allOf:
+  - $ref: "rtc.yaml#"
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,sun6i-a31-rtc
+
+    then:
+      properties:
+        clock-output-names:
+          minItems: 1
+          maxItems: 1
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun8i-a23-rtc
+              - allwinner,sun8i-r40-rtc
+              - allwinner,sun8i-v3-rtc
+
+    then:
+      properties:
+        clock-output-names:
+          minItems: 2
+          maxItems: 2
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun8i-h3-rtc
+              - allwinner,sun50i-h5-rtc
+
+    then:
+      properties:
+        clock-output-names:
+          minItems: 3
+          maxItems: 3
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,sun8i-r40-rtc
+
+    then:
+      properties:
+        interrupts:
+          minItems: 1
+          maxItems: 1
+
+    else:
+      properties:
+        interrupts:
+          minItems: 2
+          maxItems: 2
+
+required:
+  - "#clock-cells"
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-output-names
+
+additionalProperties: false
+
+examples:
+  - |
+    rtc: rtc@1f00000 {
+        compatible = "allwinner,sun6i-a31-rtc";
+        reg = <0x01f00000 0x400>;
+        interrupts = <0 40 4>, <0 41 4>;
+        clock-output-names = "osc32k";
+        clocks = <&ext_osc32k>;
+        #clock-cells = <1>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/rtc.txt b/Documentation/devicetree/bindings/rtc/rtc.txt
index a97fc6a9a75e..b8d36fce5e2d 100644
--- a/Documentation/devicetree/bindings/rtc/rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc.txt
@@ -1,72 +1 @@
-Generic device tree bindings for Real Time Clock devices
-========================================================
-
-This document describes generic bindings which can be used to describe Real Time
-Clock devices in a device tree.
-
-Required properties
--------------------
-
-- compatible : name of RTC device following generic names recommended practice.
-
-For other required properties e.g. to describe register sets,
-clocks, etc. check the binding documentation of the specific driver.
-
-Optional properties
--------------------
-
-- start-year : if provided, the default hardware range supported by the RTC is
-               shifted so the first usable year is the specified one.
-
-The following properties may not be supported by all drivers. However, if a
-driver wants to support one of the below features, it should adapt the bindings
-below.
-- trickle-resistor-ohms :   Selected resistor for trickle charger. Should be given
-                            if trickle charger should be enabled
-- trickle-diode-disable :   Do not use internal trickle charger diode Should be
-                            given if internal trickle charger diode should be
-                            disabled
-- wakeup-source :           Enables wake up of host system on alarm
-- quartz-load-femtofarads : The capacitive load of the quartz(x-tal),
-                            expressed in femto Farad (fF).
-                            The default value shall be listed (if optional),
-                            and likewise all valid values.
-
-Trivial RTCs
-------------
-
-This is a list of trivial RTC devices that have simple device tree
-bindings, consisting only of a compatible field, an address and
-possibly an interrupt line.
-
-
-Compatible		Vendor / Chip
-==========		=============
-abracon,abb5zes3	AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface
-abracon,abeoz9		AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface
-dallas,ds1374		I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output
-dallas,ds1672		Dallas DS1672 Real-time Clock
-dallas,ds3232		Extremely Accurate I²C RTC with Integrated Crystal and SRAM
-epson,rx8010		I2C-BUS INTERFACE REAL TIME CLOCK MODULE
-epson,rx8571		I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM
-epson,rx8581		I2C-BUS INTERFACE REAL TIME CLOCK MODULE
-emmicro,em3027		EM Microelectronic EM3027 Real-time Clock
-isil,isl1208		Intersil ISL1208 Low Power RTC with Battery Backed SRAM
-isil,isl1218		Intersil ISL1218 Low Power RTC with Battery Backed SRAM
-isil,isl12022		Intersil ISL12022 Real-time Clock
-microcrystal,rv3028	Real Time Clock Module with I2C-Bus
-microcrystal,rv3029	Real Time Clock Module with I2C-Bus
-microcrystal,rv8523	Real Time Clock
-nxp,pcf2127		Real-time clock
-nxp,pcf2129		Real-time clock
-nxp,pcf8563		Real-time clock/calendar
-pericom,pt7c4338	Real-time Clock Module
-ricoh,r2025sd		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,r2221tl		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rs5c372a		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rs5c372b		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rv5c386		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rv5c387a		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-sii,s35390a		2-wire CMOS real-time clock
-whwave,sd3078		I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-xircom,x1205		Xircom X1205 I2C RTC
+This file has been moved to rtc.yaml.
diff --git a/Documentation/devicetree/bindings/rtc/rtc.yaml b/Documentation/devicetree/bindings/rtc/rtc.yaml
new file mode 100644
index 000000000000..ee237b2ed66a
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RTC Generic Binding
+
+maintainers:
+  - Alexandre Belloni <alexandre.belloni@bootlin.com>
+
+description: |
+  This document describes generic bindings which can be used to
+  describe Real Time Clock devices in a device tree.
+
+properties:
+  $nodename:
+    pattern: "^rtc(@.*|-[0-9a-f])*$"
+
+  quartz-load-femtofarads:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      The capacitive load of the quartz(x-tal), expressed in femto
+      Farad (fF). The default value shall be listed (if optional),
+      and likewise all valid values.
+
+  start-year:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      If provided, the default hardware range supported by the RTC is
+      shifted so the first usable year is the specified one.
+
+  trickle-diode-disable:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Do not use internal trickle charger diode. Should be given if
+      internal trickle charger diode should be disabled.
+
+  trickle-resistor-ohms:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      Selected resistor for trickle charger. Should be given
+      if trickle charger should be enabled.
+
+  wakeup-source:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Enables wake up of host system on alarm.
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
deleted file mode 100644
index 6b732c41392b..000000000000
--- a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-* sun6i Real Time Clock
-
-RTC controller for the Allwinner A31
-
-Required properties:
-- compatible	: Should be one of the following combinations:
-		    - "allwinner,sun6i-a31-rtc"
-		    - "allwinner,sun8i-a23-rtc"
-		    - "allwinner,sun8i-h3-rtc"
-		    - "allwinner,sun8i-r40-rtc", "allwinner,sun8i-h3-rtc"
-		    - "allwinner,sun8i-v3-rtc"
-		    - "allwinner,sun50i-a64-rtc", "allwinner,sun8i-h3-rtc"
-		    - "allwinner,sun50i-h5-rtc"
-
-		  Where there are two or more compatible strings, this
-		  denotes the hardware covered by the most specific one
-		  is backward-compatible with the latter ones, and the
-		  implementation for the latter ones can be used, albeit
-		  with reduced functionality.
-
-- reg		: physical base address of the controller and length of
-		  memory mapped region.
-- interrupts	: IRQ lines for the RTC alarm 0 and alarm 1, in that order.
-
-Required properties for new device trees
-- clocks	: phandle to the 32kHz external oscillator
-- clock-output-names : names of up to three clock outputs. See below.
-- #clock-cells  : must be equal to 1.
-
-The RTC provides the following clocks at the given indices:
-- 0: LOSC
-- 1: LOSC external output, known as X32KFOUT in the datasheet.
-     This clock is not available on the A31 and is deprecated for old
-     device trees still using the "allwinner,sun6i-a31-rtc" compatible.
-- 2: InternalOSC, or internal RC oscillator (A64/H3/H5 only)
-
-Example:
-
-rtc: rtc@1f00000 {
-	compatible = "allwinner,sun6i-a31-rtc";
-	reg = <0x01f00000 0x400>;
-	interrupts = <0 40 4>, <0 41 4>;
-	clock-output-names = "osc32k";
-	clocks = <&ext_osc32k>;
-	#clock-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt b/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt
deleted file mode 100644
index 4a8d79c1cf08..000000000000
--- a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-* sun4i/sun7i Real Time Clock
-
-RTC controller for the Allwinner A10/A20
-
-Required properties:
-- compatible : Should be "allwinner,sun4i-a10-rtc" or "allwinner,sun7i-a20-rtc"
-- reg: physical base address of the controller and length of memory mapped
-  region.
-- interrupts: IRQ line for the RTC.
-
-Example:
-
-rtc: rtc@1c20d00 {
-	compatible = "allwinner,sun4i-a10-rtc";
-	reg = <0x01c20d00 0x20>;
-	interrupts = <24>;
-};
diff --git a/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml
new file mode 100644
index 000000000000..0c12ce9a9b45
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/trivial-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Trivial RTCs
+
+maintainers:
+  - Alexandre Belloni <alexandre.belloni@bootlin.com>
+
+description: |
+  This is a list of trivial RTC devices that have simple device tree
+  bindings, consisting only of a compatible field, an address and
+  possibly an interrupt line.
+
+allOf:
+  - $ref: "rtc.yaml#"
+
+properties:
+  compatible:
+    enum:
+      # AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface
+      - abracon,abb5zes3
+      # AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface
+      - abracon,abeoz9
+      # I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output
+      - dallas,ds1374
+      # Dallas DS1672 Real-time Clock
+      - dallas,ds1672
+      # Extremely Accurate I²C RTC with Integrated Crystal and SRAM
+      - dallas,ds3232
+      # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
+      - epson,rx8010
+      # I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM
+      - epson,rx8571
+      # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
+      - epson,rx8581
+      # Intersil ISL1208 Low Power RTC with Battery Backed SRAM
+      - isil,isl1208
+      # Intersil ISL1218 Low Power RTC with Battery Backed SRAM
+      - isil,isl1218
+      # Intersil ISL12022 Real-time Clock
+      - isil,isl12022
+      # Real Time Clock Module with I2C-Bus
+      - microcrystal,rv3028
+      # Real Time Clock Module with I2C-Bus
+      - microcrystal,rv3029
+      # Real Time Clock
+      - microcrystal,rv8523
+      # Real-time clock
+      - nxp,pcf2127
+      # Real-time clock
+      - nxp,pcf2129
+      # Real-time clock/calendar
+      - nxp,pcf8563
+      # Real-time Clock Module
+      - pericom,pt7c4338
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,r2025sd
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,r2221tl
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,rs5c372a
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,rs5c372b
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,rv5c386
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - ricoh,rv5c387a
+      # 2-wire CMOS real-time clock
+      - sii,s35390a
+      # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+      - whwave,sd3078
+      # Xircom X1205 I2C RTC
+      - xircom,x1205
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  start-year: true
+
+required:
+  - compatible
+  - reg
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt
index 3cba12f855b7..20d351f268ef 100644
--- a/Documentation/devicetree/bindings/serial/8250.txt
+++ b/Documentation/devicetree/bindings/serial/8250.txt
@@ -53,6 +53,9 @@ Optional properties:
   programmable TX FIFO thresholds.
 - resets : phandle + reset specifier pairs
 - overrun-throttle-ms : how long to pause uart rx when input overrun is encountered.
+- {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD
+  line respectively. It will use specified GPIO instead of the peripheral
+  function pin for the UART feature. If unsure, don't specify this property.
 
 Note:
 * fsl,ns16550:
@@ -74,3 +77,19 @@ Example:
 		interrupts = <10>;
 		reg-shift = <2>;
 	};
+
+Example for OMAP UART using GPIO-based modem control signals:
+
+	uart4: serial@49042000 {
+		compatible = "ti,omap3-uart";
+		reg = <0x49042000 0x400>;
+		interrupts = <80>;
+		ti,hwmods = "uart4";
+		clock-frequency = <48000000>;
+		cts-gpios = <&gpio3 5 GPIO_ACTIVE_LOW>;
+		rts-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>;
+		dtr-gpios = <&gpio1 12 GPIO_ACTIVE_LOW>;
+		dsr-gpios = <&gpio1 13 GPIO_ACTIVE_LOW>;
+		dcd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
+		rng-gpios = <&gpio1 15 GPIO_ACTIVE_LOW>;
+	};
diff --git a/Documentation/devicetree/bindings/serial/mtk-uart.txt b/Documentation/devicetree/bindings/serial/mtk-uart.txt
index c6b5262eb352..6fdffb735fb9 100644
--- a/Documentation/devicetree/bindings/serial/mtk-uart.txt
+++ b/Documentation/devicetree/bindings/serial/mtk-uart.txt
@@ -23,7 +23,12 @@ Required properties:
 
 - reg: The base address of the UART register bank.
 
-- interrupts: A single interrupt specifier.
+- interrupts:
+  index 0: an interrupt specifier for the UART controller itself
+  index 1: optional, an interrupt specifier with edge sensitivity on Rx pin to
+           support Rx in-band wake up. If one would like to use this feature,
+           one must create an addtional pinctrl to reconfigure Rx pin to normal
+           GPIO before suspend.
 
 - clocks : Must contain an entry for each entry in clock-names.
   See ../clocks/clock-bindings.txt for details.
@@ -39,7 +44,11 @@ Example:
 	uart0: serial@11006000 {
 		compatible = "mediatek,mt6589-uart", "mediatek,mt6577-uart";
 		reg = <0x11006000 0x400>;
-		interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_LOW>;
+		interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 52 IRQ_TYPE_EDGE_FALLING>;
 		clocks = <&uart_clk>, <&bus_clk>;
 		clock-names = "baud", "bus";
+		pinctrl-names = "default", "sleep";
+		pinctrl-0 = <&uart_pin>;
+		pinctrl-1 = <&uart_pin_sleep>;
 	};
diff --git a/Documentation/devicetree/bindings/serial/omap_serial.txt b/Documentation/devicetree/bindings/serial/omap_serial.txt
index 0a9b5444f4e6..dcba86b0a0d0 100644
--- a/Documentation/devicetree/bindings/serial/omap_serial.txt
+++ b/Documentation/devicetree/bindings/serial/omap_serial.txt
@@ -1,6 +1,7 @@
 OMAP UART controller
 
 Required properties:
+- compatible : should be "ti,j721e-uart", "ti,am654-uart" for J721E controllers
 - compatible : should be "ti,am654-uart" for AM654 controllers
 - compatible : should be "ti,omap2-uart" for OMAP2 controllers
 - compatible : should be "ti,omap3-uart" for OMAP3 controllers
diff --git a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
index 9d3efed55deb..a6b19485c9dc 100644
--- a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
+++ b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
@@ -13,6 +13,7 @@ Required properties:
 - clocks: The input clock of the USART instance
 
 Optional properties:
+- resets: Must contain the phandle to the reset controller.
 - pinctrl: The reference on the pins configuration
 - st,hw-flow-ctrl: bool flag to enable hardware flow control.
 - rs485-rts-delay, rs485-rx-during-tx, rs485-rts-active-low,
diff --git a/Documentation/devicetree/bindings/sifive/sifive-blocks-ip-versioning.txt b/Documentation/devicetree/bindings/sifive/sifive-blocks-ip-versioning.txt
new file mode 100644
index 000000000000..beaa3b64084e
--- /dev/null
+++ b/Documentation/devicetree/bindings/sifive/sifive-blocks-ip-versioning.txt
@@ -0,0 +1,38 @@
+DT compatible string versioning for SiFive open-source IP blocks
+
+This document describes the version specification for DT "compatible"
+strings for open-source SiFive IP blocks.  HDL for these IP blocks
+can be found in this public repository:
+
+https://github.com/sifive/sifive-blocks
+
+IP block-specific DT compatible strings are contained within the HDL,
+in the form "sifive,<ip-block-name><integer version number>".
+
+An example is "sifive,uart0" from:
+
+https://github.com/sifive/sifive-blocks/blob/v1.0/src/main/scala/devices/uart/UART.scala#L43
+
+Until these IP blocks (or IP integration) support version
+auto-discovery, the maintainers of these IP blocks intend to increment
+the suffixed number in the compatible string whenever the software
+interface to these IP blocks changes, or when the functionality of the
+underlying IP blocks changes in a way that software should be aware of.
+
+Driver developers can use compatible string "match" values such as
+"sifive,uart0" to indicate that their driver is compatible with the
+register interface and functionality associated with the relevant
+upstream sifive-blocks commits.  It is expected that most drivers will
+match on these IP block-specific compatible strings.
+
+DT data authors, when writing data for a particular SoC, should
+continue to specify an SoC-specific compatible string value, such as
+"sifive,fu540-c000-uart".  This way, if SoC-specific
+integration-specific bug fixes or workarounds are needed, the kernel
+or other system software can match on this string to apply them.  The
+IP block-specific compatible string (such as "sifive,uart0") should
+then be specified as a subsequent value.
+
+An example of this style:
+
+    compatible = "sifive,fu540-c000-uart", "sifive,uart0";
diff --git a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
index 436d2106e80d..e876f3ce54f6 100644
--- a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
+++ b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
@@ -2,8 +2,8 @@ Amlogic Canvas
 ================================
 
 A canvas is a collection of metadata that describes a pixel buffer.
-Those metadata include: width, height, phyaddr, wrapping, block mode
-and endianness.
+Those metadata include: width, height, phyaddr, wrapping and block mode.
+Starting with GXBB the endianness can also be described.
 
 Many IPs within Amlogic SoCs rely on canvas indexes to read/write pixel data
 rather than use the phy addresses directly. For instance, this is the case for
@@ -18,7 +18,11 @@ Video Lookup Table
 --------------------------
 
 Required properties:
-- compatible: "amlogic,canvas"
+- compatible: has to be one of:
+		- "amlogic,meson8-canvas", "amlogic,canvas" on Meson8
+		- "amlogic,meson8b-canvas", "amlogic,canvas" on Meson8b
+		- "amlogic,meson8m2-canvas", "amlogic,canvas" on Meson8m2
+		- "amlogic,canvas" on GXBB and newer
 - reg: Base physical address and size of the canvas registers.
 
 Example:
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
new file mode 100644
index 000000000000..954ffee0a9c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
@@ -0,0 +1,81 @@
+Qualcomm Always-On Subsystem side channel binding
+
+This binding describes the hardware component responsible for side channel
+requests to the always-on subsystem (AOSS), used for certain power management
+requests that is not handled by the standard RPMh interface. Each client in the
+SoC has it's own block of message RAM and IRQ for communication with the AOSS.
+The protocol used to communicate in the message RAM is known as Qualcomm
+Messaging Protocol (QMP)
+
+The AOSS side channel exposes control over a set of resources, used to control
+a set of debug related clocks and to affect the low power state of resources
+related to the secondary subsystems. These resources are exposed as a set of
+power-domains.
+
+- compatible:
+	Usage: required
+	Value type: <string>
+	Definition: must be "qcom,sdm845-aoss-qmp"
+
+- reg:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: the base address and size of the message RAM for this
+		    client's communication with the AOSS
+
+- interrupts:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: should specify the AOSS message IRQ for this client
+
+- mboxes:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: reference to the mailbox representing the outgoing doorbell
+		    in APCS for this client, as described in mailbox/mailbox.txt
+
+- #clock-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 0
+		    The single clock represents the QDSS clock.
+
+- #power-domain-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 1
+		    The provided power-domains are:
+		    CDSP state (0), LPASS state (1), modem state (2), SLPI
+		    state (3), SPSS state (4) and Venus state (5).
+
+= SUBNODES
+The AOSS side channel also provides the controls for three cooling devices,
+these are expressed as subnodes of the QMP node. The name of the node is used
+to identify the resource and must therefor be "cx", "mx" or "ebi".
+
+- #cooling-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 2
+
+= EXAMPLE
+
+The following example represents the AOSS side-channel message RAM and the
+mechanism exposing the power-domains, as found in SDM845.
+
+  aoss_qmp: qmp@c300000 {
+	  compatible = "qcom,sdm845-aoss-qmp";
+	  reg = <0x0c300000 0x100000>;
+	  interrupts = <GIC_SPI 389 IRQ_TYPE_EDGE_RISING>;
+	  mboxes = <&apss_shared 0>;
+
+	  #power-domain-cells = <1>;
+
+	  cx_cdev: cx {
+		#cooling-cells = <2>;
+	  };
+
+	  mx_cdev: mx {
+		#cooling-cells = <2>;
+	  };
+  };
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
index bcc612cc7423..db501269f47b 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
@@ -9,7 +9,7 @@ used for audio/voice services on the QDSP.
 	Value type: <stringlist>
 	Definition: must be "qcom,apr-v<VERSION-NUMBER>", example "qcom,apr-v2"
 
-- reg
+- qcom,apr-domain
 	Usage: required
 	Value type: <u32>
 	Definition: Destination processor ID.
@@ -49,9 +49,9 @@ by the individual bindings for the specific service
 The following example represents a QDSP based sound card on a MSM8996 device
 which uses apr as communication between Apps and QDSP.
 
-	apr@4 {
+	apr {
 		compatible = "qcom,apr-v2";
-		reg = <APR_DOMAIN_ADSP>;
+		qcom,apr-domain = <APR_DOMAIN_ADSP>;
 
 		q6core@3 {
 			compatible = "qcom,q6core";
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
index cf759e5f9b10..1214192847ac 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
@@ -21,6 +21,11 @@ edge.
 	Definition: should specify the IRQ used by the remote processor to
 		    signal this processor about communication related events
 
+- qcom,remote-pid:
+	Usage: required for glink-smem
+	Value type: <u32>
+	Definition: specifies the identifier of the remote endpoint of this edge
+
 - qcom,rpm-msg-ram:
 	Usage: required for glink-rpm
 	Value type: <prop-encoded-array>
diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml
new file mode 100644
index 000000000000..eb3992138eec
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-i2s.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 I2S Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#sound-dai-cells":
+    const: 0
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-i2s
+      - const: allwinner,sun6i-a31-i2s
+      - const: allwinner,sun8i-a83t-i2s
+      - const: allwinner,sun8i-h3-i2s
+      - const: allwinner,sun50i-a64-codec-i2s
+      - items:
+          - const: allwinner,sun50i-a64-i2s
+          - const: allwinner,sun8i-h3-i2s
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: apb
+      - const: mod
+
+  # Even though it only applies to subschemas under the conditionals,
+  # not listing them here will trigger a warning because of the
+  # additionalsProperties set to false.
+  dmas: true
+  dma-names: true
+  resets:
+    maxItems: 1
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun6i-a31-i2s
+              - allwinner,sun8i-a83t-i2s
+              - allwinner,sun8i-h3-i2s
+              - allwinner,sun50i-a64-codec-i2s
+
+    then:
+      required:
+        - resets
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,sun8i-a83t-i2s
+
+    then:
+      properties:
+        dmas:
+          minItems: 1
+          maxItems: 2
+          items:
+            - description: RX DMA Channel
+            - description: TX DMA Channel
+          description:
+            Some controllers cannot receive but can only transmit
+            data. In such a case, the RX DMA channel is to be omitted.
+
+        dma-names:
+          oneOf:
+            - items:
+                - const: rx
+                - const: tx
+            - const: tx
+          description:
+            Some controllers cannot receive but can only transmit
+            data. In such a case, the RX name is to be omitted.
+
+    else:
+      properties:
+        dmas:
+          items:
+            - description: RX DMA Channel
+            - description: TX DMA Channel
+
+        dma-names:
+          items:
+            - const: rx
+            - const: tx
+
+required:
+  - "#sound-dai-cells"
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - dmas
+  - dma-names
+
+additionalProperties: false
+
+examples:
+  - |
+    i2s0: i2s@1c22400 {
+        #sound-dai-cells = <0>;
+        compatible = "allwinner,sun4i-a10-i2s";
+        reg = <0x01c22400 0x400>;
+        interrupts = <0 16 4>;
+        clocks = <&apb0_gates 3>, <&i2s0_clk>;
+        clock-names = "apb", "mod";
+        dmas = <&dma 0 3>, <&dma 0 3>;
+        dma-names = "rx", "tx";
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml
new file mode 100644
index 000000000000..e0284d8c3b63
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-spdif.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 S/PDIF Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Liam Girdwood <lgirdwood@gmail.com>
+  - Mark Brown <broonie@kernel.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#sound-dai-cells":
+    const: 0
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-spdif
+      - const: allwinner,sun6i-a31-spdif
+      - const: allwinner,sun8i-h3-spdif
+      - const: allwinner,sun50i-h6-spdif
+      - items:
+          - const: allwinner,sun8i-a83t-spdif
+          - const: allwinner,sun8i-h3-spdif
+      - items:
+          - const: allwinner,sun50i-a64-spdif
+          - const: allwinner,sun8i-h3-spdif
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: apb
+      - const: spdif
+
+  # Even though it only applies to subschemas under the conditionals,
+  # not listing them here will trigger a warning because of the
+  # additionalsProperties set to false.
+  dmas: true
+  dma-names: true
+  resets:
+    maxItems: 1
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - allwinner,sun6i-a31-spdif
+              - allwinner,sun8i-h3-spdif
+
+    then:
+      required:
+        - resets
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: allwinner,sun8i-h3-spdif
+
+    then:
+      properties:
+        dmas:
+          description: TX DMA Channel
+
+        dma-names:
+          const: tx
+
+    else:
+      properties:
+        dmas:
+          items:
+            - description: RX DMA Channel
+            - description: TX DMA Channel
+
+        dma-names:
+          items:
+            - const: rx
+            - const: tx
+
+required:
+  - "#sound-dai-cells"
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - dmas
+  - dma-names
+
+additionalProperties: false
+
+examples:
+  - |
+    spdif: spdif@1c21000 {
+        #sound-dai-cells = <0>;
+        compatible = "allwinner,sun4i-a10-spdif";
+        reg = <0x01c21000 0x40>;
+        interrupts = <13>;
+        clocks = <&apb0_gates 1>, <&spdif_clk>;
+        clock-names = "apb", "spdif";
+        dmas = <&dma 0 2>, <&dma 0 2>;
+        dma-names = "rx", "tx";
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt b/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
index 3b94a715a0b9..8835a43edfbb 100644
--- a/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
+++ b/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
@@ -15,11 +15,15 @@ Required properties:
   * "lrclk"    : sample clock
   * "lrclk_sel": sample clock input multiplexer
 
-Example of TDMOUT_A on the A113 SoC:
+Optional property:
+- resets: phandle to the dedicated reset line of the tdm formatter.
+
+Example of TDMOUT_A on the S905X2 SoC:
 
 tdmout_a: audio-controller@500 {
 	compatible = "amlogic,axg-tdmout";
 	reg = <0x0 0x500 0x0 0x40>;
+	resets = <&clkc_audio AUD_RESET_TDMOUT_A>;
 	clocks = <&clkc_audio AUD_CLKID_TDMOUT_A>,
 		 <&clkc_audio AUD_CLKID_TDMOUT_A_SCLK>,
 		 <&clkc_audio AUD_CLKID_TDMOUT_A_SCLK_SEL>,
diff --git a/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt
new file mode 100644
index 000000000000..aa6c35570d31
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt
@@ -0,0 +1,55 @@
+* Amlogic HDMI Tx control glue
+
+Required properties:
+- compatible: "amlogic,g12a-tohdmitx"
+- reg: physical base address of the controller and length of memory
+       mapped region.
+- #sound-dai-cells: should be 1.
+
+Example on the S905X2 SoC:
+
+tohdmitx: audio-controller@744 {
+	compatible = "amlogic,g12a-tohdmitx";
+	reg = <0x0 0x744 0x0 0x4>;
+	#sound-dai-cells = <1>;
+};
+
+Example of an 'amlogic,axg-sound-card':
+
+sound {
+	compatible = "amlogic,axg-sound-card";
+
+[...]
+
+	dai-link-x {
+		sound-dai = <&tdmif_a>;
+		dai-format = "i2s";
+		dai-tdm-slot-tx-mask-0 = <1 1>;
+
+		codec-0 {
+			sound-dai = <&tohdmitx TOHDMITX_I2S_IN_A>;
+		};
+
+		codec-1 {
+			sound-dai = <&external_dac>;
+		};
+	};
+
+	dai-link-y {
+		sound-dai = <&tdmif_c>;
+		dai-format = "i2s";
+		dai-tdm-slot-tx-mask-0 = <1 1>;
+
+		codec {
+			sound-dai = <&tohdmitx TOHDMITX_I2S_IN_C>;
+		};
+	};
+
+	dai-link-z {
+		sound-dai = <&tohdmitx TOHDMITX_I2S_OUT>;
+
+		codec {
+			sound-dai = <&hdmi_tx>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/sound/cs42l73.txt b/Documentation/devicetree/bindings/sound/cs42l73.txt
index 80ae910dbf6c..47b868b5ab01 100644
--- a/Documentation/devicetree/bindings/sound/cs42l73.txt
+++ b/Documentation/devicetree/bindings/sound/cs42l73.txt
@@ -19,4 +19,4 @@ codec: cs42l73@4a {
 	reg = <0x4a>;
 	reset_gpio = <&gpio 10 0>;
 	chgfreq = <0x05>;
-};
-\ No newline at end of file
+};
diff --git a/Documentation/devicetree/bindings/sound/cs42xx8.txt b/Documentation/devicetree/bindings/sound/cs42xx8.txt
index 8619a156d038..bbfe39347c20 100644
--- a/Documentation/devicetree/bindings/sound/cs42xx8.txt
+++ b/Documentation/devicetree/bindings/sound/cs42xx8.txt
@@ -14,6 +14,11 @@ Required properties:
   - VA-supply, VD-supply, VLS-supply, VLC-supply: power supplies for the device,
     as covered in Documentation/devicetree/bindings/regulator/regulator.txt
 
+Optional properties:
+
+  - reset-gpios : a GPIO spec to define which pin is connected to the chip's
+    !RESET pin
+
 Example:
 
 cs42888: codec@48 {
@@ -25,4 +30,5 @@ cs42888: codec@48 {
 	VD-supply = <&reg_audio>;
 	VLS-supply = <&reg_audio>;
 	VLC-supply = <&reg_audio>;
+	reset-gpios = <&pca9557_b 1 GPIO_ACTIVE_LOW>;
 };
diff --git a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
index a58f79f5345c..c483dcec01f8 100644
--- a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
+++ b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
@@ -44,6 +44,9 @@ Optional properties:
   		 please refer to pinctrl-bindings.txt
 - fck_parent : Should contain a valid clock name which will be used as parent
 	       for the McASP fck
+- auxclk-fs-ratio: When McASP is bus master indicates the ratio between AUCLK
+		   and FS rate if applicable:
+		   AUCLK rate = auxclk-fs-ratio * FS rate
 
 Optional GPIO support:
 If any McASP pin need to be used as GPIO then the McASP node must have:
diff --git a/Documentation/devicetree/bindings/sound/madera.txt b/Documentation/devicetree/bindings/sound/madera.txt
new file mode 100644
index 000000000000..5e669ce552f4
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/madera.txt
@@ -0,0 +1,67 @@
+Cirrus Logic Madera class audio codecs
+
+This describes audio configuration bindings for these codecs.
+
+See also the core bindings for the parent MFD driver:
+See Documentation/devicetree/bindings/mfd/madera.txt
+
+and defines for values used in these bindings:
+include/dt-bindings/sound/madera.h
+
+These properties are all contained in the parent MFD node.
+
+Optional properties:
+  - cirrus,dmic-ref : Indicates how the MICBIAS pins have been externally
+    connected to DMICs on each input, one cell per input.
+    <IN1 IN2 IN3 ...>
+    A value of 0 indicates MICVDD and is the default, other values depend on the
+    codec:
+    For CS47L35 one of the CS47L35_DMIC_REF_xxx values
+    For all other codecs one of the MADERA_DMIC_REF_xxx values
+    Also see the datasheet for a description of the INn_DMIC_SUP field.
+
+  - cirrus,inmode : A list of input mode settings for each input. A maximum of
+    16 cells, with four cells per input in the order INnAL, INnAR INnBL INnBR.
+    For non-muxed inputs the first two cells for that input set the mode for
+    the left and right channel and the second two cells must be 0.
+    For muxed inputs the first two cells for that input set the mode of the
+    left and right A inputs and the second two cells set the mode of the left
+    and right B inputs.
+    Valid mode values are one of the MADERA_INMODE_xxx. If the array is shorter
+    than the number of inputs the unspecified inputs default to
+    MADERA_INMODE_DIFF.
+
+  - cirrus,out-mono : Mono bit for each output, maximum of six cells if the
+    array is shorter outputs will be set to stereo.
+
+  - cirrus,max-channels-clocked : Maximum number of channels that I2S clocks
+    will be generated for. Useful when clock master for systems where the I2S
+    bus has multiple data lines.
+    One cell for each AIF, use a value of zero for AIFs that should be handled
+    normally.
+
+  - cirrus,pdm-fmt : PDM speaker data format, must contain 2 cells
+    (OUT5 and OUT6). See the PDM_SPKn_FMT field in the datasheet for a
+    description of this value.
+    The second cell is ignored for codecs that do not have OUT6.
+
+  - cirrus,pdm-mute : PDM mute format, must contain 2 cells
+    (OUT5 and OUT6). See the PDM_SPKn_CTRL_1 register in the datasheet for a
+    description of this value.
+    The second cell is ignored for codecs that do not have OUT6.
+
+Example:
+
+cs47l35@0 {
+	compatible = "cirrus,cs47l35";
+
+	cirrus,dmic-ref = <0 0 CS47L35_DMIC_REF_MICBIAS1B 0>;
+	cirrus,inmode = <
+		MADERA_INMODE_DMIC MADERA_INMODE_DMIC /* IN1A digital */
+		MADERA_INMODE_SE   MADERA_INMODE_SE   /* IN1B single-ended */
+		MADERA_INMODE_DIFF MADERA_INMODE_DIFF /* IN2 differential */
+		0 0 	/* not used on this codec */
+	>;
+	cirrus,out-mono = <0 0 0 0 0 0>;
+	cirrus,max-channels-clocked = <2 0 0>;
+};
diff --git a/Documentation/devicetree/bindings/sound/max98357a.txt b/Documentation/devicetree/bindings/sound/max98357a.txt
index 28645a2ff885..4bce14ce806f 100644
--- a/Documentation/devicetree/bindings/sound/max98357a.txt
+++ b/Documentation/devicetree/bindings/sound/max98357a.txt
@@ -9,6 +9,10 @@ Optional properties:
 - sdmode-gpios : GPIO specifier for the chip's SD_MODE pin.
         If this option is not specified then driver does not manage
         the pin state (e.g. chip is always on).
+- sdmode-delay : specify delay time for SD_MODE pin.
+        If this option is specified, which means it's required i2s clocks
+        ready before SD_MODE is unmuted in order to avoid the speaker pop noise.
+        It's observed that 5ms is sufficient.
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/sound/rt1011.txt b/Documentation/devicetree/bindings/sound/rt1011.txt
new file mode 100644
index 000000000000..35a23e60d679
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/rt1011.txt
@@ -0,0 +1,32 @@
+RT1011 Mono Class D Audio Amplifier
+
+This device supports I2C only.
+
+Required properties:
+
+- compatible : "realtek,rt1011".
+
+- reg : The I2C address of the device. This I2C address decide by
+        two input pins (ASEL1 and ASEL2).
+        -------------------------------------
+        |   ASEL2   |  ASEL1   |  Address   |
+        -------------------------------------
+        |     0     |    0     |   0x38     |
+        -------------------------------------
+        |     0     |    1     |   0x39     |
+        -------------------------------------
+        |     1     |    0     |   0x3a     |
+        -------------------------------------
+        |     1     |    1     |   0x3b     |
+        -------------------------------------
+
+Pins on the device (for linking into audio routes) for RT1011:
+
+  * SPO
+
+Example:
+
+rt1011: codec@38 {
+	compatible = "realtek,rt1011";
+	reg = <0x38>;
+};
diff --git a/Documentation/devicetree/bindings/sound/rt1308.txt b/Documentation/devicetree/bindings/sound/rt1308.txt
new file mode 100755
index 000000000000..2d46084afce4
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/rt1308.txt
@@ -0,0 +1,17 @@
+RT1308 audio Amplifier
+
+This device supports I2C only.
+
+Required properties:
+
+- compatible : "realtek,rt1308".
+
+- reg : The I2C address of the device.
+
+
+Example:
+
+rt1308: rt1308@10 {
+	compatible = "realtek,rt1308";
+	reg = <0x10>;
+};
diff --git a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
index 58c341300552..cbf24bcd1b8d 100644
--- a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
+++ b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
@@ -18,7 +18,7 @@ Required properties:
     See Documentation/devicetree/bindings/dma/stm32-dma.txt.
   - dma-names: Identifier for each DMA request line. Must be "tx" and "rx".
   - pinctrl-names: should contain only value "default"
-  - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+  - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
 
 Optional properties:
   - resets: Reference to a reset controller asserting the reset controller
diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
index 3f4467ff0aa2..944743dd9212 100644
--- a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
+++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
@@ -41,7 +41,7 @@ SAI subnodes required properties:
 	"tx": if sai sub-block is configured as playback DAI
 	"rx": if sai sub-block is configured as capture DAI
   - pinctrl-names: should contain only value "default"
-  - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+  - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
 
 SAI subnodes Optional properties:
   - st,sync: specify synchronization mode.
diff --git a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
deleted file mode 100644
index 61e71c1729e0..000000000000
--- a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-* Allwinner A10 I2S controller
-
-The I2S bus (Inter-IC sound bus) is a serial link for digital
-audio data transfer between devices in the system.
-
-Required properties:
-
-- compatible: should be one of the following:
-   - "allwinner,sun4i-a10-i2s"
-   - "allwinner,sun6i-a31-i2s"
-   - "allwinner,sun8i-a83t-i2s"
-   - "allwinner,sun8i-h3-i2s"
-   - "allwinner,sun50i-a64-codec-i2s"
-- reg: physical base address of the controller and length of memory mapped
-  region.
-- interrupts: should contain the I2S interrupt.
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
-	Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: should include "tx" and "rx".
-- clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain the following:
-   - "apb" : clock for the I2S bus interface
-   - "mod" : module clock for the I2S controller
-- #sound-dai-cells : Must be equal to 0
-
-Required properties for the following compatibles:
-	- "allwinner,sun6i-a31-i2s"
-	- "allwinner,sun8i-a83t-i2s"
-	- "allwinner,sun8i-h3-i2s"
-	- "allwinner,sun50i-a64-codec-i2s"
-- resets: phandle to the reset line for this codec
-
-Example:
-
-i2s0: i2s@1c22400 {
-	#sound-dai-cells = <0>;
-	compatible = "allwinner,sun4i-a10-i2s";
-	reg = <0x01c22400 0x400>;
-	interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
-	clocks = <&apb0_gates 3>, <&i2s0_clk>;
-	clock-names = "apb", "mod";
-	dmas = <&dma SUN4I_DMA_NORMAL 3>,
-	       <&dma SUN4I_DMA_NORMAL 3>;
-	dma-names = "rx", "tx";
-};
diff --git a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt b/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt
deleted file mode 100644
index 0c64a209c2e9..000000000000
--- a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-Allwinner Sony/Philips Digital Interface Format (S/PDIF) Controller
-
-The Allwinner S/PDIF audio block is a transceiver that allows the
-processor to receive and transmit digital audio via an coaxial cable or
-a fibre cable.
-For now only playback is supported.
-
-Required properties:
-
-  - compatible		: should be one of the following:
-    - "allwinner,sun4i-a10-spdif": for the Allwinner A10 SoC
-    - "allwinner,sun6i-a31-spdif": for the Allwinner A31 SoC
-    - "allwinner,sun8i-h3-spdif": for the Allwinner H3 SoC
-
-  - reg			: Offset and length of the register set for the device.
-
-  - interrupts		: Contains the spdif interrupt.
-
-  - dmas		: Generic dma devicetree binding as described in
-			  Documentation/devicetree/bindings/dma/dma.txt.
-
-  - dma-names		: Two dmas have to be defined, "tx" and "rx".
-
-  - clocks		: Contains an entry for each entry in clock-names.
-
-  - clock-names		: Includes the following entries:
-	"apb"		  clock for the spdif bus.
-	"spdif"		  clock for spdif controller.
-
-  - resets		: reset specifier for the ahb reset (A31 and newer only)
-
-Example:
-
-spdif: spdif@1c21000 {
-	compatible = "allwinner,sun4i-a10-spdif";
-	reg = <0x01c21000 0x40>;
-	interrupts = <13>;
-	clocks = <&apb0_gates 1>, <&spdif_clk>;
-	clock-names = "apb", "spdif";
-	dmas = <&dma 0 2>, <&dma 0 2>;
-	dma-names = "rx", "tx";
-};
diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml
new file mode 100644
index 000000000000..6d1329c28170
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/allwinner,sun4i-a10-spi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 SPI Controller Device Tree Bindings
+
+allOf:
+  - $ref: "spi-controller.yaml"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#address-cells": true
+  "#size-cells": true
+
+  compatible:
+    const: allwinner,sun4i-a10-spi
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: ahb
+      - const: mod
+
+  dmas:
+    items:
+      - description: RX DMA Channel
+      - description: TX DMA Channel
+
+  dma-names:
+    items:
+      - const: rx
+      - const: tx
+
+  num-cs: true
+
+patternProperties:
+  "^.*@[0-9a-f]+":
+    type: object
+    properties:
+      reg:
+        items:
+          minimum: 0
+          maximum: 4
+
+      spi-rx-bus-width:
+        const: 1
+
+      spi-tx-bus-width:
+        const: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+
+additionalProperties: false
+
+examples:
+  - |
+    spi1: spi@1c06000 {
+        compatible = "allwinner,sun4i-a10-spi";
+        reg = <0x01c06000 0x1000>;
+        interrupts = <11>;
+        clocks = <&ahb_gates 21>, <&spi1_clk>;
+        clock-names = "ahb", "mod";
+        #address-cells = <1>;
+        #size-cells = <0>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml
new file mode 100644
index 000000000000..f36c46d236d7
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/allwinner,sun6i-a31-spi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 SPI Controller Device Tree Bindings
+
+allOf:
+  - $ref: "spi-controller.yaml"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  "#address-cells": true
+  "#size-cells": true
+
+  compatible:
+    enum:
+      - allwinner,sun6i-a31-spi
+      - allwinner,sun8i-h3-spi
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Bus Clock
+      - description: Module Clock
+
+  clock-names:
+    items:
+      - const: ahb
+      - const: mod
+
+  resets:
+    maxItems: 1
+
+  dmas:
+    items:
+      - description: RX DMA Channel
+      - description: TX DMA Channel
+
+  dma-names:
+    items:
+      - const: rx
+      - const: tx
+
+  num-cs: true
+
+patternProperties:
+  "^.*@[0-9a-f]+":
+    type: object
+    properties:
+      reg:
+        items:
+          minimum: 0
+          maximum: 4
+
+      spi-rx-bus-width:
+        const: 1
+
+      spi-tx-bus-width:
+        const: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+
+additionalProperties: false
+
+examples:
+  - |
+    spi1: spi@1c69000 {
+        compatible = "allwinner,sun6i-a31-spi";
+        reg = <0x01c69000 0x1000>;
+        interrupts = <0 66 4>;
+        clocks = <&ahb1_gates 21>, <&spi1_clk>;
+        clock-names = "ahb", "mod";
+        resets = <&ahb1_rst 21>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+    };
+
+  - |
+    spi0: spi@1c68000 {
+        compatible = "allwinner,sun8i-h3-spi";
+        reg = <0x01c68000 0x1000>;
+        interrupts = <0 65 4>;
+        clocks = <&ccu 30>, <&ccu 82>;
+        clock-names = "ahb", "mod";
+        dmas = <&dma 23>, <&dma 23>;
+        dma-names = "rx", "tx";
+        resets = <&ccu 15>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-bus.txt b/Documentation/devicetree/bindings/spi/spi-bus.txt
index 1f6e86f787ef..e07783505498 100644
--- a/Documentation/devicetree/bindings/spi/spi-bus.txt
+++ b/Documentation/devicetree/bindings/spi/spi-bus.txt
@@ -1,111 +1 @@
-SPI (Serial Peripheral Interface) busses
-
-SPI busses can be described with a node for the SPI controller device
-and a set of child nodes for each SPI slave on the bus.  The system's SPI
-controller may be described for use in SPI master mode or in SPI slave mode,
-but not for both at the same time.
-
-The SPI controller node requires the following properties:
-- compatible      - Name of SPI bus controller following generic names
-		    recommended practice.
-
-In master mode, the SPI controller node requires the following additional
-properties:
-- #address-cells  - number of cells required to define a chip select
-		address on the SPI bus.
-- #size-cells     - should be zero.
-
-In slave mode, the SPI controller node requires one additional property:
-- spi-slave       - Empty property.
-
-No other properties are required in the SPI bus node.  It is assumed
-that a driver for an SPI bus device will understand that it is an SPI bus.
-However, the binding does not attempt to define the specific method for
-assigning chip select numbers.  Since SPI chip select configuration is
-flexible and non-standardized, it is left out of this binding with the
-assumption that board specific platform code will be used to manage
-chip selects.  Individual drivers can define additional properties to
-support describing the chip select layout.
-
-Optional properties (master mode only):
-- cs-gpios	  - gpios chip select.
-- num-cs	  - total number of chipselects.
-
-If cs-gpios is used the number of chip selects will be increased automatically
-with max(cs-gpios > hw cs).
-
-So if for example the controller has 2 CS lines, and the cs-gpios
-property looks like this:
-
-cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>;
-
-Then it should be configured so that num_chipselect = 4 with the
-following mapping:
-
-cs0 : &gpio1 0 0
-cs1 : native
-cs2 : &gpio1 1 0
-cs3 : &gpio1 2 0
-
-
-SPI slave nodes must be children of the SPI controller node.
-
-In master mode, one or more slave nodes (up to the number of chip selects) can
-be present.  Required properties are:
-- compatible      - Name of SPI device following generic names recommended
-		    practice.
-- reg             - Chip select address of device.
-- spi-max-frequency - Maximum SPI clocking speed of device in Hz.
-
-In slave mode, the (single) slave node is optional.
-If present, it must be called "slave".  Required properties are:
-- compatible      - Name of SPI device following generic names recommended
-		    practice.
-
-All slave nodes can contain the following optional properties:
-- spi-cpol        - Empty property indicating device requires inverse clock
-		    polarity (CPOL) mode.
-- spi-cpha        - Empty property indicating device requires shifted clock
-		    phase (CPHA) mode.
-- spi-cs-high     - Empty property indicating device requires chip select
-		    active high.
-- spi-3wire       - Empty property indicating device requires 3-wire mode.
-- spi-lsb-first   - Empty property indicating device requires LSB first mode.
-- spi-tx-bus-width - The bus width (number of data wires) that is used for MOSI.
-		    Defaults to 1 if not present.
-- spi-rx-bus-width - The bus width (number of data wires) that is used for MISO.
-		    Defaults to 1 if not present.
-- spi-rx-delay-us - Microsecond delay after a read transfer.
-- spi-tx-delay-us - Microsecond delay after a write transfer.
-
-Some SPI controllers and devices support Dual and Quad SPI transfer mode.
-It allows data in the SPI system to be transferred using 2 wires (DUAL) or 4
-wires (QUAD).
-Now the value that spi-tx-bus-width and spi-rx-bus-width can receive is
-only 1 (SINGLE), 2 (DUAL) and 4 (QUAD).
-Dual/Quad mode is not allowed when 3-wire mode is used.
-
-If a gpio chipselect is used for the SPI slave the gpio number will be passed
-via the SPI master node cs-gpios property.
-
-SPI example for an MPC5200 SPI bus:
-	spi@f00 {
-		#address-cells = <1>;
-		#size-cells = <0>;
-		compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
-		reg = <0xf00 0x20>;
-		interrupts = <2 13 0 2 14 0>;
-		interrupt-parent = <&mpc5200_pic>;
-
-		ethernet-switch@0 {
-			compatible = "micrel,ks8995m";
-			spi-max-frequency = <1000000>;
-			reg = <0>;
-		};
-
-		codec@1 {
-			compatible = "ti,tlv320aic26";
-			spi-max-frequency = <100000>;
-			reg = <1>;
-		};
-	};
+This file has moved to spi-controller.yaml.
diff --git a/Documentation/devicetree/bindings/spi/spi-controller.yaml b/Documentation/devicetree/bindings/spi/spi-controller.yaml
new file mode 100644
index 000000000000..876c0623f322
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-controller.yaml
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SPI Controller Generic Binding
+
+maintainers:
+  - Mark Brown <broonie@kernel.org>
+
+description: |
+  SPI busses can be described with a node for the SPI controller device
+  and a set of child nodes for each SPI slave on the bus. The system SPI
+  controller may be described for use in SPI master mode or in SPI slave mode,
+  but not for both at the same time.
+
+properties:
+  $nodename:
+    pattern: "^spi(@.*|-[0-9a-f])*$"
+
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  cs-gpios:
+    description: |
+      GPIOs used as chip selects.
+      If that property is used, the number of chip selects will be
+      increased automatically with max(cs-gpios, hardware chip selects).
+
+      So if, for example, the controller has 2 CS lines, and the
+      cs-gpios looks like this
+        cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>;
+
+      Then it should be configured so that num_chipselect = 4, with
+      the following mapping
+        cs0 : &gpio1 0 0
+        cs1 : native
+        cs2 : &gpio1 1 0
+        cs3 : &gpio1 2 0
+
+  num-cs:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      Total number of chip selects.
+
+  spi-slave:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      The SPI controller acts as a slave, instead of a master.
+
+patternProperties:
+  "^slave$":
+    type: object
+
+    properties:
+      compatible:
+        description:
+          Compatible of the SPI device.
+
+    required:
+      - compatible
+
+  "^.*@[0-9a-f]+$":
+    type: object
+
+    properties:
+      compatible:
+        description:
+          Compatible of the SPI device.
+
+      reg:
+        maxItems: 1
+        minimum: 0
+        maximum: 256
+        description:
+          Chip select used by the device.
+
+      spi-3wire:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The device requires 3-wire mode.
+
+      spi-cpha:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The device requires shifted clock phase (CPHA) mode.
+
+      spi-cpol:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The device requires inverse clock polarity (CPOL) mode.
+
+      spi-cs-high:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The device requires the chip select active high.
+
+      spi-lsb-first:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          The device requires the LSB first mode.
+
+      spi-max-frequency:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        description:
+          Maximum SPI clocking speed of the device in Hz.
+
+      spi-rx-bus-width:
+        allOf:
+          - $ref: /schemas/types.yaml#/definitions/uint32
+          - enum: [ 1, 2, 4 ]
+          - default: 1
+        description:
+          Bus width to the SPI bus used for MISO.
+
+      spi-rx-delay-us:
+        description:
+          Delay, in microseconds, after a read transfer.
+
+      spi-tx-bus-width:
+        allOf:
+          - $ref: /schemas/types.yaml#/definitions/uint32
+          - enum: [ 1, 2, 4 ]
+          - default: 1
+        description:
+          Bus width to the SPI bus used for MOSI.
+
+      spi-tx-delay-us:
+        description:
+          Delay, in microseconds, after a write transfer.
+
+    required:
+      - compatible
+      - reg
+
+examples:
+  - |
+    spi@f00 {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
+        reg = <0xf00 0x20>;
+        interrupts = <2 13 0 2 14 0>;
+        interrupt-parent = <&mpc5200_pic>;
+
+        ethernet-switch@0 {
+            compatible = "micrel,ks8995m";
+            spi-max-frequency = <1000000>;
+            reg = <0>;
+        };
+
+        codec@1 {
+            compatible = "ti,tlv320aic26";
+            spi-max-frequency = <100000>;
+            reg = <1>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.txt b/Documentation/devicetree/bindings/spi/spi-gpio.txt
deleted file mode 100644
index 52db562f17a4..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-gpio.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-SPI-GPIO devicetree bindings
-
-This represents a group of 3-n GPIO lines used for bit-banged SPI on dedicated
-GPIO lines.
-
-Required properties:
-
- - compatible: should be set to "spi-gpio"
- - #address-cells: should be set to <0x1>
- - ranges
- - sck-gpios: GPIO spec for the SCK line to use
- - miso-gpios: GPIO spec for the MISO line to use
- - mosi-gpios: GPIO spec for the MOSI line to use
- - cs-gpios: GPIOs to use for chipselect lines.
-             Not needed if num-chipselects = <0>.
- - num-chipselects: Number of chipselect lines. Should be <0> if a single device
-                    with no chip select is connected.
-
-Deprecated bindings:
-
-These legacy GPIO line bindings can alternatively be used to define the
-GPIO lines used, they should not be used in new device trees.
-
- - gpio-sck: GPIO spec for the SCK line to use
- - gpio-miso: GPIO spec for the MISO line to use
- - gpio-mosi: GPIO spec for the MOSI line to use
-
-Example:
-
-	spi {
-		compatible = "spi-gpio";
-		#address-cells = <0x1>;
-		ranges;
-
-		sck-gpios = <&gpio 95 0>;
-		miso-gpios = <&gpio 98 0>;
-		mosi-gpios = <&gpio 97 0>;
-		cs-gpios = <&gpio 125 0>;
-		num-chipselects = <1>;
-
-		/* clients */
-	};
-
diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.yaml b/Documentation/devicetree/bindings/spi/spi-gpio.yaml
new file mode 100644
index 000000000000..55c4f1705f07
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-gpio.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SPI-GPIO devicetree bindings
+
+maintainers:
+  - Rob Herring <robh@kernel.org>
+
+description:
+  This represents a group of 3-n GPIO lines used for bit-banged SPI on
+  dedicated GPIO lines.
+
+allOf:
+  - $ref: "/schemas/spi/spi-controller.yaml#"
+
+properties:
+  compatible:
+    const: spi-gpio
+
+  sck-gpios:
+    description: GPIO spec for the SCK line to use
+    maxItems: 1
+
+  miso-gpios:
+    description: GPIO spec for the MISO line to use
+    maxItems: 1
+
+  mosi-gpios:
+    description: GPIO spec for the MOSI line to use
+    maxItems: 1
+
+  cs-gpios:
+    description: GPIOs to use for chipselect lines.
+      Not needed if num-chipselects = <0>.
+    minItems: 1
+    maxItems: 1024
+
+  num-chipselects:
+    description: Number of chipselect lines. Should be <0> if a single device
+      with no chip select is connected.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  # Deprecated properties
+  gpio-sck: false
+  gpio-miso: false
+  gpio-mosi: false
+
+required:
+  - compatible
+  - num-chipselects
+  - sck-gpios
+
+examples:
+  - |
+    spi {
+      compatible = "spi-gpio";
+      #address-cells = <0x1>;
+      #size-cells = <0x0>;
+
+      sck-gpios = <&gpio 95 0>;
+      miso-gpios = <&gpio 98 0>;
+      mosi-gpios = <&gpio 97 0>;
+      cs-gpios = <&gpio 125 0>;
+      num-chipselects = <1>;
+
+      /* clients */
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-pl022.yaml b/Documentation/devicetree/bindings/spi/spi-pl022.yaml
new file mode 100644
index 000000000000..dfb697c69341
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-pl022.yaml
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-pl022.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM PL022 SPI controller
+
+maintainers:
+  - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+  - $ref: "spi-controller.yaml#"
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+  properties:
+    compatible:
+      contains:
+        const: arm,pl022
+  required:
+    - compatible
+
+properties:
+  compatible:
+    items:
+      - const: arm,pl022
+      - const: arm,primecell
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 2
+
+  clock-names:
+    items:
+      - enum:
+          - SSPCLK
+          - sspclk
+      - const: apb_pclk
+
+  pl022,autosuspend-delay:
+    description: delay in ms following transfer completion before the
+      runtime power management system suspends the device. A setting of 0
+      indicates no delay and the device will be suspended immediately.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+
+  pl022,rt:
+    description: indicates the controller should run the message pump with realtime
+               priority to minimise the transfer latency on the bus (boolean)
+    type: boolean
+
+  dmas:
+    description:
+      Two or more DMA channel specifiers following the convention outlined
+      in bindings/dma/dma.txt
+    minItems: 2
+    maxItems: 32
+
+  dma-names:
+    description:
+      There must be at least one channel named "tx" for transmit and named "rx"
+      for receive.
+    minItems: 2
+    maxItems: 32
+    additionalItems: true
+    items:
+      - const: rx
+      - const: tx
+
+patternProperties:
+  "^[a-zA-Z][a-zA-Z0-9,+\\-._]{0,63}@[0-9a-f]+$":
+    type: object
+    # SPI slave nodes must be children of the SPI master node and can
+    # contain the following properties.
+    properties:
+      pl022,interface:
+        description: SPI interface type
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - enum:
+              - 0  # SPI
+              - 1  # Texas Instruments Synchronous Serial Frame Format
+              - 2  # Microwire (Half Duplex)
+
+      pl022,com-mode:
+        description: Specifies the transfer mode
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - enum:
+              - 0  # interrupt mode
+              - 1  # polling mode
+              - 2  # DMA mode
+            default: 1
+
+      pl022,rx-level-trig:
+        description: Rx FIFO watermark level
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 4
+
+      pl022,tx-level-trig:
+        description: Tx FIFO watermark level
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 4
+
+      pl022,ctrl-len:
+        description: Microwire interface - Control length
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0x03
+            maximum: 0x1f
+
+      pl022,wait-state:
+        description: Microwire interface - Wait state
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - enum: [ 0, 1 ]
+
+      pl022,duplex:
+        description: Microwire interface - Full/Half duplex
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - enum: [ 0, 1 ]
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    spi@e0100000 {
+      compatible = "arm,pl022", "arm,primecell";
+      reg = <0xe0100000 0x1000>;
+      #address-cells = <1>;
+      #size-cells = <0>;
+      interrupts = <0 31 0x4>;
+      dmas = <&dma_controller 23 1>,
+        <&dma_controller 24 0>;
+      dma-names = "rx", "tx";
+
+      m25p80@1 {
+        compatible = "st,m25p80";
+        reg = <1>;
+        spi-max-frequency = <12000000>;
+        spi-cpol;
+        spi-cpha;
+        pl022,interface = <0>;
+        pl022,com-mode = <0x2>;
+        pl022,rx-level-trig = <0>;
+        pl022,tx-level-trig = <0>;
+        pl022,ctrl-len = <0x11>;
+        pl022,wait-state = <0>;
+        pl022,duplex = <0>;
+      };
+    };
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
index adeeb63e84b9..bfc038b9478d 100644
--- a/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
+++ b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
@@ -19,8 +19,11 @@ Required properties:
 - reg: chip-Select number (QSPI controller may connect 2 flashes)
 - spi-max-frequency: max frequency of spi bus
 
-Optional property:
+Optional properties:
 - spi-rx-bus-width: see ./spi-bus.txt for the description
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names should include "tx" and "rx" if present.
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/spi/spi-sun4i.txt b/Documentation/devicetree/bindings/spi/spi-sun4i.txt
deleted file mode 100644
index c75d604a8290..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-sun4i.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Allwinner A10 SPI controller
-
-Required properties:
-- compatible: Should be "allwinner,sun4-a10-spi".
-- reg: Should contain register location and length.
-- interrupts: Should contain interrupt.
-- clocks: phandle to the clocks feeding the SPI controller. Two are
-          needed:
-  - "ahb": the gated AHB parent clock
-  - "mod": the parent module clock
-- clock-names: Must contain the clock names described just above
-
-Example:
-
-spi1: spi@1c06000 {
-	compatible = "allwinner,sun4i-a10-spi";
-	reg = <0x01c06000 0x1000>;
-	interrupts = <11>;
-	clocks = <&ahb_gates 21>, <&spi1_clk>;
-	clock-names = "ahb", "mod";
-	#address-cells = <1>;
-	#size-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/spi/spi-sun6i.txt b/Documentation/devicetree/bindings/spi/spi-sun6i.txt
deleted file mode 100644
index 435a8e0731ac..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-sun6i.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-Allwinner A31/H3 SPI controller
-
-Required properties:
-- compatible: Should be "allwinner,sun6i-a31-spi" or "allwinner,sun8i-h3-spi".
-- reg: Should contain register location and length.
-- interrupts: Should contain interrupt.
-- clocks: phandle to the clocks feeding the SPI controller. Two are
-          needed:
-  - "ahb": the gated AHB parent clock
-  - "mod": the parent module clock
-- clock-names: Must contain the clock names described just above
-- resets: phandle to the reset controller asserting this device in
-          reset
-
-Optional properties:
-- dmas: DMA specifiers for rx and tx dma. See the DMA client binding,
-	Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "rx" and "tx" if present.
-
-Example:
-
-spi1: spi@1c69000 {
-	compatible = "allwinner,sun6i-a31-spi";
-	reg = <0x01c69000 0x1000>;
-	interrupts = <0 66 4>;
-	clocks = <&ahb1_gates 21>, <&spi1_clk>;
-	clock-names = "ahb", "mod";
-	resets = <&ahb1_rst 21>;
-};
-
-spi0: spi@1c68000 {
-	compatible = "allwinner,sun8i-h3-spi";
-	reg = <0x01c68000 0x1000>;
-	interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>;
-	clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>;
-	clock-names = "ahb", "mod";
-	dmas = <&dma 23>, <&dma 23>;
-	dma-names = "rx", "tx";
-	pinctrl-names = "default";
-	pinctrl-0 = <&spi0_pins>;
-	resets = <&ccu RST_BUS_SPI0>;
-	#address-cells = <1>;
-	#size-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/spi/spi-synquacer.txt b/Documentation/devicetree/bindings/spi/spi-synquacer.txt
new file mode 100644
index 000000000000..291dfa692d0a
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-synquacer.txt
@@ -0,0 +1,27 @@
+* Socionext Synquacer HS-SPI bindings
+
+Required Properties:
+- compatible: should be "socionext,synquacer-spi"
+- reg: physical base address of the controller and length of memory mapped
+       region.
+- interrupts: should contain the "spi_rx", "spi_tx" and "spi_fault" interrupts.
+- clocks: core clock iHCLK. Optional rate clock iPCLK (default is iHCLK)
+- clock-names: Shall be "iHCLK" and "iPCLK" respectively
+
+Optional Properties:
+- socionext,use-rtm: boolean, if required to use "retimed clock" for RX
+- socionext,set-aces: boolean, if same active clock edges field to be set.
+
+Example:
+
+	spi0: spi@ff110000 {
+		compatible = "socionext,synquacer-spi";
+		reg = <0xff110000 0x1000>;
+		interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 161 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&clk_hsspi>;
+		clock-names = "iHCLK";
+		socionext,use-rtm;
+		socionext,set-aces;
+	};
diff --git a/Documentation/devicetree/bindings/spi/spi_pl022.txt b/Documentation/devicetree/bindings/spi/spi_pl022.txt
deleted file mode 100644
index 7638b4968ddb..000000000000
--- a/Documentation/devicetree/bindings/spi/spi_pl022.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-ARM PL022 SPI controller
-
-Required properties:
-- compatible : "arm,pl022", "arm,primecell"
-- reg : Offset and length of the register set for the device
-- interrupts : Should contain SPI controller interrupt
-- num-cs : total number of chipselects
-
-Optional properties:
-- cs-gpios : should specify GPIOs used for chipselects.
-  The gpios will be referred to as reg = <index> in the SPI child nodes.
-  If unspecified, a single SPI device without a chip select can be used.
-- pl022,autosuspend-delay : delay in ms following transfer completion before
-			    the runtime power management system suspends the
-			    device. A setting of 0 indicates no delay and the
-                            device will be suspended immediately
-- pl022,rt : indicates the controller should run the message pump with realtime
-             priority to minimise the transfer latency on the bus (boolean)
-- dmas : Two or more DMA channel specifiers following the convention outlined
-         in bindings/dma/dma.txt
-- dma-names: Names for the dma channels, if present. There must be at
-	     least one channel named "tx" for transmit and named "rx" for
-             receive.
-
-
-SPI slave nodes must be children of the SPI master node and can
-contain the following properties.
-
-- pl022,interface : interface type:
-	0: SPI
-	1: Texas Instruments Synchronous Serial Frame Format
-	2: Microwire (Half Duplex)
-- pl022,com-mode : specifies the transfer mode:
-	0: interrupt mode
-	1: polling mode (default mode if property not present)
-	2: DMA mode
-- pl022,rx-level-trig : Rx FIFO watermark level
-- pl022,tx-level-trig : Tx FIFO watermark level
-- pl022,ctrl-len : Microwire interface: Control length
-- pl022,wait-state : Microwire interface: Wait state
-- pl022,duplex : Microwire interface: Full/Half duplex
-
-
-Example:
-
-	spi@e0100000 {
-		compatible = "arm,pl022", "arm,primecell";
-		reg = <0xe0100000 0x1000>;
-		#address-cells = <1>;
-		#size-cells = <0>;
-		interrupts = <0 31 0x4>;
-		dmas = <&dma-controller 23 1>,
-			<&dma-controller 24 0>;
-		dma-names = "rx", "tx";
-
-		m25p80@1 {
-			compatible = "st,m25p80";
-			reg = <1>;
-			spi-max-frequency = <12000000>;
-			spi-cpol;
-			spi-cpha;
-			pl022,interface = <0>;
-			pl022,com-mode = <0x2>;
-			pl022,rx-level-trig = <0>;
-			pl022,tx-level-trig = <0>;
-			pl022,ctrl-len = <0x11>;
-			pl022,wait-state = <0>;
-			pl022,duplex = <0>;
-		};
-	};
diff --git a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt
new file mode 100644
index 000000000000..d57659996d62
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt
@@ -0,0 +1,25 @@
+NXP System Counter Module(sys_ctr)
+
+The system counter(sys_ctr) is a programmable system counter which provides
+a shared time base to Cortex A15, A7, A53, A73, etc. it is intended for use in
+applications where the counter is always powered and support multiple,
+unrelated clocks. The compare frame inside can be used for timer purpose.
+
+Required properties:
+
+- compatible :      should be "nxp,sysctr-timer"
+- reg :             Specifies the base physical address and size of the comapre
+                    frame and the counter control, read & compare.
+- interrupts :      should be the first compare frames' interrupt
+- clocks : 	    Specifies the counter clock.
+- clock-names: 	    Specifies the clock's name of this module
+
+Example:
+
+	system_counter: timer@306a0000 {
+		compatible = "nxp,sysctr-timer";
+		reg = <0x306a0000 0x20000>;/* system-counter-rd & compare */
+		clocks = <&clk_8m>;
+		clock-names = "per";
+		interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+	};
diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index c0594450e9ef..c5220bcd852b 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -42,12 +42,18 @@ Required Properties:
     - "renesas,r8a7793-cmt1" for the 48-bit CMT1 device included in r8a7793.
     - "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794.
     - "renesas,r8a7794-cmt1" for the 48-bit CMT1 device included in r8a7794.
+    - "renesas,r8a7795-cmt0" for the 32-bit CMT0 device included in r8a7795.
+    - "renesas,r8a7795-cmt1" for the 48-bit CMT1 device included in r8a7795.
     - "renesas,r8a7796-cmt0" for the 32-bit CMT0 device included in r8a7796.
     - "renesas,r8a7796-cmt1" for the 48-bit CMT1 device included in r8a7796.
+    - "renesas,r8a77965-cmt0" for the 32-bit CMT0 device included in r8a77965.
+    - "renesas,r8a77965-cmt1" for the 48-bit CMT1 device included in r8a77965.
     - "renesas,r8a77970-cmt0" for the 32-bit CMT0 device included in r8a77970.
     - "renesas,r8a77970-cmt1" for the 48-bit CMT1 device included in r8a77970.
     - "renesas,r8a77980-cmt0" for the 32-bit CMT0 device included in r8a77980.
     - "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980.
+    - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
+    - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
 
     - "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2
 		and RZ/G1.
diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 747fd3f689dc..2e742d399e87 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -52,6 +52,10 @@ properties:
           - at,24c08
             # i2c trusted platform module (TPM)
           - atmel,at97sc3204t
+            # i2c h/w symmetric crypto module
+          - atmel,atsha204a
+            # i2c h/w elliptic curve crypto module
+          - atmel,atecc508a
             # CM32181: Ambient Light Sensor
           - capella,cm32181
             # CM3232: Ambient Light Sensor
diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt
index 49eac0dc86b0..aafff3a6904d 100644
--- a/Documentation/devicetree/bindings/usb/dwc2.txt
+++ b/Documentation/devicetree/bindings/usb/dwc2.txt
@@ -42,6 +42,8 @@ Refer to phy/phy-bindings.txt for generic phy consumer properties
 - g-rx-fifo-size: size of rx fifo size in gadget mode.
 - g-np-tx-fifo-size: size of non-periodic tx fifo size in gadget mode.
 - g-tx-fifo-size: size of periodic tx fifo per endpoint (except ep0) in gadget mode.
+- snps,need-phy-for-wake: If present indicates that the phy needs to be left
+                          on for remote wakeup during suspend.
 - snps,reset-phy-on-wake: If present indicates that we need to reset the PHY when
                           we detect a wakeup.  This is due to a hardware errata.
 
@@ -58,4 +60,5 @@ Example:
 		clock-names = "otg";
 		phys = <&usbphy>;
 		phy-names = "usb2-phy";
+		snps,need-phy-for-wake;
         };
diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt b/Documentation/devicetree/bindings/usb/dwc3.txt
index 8e5265e9f658..66780a47ad85 100644
--- a/Documentation/devicetree/bindings/usb/dwc3.txt
+++ b/Documentation/devicetree/bindings/usb/dwc3.txt
@@ -64,6 +64,8 @@ Optional properties:
  - snps,dis_u2_susphy_quirk: when set core will disable USB2 suspend phy.
  - snps,dis_enblslpm_quirk: when set clears the enblslpm in GUSB2PHYCFG,
 			disabling the suspend signal to the PHY.
+ - snps,dis-u1-entry-quirk: set if link entering into U1 needs to be disabled.
+ - snps,dis-u2-entry-quirk: set if link entering into U2 needs to be disabled.
  - snps,dis_rxdet_inp3_quirk: when set core will disable receiver detection
 			in PHY P3 power state.
  - snps,dis-u2-freeclk-exists-quirk: when set, clear the u2_freeclk_exists
diff --git a/Documentation/devicetree/bindings/usb/generic-ehci.yaml b/Documentation/devicetree/bindings/usb/generic-ehci.yaml
index d3b4f6415920..059f6ef1ad4a 100644
--- a/Documentation/devicetree/bindings/usb/generic-ehci.yaml
+++ b/Documentation/devicetree/bindings/usb/generic-ehci.yaml
@@ -74,7 +74,7 @@ additionalProperties: false
 
 examples:
   - |
-    ehci@e0000300 {
+    usb@e0000300 {
         compatible = "ibm,usb-ehci-440epx", "generic-ehci";
         interrupt-parent = <&UIC0>;
         interrupts = <0x1a 4>;
@@ -89,7 +89,6 @@ examples:
         interrupts = <39>;
         clocks = <&ahb_gates 1>;
         phys = <&usbphy 1>;
-        phy-names = "usb";
     };
 
 ...
diff --git a/Documentation/devicetree/bindings/usb/renesas_usb3.txt b/Documentation/devicetree/bindings/usb/renesas,usb3.txt
index 35039e720515..35039e720515 100644
--- a/Documentation/devicetree/bindings/usb/renesas_usb3.txt
+++ b/Documentation/devicetree/bindings/usb/renesas,usb3.txt
diff --git a/Documentation/devicetree/bindings/usb/renesas,usbhs.txt b/Documentation/devicetree/bindings/usb/renesas,usbhs.txt
new file mode 100644
index 000000000000..e39255ea6e4f
--- /dev/null
+++ b/Documentation/devicetree/bindings/usb/renesas,usbhs.txt
@@ -0,0 +1,57 @@
+Renesas Electronics USBHS driver
+
+Required properties:
+  - compatible: Must contain one or more of the following:
+
+	- "renesas,usbhs-r8a7743" for r8a7743 (RZ/G1M) compatible device
+	- "renesas,usbhs-r8a7744" for r8a7744 (RZ/G1N) compatible device
+	- "renesas,usbhs-r8a7745" for r8a7745 (RZ/G1E) compatible device
+	- "renesas,usbhs-r8a77470" for r8a77470 (RZ/G1C) compatible device
+	- "renesas,usbhs-r8a774a1" for r8a774a1 (RZ/G2M) compatible device
+	- "renesas,usbhs-r8a774c0" for r8a774c0 (RZ/G2E) compatible device
+	- "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device
+	- "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device
+	- "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device
+	- "renesas,usbhs-r8a7793" for r8a7793 (R-Car M2-N) compatible device
+	- "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device
+	- "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device
+	- "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device
+	- "renesas,usbhs-r8a77965" for r8a77965 (R-Car M3-N) compatible device
+	- "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device
+	- "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device
+	- "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device
+	- "renesas,usbhs-r7s9210" for r7s9210 (RZ/A2) compatible device
+	- "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices
+	- "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices
+	- "renesas,rza1-usbhs" for RZ/A1 compatible device
+	- "renesas,rza2-usbhs" for RZ/A2 compatible device
+
+	When compatible with the generic version, nodes must list the
+	SoC-specific version corresponding to the platform first followed
+	by the generic version.
+
+  - reg: Base address and length of the register for the USBHS
+  - interrupts: Interrupt specifier for the USBHS
+  - clocks: A list of phandle + clock specifier pairs.
+	    - In case of "renesas,rcar-gen3-usbhs", two clocks are required.
+	      First clock should be peripheral and second one should be host.
+	    - In case of except above, one clock is required. First clock
+	      should be peripheral.
+
+Optional properties:
+  - renesas,buswait: Integer to use BUSWAIT register
+  - renesas,enable-gpio: A gpio specifier to check GPIO determining if USB
+			 function should be enabled
+  - phys: phandle + phy specifier pair
+  - phy-names: must be "usb"
+  - dmas: Must contain a list of references to DMA specifiers.
+  - dma-names : named "ch%d", where %d is the channel number ranging from zero
+                to the number of channels (DnFIFOs) minus one.
+
+Example:
+	usbhs: usb@e6590000 {
+		compatible = "renesas,usbhs-r8a7790", "renesas,rcar-gen2-usbhs";
+		reg = <0 0xe6590000 0 0x100>;
+		interrupts = <0 107 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
+	};
diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
deleted file mode 100644
index b8acc2a994a8..000000000000
--- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Renesas Electronics USBHS driver
-
-Required properties:
-  - compatible: Must contain one or more of the following:
-
-	- "renesas,usbhs-r8a7743" for r8a7743 (RZ/G1M) compatible device
-	- "renesas,usbhs-r8a7744" for r8a7744 (RZ/G1N) compatible device
-	- "renesas,usbhs-r8a7745" for r8a7745 (RZ/G1E) compatible device
-	- "renesas,usbhs-r8a77470" for r8a77470 (RZ/G1C) compatible device
-	- "renesas,usbhs-r8a774a1" for r8a774a1 (RZ/G2M) compatible device
-	- "renesas,usbhs-r8a774c0" for r8a774c0 (RZ/G2E) compatible device
-	- "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device
-	- "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device
-	- "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device
-	- "renesas,usbhs-r8a7793" for r8a7793 (R-Car M2-N) compatible device
-	- "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device
-	- "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device
-	- "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device
-	- "renesas,usbhs-r8a77965" for r8a77965 (R-Car M3-N) compatible device
-	- "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device
-	- "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device
-	- "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device
-	- "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices
-	- "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices
-	- "renesas,rza1-usbhs" for RZ/A1 compatible device
-
-	When compatible with the generic version, nodes must list the
-	SoC-specific version corresponding to the platform first followed
-	by the generic version.
-
-  - reg: Base address and length of the register for the USBHS
-  - interrupts: Interrupt specifier for the USBHS
-  - clocks: A list of phandle + clock specifier pairs.
-	    - In case of "renesas,rcar-gen3-usbhs", two clocks are required.
-	      First clock should be peripheral and second one should be host.
-	    - In case of except above, one clock is required. First clock
-	      should be peripheral.
-
-Optional properties:
-  - renesas,buswait: Integer to use BUSWAIT register
-  - renesas,enable-gpio: A gpio specifier to check GPIO determining if USB
-			 function should be enabled
-  - phys: phandle + phy specifier pair
-  - phy-names: must be "usb"
-  - dmas: Must contain a list of references to DMA specifiers.
-  - dma-names : named "ch%d", where %d is the channel number ranging from zero
-                to the number of channels (DnFIFOs) minus one.
-
-Example:
-	usbhs: usb@e6590000 {
-		compatible = "renesas,usbhs-r8a7790", "renesas,rcar-gen2-usbhs";
-		reg = <0 0xe6590000 0 0x100>;
-		interrupts = <0 107 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
-	};
diff --git a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
index e45b38ce2986..26c85afd0b53 100644
--- a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
+++ b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
@@ -4,7 +4,7 @@ OHCI
 
 Required properties:
  - compatible: should be "samsung,s3c2410-ohci" for USB host controller
- - reg: address and lenght of the controller memory mapped region
+ - reg: address and length of the controller memory mapped region
  - interrupts: interrupt number for the USB OHCI controller
  - clocks: Should reference the bus and host clocks
  - clock-names: Should contain two strings
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 33a65a45e319..6992bbbbffab 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -49,6 +49,8 @@ patternProperties:
     description: Aeroflex Gaisler AB
   "^al,.*":
     description: Annapurna Labs
+  "^allegro,.*":
+    description: Allegro DVT
   "^allo,.*":
     description: Allo.com
   "^allwinner,.*":
@@ -147,6 +149,8 @@ patternProperties:
     description: Broadcom Corporation
   "^buffalo,.*":
     description: Buffalo, Inc.
+  "^bur,.*":
+    description: B&R Industrial Automation GmbH
   "^bticino,.*":
     description: Bticino International
   "^calxeda,.*":
@@ -175,6 +179,8 @@ patternProperties:
     description: Common Hardware Reference Platform
   "^chunghwa,.*":
     description: Chunghwa Picture Tubes Ltd.
+  "^chuwi,.*":
+    description: Chuwi Innovation Ltd.
   "^ciaa,.*":
     description: Computadora Industrial Abierta Argentina
   "^cirrus,.*":
@@ -185,8 +191,12 @@ patternProperties:
     description: Chips&Media, Inc.
   "^cnxt,.*":
     description: Conexant Systems, Inc.
+  "^colorfly,.*":
+    description: Colorful GRP, Shenzhen Xueyushi Technology Ltd.
   "^compulab,.*":
     description: CompuLab Ltd.
+  "^corpro,.*":
+    description: Chengdu Corpro Technology Co., Ltd.
   "^cortina,.*":
     description: Cortina Systems, Inc.
   "^cosmic,.*":
@@ -199,6 +209,8 @@ patternProperties:
     description: Crystalfontz America, Inc.
   "^csky,.*":
     description: Hangzhou C-SKY Microsystems Co., Ltd
+  "^csq,.*":
+    description: Shenzen Chuangsiqi Technology Co.,Ltd.
   "^cubietech,.*":
     description: Cubietech, Ltd.
   "^cypress,.*":
@@ -219,6 +231,8 @@ patternProperties:
     description: Devantech, Ltd.
   "^dh,.*":
     description: DH electronics GmbH
+  "^difrnce,.*":
+    description: Shenzhen Yagu Electronic Technology Co., Ltd.
   "^digi,.*":
     description: Digi International Inc.
   "^digilent,.*":
@@ -241,6 +255,8 @@ patternProperties:
     description: DPTechnics
   "^dragino,.*":
     description: Dragino Technology Co., Limited
+  "^dserve,.*":
+    description: dServe Technology B.V.
   "^ea,.*":
     description: Embedded Artists AB
   "^ebs-systart,.*":
@@ -263,6 +279,8 @@ patternProperties:
     description: Emlid, Ltd.
   "^emmicro,.*":
     description: EM Microelectronic
+  "^empire-electronix,.*":
+    description: Empire Electronix
   "^emtrion,.*":
     description: emtrion GmbH
   "^endless,.*":
@@ -277,6 +295,8 @@ patternProperties:
     description: Ecole Polytechnique Fédérale de Lausanne
   "^epson,.*":
     description: Seiko Epson Corp.
+  "^esp,.*":
+    description: Espressif Systems Co. Ltd.
   "^est,.*":
     description: ESTeem Wireless Modems
   "^ettus,.*":
@@ -287,6 +307,8 @@ patternProperties:
     description: Everest Semiconductor Co. Ltd.
   "^everspin,.*":
     description: Everspin Technologies, Inc.
+  "^evervision,.*":
+    description: Evervision Electronics Co. Ltd.
   "^exar,.*":
     description: Exar Corporation
   "^excito,.*":
@@ -327,6 +349,8 @@ patternProperties:
     description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
   "^GEFanuc,.*":
     description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
+  "^gemei,.*":
+    description: Gemei Digital Technology Co., Ltd.
   "^geniatech,.*":
     description: Geniatech, Inc.
   "^giantec,.*":
@@ -371,12 +395,20 @@ patternProperties:
     description: Holt Integrated Circuits, Inc.
   "^honeywell,.*":
     description: Honeywell
+  "^hoperun,.*":
+    description: Jiangsu HopeRun Software Co., Ltd.
   "^hp,.*":
     description: Hewlett Packard
+  "^hsg,.*":
+    description: HannStar Display Co.
   "^holtek,.*":
     description: Holtek Semiconductor, Inc.
+  "^hugsun,.*":
+    description: Shenzhen Hugsun Technology Co. Ltd.
   "^hwacom,.*":
     description: HwaCom Systems Inc.
+  "^hyundai,.*":
+    description: Hyundai Technology
   "^i2se,.*":
     description: I2SE GmbH
   "^ibm,.*":
@@ -391,6 +423,10 @@ patternProperties:
     description: ILI Technology Corporation (ILITEK)
   "^img,.*":
     description: Imagination Technologies Ltd.
+  "^incircuit,.*":
+    description: In-Circuit GmbH
+  "^inet-tek,.*":
+    description: Shenzhen iNet Mobile Internet Technology Co., Ltd
   "^infineon,.*":
     description: Infineon Technologies
   "^inforce,.*":
@@ -425,6 +461,8 @@ patternProperties:
     description: Japan Display Inc.
   "^jedec,.*":
     description: JEDEC Solid State Technology Association
+  "^jesurun,.*":
+    description: Shenzhen Jesurun Electronics Business Dept.
   "^jianda,.*":
     description: Jiandangjing Technology Co., Ltd.
   "^karo,.*":
@@ -449,6 +487,8 @@ patternProperties:
     description: Rakuten Kobo Inc.
   "^koe,.*":
     description: Kaohsiung Opto-Electronics Inc.
+  "^kontron,.*":
+    description: Kontron S&T AG
   "^kosagi,.*":
     description: Sutajio Ko-Usagi PTE Ltd.
   "^kyo,.*":
@@ -457,6 +497,8 @@ patternProperties:
     description: LaCie
   "^laird,.*":
     description: Laird PLC
+  "^lamobo,.*":
+    description: Ketai Huajie Technology Co., Ltd.
   "^lantiq,.*":
     description: Lantiq Semiconductor
   "^lattice,.*":
@@ -475,6 +517,8 @@ patternProperties:
     description: Lichee Pi
   "^linaro,.*":
     description: Linaro Limited
+  "^linksprite,.*":
+    description: LinkSprite Technologies, Inc.
   "^linksys,.*":
     description: Belkin International, Inc. (Linksys)
   "^linux,.*":
@@ -491,6 +535,8 @@ patternProperties:
     description: Liebherr-Werk Nenzing GmbH
   "^macnica,.*":
     description: Macnica Americas
+  "^mapleboard,.*":
+    description: Mapleboard.org
   "^marvell,.*":
     description: Marvell Technology Group Ltd.
   "^maxbotix,.*":
@@ -531,6 +577,8 @@ patternProperties:
     description: Micron Technology Inc.
   "^mikroe,.*":
     description: MikroElektronika d.o.o.
+  "^miniand,.*":
+    description: Miniand Tech
   "^minix,.*":
     description: MINIX Technology Ltd.
   "^miramems,.*":
@@ -661,28 +709,38 @@ patternProperties:
     description: Picochip Ltd
   "^pine64,.*":
     description: Pine64
+  "^pineriver,.*":
+    description: Shenzhen PineRiver Designs Co., Ltd.
   "^pixcir,.*":
     description: PIXCIR MICROELECTRONICS Co., Ltd
   "^plantower,.*":
     description: Plantower Co., Ltd
   "^plathome,.*":
-    description: Plat'Home Co., Ltd.
+    description: Plat\'Home Co., Ltd.
   "^plda,.*":
     description: PLDA
   "^plx,.*":
     description: Broadcom Corporation (formerly PLX Technology)
   "^pni,.*":
     description: PNI Sensor Corporation
+  "^polaroid,.*":
+    description: Polaroid Corporation
   "^portwell,.*":
     description: Portwell Inc.
   "^poslab,.*":
     description: Poslab Technology Co., Ltd.
+  "^pov,.*":
+    description: Point of View International B.V.
   "^powervr,.*":
     description: PowerVR (deprecated, use img)
+  "^primux,.*":
+    description: Primux Trading, S.L.
   "^probox2,.*":
     description: PROBOX2 (by W2COMP Co., Ltd.)
   "^pulsedlight,.*":
     description: PulsedLight, Inc
+  "^purism,.*":
+    description: Purism, SPC
   "^qca,.*":
     description: Qualcomm Atheros, Inc.
   "^qcom,.*":
@@ -691,6 +749,8 @@ patternProperties:
     description: QEMU, a generic and open source machine emulator and virtualizer
   "^qi,.*":
     description: Qi Hardware
+  "^qihua,.*":
+    description: Chengdu Kaixuan Information Technology Co., Ltd.
   "^qiaodian,.*":
     description: QiaoDian XianShi Corporation
   "^qnap,.*":
@@ -713,6 +773,8 @@ patternProperties:
     description: Realtek Semiconductor Corp.
   "^renesas,.*":
     description: Renesas Electronics Corporation
+  "^rervision,.*":
+    description: Shenzhen Rervision Technology Co., Ltd.
   "^richtek,.*":
     description: Richtek Technology Corporation
   "^ricoh,.*":
@@ -781,8 +843,14 @@ patternProperties:
     description: Silergy Corp.
   "^siliconmitus,.*":
     description: Silicon Mitus, Inc.
-  "^simte,.*":
-    description: k
+  "^simtek,.*":
+    description: Cypress Semiconductor Corporation (Simtek Corporation)
+  "^sinlinx,.*":
+    description: Sinlinx Electronics Technology Co., LTD
+  "^sinovoip,.*":
+    description: SinoVoip Co., Ltd
+  "^sipeed,.*":
+    description: Shenzhen Sipeed Technology Co., Ltd.
   "^sirf,.*":
     description: SiRF Technology, Inc.
   "^sis,.*":
@@ -795,6 +863,8 @@ patternProperties:
     description: Standard Microsystems Corporation
   "^snps,.*":
     description: Synopsys, Inc.
+  "^sochip,.*":
+    description: Shenzhen SoChip Technology Co., Ltd.
   "^socionext,.*":
     description: Socionext Inc.
   "^solidrun,.*":
@@ -849,6 +919,8 @@ patternProperties:
     description: Shenzhen Techstar Electronics Co., Ltd.
   "^terasic,.*":
     description: Terasic Inc.
+  "^tfc,.*":
+    description: Three Five Corp
   "^thine,.*":
     description: THine Electronics, Inc.
   "^ti,.*":
@@ -901,6 +973,8 @@ patternProperties:
     description: United Radiant Technology Corporation
   "^usi,.*":
     description: Universal Scientific Industrial Co., Ltd.
+  "^utoo,.*":
+    description: Aigo Digital Technology Co., Ltd.
   "^v3,.*":
     description: V3 Semiconductor
   "^vamrs,.*":
@@ -923,6 +997,8 @@ patternProperties:
     description: Voipac Technologies s.r.o.
   "^vot,.*":
     description: Vision Optical Technology Co., Ltd.
+  "^vxt,.*":
+    description: VXT Ltd
   "^wd,.*":
     description: Western Digital Corp.
   "^wetek,.*":
@@ -937,10 +1013,14 @@ patternProperties:
     description: Winbond Electronics corp.
   "^winstar,.*":
     description: Winstar Display Corp.
+  "^wits,.*":
+    description: Shenzhen Merrii Technology Co., Ltd. (WITS)
   "^wlf,.*":
     description: Wolfson Microelectronics
   "^wm,.*":
     description: Wondermedia Technologies, Inc.
+  "^wobo,.*":
+    description: Wobo
   "^x-powers,.*":
     description: X-Powers
   "^xes,.*":
@@ -951,6 +1031,8 @@ patternProperties:
     description: Xilinx
   "^xunlong,.*":
     description: Shenzhen Xunlong Software CO.,Limited
+  "^yones-toptech,.*":
+    description: Yones Toptech Co., Ltd.
   "^ysoft,.*":
     description: Y Soft Corporation a.s.
   "^zarlink,.*":
@@ -968,7 +1050,7 @@ patternProperties:
 
   # Normal property name match without a comma
   # These should catch all node/property names without a prefix
-  "^[a-zA-Z0-9#][a-zA-Z0-9+\\-._@]{0,63}$": true
+  "^[a-zA-Z0-9#_][a-zA-Z0-9+\\-._@]{0,63}$": true
   "^[a-zA-Z0-9+\\-._]*@[0-9a-zA-Z,]*$": true
   "^#.*": true
 
diff --git a/Documentation/devicetree/bindings/virtio/iommu.txt b/Documentation/devicetree/bindings/virtio/iommu.txt
new file mode 100644
index 000000000000..2407fea0651c
--- /dev/null
+++ b/Documentation/devicetree/bindings/virtio/iommu.txt
@@ -0,0 +1,66 @@
+* virtio IOMMU PCI device
+
+When virtio-iommu uses the PCI transport, its programming interface is
+discovered dynamically by the PCI probing infrastructure. However the
+device tree statically describes the relation between IOMMU and DMA
+masters. Therefore, the PCI root complex that hosts the virtio-iommu
+contains a child node representing the IOMMU device explicitly.
+
+Required properties:
+
+- compatible:	Should be "virtio,pci-iommu"
+- reg:		PCI address of the IOMMU. As defined in the PCI Bus
+		Binding reference [1], the reg property is a five-cell
+		address encoded as (phys.hi phys.mid phys.lo size.hi
+		size.lo). phys.hi should contain the device's BDF as
+		0b00000000 bbbbbbbb dddddfff 00000000. The other cells
+		should be zero.
+- #iommu-cells:	Each platform DMA master managed by the IOMMU is assigned
+		an endpoint ID, described by the "iommus" property [2].
+		For virtio-iommu, #iommu-cells must be 1.
+
+Notes:
+
+- DMA from the IOMMU device isn't managed by another IOMMU. Therefore the
+  virtio-iommu node doesn't have an "iommus" property, and is omitted from
+  the iommu-map property of the root complex.
+
+Example:
+
+pcie@10000000 {
+	compatible = "pci-host-ecam-generic";
+	...
+
+	/* The IOMMU programming interface uses slot 00:01.0 */
+	iommu0: iommu@0008 {
+		compatible = "virtio,pci-iommu";
+		reg = <0x00000800 0 0 0 0>;
+		#iommu-cells = <1>;
+	};
+
+	/*
+	 * The IOMMU manages all functions in this PCI domain except
+	 * itself. Omit BDF 00:01.0.
+	 */
+	iommu-map = <0x0 &iommu0 0x0 0x8>
+		    <0x9 &iommu0 0x9 0xfff7>;
+};
+
+pcie@20000000 {
+	compatible = "pci-host-ecam-generic";
+	...
+	/*
+	 * The IOMMU also manages all functions from this domain,
+	 * with endpoint IDs 0x10000 - 0x1ffff
+	 */
+	iommu-map = <0x0 &iommu0 0x10000 0x10000>;
+};
+
+ethernet@fe001000 {
+	...
+	/* The IOMMU manages this platform device with endpoint ID 0x20000 */
+	iommus = <&iommu0 0x20000>;
+};
+
+[1] Documentation/devicetree/bindings/pci/pci.txt
+[2] Documentation/devicetree/bindings/iommu/iommu.txt
diff --git a/Documentation/devicetree/bindings/virtio/mmio.txt b/Documentation/devicetree/bindings/virtio/mmio.txt
index 5069c1b8e193..21af30fbb81f 100644
--- a/Documentation/devicetree/bindings/virtio/mmio.txt
+++ b/Documentation/devicetree/bindings/virtio/mmio.txt
@@ -8,10 +8,40 @@ Required properties:
 - reg:		control registers base address and size including configuration space
 - interrupts:	interrupt generated by the device
 
+Required properties for virtio-iommu:
+
+- #iommu-cells:	When the node corresponds to a virtio-iommu device, it is
+		linked to DMA masters using the "iommus" or "iommu-map"
+		properties [1][2]. #iommu-cells specifies the size of the
+		"iommus" property. For virtio-iommu #iommu-cells must be
+		1, each cell describing a single endpoint ID.
+
+Optional properties:
+
+- iommus:	If the device accesses memory through an IOMMU, it should
+		have an "iommus" property [1]. Since virtio-iommu itself
+		does not access memory through an IOMMU, the "virtio,mmio"
+		node cannot have both an "#iommu-cells" and an "iommus"
+		property.
+
 Example:
 
 	virtio_block@3000 {
 		compatible = "virtio,mmio";
 		reg = <0x3000 0x100>;
 		interrupts = <41>;
+
+		/* Device has endpoint ID 23 */
+		iommus = <&viommu 23>
 	}
+
+	viommu: iommu@3100 {
+		compatible = "virtio,mmio";
+		reg = <0x3100 0x100>;
+		interrupts = <42>;
+
+		#iommu-cells = <1>
+	}
+
+[1] Documentation/devicetree/bindings/iommu/iommu.txt
+[2] Documentation/devicetree/bindings/pci/pci-iommu.txt
diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
deleted file mode 100644
index 02b87e92ae68..000000000000
--- a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-* Freescale i.MX System Controller Watchdog
-
-i.MX system controller watchdog is for i.MX SoCs with system controller inside,
-the watchdog is managed by system controller, users can ONLY communicate with
-system controller from secure mode for watchdog operations, so Linux i.MX system
-controller watchdog driver will call ARM SMC API and trap into ARM-Trusted-Firmware
-for watchdog operations, ARM-Trusted-Firmware is running at secure EL3 mode and
-it will request system controller to execute the watchdog operation passed from
-Linux kernel.
-
-Required properties:
-- compatible:	Should be :
-		"fsl,imx8qxp-sc-wdt"
-		followed by "fsl,imx-sc-wdt";
-
-Optional properties:
-- timeout-sec : Contains the watchdog timeout in seconds.
-
-Examples:
-
-watchdog {
-	compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
-	timeout-sec = <60>;
-};
diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
index 9f365c1a3399..9f365c1a3399 100644
--- a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
index 46055254e8dd..e65198d82a2b 100644
--- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
@@ -6,6 +6,7 @@ Required properties:
 	"allwinner,sun4i-a10-wdt"
 	"allwinner,sun6i-a31-wdt"
 	"allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt"
+	"allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt"
 	"allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt"
 - reg : Specifies base physical address and size of the registers.
 
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index e86bd2f64117..4660ccee35a3 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -160,7 +160,7 @@ it with special cases.
    of the kernel image. That entry point supports two calling
    conventions.  A summary of the interface is described here.  A full
    description of the boot requirements is documented in
-   Documentation/arm/Booting
+   Documentation/arm/booting.rst
 
         a) ATAGS interface.  Minimal information is passed from firmware
         to the kernel with a tagged list of predefined parameters.
@@ -174,7 +174,7 @@ it with special cases.
         b) Entry with a flattened device-tree block.  Firmware loads the
         physical address of the flattened device tree block (dtb) into r2,
         r1 is not used, but it is considered good practice to use a valid
-        machine number as described in Documentation/arm/Booting.
+        machine number as described in Documentation/arm/booting.rst.
 
                 r0 : 0
 
@@ -277,7 +277,7 @@ it with special cases.
   the decompressor (the real mode entry point goes to the same  32bit
   entry point once it switched into protected mode). That entry point
   supports one calling convention which is documented in
-  Documentation/x86/boot.txt
+  Documentation/x86/boot.rst
   The physical pointer to the device-tree block (defined in chapter II)
   is passed via setup_data which requires at least boot protocol 2.09.
   The type filed is defined as
diff --git a/Documentation/doc-guide/conf.py b/Documentation/doc-guide/conf.py
deleted file mode 100644
index fd3731182d5a..000000000000
--- a/Documentation/doc-guide/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = 'Linux Kernel Documentation Guide'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst
index f96059767c8c..192c36af39e2 100644
--- a/Documentation/doc-guide/kernel-doc.rst
+++ b/Documentation/doc-guide/kernel-doc.rst
@@ -359,7 +359,7 @@ Domain`_ references.
   ``monospaced font``.
 
   Useful if you need to use special characters that would otherwise have some
-  meaning either by kernel-doc script of by reStructuredText.
+  meaning either by kernel-doc script or by reStructuredText.
 
   This is particularly useful if you need to use things like ``%ph`` inside
   a function description.
diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst
index c039224b404e..f71ddd592aaa 100644
--- a/Documentation/doc-guide/sphinx.rst
+++ b/Documentation/doc-guide/sphinx.rst
@@ -27,8 +27,7 @@ Sphinx Install
 ==============
 
 The ReST markups currently used by the Documentation/ files are meant to be
-built with ``Sphinx`` version 1.3 or higher. If you desire to build
-PDF output, it is recommended to use version 1.4.6 or higher.
+built with ``Sphinx`` version 1.3 or higher.
 
 There's a script that checks for the Sphinx requirements. Please see
 :ref:`sphinx-pre-install` for further details.
@@ -56,13 +55,13 @@ or ``virtualenv``, depending on how your distribution packaged Python 3.
       those expressions are written using LaTeX notation. It needs texlive
       installed with amdfonts and amsmath in order to evaluate them.
 
-In summary, if you want to install Sphinx version 1.4.9, you should do::
+In summary, if you want to install Sphinx version 1.7.9, you should do::
 
-       $ virtualenv sphinx_1.4
-       $ . sphinx_1.4/bin/activate
-       (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt
+       $ virtualenv sphinx_1.7.9
+       $ . sphinx_1.7.9/bin/activate
+       (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt
 
-After running ``. sphinx_1.4/bin/activate``, the prompt will change,
+After running ``. sphinx_1.7.9/bin/activate``, the prompt will change,
 in order to indicate that you're using the new environment. If you
 open a new shell, you need to rerun this command to enter again at
 the virtual environment before building the documentation.
@@ -105,8 +104,8 @@ command line options for your distro::
 	You should run:
 
 		sudo dnf install -y texlive-luatex85
-		/usr/bin/virtualenv sphinx_1.4
-		. sphinx_1.4/bin/activate
+		/usr/bin/virtualenv sphinx_1.7.9
+		. sphinx_1.7.9/bin/activate
 		pip install -r Documentation/sphinx/requirements.txt
 
 	Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468.
@@ -218,7 +217,7 @@ Here are some specific guidelines for the kernel documentation:
   examples, etc.), use ``::`` for anything that doesn't really benefit
   from syntax highlighting, especially short snippets. Use
   ``.. code-block:: <language>`` for longer code blocks that benefit
-  from highlighting.
+  from highlighting. For a short snippet of code embedded in the text, use \`\`.
 
 
 the C domain
@@ -242,11 +241,14 @@ The C domain of the kernel-doc has some additional features. E.g. you can
 
 The func-name (e.g. ioctl) remains in the output but the ref-name changed from
 ``ioctl`` to ``VIDIOC_LOG_STATUS``. The index entry for this function is also
-changed to ``VIDIOC_LOG_STATUS`` and the function can now referenced by:
-
-.. code-block:: rst
-
-     :c:func:`VIDIOC_LOG_STATUS`
+changed to ``VIDIOC_LOG_STATUS``.
+
+Please note that there is no need to use ``c:func:`` to generate cross
+references to function documentation.  Due to some Sphinx extension magic,
+the documentation build system will automatically turn a reference to
+``function()`` into a cross reference if an index entry for the given
+function name exists.  If you see ``c:func:`` use in a kernel document,
+please feel free to remove it.
 
 
 list tables
diff --git a/Documentation/docutils.conf b/Documentation/docutils.conf
index 2830772264c8..f1a180b97dec 100644
--- a/Documentation/docutils.conf
+++ b/Documentation/docutils.conf
@@ -4,4 +4,4 @@
 # http://docutils.sourceforge.net/docs/user/config.html
 
 [general]
-halt_level: severe
-\ No newline at end of file
+halt_level: severe
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 5eba889ea84d..9f4392876099 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -30,6 +30,7 @@
 *.lzo
 *.mo
 *.moc
+*.mod
 *.mod.c
 *.o
 *.o.*
diff --git a/Documentation/driver-api/80211/conf.py b/Documentation/driver-api/80211/conf.py
deleted file mode 100644
index 4424b4b0b9c3..000000000000
--- a/Documentation/driver-api/80211/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Linux 802.11 Driver Developer's Guide"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', '80211.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/driver-api/80211/mac80211-advanced.rst b/Documentation/driver-api/80211/mac80211-advanced.rst
index 70a89b2163c2..9f1c5bb7ac35 100644
--- a/Documentation/driver-api/80211/mac80211-advanced.rst
+++ b/Documentation/driver-api/80211/mac80211-advanced.rst
@@ -226,9 +226,6 @@ TBD
 .. kernel-doc:: include/net/mac80211.h
    :functions: ieee80211_tx_rate_control
 
-.. kernel-doc:: include/net/mac80211.h
-   :functions: rate_control_send_low
-
 TBD
 
 This part of the book describes mac80211 internals.
diff --git a/Documentation/driver-api/backlight/lp855x-driver.rst b/Documentation/driver-api/backlight/lp855x-driver.rst
new file mode 100644
index 000000000000..1e0b224fc397
--- /dev/null
+++ b/Documentation/driver-api/backlight/lp855x-driver.rst
@@ -0,0 +1,81 @@
+====================
+Kernel driver lp855x
+====================
+
+Backlight driver for LP855x ICs
+
+Supported chips:
+
+	Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and
+	LP8557
+
+Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+-----------
+
+* Brightness control
+
+  Brightness can be controlled by the pwm input or the i2c command.
+  The lp855x driver supports both cases.
+
+* Device attributes
+
+  1) bl_ctl_mode
+
+  Backlight control mode.
+
+  Value: pwm based or register based
+
+  2) chip_id
+
+  The lp855x chip id.
+
+  Value: lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557
+
+Platform data for lp855x
+------------------------
+
+For supporting platform specific data, the lp855x platform data can be used.
+
+* name:
+	Backlight driver name. If it is not defined, default name is set.
+* device_control:
+	Value of DEVICE CONTROL register.
+* initial_brightness:
+	Initial value of backlight brightness.
+* period_ns:
+	Platform specific PWM period value. unit is nano.
+	Only valid when brightness is pwm input mode.
+* size_program:
+	Total size of lp855x_rom_data.
+* rom_data:
+	List of new eeprom/eprom registers.
+
+Examples
+========
+
+1) lp8552 platform data: i2c register mode with new eeprom data::
+
+    #define EEPROM_A5_ADDR	0xA5
+    #define EEPROM_A5_VAL	0x4f	/* EN_VSYNC=0 */
+
+    static struct lp855x_rom_data lp8552_eeprom_arr[] = {
+	{EEPROM_A5_ADDR, EEPROM_A5_VAL},
+    };
+
+    static struct lp855x_platform_data lp8552_pdata = {
+	.name = "lcd-bl",
+	.device_control = I2C_CONFIG(LP8552),
+	.initial_brightness = INITIAL_BRT,
+	.size_program = ARRAY_SIZE(lp8552_eeprom_arr),
+	.rom_data = lp8552_eeprom_arr,
+    };
+
+2) lp8556 platform data: pwm input mode with default rom data::
+
+    static struct lp855x_platform_data lp8556_pdata = {
+	.device_control = PWM_CONFIG(LP8556),
+	.initial_brightness = INITIAL_BRT,
+	.period_ns = 1000000,
+    };
diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst
index e970fadf4d1a..1ba88c7b3984 100644
--- a/Documentation/driver-api/basics.rst
+++ b/Documentation/driver-api/basics.rst
@@ -115,9 +115,6 @@ Kernel utility functions
 .. kernel-doc:: kernel/rcu/tree.c
    :export:
 
-.. kernel-doc:: kernel/rcu/tree_plugin.h
-   :export:
-
 .. kernel-doc:: kernel/rcu/update.c
    :export:
 
diff --git a/Documentation/bt8xxgpio.txt b/Documentation/driver-api/bt8xxgpio.rst
index a845feb074de..a845feb074de 100644
--- a/Documentation/bt8xxgpio.txt
+++ b/Documentation/driver-api/bt8xxgpio.rst
diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst
index 593cca5058b1..3cad45d14187 100644
--- a/Documentation/driver-api/clk.rst
+++ b/Documentation/driver-api/clk.rst
@@ -175,9 +175,9 @@ the following::
 To take advantage of your data you'll need to support valid operations
 for your clk::
 
-	struct clk_ops clk_foo_ops {
-		.enable		= &clk_foo_enable;
-		.disable	= &clk_foo_disable;
+	struct clk_ops clk_foo_ops = {
+		.enable		= &clk_foo_enable,
+		.disable	= &clk_foo_disable,
 	};
 
 Implement the above functions using container_of::
diff --git a/Documentation/driver-api/conf.py b/Documentation/driver-api/conf.py
deleted file mode 100644
index 202726d20088..000000000000
--- a/Documentation/driver-api/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "The Linux driver implementer's API guide"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'driver-api.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/driver-api/connector.rst b/Documentation/driver-api/connector.rst
new file mode 100644
index 000000000000..c100c7482289
--- /dev/null
+++ b/Documentation/driver-api/connector.rst
@@ -0,0 +1,156 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+Kernel Connector
+================
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+The Connector driver makes it easy to connect various agents using a
+netlink based network.  One must register a callback and an identifier.
+When the driver receives a special netlink message with the appropriate
+identifier, the appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+	- socket();
+	- bind();
+	- send();
+	- recv();
+
+But if kernelspace wants to use the full power of such connections, the
+driver writer must create special sockets, must know about struct sk_buff
+handling, etc...  The Connector driver allows any kernelspace agents to use
+netlink based networking for inter-process communication in a significantly
+easier way::
+
+  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
+  void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
+
+  struct cb_id
+  {
+	__u32			idx;
+	__u32			val;
+  };
+
+idx and val are unique identifiers which must be registered in the
+connector.h header for in-kernel usage.  `void (*callback) (void *)` is a
+callback function which will be called when a message with above idx.val
+is received by the connector core.  The argument for that function must
+be dereferenced to `struct cn_msg *`::
+
+  struct cn_msg
+  {
+	struct cb_id		id;
+
+	__u32			seq;
+	__u32			ack;
+
+	__u32			len;	/* Length of the following data */
+	__u8			data[0];
+  };
+
+Connector interfaces
+====================
+
+ .. kernel-doc:: include/linux/connector.h
+
+ Note:
+   When registering new callback user, connector core assigns
+   netlink group to the user which is equal to its id.idx.
+
+Protocol description
+====================
+
+The current framework offers a transport layer with fixed headers.  The
+recommended protocol which uses such a header is as following:
+
+msg->seq and msg->ack are used to determine message genealogy.  When
+someone sends a message, they use a locally unique sequence and random
+acknowledge number.  The sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+The sequence number is incremented with each message sent.
+
+If you expect a reply to the message, then the sequence number in the
+received message MUST be the same as in the original message, and the
+acknowledge number MUST be the same + 1.
+
+If we receive a message and its sequence number is not equal to one we
+are expecting, then it is a new message.  If we receive a message and
+its sequence number is the same as one we are expecting, but its
+acknowledge is not equal to the sequence number in the original
+message + 1, then it is a new message.
+
+Obviously, the protocol header contains the above id.
+
+The connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected ids will be turned on or off (registered or unregistered its
+callback).  It is done by sending a special command to the connector
+driver (it also registers itself with id={-1, -1}).
+
+As example of this usage can be found in the cn_test.c module which
+uses the connector to request notification and to send messages.
+
+Reliability
+===========
+
+Netlink itself is not a reliable protocol.  That means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned that it must be prepared.  That is why the struct
+cn_msg [main connector's message header] contains u32 seq and u32 ack
+fields.
+
+Userspace usage
+===============
+
+2.6.14 has a new netlink socket implementation, which by default does not
+allow people to send data to netlink groups other than 1.
+So, if you wish to use a netlink socket (for example using connector)
+with a different group number, the userspace application must subscribe to
+that group first.  It can be achieved by the following pseudocode::
+
+  s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+  l_local.nl_family = AF_NETLINK;
+  l_local.nl_groups = 12345;
+  l_local.nl_pid = 0;
+
+  if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+	perror("bind");
+	close(s);
+	return -1;
+  }
+
+  {
+	int on = l_local.nl_groups;
+	setsockopt(s, 270, 1, &on, sizeof(on));
+  }
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option.  To drop a multicast subscription, one should call the above socket
+option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
+
+Code samples
+============
+
+Sample code for a connector test module and user space can be found
+in samples/connector/. To build this code, enable CONFIG_CONNECTOR
+and CONFIG_SAMPLES.
diff --git a/Documentation/driver-api/console.rst b/Documentation/driver-api/console.rst
new file mode 100644
index 000000000000..8394ad7747ac
--- /dev/null
+++ b/Documentation/driver-api/console.rst
@@ -0,0 +1,152 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Console Drivers
+===============
+
+The Linux kernel has 2 general types of console drivers.  The first type is
+assigned by the kernel to all the virtual consoles during the boot process.
+This type will be called 'system driver', and only one system driver is allowed
+to exist. The system driver is persistent and it can never be unloaded, though
+it may become inactive.
+
+The second type has to be explicitly loaded and unloaded. This will be called
+'modular driver' by this document. Multiple modular drivers can coexist at
+any time with each driver sharing the console with other drivers including
+the system driver. However, modular drivers cannot take over the console
+that is currently occupied by another modular driver. (Exception: Drivers that
+call do_take_over_console() will succeed in the takeover regardless of the type
+of driver occupying the consoles.) They can only take over the console that is
+occupied by the system driver. In the same token, if the modular driver is
+released by the console, the system driver will take over.
+
+Modular drivers, from the programmer's point of view, have to call::
+
+	 do_take_over_console() - load and bind driver to console layer
+	 give_up_console() - unload driver; it will only work if driver
+			     is fully unbound
+
+In newer kernels, the following are also available::
+
+	 do_register_con_driver()
+	 do_unregister_con_driver()
+
+If sysfs is enabled, the contents of /sys/class/vtconsole can be
+examined. This shows the console backends currently registered by the
+system which are named vtcon<n> where <n> is an integer from 0 to 15.
+Thus::
+
+       ls /sys/class/vtconsole
+       .  ..  vtcon0  vtcon1
+
+Each directory in /sys/class/vtconsole has 3 files::
+
+     ls /sys/class/vtconsole/vtcon0
+     .  ..  bind  name  uevent
+
+What do these files signify?
+
+     1. bind - this is a read/write file. It shows the status of the driver if
+        read, or acts to bind or unbind the driver to the virtual consoles
+        when written to. The possible values are:
+
+	0
+	  - means the driver is not bound and if echo'ed, commands the driver
+	    to unbind
+
+        1
+	  - means the driver is bound and if echo'ed, commands the driver to
+	    bind
+
+     2. name - read-only file. Shows the name of the driver in this format::
+
+	  cat /sys/class/vtconsole/vtcon0/name
+	  (S) VGA+
+
+	      '(S)' stands for a (S)ystem driver, i.e., it cannot be directly
+	      commanded to bind or unbind
+
+	      'VGA+' is the name of the driver
+
+	  cat /sys/class/vtconsole/vtcon1/name
+	  (M) frame buffer device
+
+	      In this case, '(M)' stands for a (M)odular driver, one that can be
+	      directly commanded to bind or unbind.
+
+     3. uevent - ignore this file
+
+When unbinding, the modular driver is detached first, and then the system
+driver takes over the consoles vacated by the driver. Binding, on the other
+hand, will bind the driver to the consoles that are currently occupied by a
+system driver.
+
+NOTE1:
+  Binding and unbinding must be selected in Kconfig. It's under::
+
+    Device Drivers ->
+	Character devices ->
+		Support for binding and unbinding console drivers
+
+NOTE2:
+  If any of the virtual consoles are in KD_GRAPHICS mode, then binding or
+  unbinding will not succeed. An example of an application that sets the
+  console to KD_GRAPHICS is X.
+
+How useful is this feature? This is very useful for console driver
+developers. By unbinding the driver from the console layer, one can unload the
+driver, make changes, recompile, reload and rebind the driver without any need
+for rebooting the kernel. For regular users who may want to switch from
+framebuffer console to VGA console and vice versa, this feature also makes
+this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb
+for more details.)
+
+Notes for developers
+====================
+
+do_take_over_console() is now broken up into::
+
+     do_register_con_driver()
+     do_bind_con_driver() - private function
+
+give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must
+be fully unbound for this call to succeed. con_is_bound() will check if the
+driver is bound or not.
+
+Guidelines for console driver writers
+=====================================
+
+In order for binding to and unbinding from the console to properly work,
+console drivers must follow these guidelines:
+
+1. All drivers, except system drivers, must call either do_register_con_driver()
+   or do_take_over_console(). do_register_con_driver() will just add the driver
+   to the console's internal list. It won't take over the
+   console. do_take_over_console(), as it name implies, will also take over (or
+   bind to) the console.
+
+2. All resources allocated during con->con_init() must be released in
+   con->con_deinit().
+
+3. All resources allocated in con->con_startup() must be released when the
+   driver, which was previously bound, becomes unbound.  The console layer
+   does not have a complementary call to con->con_startup() so it's up to the
+   driver to check when it's legal to release these resources. Calling
+   con_is_bound() in con->con_deinit() will help.  If the call returned
+   false(), then it's safe to release the resources.  This balance has to be
+   ensured because con->con_startup() can be called again when a request to
+   rebind the driver to the console arrives.
+
+4. Upon exit of the driver, ensure that the driver is totally unbound. If the
+   condition is satisfied, then the driver must call do_unregister_con_driver()
+   or give_up_console().
+
+5. do_unregister_con_driver() can also be called on conditions which make it
+   impossible for the driver to service console requests.  This can happen
+   with the framebuffer console that suddenly lost all of its drivers.
+
+The current crop of console drivers should still work correctly, but binding
+and unbinding them may cause problems. With minimal fixes, these drivers can
+be made to work correctly.
+
+Antonino Daplas <adaplas@pol.net>
diff --git a/Documentation/dcdbas.txt b/Documentation/driver-api/dcdbas.rst
index 309cc57a7c1c..309cc57a7c1c 100644
--- a/Documentation/dcdbas.txt
+++ b/Documentation/driver-api/dcdbas.rst
diff --git a/Documentation/dell_rbu.txt b/Documentation/driver-api/dell_rbu.rst
index 5d1ce7bcd04d..5d1ce7bcd04d 100644
--- a/Documentation/dell_rbu.txt
+++ b/Documentation/driver-api/dell_rbu.rst
diff --git a/Documentation/driver-api/dmaengine/dmatest.rst b/Documentation/driver-api/dmaengine/dmatest.rst
index e78d070bb468..ee268d445d38 100644
--- a/Documentation/driver-api/dmaengine/dmatest.rst
+++ b/Documentation/driver-api/dmaengine/dmatest.rst
@@ -44,7 +44,8 @@ Example of usage::
 
     dmatest.timeout=2000 dmatest.iterations=1 dmatest.channel=dma0chan0 dmatest.run=1
 
-Example of multi-channel test usage:
+Example of multi-channel test usage (new in the 5.0 kernel)::
+
     % modprobe dmatest
     % echo 2000 > /sys/module/dmatest/parameters/timeout
     % echo 1 > /sys/module/dmatest/parameters/iterations
@@ -53,15 +54,18 @@ Example of multi-channel test usage:
     % echo dma0chan2 > /sys/module/dmatest/parameters/channel
     % echo 1 > /sys/module/dmatest/parameters/run
 
-Note: the channel parameter should always be the last parameter set prior to
-running the test (setting run=1), this is because upon setting the channel
-parameter, that specific channel is requested using the dmaengine and a thread
-is created with the existing parameters. This thread is set as pending
-and will be executed once run is set to 1. Any parameters set after the thread
-is created are not applied.
+.. note::
+  For all tests, starting in the 5.0 kernel, either single- or multi-channel,
+  the channel parameter(s) must be set after all other parameters. It is at
+  that time that the existing parameter values are acquired for use by the
+  thread(s). All other parameters are shared. Therefore, if changes are made
+  to any of the other parameters, and an additional channel specified, the
+  (shared) parameters used for all threads will use the new values.
+  After the channels are specified, each thread is set as pending. All threads
+  begin execution when the run parameter is set to 1.
 
 .. hint::
-  available channel list could be extracted by running the following command::
+  A list of available channels can be found by running the following command::
 
     % ls -1 /sys/class/dma/
 
@@ -204,6 +208,7 @@ Releasing Channels
 Channels can be freed by setting run to 0.
 
 Example::
+
     % echo dma0chan1 > /sys/module/dmatest/parameters/channel
     dmatest: Added 1 threads using dma0chan1
     % cat /sys/class/dma/dma0chan1/in_use
diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst
new file mode 100644
index 000000000000..7ea1d7a41e1d
--- /dev/null
+++ b/Documentation/driver-api/driver-model/binding.rst
@@ -0,0 +1,98 @@
+==============
+Driver Binding
+==============
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
new file mode 100644
index 000000000000..016b15a6e8ea
--- /dev/null
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -0,0 +1,146 @@
+=========
+Bus Types
+=========
+
+Definition
+~~~~~~~~~~
+See the kerneldoc for the struct bus_type.
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback::
+
+   struct bus_type pci_bus_type = {
+          .name	= "pci",
+          .match	= pci_bus_match,
+   };
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is to give the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+		       void * data,
+		       int (*fn)(struct device *, void *));
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+		       void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+	    |-- Intel ICH
+	    |-- Intel ICH Joystick
+	    |-- agpgart
+	    `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy::
+
+	/sys/bus/pci/
+	|-- devices
+	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
+	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
+	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
+	`-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct bus_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct bus_type *, char * buf);
+	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+  };
+
+Bus drivers can export attributes using the BUS_ATTR_RW macro that works
+similarly to the DEVICE_ATTR_RW macro for devices. For example, a
+definition like this::
+
+	static BUS_ATTR_RW(debug);
+
+is equivalent to declaring::
+
+	static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using::
+
+	int bus_create_file(struct bus_type *, struct bus_attribute *);
+	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst
new file mode 100644
index 000000000000..fff55b80e86a
--- /dev/null
+++ b/Documentation/driver-api/driver-model/class.rst
@@ -0,0 +1,149 @@
+==============
+Device Classes
+==============
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like::
+
+
+  typedef int (*devclass_add)(struct device *);
+  typedef void (*devclass_remove)(struct device *);
+
+See the kerneldoc for the struct class.
+
+A typical device class definition would look like::
+
+  struct device_class input_devclass = {
+        .name		= "input",
+        .add_device	= input_add_device,
+	.remove_device	= input_remove_device,
+  };
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using::
+
+  int devclass_register(struct device_class * cls);
+  void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories::
+
+        class/
+        `-- input
+            |-- devices
+            `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory)::
+
+   class/
+   `-- input
+       |-- devices
+       `-- drivers
+           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy::
+
+   class/
+   `-- input
+       |-- devices
+       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+       `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct devclass_attribute {
+        struct attribute        attr;
+        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+  };
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this::
+
+  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring::
+
+  static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using::
+
+  int devclass_create_file(struct device_class *, struct devclass_attribute *);
+  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst
new file mode 100644
index 000000000000..41eb8f41f7dd
--- /dev/null
+++ b/Documentation/driver-api/driver-model/design-patterns.rst
@@ -0,0 +1,116 @@
+=============================
+Device Driver Design Patterns
+=============================
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form::
+
+  struct foo {
+      spinlock_t lock; /* Example member */
+      (...)
+  };
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+      if (!foo)
+          return -ENOMEM;
+      spin_lock_init(&foo->lock);
+      (...)
+  }
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this::
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      (...)
+      ret = request_irq(irq, foo_handler, 0, "foo", foo);
+  }
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add an offloaded work::
+
+  struct foo {
+      spinlock_t lock;
+      struct workqueue_struct *wq;
+      struct work_struct offload;
+      (...)
+  };
+
+  static void foo_work(struct work_struct *work)
+  {
+      struct foo *foo = container_of(work, struct foo, offload);
+
+      (...)
+  }
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+
+      queue_work(foo->wq, &foo->offload);
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo->wq = create_singlethread_workqueue("foo-wq");
+      INIT_WORK(&foo->offload, foo_work);
+      (...)
+  }
+
+The design pattern is the same for an hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst
new file mode 100644
index 000000000000..2b868d49d349
--- /dev/null
+++ b/Documentation/driver-api/driver-model/device.rst
@@ -0,0 +1,109 @@
+==========================
+The Basic Device Structure
+==========================
+
+See the kerneldoc for the struct device.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core::
+
+  int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+    - parent
+    - name
+    - bus_id
+    - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using::
+
+  struct device * get_device(struct device * dev);
+  void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using::
+
+  void lock_device(struct device * dev);
+  void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct device_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count);
+  };
+
+Attributes of devices can be exported by a device driver through sysfs.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+As explained in Documentation/kobject.txt, device attributes must be
+created before the KOBJ_ADD uevent is generated. The only way to realize
+that is by defining an attribute group.
+
+Attributes are declared using a macro called DEVICE_ATTR::
+
+  #define DEVICE_ATTR(name,mode,show,store)
+
+Example:::
+
+  static DEVICE_ATTR(type, 0444, show_type, NULL);
+  static DEVICE_ATTR(power, 0644, show_power, store_power);
+
+This declares two structures of type struct device_attribute with respective
+names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
+organized as follows into a group::
+
+  static struct attribute *dev_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_power.attr,
+	NULL,
+  };
+
+  static struct attribute_group dev_attr_group = {
+	.attrs = dev_attrs,
+  };
+
+  static const struct attribute_group *dev_attr_groups[] = {
+	&dev_attr_group,
+	NULL,
+  };
+
+This array of groups can then be associated with a device by setting the
+group pointer in struct device before device_register() is invoked::
+
+        dev->groups = dev_attr_groups;
+        device_register(dev);
+
+The device_register() function will use the 'groups' pointer to create the
+device attributes and the device_unregister() function will use this pointer
+to remove the device attributes.
+
+Word of warning:  While the kernel allows device_create_file() and
+device_remove_file() to be called on a device at any time, userspace has
+strict expectations on when attributes get created.  When a new device is
+registered in the kernel, a uevent is generated to notify userspace (like
+udev) that a new device is available.  If attributes are added after the
+device is registered, then userspace won't get notified and userspace will
+not know about the new attributes.
+
+This is important for device driver that need to publish additional
+attributes for a device at driver probe time.  If the device driver simply
+calls device_create_file() on the device structure passed to it, then
+userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
new file mode 100644
index 000000000000..a100bef54952
--- /dev/null
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -0,0 +1,418 @@
+================================
+Devres - Managed Device Resource
+================================
+
+Tejun Heo	<teheo@suse.de>
+
+First draft	10 January 2007
+
+.. contents
+
+   1. Intro			: Huh? Devres?
+   2. Devres			: Devres in a nutshell
+   3. Devres Group		: Group devres'es and release them together
+   4. Details			: Life time rules, calling context, ...
+   5. Overhead			: How much do we have to pay for this?
+   6. List of managed interfaces: Currently implemented managed interfaces
+
+
+1. Intro
+--------
+
+devres came up while trying to convert libata to use iomap.  Each
+iomapped address should be kept and unmapped on driver detach.  For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path.  Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers?  After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs.  iomap
+adds more to this mix.  So do msi and msix.
+
+
+2. Devres
+---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device.  Each devres entry is associated with
+a release function.  A devres can be released in several ways.  No
+matter what, all devres entries are released on driver detach.  On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres.  For example, coherent DMA memory is acquired
+using dma_alloc_coherent().  The managed version is called
+dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach.  Implementation looks like
+the following::
+
+  struct dma_devres {
+	size_t		size;
+	void		*vaddr;
+	dma_addr_t	dma_handle;
+  };
+
+  static void dmam_coherent_release(struct device *dev, void *res)
+  {
+	struct dma_devres *this = res;
+
+	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+  }
+
+  dmam_alloc_coherent(dev, size, dma_handle, gfp)
+  {
+	struct dma_devres *dr;
+	void *vaddr;
+
+	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+	...
+
+	/* alloc DMA memory as usual */
+	vaddr = dma_alloc_coherent(...);
+	...
+
+	/* record size, vaddr, dma_handle in dr */
+	dr->vaddr = vaddr;
+	...
+
+	devres_add(dev, dr);
+
+	return vaddr;
+  }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached.  If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code.  Init path basically
+looks like the following::
+
+  my_init_one()
+  {
+	struct mydev *d;
+
+	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->ring = dmam_alloc_coherent(...);
+	if (!d->ring)
+		return -ENOMEM;
+
+	if (check something)
+		return -EINVAL;
+	...
+
+	return register_to_upper_layer(d);
+  }
+
+And exit path::
+
+  my_remove_one()
+  {
+	unregister_from_upper_layer(d);
+	shutdown_my_hardware();
+  }
+
+As shown above, low level drivers can be simplified a lot by using
+devres.  Complexity is shifted from less maintained low level drivers
+to better maintained higher layer.  Also, as init failure path is
+shared with exit path, both can get more testing.
+
+Note though that when converting current calls or assignments to
+managed devm_* versions it is up to you to check if internal operations
+like allocating memory, have failed. Managed resources pertains to the
+freeing of these resources *only* - all other checks needed are still
+on you. In some cases this may mean introducing checks that were not
+necessary before moving to the managed devm_* calls.
+
+
+3. Devres group
+---------------
+
+Devres entries can be grouped using devres group.  When a group is
+released, all contained normal devres entries and properly nested
+groups are released.  One usage is to rollback series of acquired
+resources on failure.  For example::
+
+  if (!devres_open_group(dev, NULL, GFP_KERNEL))
+	return -ENOMEM;
+
+  acquire A;
+  if (failed)
+	goto err;
+
+  acquire B;
+  if (failed)
+	goto err;
+  ...
+
+  devres_remove_group(dev, NULL);
+  return 0;
+
+ err:
+  devres_release_group(dev, NULL);
+  return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by `void *id`.  It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example.  In both
+cases, devres_open_group() returns the group's id.  The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following::
+
+  int my_midlayer_create_something()
+  {
+	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+		return -ENOMEM;
+
+	...
+
+	devres_close_group(dev, my_midlayer_create_something);
+	return 0;
+  }
+
+  void my_midlayer_destroy_something()
+  {
+	devres_release_group(dev, my_midlayer_create_something);
+  }
+
+
+4. Details
+----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found).  Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility.  This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+5. Overhead
+-----------
+
+Each devres bookkeeping info is allocated together with requested data
+area.  With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment).  If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers.  It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+6. List of managed interfaces
+-----------------------------
+
+CLOCK
+  devm_clk_get()
+  devm_clk_get_optional()
+  devm_clk_put()
+  devm_clk_bulk_get()
+  devm_clk_bulk_get_all()
+  devm_clk_bulk_get_optional()
+  devm_get_clk_from_childl()
+  devm_clk_hw_register()
+  devm_of_clk_add_hw_provider()
+  devm_clk_hw_register_clkdev()
+
+DMA
+  dmaenginem_async_device_register()
+  dmam_alloc_coherent()
+  dmam_alloc_attrs()
+  dmam_free_coherent()
+  dmam_pool_create()
+  dmam_pool_destroy()
+
+DRM
+  devm_drm_dev_init()
+
+GPIO
+  devm_gpiod_get()
+  devm_gpiod_get_index()
+  devm_gpiod_get_index_optional()
+  devm_gpiod_get_optional()
+  devm_gpiod_put()
+  devm_gpiod_unhinge()
+  devm_gpiochip_add_data()
+  devm_gpio_request()
+  devm_gpio_request_one()
+  devm_gpio_free()
+
+I2C
+  devm_i2c_new_dummy_device()
+
+IIO
+  devm_iio_device_alloc()
+  devm_iio_device_free()
+  devm_iio_device_register()
+  devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
+  devm_iio_triggered_buffer_setup()
+  devm_iio_triggered_buffer_cleanup()
+  devm_iio_trigger_alloc()
+  devm_iio_trigger_free()
+  devm_iio_trigger_register()
+  devm_iio_trigger_unregister()
+  devm_iio_channel_get()
+  devm_iio_channel_release()
+  devm_iio_channel_get_all()
+  devm_iio_channel_release_all()
+
+INPUT
+  devm_input_allocate_device()
+
+IO region
+  devm_release_mem_region()
+  devm_release_region()
+  devm_release_resource()
+  devm_request_mem_region()
+  devm_request_region()
+  devm_request_resource()
+
+IOMAP
+  devm_ioport_map()
+  devm_ioport_unmap()
+  devm_ioremap()
+  devm_ioremap_nocache()
+  devm_ioremap_wc()
+  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
+  devm_iounmap()
+  pcim_iomap()
+  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
+  pcim_iomap_table()	: array of mapped addresses indexed by BAR
+  pcim_iounmap()
+
+IRQ
+  devm_free_irq()
+  devm_request_any_context_irq()
+  devm_request_irq()
+  devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
+  devm_irq_alloc_generic_chip()
+  devm_irq_setup_generic_chip()
+  devm_irq_sim_init()
+
+LED
+  devm_led_classdev_register()
+  devm_led_classdev_unregister()
+
+MDIO
+  devm_mdiobus_alloc()
+  devm_mdiobus_alloc_size()
+  devm_mdiobus_free()
+
+MEM
+  devm_free_pages()
+  devm_get_free_pages()
+  devm_kasprintf()
+  devm_kcalloc()
+  devm_kfree()
+  devm_kmalloc()
+  devm_kmalloc_array()
+  devm_kmemdup()
+  devm_kstrdup()
+  devm_kvasprintf()
+  devm_kzalloc()
+
+MFD
+  devm_mfd_add_devices()
+
+MUX
+  devm_mux_chip_alloc()
+  devm_mux_chip_register()
+  devm_mux_control_get()
+
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
+PCI
+  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
+  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
+  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
+  pcim_enable_device()		: after success, all PCI ops become managed
+  pcim_pin_device()		: keep PCI device enabled after release
+
+PHY
+  devm_usb_get_phy()
+  devm_usb_put_phy()
+
+PINCTRL
+  devm_pinctrl_get()
+  devm_pinctrl_put()
+  devm_pinctrl_register()
+  devm_pinctrl_unregister()
+
+POWER
+  devm_reboot_mode_register()
+  devm_reboot_mode_unregister()
+
+PWM
+  devm_pwm_get()
+  devm_pwm_put()
+
+REGULATOR
+  devm_regulator_bulk_get()
+  devm_regulator_get()
+  devm_regulator_put()
+  devm_regulator_register()
+
+RESET
+  devm_reset_control_get()
+  devm_reset_controller_register()
+
+SERDEV
+  devm_serdev_device_open()
+
+SLAVE DMA ENGINE
+  devm_acpi_dma_controller_register()
+
+SPI
+  devm_spi_register_master()
+
+WATCHDOG
+  devm_watchdog_register_device()
diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst
new file mode 100644
index 000000000000..11d281506a04
--- /dev/null
+++ b/Documentation/driver-api/driver-model/driver.rst
@@ -0,0 +1,223 @@
+==============
+Device Drivers
+==============
+
+See the kerneldoc for the struct device_driver.
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model::
+
+  static struct device_driver eepro100_driver = {
+         .name		= "eepro100",
+         .bus		= &pci_bus_type,
+
+         .probe		= eepro100_probe,
+         .remove		= eepro100_remove,
+         .suspend		= eepro100_suspend,
+         .resume		= eepro100_resume,
+  };
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this::
+
+  struct pci_driver {
+         const struct pci_device_id *id_table;
+         struct device_driver	  driver;
+  };
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again)::
+
+  static struct pci_driver eepro100_driver = {
+         .id_table       = eepro100_pci_tbl,
+         .driver	       = {
+		.name		= "eepro100",
+		.bus		= &pci_bus_type,
+		.probe		= eepro100_probe,
+		.remove		= eepro100_remove,
+		.suspend	= eepro100_suspend,
+		.resume		= eepro100_resume,
+         },
+  };
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+::
+
+  int driver_register(struct device_driver *drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices::
+
+  int driver_for_each_dev(struct device_driver *drv, void *data,
+			  int (*callback)(struct device *dev, void *data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+::
+
+	int	(*probe)	(struct device *dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device.  Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines.  That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device.  That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated::
+
+	int 	(*remove)	(struct device *dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state::
+
+	int	(*suspend)	(struct device *dev, pm_message_t state);
+
+suspend is called to put the device in a low power state::
+
+	int	(*resume)	(struct device *dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct driver_attribute {
+          struct attribute        attr;
+          ssize_t (*show)(struct device_driver *driver, char *buf);
+          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
+  };
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
+macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
+macros.
+
+Example::
+
+	DRIVER_ATTR_RW(debug);
+
+This is equivalent to declaring::
+
+	struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using::
+
+  int driver_create_file(struct device_driver *, const struct driver_attribute *);
+  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst
new file mode 100644
index 000000000000..755016422269
--- /dev/null
+++ b/Documentation/driver-api/driver-model/index.rst
@@ -0,0 +1,24 @@
+============
+Driver Model
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   binding
+   bus
+   class
+   design-patterns
+   device
+   devres
+   driver
+   overview
+   platform
+   porting
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
new file mode 100644
index 000000000000..d4d1e9b40e0c
--- /dev/null
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -0,0 +1,124 @@
+=============================
+The Linux Kernel Device Model
+=============================
+
+Patrick Mochel	<mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm.  Of course,
+not every bus is able to support all such operations, although most
+buses support most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this::
+
+  struct pci_dev {
+	...
+
+	struct device dev;     /* Generic device interface */
+	...
+  };
+
+Note first that the struct device dev within the struct pci_dev is
+statically allocated. This means only one allocation on device discovery.
+
+Note also that that struct device dev is not necessarily defined at the
+front of the pci_dev structure.  This is to make people think about what
+they're doing when switching between the bus driver and the global driver,
+and to discourage meaningless and incorrect casts between the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a compelling reason to do so.
+
+The above abstraction prevents unnecessary pain during transitional phases.
+If it were not done this way, then when a field was renamed or removed, every
+downstream driver would break.  On the other hand, if only the bus layer
+(and not the device layer) accesses the struct device, it is only the bus
+layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs.
+
+Almost all mainstream Linux distros mount this filesystem automatically; you
+can see some variation of the following in the output of the "mount" command::
+
+  $ mount
+  ...
+  none on /sys type sysfs (rw,noexec,nosuid,nodev)
+  ...
+  $
+
+The auto-mounting of sysfs is typically accomplished by an entry similar to
+the following in the /etc/fstab file::
+
+  none     	/sys	sysfs    defaults	  	0 0
+
+or something similar in the /lib/init/fstab file on Debian-based systems::
+
+  none            /sys    sysfs    nodev,noexec,nosuid    0 0
+
+If sysfs is not automatically mounted, you can always do it manually with::
+
+	# mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
new file mode 100644
index 000000000000..334dd4071ae4
--- /dev/null
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -0,0 +1,246 @@
+============================
+Platform Devices and Drivers
+============================
+
+See <linux/platform_device.h> for the driver model interface to the
+platform bus:  platform_device, and platform_driver.  This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms.  What they usually have in common
+is direct addressing from a CPU bus.  Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs::
+
+  struct platform_device {
+	const char	*name;
+	u32		id;
+	struct device	dev;
+	u32		num_resources;
+	struct resource	*resource;
+  };
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods.  They support power management
+and shutdown notifications using the standard conventions::
+
+  struct platform_driver {
+	int (*probe)(struct platform_device *);
+	int (*remove)(struct platform_device *);
+	void (*shutdown)(struct platform_device *);
+	int (*suspend)(struct platform_device *, pm_message_t state);
+	int (*suspend_late)(struct platform_device *, pm_message_t state);
+	int (*resume_early)(struct platform_device *);
+	int (*resume)(struct platform_device *);
+	struct device_driver driver;
+  };
+
+Note that probe() should in general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure.  The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way::
+
+	int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint::
+
+	int platform_driver_probe(struct platform_driver *drv,
+			  int (*probe)(struct platform_device *))
+
+Kernel modules can be composed of several platform drivers. The platform core
+provides helpers to register and unregister an array of drivers::
+
+	int __platform_register_drivers(struct platform_driver * const *drivers,
+				      unsigned int count, struct module *owner);
+	void platform_unregister_drivers(struct platform_driver * const *drivers,
+					 unsigned int count);
+
+If one of the drivers fails to register, all drivers registered up to that
+point will be unregistered in reverse order. Note that there is a convenience
+macro that passes THIS_MODULE as owner parameter::
+
+	#define platform_register_drivers(drivers, count)
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices::
+
+	int platform_device_register(struct platform_device *pdev);
+
+	int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered.  For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board.   Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board.  Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work.  Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers:  Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role:  the driver registers its platform device, rather than
+leaving that for system infrastructure.  Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration.  Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged.  If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver.  This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers.  Avoid
+using these calls except with such hotplug-deficient drivers::
+
+	struct platform_device *platform_device_alloc(
+			const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually::
+
+	struct platform_device *platform_device_register_simple(
+			const char *name, int id,
+			struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+    * platform_device.name ... which is also used to for driver matching.
+
+    * platform_device.id ... the device instance number, or else "-1"
+      to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver.  If the
+probe() succeeds, the driver and device are bound as usual.  There are
+three different ways to find such a match:
+
+    - Whenever a device is registered, the drivers for that bus are
+      checked for matches.  Platform devices should be registered very
+      early during system boot.
+
+    - When a driver is registered using platform_driver_register(), all
+      unbound devices on that bus are checked for matches.  Drivers
+      usually register later during booting, or by module loading.
+
+    - Registering a driver using platform_driver_probe() works just like
+      using platform_driver_register(), except that the driver won't
+      be probed later if another device registers.  (Which is OK, since
+      this interface is only for use with non-hotpluggable devices.)
+
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platform driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst
new file mode 100644
index 000000000000..931ea879af3f
--- /dev/null
+++ b/Documentation/driver-api/driver-model/porting.rst
@@ -0,0 +1,448 @@
+=======================================
+Porting Drivers to the New Driver Model
+=======================================
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing::
+
+	# mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver::
+
+    struct bus_type pci_bus_type = {
+          .name           = "pci",
+    };
+
+
+- Register the bus type.
+
+  This should be done in the initialization function for the bus type,
+  which is usually the module_init(), or equivalent, function::
+
+    static int __init pci_driver_init(void)
+    {
+            return bus_register(&pci_bus_type);
+    }
+
+    subsys_initcall(pci_driver_init);
+
+
+  The bus type may be unregistered (if the bus driver may be compiled
+  as a module) by doing::
+
+     bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+  Other code may wish to reference the bus type, so declare it in a
+  shared header file and export the symbol.
+
+From include/linux/pci.h::
+
+  extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in::
+
+  EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+  subdirectories: 'devices' and 'drivers'::
+
+    # tree -d /sys/bus/pci/
+    /sys/bus/pci/
+    |-- devices
+    `-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type::
+
+
+    struct pci_dev {
+           ...
+           struct  device  dev;            /* Generic device interface */
+           ...
+    };
+
+  It is recommended that the generic device not be the first item in
+  the struct to discourage programmers from doing mindless casts
+  between the object types. Instead macros, or inline functions,
+  should be created to convert from the generic object type::
+
+
+    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+    or
+
+    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+    {
+	return container_of(n, struct pci_dev, dev);
+    }
+
+  This allows the compiler to verify type-safety of the operations
+  that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+  When devices are discovered or registered with the bus type, the
+  bus driver should initialize the generic device. The most important
+  things to initialize are the bus_id, parent, and bus fields.
+
+  The bus_id is an ASCII string that contains the device's address on
+  the bus. The format of this string is bus-specific. This is
+  necessary for representing devices in sysfs.
+
+  parent is the physical parent of the device. It is important that
+  the bus driver sets this field correctly.
+
+  The driver model maintains an ordered list of devices that it uses
+  for power management. This list must be in order to guarantee that
+  devices are shutdown before their physical parents, and vice versa.
+  The order of this list is determined by the parent of registered
+  devices.
+
+  Also, the location of the device's sysfs directory depends on a
+  device's parent. sysfs exports a directory structure that mirrors
+  the device hierarchy. Accurately setting the parent guarantees that
+  sysfs will accurately represent the hierarchy.
+
+  The device's bus field is a pointer to the bus type the device
+  belongs to. This should be set to the bus_type that was declared
+  and initialized before.
+
+  Optionally, the bus driver may set the device's name and release
+  fields.
+
+  The name field is an ASCII string describing the device, like
+
+     "ATI Technologies Inc Radeon QD"
+
+  The release field is a callback that the driver model core calls
+  when the device has been removed, and all references to it have
+  been released. More on this in a moment.
+
+
+- Register the device.
+
+  Once the generic device has been initialized, it can be registered
+  with the driver model core by doing::
+
+       device_register(&dev->dev);
+
+  It can later be unregistered by doing::
+
+       device_unregister(&dev->dev);
+
+  This should happen on buses that support hotpluggable devices.
+  If a bus driver unregisters a device, it should not immediately free
+  it. It should instead wait for the driver model core to call the
+  device's release method, then free the bus-specific object.
+  (There may be other code that is currently referencing the device
+  structure, and it would be rude to free the device while that is
+  happening).
+
+
+  When the device is registered, a directory in sysfs is created.
+  The PCI tree in sysfs looks like::
+
+    /sys/devices/pci0/
+    |-- 00:00.0
+    |-- 00:01.0
+    |   `-- 01:00.0
+    |-- 00:02.0
+    |   `-- 02:1f.0
+    |       `-- 03:00.0
+    |-- 00:1e.0
+    |   `-- 04:04.0
+    |-- 00:1f.0
+    |-- 00:1f.1
+    |   |-- ide0
+    |   |   |-- 0.0
+    |   |   `-- 0.1
+    |   `-- ide1
+    |       `-- 1.0
+    |-- 00:1f.2
+    |-- 00:1f.3
+    `-- 00:1f.5
+
+  Also, symlinks are created in the bus's 'devices' directory
+  that point to the device's directory in the physical hierarchy::
+
+    /sys/bus/pci/devices/
+    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
+    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
+    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
+    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+  Just like with devices, do something like::
+
+    struct pci_driver {
+           ...
+           struct device_driver    driver;
+    };
+
+
+- Initialize the generic driver structure.
+
+  When the driver registers with the bus (e.g. doing pci_register_driver()),
+  initialize the necessary fields of the driver: the name and bus
+  fields.
+
+
+- Register the driver.
+
+  After the generic driver has been initialized, call::
+
+	driver_register(&drv->driver);
+
+  to register the driver with the core.
+
+  When the driver is unregistered from the bus, unregister it from the
+  core by doing::
+
+        driver_unregister(&drv->driver);
+
+  Note that this will block until all references to the driver have
+  gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+  Drivers are exported via sysfs in their bus's 'driver's directory.
+  For example::
+
+    /sys/bus/pci/drivers/
+    |-- 3c59x
+    |-- Ensoniq AudioPCI
+    |-- agpgart-amdk7
+    |-- e100
+    `-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance::
+
+
+  static int pci_device_remove(struct device * dev)
+  {
+          struct pci_dev * pci_dev = to_pci_dev(dev);
+          struct pci_driver * drv = pci_dev->driver;
+
+          if (drv) {
+                  if (drv->remove)
+                          drv->remove(pci_dev);
+                  pci_dev->driver = NULL;
+          }
+          return 0;
+  }
+
+
+The generic driver should be initialized with these methods before it
+is registered::
+
+        /* initialize common driver fields */
+        drv->driver.name = drv->name;
+        drv->driver.bus = &pci_bus_type;
+        drv->driver.probe = pci_device_probe;
+        drv->driver.resume = pci_device_resume;
+        drv->driver.suspend = pci_device_suspend;
+        drv->driver.remove = pci_device_remove;
+
+        /* register with core */
+        driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison::
+
+  int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return positive value if the driver supports the device,
+and zero otherwise. It may also return error code (for example
+-EPROBE_DEFER) if determining that given driver supports the device is
+not possible.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory::
+
+  /sys/bus/pci/drivers/
+  |-- 3c59x
+  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+  |-- Ensoniq AudioPCI
+  |-- agpgart-amdk7
+  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+  |-- e100
+  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+  `-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type::
+
+     int (*hotplug) (struct device *dev, char **envp,
+                     int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+                       void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices::
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+                       void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
diff --git a/Documentation/driver-api/early-userspace/buffer-format.rst b/Documentation/driver-api/early-userspace/buffer-format.rst
new file mode 100644
index 000000000000..7f74e301fdf3
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/buffer-format.rst
@@ -0,0 +1,119 @@
+=======================
+initramfs buffer format
+=======================
+
+Al Viro, H. Peter Anvin
+
+Last revision: 2002-01-13
+
+Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
+getting {replaced/complemented} with the new "initial ramfs"
+(initramfs) protocol.  The initramfs contents is passed using the same
+memory buffer protocol used by the initrd protocol, but the contents
+is different.  The initramfs buffer contains an archive which is
+expanded into a ramfs filesystem; this document details the format of
+the initramfs buffer format.
+
+The initramfs buffer format is based around the "newc" or "crc" CPIO
+formats, and can be created with the cpio(1) utility.  The cpio
+archive can be compressed using gzip(1).  One valid version of an
+initramfs buffer is thus a single .cpio.gz file.
+
+The full format of the initramfs buffer is defined by the following
+grammar, where::
+
+	*	is used to indicate "0 or more occurrences of"
+	(|)	indicates alternatives
+	+	indicates concatenation
+	GZIP()	indicates the gzip(1) of the operand
+	ALGN(n)	means padding with null bytes to an n-byte boundary
+
+	initramfs  := ("\0" | cpio_archive | cpio_gzip_archive)*
+
+	cpio_gzip_archive := GZIP(cpio_archive)
+
+	cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
+
+	cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data
+
+	cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4)
+
+
+In human terms, the initramfs buffer contains a collection of
+compressed and/or uncompressed cpio archives (in the "newc" or "crc"
+formats); arbitrary amounts zero bytes (for padding) can be added
+between members.
+
+The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is
+not ignored; see "handling of hard links" below.
+
+The structure of the cpio_header is as follows (all fields contain
+hexadecimal ASCII numbers fully padded with '0' on the left to the
+full width of the field, for example, the integer 4780 is represented
+by the ASCII string "000012ac"):
+
+============= ================== ==============================================
+Field name    Field size	 Meaning
+============= ================== ==============================================
+c_magic	      6 bytes		 The string "070701" or "070702"
+c_ino	      8 bytes		 File inode number
+c_mode	      8 bytes		 File mode and permissions
+c_uid	      8 bytes		 File uid
+c_gid	      8 bytes		 File gid
+c_nlink	      8 bytes		 Number of links
+c_mtime	      8 bytes		 Modification time
+c_filesize    8 bytes		 Size of data field
+c_maj	      8 bytes		 Major part of file device number
+c_min	      8 bytes		 Minor part of file device number
+c_rmaj	      8 bytes		 Major part of device node reference
+c_rmin	      8 bytes		 Minor part of device node reference
+c_namesize    8 bytes		 Length of filename, including final \0
+c_chksum      8 bytes		 Checksum of data field if c_magic is 070702;
+				 otherwise zero
+============= ================== ==============================================
+
+The c_mode field matches the contents of st_mode returned by stat(2)
+on Linux, and encodes the file type and file permissions.
+
+The c_filesize should be zero for any file which is not a regular file
+or symlink.
+
+The c_chksum field contains a simple 32-bit unsigned sum of all the
+bytes in the data field.  cpio(1) refers to this as "crc", which is
+clearly incorrect (a cyclic redundancy check is a different and
+significantly stronger integrity check), however, this is the
+algorithm used.
+
+If the filename is "TRAILER!!!" this is actually an end-of-archive
+marker; the c_filesize for an end-of-archive marker must be zero.
+
+
+Handling of hard links
+======================
+
+When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino)
+tuple is looked up in a tuple buffer.  If not found, it is entered in
+the tuple buffer and the entry is created as usual; if found, a hard
+link rather than a second copy of the file is created.  It is not
+necessary (but permitted) to include a second copy of the file
+contents; if the file contents is not included, the c_filesize field
+should be set to zero to indicate no data section follows.  If data is
+present, the previous instance of the file is overwritten; this allows
+the data-carrying instance of a file to occur anywhere in the sequence
+(GNU cpio is reported to attach the data to the last instance of a
+file only.)
+
+c_filesize must not be zero for a symlink.
+
+When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is
+reset.  This permits archives which are generated independently to be
+concatenated.
+
+To combine file data from different sources (without having to
+regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of
+the following techniques can be used:
+
+a) Separate the different file data sources with a "TRAILER!!!"
+   end-of-archive marker, or
+
+b) Make sure c_nlink == 1 for all nondirectory entries.
diff --git a/Documentation/driver-api/early-userspace/early_userspace_support.rst b/Documentation/driver-api/early-userspace/early_userspace_support.rst
new file mode 100644
index 000000000000..3deefb34046b
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/early_userspace_support.rst
@@ -0,0 +1,154 @@
+=======================
+Early userspace support
+=======================
+
+Last update: 2004-12-20 tlh
+
+
+"Early userspace" is a set of libraries and programs that provide
+various pieces of functionality that are important enough to be
+available while a Linux kernel is coming up, but that don't need to be
+run inside the kernel itself.
+
+It consists of several major infrastructure components:
+
+- gen_init_cpio, a program that builds a cpio-format archive
+  containing a root filesystem image.  This archive is compressed, and
+  the compressed image is linked into the kernel image.
+- initramfs, a chunk of code that unpacks the compressed cpio image
+  midway through the kernel boot process.
+- klibc, a userspace C library, currently packaged separately, that is
+  optimized for correctness and small size.
+
+The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
+format, and is documented in the file "buffer-format.txt".  There are
+two ways to add an early userspace image: specify an existing cpio
+archive to be used as the image or have the kernel build process build
+the image from specifications.
+
+CPIO ARCHIVE method
+-------------------
+
+You can create a cpio archive that contains the early userspace image.
+Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it
+will be used directly.  Only a single cpio file may be specified in
+CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in
+combination with a cpio archive.
+
+IMAGE BUILDING method
+---------------------
+
+The kernel build process can also build an early userspace image from
+source parts rather than supplying a cpio archive.  This method provides
+a way to create images with root-owned files even though the image was
+built by an unprivileged user.
+
+The image is specified as one or more sources in
+CONFIG_INITRAMFS_SOURCE.  Sources can be either directories or files -
+cpio archives are *not* allowed when building from sources.
+
+A source directory will have it and all of its contents packaged.  The
+specified directory name will be mapped to '/'.  When packaging a
+directory, limited user and group ID translation can be performed.
+INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
+user root (0).  INITRAMFS_ROOT_GID can be set to a group ID that needs
+to be mapped to group root (0).
+
+A source file must be directives in the format required by the
+usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the
+file format).  The directives in the file will be passed directly to
+usr/gen_init_cpio.
+
+When a combination of directories and files are specified then the
+initramfs image will be an aggregate of all of them.  In this way a user
+can create a 'root-image' directory and install all files into it.
+Because device-special files cannot be created by a unprivileged user,
+special files can be listed in a 'root-files' file.  Both 'root-image'
+and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete
+early userspace image can be built by an unprivileged user.
+
+As a technical note, when directories and files are specified, the
+entire CONFIG_INITRAMFS_SOURCE is passed to
+usr/gen_initramfs_list.sh.  This means that CONFIG_INITRAMFS_SOURCE
+can really be interpreted as any legal argument to
+gen_initramfs_list.sh.  If a directory is specified as an argument then
+the contents are scanned, uid/gid translation is performed, and
+usr/gen_init_cpio file directives are output.  If a directory is
+specified as an argument to usr/gen_initramfs_list.sh then the
+contents of the file are simply copied to the output.  All of the output
+directives from directory scanning and file contents copying are
+processed by usr/gen_init_cpio.
+
+See also 'usr/gen_initramfs_list.sh -h'.
+
+Where's this all leading?
+=========================
+
+The klibc distribution contains some of the necessary software to make
+early userspace useful.  The klibc distribution is currently
+maintained separately from the kernel.
+
+You can obtain somewhat infrequent snapshots of klibc from
+https://www.kernel.org/pub/linux/libs/klibc/
+
+For active users, you are better off using the klibc git
+repository, at http://git.kernel.org/?p=libs/klibc/klibc.git
+
+The standalone klibc distribution currently provides three components,
+in addition to the klibc library:
+
+- ipconfig, a program that configures network interfaces.  It can
+  configure them statically, or use DHCP to obtain information
+  dynamically (aka "IP autoconfiguration").
+- nfsmount, a program that can mount an NFS filesystem.
+- kinit, the "glue" that uses ipconfig and nfsmount to replace the old
+  support for IP autoconfig, mount a filesystem over NFS, and continue
+  system boot using that filesystem as root.
+
+kinit is built as a single statically linked binary to save space.
+
+Eventually, several more chunks of kernel functionality will hopefully
+move to early userspace:
+
+- Almost all of init/do_mounts* (the beginning of this is already in
+  place)
+- ACPI table parsing
+- Insert unwieldy subsystem that doesn't really need to be in kernel
+  space here
+
+If kinit doesn't meet your current needs and you've got bytes to burn,
+the klibc distribution includes a small Bourne-compatible shell (ash)
+and a number of other utilities, so you can replace kinit and build
+custom initramfs images that meet your needs exactly.
+
+For questions and help, you can sign up for the early userspace
+mailing list at http://www.zytor.com/mailman/listinfo/klibc
+
+How does it work?
+=================
+
+The kernel has currently 3 ways to mount the root filesystem:
+
+a) all required device and filesystem drivers compiled into the kernel, no
+   initrd.  init/main.c:init() will call prepare_namespace() to mount the
+   final root filesystem, based on the root= option and optional init= to run
+   some other init binary than listed at the end of init/main.c:init().
+
+b) some device and filesystem drivers built as modules and stored in an
+   initrd.  The initrd must contain a binary '/linuxrc' which is supposed to
+   load these driver modules.  It is also possible to mount the final root
+   filesystem via linuxrc and use the pivot_root syscall.  The initrd is
+   mounted and executed via prepare_namespace().
+
+c) using initramfs.  The call to prepare_namespace() must be skipped.
+   This means that a binary must do all the work.  Said binary can be stored
+   into initramfs either via modifying usr/gen_init_cpio.c or via the new
+   initrd format, an cpio archive.  It must be called "/init".  This binary
+   is responsible to do all the things prepare_namespace() would do.
+
+   To maintain backwards compatibility, the /init binary will only run if it
+   comes via an initramfs cpio archive.  If this is not the case,
+   init/main.c:init() will run prepare_namespace() to mount the final root
+   and exec one of the predefined init binaries.
+
+Bryan O'Sullivan <bos@serpentine.com>
diff --git a/Documentation/driver-api/early-userspace/index.rst b/Documentation/driver-api/early-userspace/index.rst
new file mode 100644
index 000000000000..149c1822f06d
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Early Userspace
+===============
+
+.. toctree::
+    :maxdepth: 1
+
+    early_userspace_support
+    buffer-format
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/edid.rst b/Documentation/driver-api/edid.rst
new file mode 100644
index 000000000000..b1b5acd501ed
--- /dev/null
+++ b/Documentation/driver-api/edid.rst
@@ -0,0 +1,58 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+EDID
+====
+
+In the good old days when graphics parameters were configured explicitly
+in a file called xorg.conf, even broken hardware could be managed.
+
+Today, with the advent of Kernel Mode Setting, a graphics board is
+either correctly working because all components follow the standards -
+or the computer is unusable, because the screen remains dark after
+booting or it displays the wrong area. Cases when this happens are:
+- The graphics board does not recognize the monitor.
+- The graphics board is unable to detect any EDID data.
+- The graphics board incorrectly forwards EDID data to the driver.
+- The monitor sends no or bogus EDID data.
+- A KVM sends its own EDID data instead of querying the connected monitor.
+Adding the kernel parameter "nomodeset" helps in most cases, but causes
+restrictions later on.
+
+As a remedy for such situations, the kernel configuration item
+CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an
+individually prepared or corrected EDID data set in the /lib/firmware
+directory from where it is loaded via the firmware interface. The code
+(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for
+commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200,
+1680x1050, 1920x1080) as binary blobs, but the kernel source tree does
+not contain code to create these data. In order to elucidate the origin
+of the built-in binary EDID blobs and to facilitate the creation of
+individual data for a specific misbehaving monitor, commented sources
+and a Makefile environment are given here.
+
+To create binary EDID and C source code files from the existing data
+material, simply type "make".
+
+If you want to create your own EDID file, copy the file 1024x768.S,
+replace the settings with your own data and add a new target to the
+Makefile. Please note that the EDID data structure expects the timing
+values in a different way as compared to the standard X11 format.
+
+X11:
+  HTimings:
+    hdisp hsyncstart hsyncend htotal
+  VTimings:
+    vdisp vsyncstart vsyncend vtotal
+
+EDID::
+
+  #define XPIX hdisp
+  #define XBLANK htotal-hdisp
+  #define XOFFSET hsyncstart-hdisp
+  #define XPULSE hsyncend-hsyncstart
+
+  #define YPIX vdisp
+  #define YBLANK vtotal-vdisp
+  #define YOFFSET vsyncstart-vdisp
+  #define YPULSE vsyncend-vsyncstart
diff --git a/Documentation/driver-api/eisa.rst b/Documentation/driver-api/eisa.rst
new file mode 100644
index 000000000000..c07565ba57da
--- /dev/null
+++ b/Documentation/driver-api/eisa.rst
@@ -0,0 +1,230 @@
+================
+EISA bus support
+================
+
+:Author: Marc Zyngier <maz@wild-wind.fr.eu.org>
+
+This document groups random notes about porting EISA drivers to the
+new EISA/sysfs API.
+
+Starting from version 2.5.59, the EISA bus is almost given the same
+status as other much more mainstream busses such as PCI or USB. This
+has been possible through sysfs, which defines a nice enough set of
+abstractions to manage busses, devices and drivers.
+
+Although the new API is quite simple to use, converting existing
+drivers to the new infrastructure is not an easy task (mostly because
+detection code is generally also used to probe ISA cards). Moreover,
+most EISA drivers are among the oldest Linux drivers so, as you can
+imagine, some dust has settled here over the years.
+
+The EISA infrastructure is made up of three parts:
+
+    - The bus code implements most of the generic code. It is shared
+      among all the architectures that the EISA code runs on. It
+      implements bus probing (detecting EISA cards available on the bus),
+      allocates I/O resources, allows fancy naming through sysfs, and
+      offers interfaces for driver to register.
+
+    - The bus root driver implements the glue between the bus hardware
+      and the generic bus code. It is responsible for discovering the
+      device implementing the bus, and setting it up to be latter probed
+      by the bus code. This can go from something as simple as reserving
+      an I/O region on x86, to the rather more complex, like the hppa
+      EISA code. This is the part to implement in order to have EISA
+      running on an "new" platform.
+
+    - The driver offers the bus a list of devices that it manages, and
+      implements the necessary callbacks to probe and release devices
+      whenever told to.
+
+Every function/structure below lives in <linux/eisa.h>, which depends
+heavily on <linux/device.h>.
+
+Bus root driver
+===============
+
+::
+
+	int eisa_root_register (struct eisa_root_device *root);
+
+The eisa_root_register function is used to declare a device as the
+root of an EISA bus. The eisa_root_device structure holds a reference
+to this device, as well as some parameters for probing purposes::
+
+	struct eisa_root_device {
+		struct device   *dev;	 /* Pointer to bridge device */
+		struct resource *res;
+		unsigned long    bus_base_addr;
+		int		 slots;  /* Max slot number */
+		int		 force_probe; /* Probe even when no slot 0 */
+		u64		 dma_mask; /* from bridge device */
+		int              bus_nr; /* Set by eisa_root_register */
+		struct resource  eisa_root_res;	/* ditto */
+	};
+
+============= ======================================================
+node          used for eisa_root_register internal purpose
+dev           pointer to the root device
+res           root device I/O resource
+bus_base_addr slot 0 address on this bus
+slots	      max slot number to probe
+force_probe   Probe even when slot 0 is empty (no EISA mainboard)
+dma_mask      Default DMA mask. Usually the bridge device dma_mask.
+bus_nr	      unique bus id, set by eisa_root_register
+============= ======================================================
+
+Driver
+======
+
+::
+
+	int eisa_driver_register (struct eisa_driver *edrv);
+	void eisa_driver_unregister (struct eisa_driver *edrv);
+
+Clear enough ?
+
+::
+
+	struct eisa_device_id {
+		char sig[EISA_SIG_LEN];
+		unsigned long driver_data;
+	};
+
+	struct eisa_driver {
+		const struct eisa_device_id *id_table;
+		struct device_driver         driver;
+	};
+
+=============== ====================================================
+id_table	an array of NULL terminated EISA id strings,
+		followed by an empty string. Each string can
+		optionally be paired with a driver-dependent value
+		(driver_data).
+
+driver		a generic driver, such as described in
+		Documentation/driver-api/driver-model/driver.rst. Only .name,
+		.probe and .remove members are mandatory.
+=============== ====================================================
+
+An example is the 3c59x driver::
+
+	static struct eisa_device_id vortex_eisa_ids[] = {
+		{ "TCM5920", EISA_3C592_OFFSET },
+		{ "TCM5970", EISA_3C597_OFFSET },
+		{ "" }
+	};
+
+	static struct eisa_driver vortex_eisa_driver = {
+		.id_table = vortex_eisa_ids,
+		.driver   = {
+			.name    = "3c59x",
+			.probe   = vortex_eisa_probe,
+			.remove  = vortex_eisa_remove
+		}
+	};
+
+Device
+======
+
+The sysfs framework calls .probe and .remove functions upon device
+discovery and removal (note that the .remove function is only called
+when driver is built as a module).
+
+Both functions are passed a pointer to a 'struct device', which is
+encapsulated in a 'struct eisa_device' described as follows::
+
+	struct eisa_device {
+		struct eisa_device_id id;
+		int                   slot;
+		int                   state;
+		unsigned long         base_addr;
+		struct resource       res[EISA_MAX_RESOURCES];
+		u64                   dma_mask;
+		struct device         dev; /* generic device */
+	};
+
+======== ============================================================
+id	 EISA id, as read from device. id.driver_data is set from the
+	 matching driver EISA id.
+slot	 slot number which the device was detected on
+state    set of flags indicating the state of the device. Current
+	 flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
+res	 set of four 256 bytes I/O regions allocated to this device
+dma_mask DMA mask set from the parent device.
+dev	 generic device (see Documentation/driver-api/driver-model/device.rst)
+======== ============================================================
+
+You can get the 'struct eisa_device' from 'struct device' using the
+'to_eisa_device' macro.
+
+Misc stuff
+==========
+
+::
+
+	void eisa_set_drvdata (struct eisa_device *edev, void *data);
+
+Stores data into the device's driver_data area.
+
+::
+
+	void *eisa_get_drvdata (struct eisa_device *edev):
+
+Gets the pointer previously stored into the device's driver_data area.
+
+::
+
+	int eisa_get_region_index (void *addr);
+
+Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
+address.
+
+Kernel parameters
+=================
+
+eisa_bus.enable_dev
+	A comma-separated list of slots to be enabled, even if the firmware
+	set the card as disabled. The driver must be able to properly
+	initialize the device in such conditions.
+
+eisa_bus.disable_dev
+	A comma-separated list of slots to be enabled, even if the firmware
+	set the card as enabled. The driver won't be called to handle this
+	device.
+
+virtual_root.force_probe
+	Force the probing code to probe EISA slots even when it cannot find an
+	EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
+	(don't force), and set to 1 (force probing) when either
+	CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
+
+Random notes
+============
+
+Converting an EISA driver to the new API mostly involves *deleting*
+code (since probing is now in the core EISA code). Unfortunately, most
+drivers share their probing routine between ISA, and EISA. Special
+care must be taken when ripping out the EISA code, so other busses
+won't suffer from these surgical strikes...
+
+You *must not* expect any EISA device to be detected when returning
+from eisa_driver_register, since the chances are that the bus has not
+yet been probed. In fact, that's what happens most of the time (the
+bus root driver usually kicks in rather late in the boot process).
+Unfortunately, most drivers are doing the probing by themselves, and
+expect to have explored the whole machine when they exit their probe
+routine.
+
+For example, switching your favorite EISA SCSI card to the "hotplug"
+model is "the right thing"(tm).
+
+Thanks
+======
+
+I'd like to thank the following people for their help:
+
+- Xavier Benigni for lending me a wonderful Alpha Jensen,
+- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
+- Andries Brouwer for contributing numerous EISA ids,
+- Catrin Jones for coping with far too many machines at home.
diff --git a/Documentation/driver-api/firmware/other_interfaces.rst b/Documentation/driver-api/firmware/other_interfaces.rst
index a4ac54b5fd79..b81794e0cfbb 100644
--- a/Documentation/driver-api/firmware/other_interfaces.rst
+++ b/Documentation/driver-api/firmware/other_interfaces.rst
@@ -33,7 +33,7 @@ of the requests on to a secure monitor (EL3).
    :functions: stratix10_svc_client_msg
 
 .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h
-   :functions: stratix10_svc_command_reconfig_payload
+   :functions: stratix10_svc_command_config_type
 
 .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h
    :functions: stratix10_svc_cb_data
diff --git a/Documentation/driver-api/generic-counter.rst b/Documentation/driver-api/generic-counter.rst
index f51db893f595..8382f01a53e3 100644
--- a/Documentation/driver-api/generic-counter.rst
+++ b/Documentation/driver-api/generic-counter.rst
@@ -233,7 +233,7 @@ Userspace Interface
 Several sysfs attributes are generated by the Generic Counter interface,
 and reside under the /sys/bus/counter/devices/counterX directory, where
 counterX refers to the respective counter device. Please see
-Documentation/ABI/testing/sys-bus-counter-generic-sysfs for detailed
+Documentation/ABI/testing/sysfs-bus-counter for detailed
 information on each Generic Counter interface sysfs attribute.
 
 Through these sysfs attributes, programs and scripts may interact with
@@ -251,7 +251,7 @@ for defining a counter device.
 .. kernel-doc:: include/linux/counter.h
    :internal:
 
-.. kernel-doc:: drivers/counter/generic-counter.c
+.. kernel-doc:: drivers/counter/counter.c
    :export:
 
 Implementation
@@ -325,7 +325,7 @@ sysfs attributes, where Y is the unique ID of the respective Count:
 
 For a more detailed breakdown of the available Generic Counter interface
 sysfs attributes, please refer to the
-Documentation/ABI/testing/sys-bus-counter file.
+Documentation/ABI/testing/sysfs-bus-counter file.
 
 The Signals and Counts associated with the Counter device are registered
 to the system as well by the counter_register function. The
diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst
index b37f3f7b8926..ce91518bf9f4 100644
--- a/Documentation/driver-api/gpio/board.rst
+++ b/Documentation/driver-api/gpio/board.rst
@@ -101,7 +101,7 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1::
 	}
 
 For more information about the ACPI GPIO bindings see
-Documentation/acpi/gpio-properties.txt.
+Documentation/firmware-guide/acpi/gpio-properties.rst.
 
 Platform Data
 -------------
diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index 5e4d8aa68913..423492d125b9 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -283,8 +283,6 @@ To summarize::
   gpiod_set_value(desc, 1);          default (active high)  high
   gpiod_set_value(desc, 0);          active low             high
   gpiod_set_value(desc, 1);          active low             low
-  gpiod_set_value(desc, 0);          default (active high)  low
-  gpiod_set_value(desc, 1);          default (active high)  high
   gpiod_set_value(desc, 0);          open drain             low
   gpiod_set_value(desc, 1);          open drain             high impedance
   gpiod_set_value(desc, 0);          open source            high impedance
@@ -366,7 +364,7 @@ accessed sequentially.
 The functions take three arguments:
 	* array_size	- the number of array elements
 	* desc_array	- an array of GPIO descriptors
-	* array_info	- optional information obtained from gpiod_array_get()
+	* array_info	- optional information obtained from gpiod_get_array()
 	* value_bitmap	- a bitmap to store the GPIOs' values (get) or
 			  a bitmap of values to assign to the GPIOs (set)
 
@@ -437,7 +435,7 @@ case, it will be handled by the GPIO subsystem automatically.  However, if the
 _DSD is not present, the mappings between GpioIo()/GpioInt() resources and GPIO
 connection IDs need to be provided by device drivers.
 
-For details refer to Documentation/acpi/gpio-properties.txt
+For details refer to Documentation/firmware-guide/acpi/gpio-properties.rst
 
 
 Interacting With the Legacy GPIO Subsystem
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 1ce7fcd0f989..921c71a3d683 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -235,7 +235,7 @@ means that a pull up or pull-down resistor is available on the output of the
 GPIO line, and this resistor is software controlled.
 
 In discrete designs, a pull-up or pull-down resistor is simply soldered on
-the circuit board. This is not something we deal or model in software. The
+the circuit board. This is not something we deal with or model in software. The
 most you will think about these lines is that they will very likely be
 configured as open drain or open source (see the section above).
 
@@ -292,18 +292,18 @@ We can divide GPIO irqchips in two broad categories:
 
 - HIERARCHICAL INTERRUPT CHIPS: this means that each GPIO line has a dedicated
   irq line to a parent interrupt controller one level up. There is no need
-  to inquire the GPIO hardware to figure out which line has figured, but it
-  may still be necessary to acknowledge the interrupt and set up the
-  configuration such as edge sensitivity.
+  to inquire the GPIO hardware to figure out which line has fired, but it
+  may still be necessary to acknowledge the interrupt and set up configuration
+  such as edge sensitivity.
 
 Realtime considerations: a realtime compliant GPIO driver should not use
 spinlock_t or any sleepable APIs (like PM runtime) as part of its irqchip
 implementation.
 
-- spinlock_t should be replaced with raw_spinlock_t [1].
+- spinlock_t should be replaced with raw_spinlock_t.[1]
 - If sleepable APIs have to be used, these can be done from the .irq_bus_lock()
   and .irq_bus_unlock() callbacks, as these are the only slowpath callbacks
-  on an irqchip. Create the callbacks if needed [2].
+  on an irqchip. Create the callbacks if needed.[2]
 
 
 Cascaded GPIO irqchips
@@ -361,7 +361,7 @@ Cascaded GPIO irqchips usually fall in one of three categories:
 
   Realtime considerations: this kind of handlers will be forced threaded on -RT,
   and as result the IRQ core will complain that generic_handle_irq() is called
-  with IRQ enabled and the same work around as for "CHAINED GPIO irqchips" can
+  with IRQ enabled and the same work-around as for "CHAINED GPIO irqchips" can
   be applied.
 
 - NESTED THREADED GPIO IRQCHIPS: these are off-chip GPIO expanders and any
@@ -399,7 +399,7 @@ symbol:
   will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the
   callbacks need to embed the gpio_chip in its state container and obtain a
   pointer to the container using container_of().
-  (See Documentation/driver-model/design-patterns.txt)
+  (See Documentation/driver-api/driver-model/design-patterns.rst)
 
 - gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
   as discussed above regarding different types of cascaded irqchips. The
@@ -418,7 +418,7 @@ symbol:
 
 If there is a need to exclude certain GPIO lines from the IRQ domain handled by
 these helpers, we can set .irq.need_valid_mask of the gpiochip before
-[devm_]gpiochip_add_data() is called. This allocates an .irq.valid_mask with as
+``[devm_]gpiochip_add_data()`` is called. This allocates an .irq.valid_mask with as
 many bits set as there are GPIO lines in the chip, each bit representing line
 0..n-1. Drivers can exclude GPIO lines by clearing bits from this mask. The mask
 must be filled in before gpiochip_irqchip_add() or gpiochip_irqchip_add_nested()
diff --git a/Documentation/driver-api/iio/hw-consumer.rst b/Documentation/driver-api/iio/hw-consumer.rst
index e0fe0b98230e..819fb9edc005 100644
--- a/Documentation/driver-api/iio/hw-consumer.rst
+++ b/Documentation/driver-api/iio/hw-consumer.rst
@@ -45,7 +45,6 @@ A typical IIO HW consumer setup looks like this::
 
 More details
 ============
-.. kernel-doc:: include/linux/iio/hw-consumer.h
 .. kernel-doc:: drivers/iio/buffer/industrialio-hw-consumer.c
    :export:
 
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index d26308af6036..d12a80f386a6 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -14,8 +14,10 @@ available subsections can be seen below.
 .. toctree::
    :maxdepth: 2
 
+   driver-model/index
    basics
    infrastructure
+   early-userspace/index
    pm/index
    clk
    device-io
@@ -34,7 +36,9 @@ available subsections can be seen below.
    pci/index
    spi
    i2c
+   ipmb
    i3c/index
+   interconnect
    hsi
    edac
    scsi
@@ -42,8 +46,12 @@ available subsections can be seen below.
    target
    mtdnand
    miscellaneous
+   mei/index
+   mtd/index
+   mmc/index
+   nvdimm/index
    w1
-   rapidio
+   rapidio/index
    s390-drivers
    vme
    80211/index
@@ -51,13 +59,48 @@ available subsections can be seen below.
    firmware/index
    pinctl
    gpio/index
+   md/index
    misc_devices
+   nfc/index
    dmaengine/index
    slimbus
    soundwire/index
    fpga/index
    acpi/index
+   backlight/lp855x-driver.rst
+   bt8xxgpio
+   connector
+   console
+   dcdbas
+   dell_rbu
+   edid
+   eisa
+   isa
+   isapnp
    generic-counter
+   lightnvm-pblk
+   memory-devices/index
+   men-chameleon-bus
+   ntb
+   nvmem
+   parport-lowlevel
+   pps
+   ptp
+   phy/index
+   pti_intel_mid
+   pwm
+   rfkill
+   serial/index
+   sgi-ioc4
+   sm501
+   smsc_ece1099
+   switchtec
+   sync_file
+   vfio-mediated-device
+   vfio
+   xilinx/index
+   xillybus
+   zorro
 
 .. only::  subproject and html
 
diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/driver-api/interconnect.rst
index b8107dcc4cd3..c3e004893796 100644
--- a/Documentation/interconnect/interconnect.rst
+++ b/Documentation/driver-api/interconnect.rst
@@ -89,6 +89,5 @@ Interconnect consumers
 
 Interconnect consumers are the clients which use the interconnect APIs to
 get paths between endpoints and set their bandwidth/latency/QoS requirements
-for these interconnect paths.
-
-.. kernel-doc:: include/linux/interconnect.h
+for these interconnect paths.  These interfaces are not currently
+documented.
diff --git a/Documentation/driver-api/ipmb.rst b/Documentation/driver-api/ipmb.rst
new file mode 100644
index 000000000000..7e2265144157
--- /dev/null
+++ b/Documentation/driver-api/ipmb.rst
@@ -0,0 +1,105 @@
+==============================
+IPMB Driver for a Satellite MC
+==============================
+
+The Intelligent Platform Management Bus or IPMB, is an
+I2C bus that provides a standardized interconnection between
+different boards within a chassis. This interconnection is
+between the baseboard management (BMC) and chassis electronics.
+IPMB is also associated with the messaging protocol through the
+IPMB bus.
+
+The devices using the IPMB are usually management
+controllers that perform management functions such as servicing
+the front panel interface, monitoring the baseboard,
+hot-swapping disk drivers in the system chassis, etc...
+
+When an IPMB is implemented in the system, the BMC serves as
+a controller to give system software access to the IPMB. The BMC
+sends IPMI requests to a device (usually a Satellite Management
+Controller or Satellite MC) via IPMB and the device
+sends a response back to the BMC.
+
+For more information on IPMB and the format of an IPMB message,
+refer to the IPMB and IPMI specifications.
+
+IPMB driver for Satellite MC
+----------------------------
+
+ipmb-dev-int - This is the driver needed on a Satellite MC to
+receive IPMB messages from a BMC and send a response back.
+This driver works with the I2C driver and a userspace
+program such as OpenIPMI:
+
+1) It is an I2C slave backend driver. So, it defines a callback
+   function to set the Satellite MC as an I2C slave.
+   This callback function handles the received IPMI requests.
+
+2) It defines the read and write functions to enable a user
+   space program (such as OpenIPMI) to communicate with the kernel.
+
+
+Load the IPMB driver
+--------------------
+
+The driver needs to be loaded at boot time or manually first.
+First, make sure you have the following in your config file:
+CONFIG_IPMB_DEVICE_INTERFACE=y
+
+1) If you want the driver to be loaded at boot time:
+
+a) Add this entry to your ACPI table, under the appropriate SMBus::
+
+     Device (SMB0) // Example SMBus host controller
+     {
+     Name (_HID, "<Vendor-Specific HID>") // Vendor-Specific HID
+     Name (_UID, 0) // Unique ID of particular host controller
+     :
+     :
+       Device (IPMB)
+       {
+         Name (_HID, "IPMB0001") // IPMB device interface
+         Name (_UID, 0) // Unique device identifier
+       }
+     }
+
+b) Example for device tree::
+
+     &i2c2 {
+            status = "okay";
+
+            ipmb@10 {
+                    compatible = "ipmb-dev";
+                    reg = <0x10>;
+            };
+     };
+
+2) Manually from Linux::
+
+     modprobe ipmb-dev-int
+
+
+Instantiate the device
+----------------------
+
+After loading the driver, you can instantiate the device as
+described in 'Documentation/i2c/instantiating-devices'.
+If you have multiple BMCs, each connected to your Satellite MC via
+a different I2C bus, you can instantiate a device for each of
+those BMCs.
+
+The name of the instantiated device contains the I2C bus number
+associated with it as follows::
+
+  BMC1 ------ IPMB/I2C bus 1 ---------|   /dev/ipmb-1
+				Satellite MC
+  BMC1 ------ IPMB/I2C bus 2 ---------|   /dev/ipmb-2
+
+For instance, you can instantiate the ipmb-dev-int device from
+user space at the 7 bit address 0x10 on bus 2::
+
+  # echo ipmb-dev 0x1010 > /sys/bus/i2c/devices/i2c-2/new_device
+
+This will create the device file /dev/ipmb-2, which can be accessed
+by the user space program. The device needs to be instantiated
+before running the user space program.
diff --git a/Documentation/isa.txt b/Documentation/driver-api/isa.rst
index def4a7b690b5..def4a7b690b5 100644
--- a/Documentation/isa.txt
+++ b/Documentation/driver-api/isa.rst
diff --git a/Documentation/isapnp.txt b/Documentation/driver-api/isapnp.rst
index 8d0840ac847b..8d0840ac847b 100644
--- a/Documentation/isapnp.txt
+++ b/Documentation/driver-api/isapnp.rst
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/driver-api/lightnvm-pblk.rst
index 1040ed1cec81..1040ed1cec81 100644
--- a/Documentation/lightnvm/pblk.txt
+++ b/Documentation/driver-api/lightnvm-pblk.rst
diff --git a/Documentation/driver-api/md/index.rst b/Documentation/driver-api/md/index.rst
new file mode 100644
index 000000000000..18f54a7d7d6e
--- /dev/null
+++ b/Documentation/driver-api/md/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+RAID
+====
+
+.. toctree::
+   :maxdepth: 1
+
+   md-cluster
+   raid5-cache
+   raid5-ppl
diff --git a/Documentation/driver-api/md/md-cluster.rst b/Documentation/driver-api/md/md-cluster.rst
new file mode 100644
index 000000000000..96eb52cec7eb
--- /dev/null
+++ b/Documentation/driver-api/md/md-cluster.rst
@@ -0,0 +1,385 @@
+==========
+MD Cluster
+==========
+
+The cluster MD is a shared-device RAID for a cluster, it supports
+two levels: raid1 and raid10 (limited support).
+
+
+1. On-disk format
+=================
+
+Separate write-intent-bitmaps are used for each cluster node.
+The bitmaps record all writes that may have been started on that node,
+and may not yet have finished. The on-disk layout is::
+
+  0                    4k                     8k                    12k
+  -------------------------------------------------------------------
+  | idle                | md super            | bm super [0] + bits |
+  | bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
+  | bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
+  | bm bits [3, contd]  |                     |                     |
+
+During "normal" functioning we assume the filesystem ensures that only
+one node writes to any given block at a time, so a write request will
+
+ - set the appropriate bit (if not already set)
+ - commit the write to all mirrors
+ - schedule the bit to be cleared after a timeout.
+
+Reads are just handled normally. It is up to the filesystem to ensure
+one node doesn't read from a location where another node (or the same
+node) is writing.
+
+
+2. DLM Locks for management
+===========================
+
+There are three groups of locks for managing the device:
+
+2.1 Bitmap lock resource (bm_lockres)
+-------------------------------------
+
+ The bm_lockres protects individual node bitmaps. They are named in
+ the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
+ node joins the cluster, it acquires the lock in PW mode and it stays
+ so during the lifetime the node is part of the cluster. The lock
+ resource number is based on the slot number returned by the DLM
+ subsystem. Since DLM starts node count from one and bitmap slots
+ start from zero, one is subtracted from the DLM slot number to arrive
+ at the bitmap slot number.
+
+ The LVB of the bitmap lock for a particular node records the range
+ of sectors that are being re-synced by that node.  No other
+ node may write to those sectors.  This is used when a new nodes
+ joins the cluster.
+
+2.2 Message passing locks
+-------------------------
+
+ Each node has to communicate with other nodes when starting or ending
+ resync, and for metadata superblock updates.  This communication is
+ managed through three locks: "token", "message", and "ack", together
+ with the Lock Value Block (LVB) of one of the "message" lock.
+
+2.3 new-device management
+-------------------------
+
+ A single lock: "no-new-dev" is used to co-ordinate the addition of
+ new devices - this must be synchronized across the array.
+ Normally all nodes hold a concurrent-read lock on this device.
+
+3. Communication
+================
+
+ Messages can be broadcast to all nodes, and the sender waits for all
+ other nodes to acknowledge the message before proceeding.  Only one
+ message can be processed at a time.
+
+3.1 Message Types
+-----------------
+
+ There are six types of messages which are passed:
+
+3.1.1 METADATA_UPDATED
+^^^^^^^^^^^^^^^^^^^^^^
+
+   informs other nodes that the metadata has
+   been updated, and the node must re-read the md superblock. This is
+   performed synchronously. It is primarily used to signal device
+   failure.
+
+3.1.2 RESYNCING
+^^^^^^^^^^^^^^^
+   informs other nodes that a resync is initiated or
+   ended so that each node may suspend or resume the region.  Each
+   RESYNCING message identifies a range of the devices that the
+   sending node is about to resync. This overrides any previous
+   notification from that node: only one ranged can be resynced at a
+   time per-node.
+
+3.1.3 NEWDISK
+^^^^^^^^^^^^^
+
+   informs other nodes that a device is being added to
+   the array. Message contains an identifier for that device.  See
+   below for further details.
+
+3.1.4 REMOVE
+^^^^^^^^^^^^
+
+   A failed or spare device is being removed from the
+   array. The slot-number of the device is included in the message.
+
+ 3.1.5 RE_ADD:
+
+   A failed device is being re-activated - the assumption
+   is that it has been determined to be working again.
+
+ 3.1.6 BITMAP_NEEDS_SYNC:
+
+   If a node is stopped locally but the bitmap
+   isn't clean, then another node is informed to take the ownership of
+   resync.
+
+3.2 Communication mechanism
+---------------------------
+
+ The DLM LVB is used to communicate within nodes of the cluster. There
+ are three resources used for the purpose:
+
+3.2.1 token
+^^^^^^^^^^^
+   The resource which protects the entire communication
+   system. The node having the token resource is allowed to
+   communicate.
+
+3.2.2 message
+^^^^^^^^^^^^^
+   The lock resource which carries the data to communicate.
+
+3.2.3 ack
+^^^^^^^^^
+
+   The resource, acquiring which means the message has been
+   acknowledged by all nodes in the cluster. The BAST of the resource
+   is used to inform the receiving node that a node wants to
+   communicate.
+
+The algorithm is:
+
+ 1. receive status - all nodes have concurrent-reader lock on "ack"::
+
+	sender                         receiver                 receiver
+	"ack":CR                       "ack":CR                 "ack":CR
+
+ 2. sender get EX on "token",
+    sender get EX on "message"::
+
+	sender                        receiver                 receiver
+	"token":EX                    "ack":CR                 "ack":CR
+	"message":EX
+	"ack":CR
+
+    Sender checks that it still needs to send a message. Messages
+    received or other events that happened while waiting for the
+    "token" may have made this message inappropriate or redundant.
+
+ 3. sender writes LVB
+
+    sender down-convert "message" from EX to CW
+
+    sender try to get EX of "ack"
+
+    ::
+
+      [ wait until all receivers have *processed* the "message" ]
+
+                                       [ triggered by bast of "ack" ]
+                                       receiver get CR on "message"
+                                       receiver read LVB
+                                       receiver processes the message
+                                       [ wait finish ]
+                                       receiver releases "ack"
+                                       receiver tries to get PR on "message"
+
+     sender                         receiver                  receiver
+     "token":EX                     "message":CR              "message":CR
+     "message":CW
+     "ack":EX
+
+ 4. triggered by grant of EX on "ack" (indicating all receivers
+    have processed message)
+
+    sender down-converts "ack" from EX to CR
+
+    sender releases "message"
+
+    sender releases "token"
+
+    ::
+
+                                 receiver upconvert to PR on "message"
+                                 receiver get CR of "ack"
+                                 receiver release "message"
+
+     sender                      receiver                   receiver
+     "ack":CR                    "ack":CR                   "ack":CR
+
+
+4. Handling Failures
+====================
+
+4.1 Node Failure
+----------------
+
+ When a node fails, the DLM informs the cluster with the slot
+ number. The node starts a cluster recovery thread. The cluster
+ recovery thread:
+
+	- acquires the bitmap<number> lock of the failed node
+	- opens the bitmap
+	- reads the bitmap of the failed node
+	- copies the set bitmap to local node
+	- cleans the bitmap of the failed node
+	- releases bitmap<number> lock of the failed node
+	- initiates resync of the bitmap on the current node
+	  md_check_recovery is invoked within recover_bitmaps,
+	  then md_check_recovery -> metadata_update_start/finish,
+	  it will lock the communication by lock_comm.
+	  Which means when one node is resyncing it blocks all
+	  other nodes from writing anywhere on the array.
+
+ The resync process is the regular md resync. However, in a clustered
+ environment when a resync is performed, it needs to tell other nodes
+ of the areas which are suspended. Before a resync starts, the node
+ send out RESYNCING with the (lo,hi) range of the area which needs to
+ be suspended. Each node maintains a suspend_list, which contains the
+ list of ranges which are currently suspended. On receiving RESYNCING,
+ the node adds the range to the suspend_list. Similarly, when the node
+ performing resync finishes, it sends RESYNCING with an empty range to
+ other nodes and other nodes remove the corresponding entry from the
+ suspend_list.
+
+ A helper function, ->area_resyncing() can be used to check if a
+ particular I/O range should be suspended or not.
+
+4.2 Device Failure
+==================
+
+ Device failures are handled and communicated with the metadata update
+ routine.  When a node detects a device failure it does not allow
+ any further writes to that device until the failure has been
+ acknowledged by all other nodes.
+
+5. Adding a new Device
+----------------------
+
+ For adding a new device, it is necessary that all nodes "see" the new
+ device to be added. For this, the following algorithm is used:
+
+   1.  Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
+       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
+   2.  Node 1 sends a NEWDISK message with uuid and slot number
+   3.  Other nodes issue kobject_uevent_env with uuid and slot number
+       (Steps 4,5 could be a udev rule)
+   4.  In userspace, the node searches for the disk, perhaps
+       using blkid -t SUB_UUID=""
+   5.  Other nodes issue either of the following depending on whether
+       the disk was found:
+       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
+       disc.number set to slot number)
+       ioctl(CLUSTERED_DISK_NACK)
+   6.  Other nodes drop lock on "no-new-devs" (CR) if device is found
+   7.  Node 1 attempts EX lock on "no-new-dev"
+   8.  If node 1 gets the lock, it sends METADATA_UPDATED after
+       unmarking the disk as SpareLocal
+   9.  If not (get "no-new-dev" lock), it fails the operation and sends
+       METADATA_UPDATED.
+   10. Other nodes get the information whether a disk is added or not
+       by the following METADATA_UPDATED.
+
+6. Module interface
+===================
+
+ There are 17 call-backs which the md core can make to the cluster
+ module.  Understanding these can give a good overview of the whole
+ process.
+
+6.1 join(nodes) and leave()
+---------------------------
+
+ These are called when an array is started with a clustered bitmap,
+ and when the array is stopped.  join() ensures the cluster is
+ available and initializes the various resources.
+ Only the first 'nodes' nodes in the cluster can use the array.
+
+6.2 slot_number()
+-----------------
+
+ Reports the slot number advised by the cluster infrastructure.
+ Range is from 0 to nodes-1.
+
+6.3 resync_info_update()
+------------------------
+
+ This updates the resync range that is stored in the bitmap lock.
+ The starting point is updated as the resync progresses.  The
+ end point is always the end of the array.
+ It does *not* send a RESYNCING message.
+
+6.4 resync_start(), resync_finish()
+-----------------------------------
+
+ These are called when resync/recovery/reshape starts or stops.
+ They update the resyncing range in the bitmap lock and also
+ send a RESYNCING message.  resync_start reports the whole
+ array as resyncing, resync_finish reports none of it.
+
+ resync_finish() also sends a BITMAP_NEEDS_SYNC message which
+ allows some other node to take over.
+
+6.5 metadata_update_start(), metadata_update_finish(), metadata_update_cancel()
+-------------------------------------------------------------------------------
+
+ metadata_update_start is used to get exclusive access to
+ the metadata.  If a change is still needed once that access is
+ gained, metadata_update_finish() will send a METADATA_UPDATE
+ message to all other nodes, otherwise metadata_update_cancel()
+ can be used to release the lock.
+
+6.6 area_resyncing()
+--------------------
+
+ This combines two elements of functionality.
+
+ Firstly, it will check if any node is currently resyncing
+ anything in a given range of sectors.  If any resync is found,
+ then the caller will avoid writing or read-balancing in that
+ range.
+
+ Secondly, while node recovery is happening it reports that
+ all areas are resyncing for READ requests.  This avoids races
+ between the cluster-filesystem and the cluster-RAID handling
+ a node failure.
+
+6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
+---------------------------------------------------------------
+
+ These are used to manage the new-disk protocol described above.
+ When a new device is added, add_new_disk_start() is called before
+ it is bound to the array and, if that succeeds, add_new_disk_finish()
+ is called the device is fully added.
+
+ When a device is added in acknowledgement to a previous
+ request, or when the device is declared "unavailable",
+ new_disk_ack() is called.
+
+6.8 remove_disk()
+-----------------
+
+ This is called when a spare or failed device is removed from
+ the array.  It causes a REMOVE message to be send to other nodes.
+
+6.9 gather_bitmaps()
+--------------------
+
+ This sends a RE_ADD message to all other nodes and then
+ gathers bitmap information from all bitmaps.  This combined
+ bitmap is then used to recovery the re-added device.
+
+6.10 lock_all_bitmaps() and unlock_all_bitmaps()
+------------------------------------------------
+
+ These are called when change bitmap to none. If a node plans
+ to clear the cluster raid's bitmap, it need to make sure no other
+ nodes are using the raid which is achieved by lock all bitmap
+ locks within the cluster, and also those locks are unlocked
+ accordingly.
+
+7. Unsupported features
+=======================
+
+There are somethings which are not supported by cluster MD yet.
+
+- change array_sectors.
diff --git a/Documentation/driver-api/md/raid5-cache.rst b/Documentation/driver-api/md/raid5-cache.rst
new file mode 100644
index 000000000000..d7a15f44a7c3
--- /dev/null
+++ b/Documentation/driver-api/md/raid5-cache.rst
@@ -0,0 +1,111 @@
+================
+RAID 4/5/6 cache
+================
+
+Raid 4/5/6 could include an extra disk for data cache besides normal RAID
+disks. The role of RAID disks isn't changed with the cache disk. The cache disk
+caches data to the RAID disks. The cache can be in write-through (supported
+since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
+3.4) has a new option '--write-journal' to create array with cache. Please
+refer to mdadm manual for details. By default (RAID array starts), the cache is
+in write-through mode. A user can switch it to write-back mode by::
+
+	echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by::
+
+	echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable.
+
+write-through mode
+==================
+
+This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
+shutdown can cause data in some stripes to not be in consistent state, eg, data
+and parity don't match. The reason is that a stripe write involves several RAID
+disks and it's possible the writes don't hit all RAID disks yet before the
+unclean shutdown. We call an array degraded if it has inconsistent data. MD
+tries to resync the array to bring it back to normal state. But before the
+resync completes, any system crash will expose the chance of real data
+corruption in the RAID array. This problem is called 'write hole'.
+
+The write-through cache will cache all data on cache disk first. After the data
+is safe on the cache disk, the data will be flushed onto RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even the array is degraded. Thus the cache can close the 'write hole'.
+
+In write-through mode, MD reports IO completion to upper layer (usually
+filesystems) after the data is safe on RAID disks, so cache disk failure
+doesn't cause data loss. Of course cache disk failure means the array is
+exposed to 'write hole' again.
+
+In write-through mode, the cache disk isn't required to be big. Several
+hundreds megabytes are enough.
+
+write-back mode
+===============
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached on cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all RAID disks of a stripe, we call it full-stripe
+write. For non-full-stripe writes, MD must read old data before the new parity
+can be calculated. These synchronous reads hurt write throughput. Some writes
+which are sequential but not dispatched in the same time will suffer from this
+overhead too. Write-back cache will aggregate the data and flush the data to
+RAID disks only after the data becomes a full stripe write. This will
+completely avoid the overhead, so it's very helpful for some workloads. A
+typical workload which does sequential write followed by fsync is an example.
+
+In write-back mode, MD reports IO completion to upper layer (usually
+filesystems) right after the data hits cache disk. The data is flushed to raid
+disks later after specific conditions met. So cache disk failure will cause
+data loss.
+
+In write-back mode, MD also caches data in memory. The memory cache includes
+the same data stored on cache disk, so a power loss doesn't cause data loss.
+The memory cache size has performance impact for the array. It's recommended
+the size is big. A user can configure the size by::
+
+	echo "2048" > /sys/block/md0/md/stripe_cache_size
+
+Too small cache disk will make the write aggregation less efficient in this
+mode depending on the workloads. It's recommended to use a cache disk with at
+least several gigabytes size in write-back mode.
+
+The implementation
+==================
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data can be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order in which MD writes data to cache disk and RAID disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, writes the data and parity to RAID disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO completion. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to RAID disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into RAID disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or free in-kernel memory cache space
+is low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to RAID disks
+in normal way.
diff --git a/Documentation/driver-api/md/raid5-ppl.rst b/Documentation/driver-api/md/raid5-ppl.rst
new file mode 100644
index 000000000000..357e5515bc55
--- /dev/null
+++ b/Documentation/driver-api/md/raid5-ppl.rst
@@ -0,0 +1,47 @@
+==================
+Partial Parity Log
+==================
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe.  It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+There is a limitation of maximum 64 disks in the array for PPL. It allows to
+keep data structures and implementation simple. RAID5 arrays with so many disks
+are not likely due to high risk of multiple disks failure. Such restriction
+should not be a real life limitation.
diff --git a/Documentation/driver-api/mei/hdcp.rst b/Documentation/driver-api/mei/hdcp.rst
new file mode 100644
index 000000000000..e85a065b1cdc
--- /dev/null
+++ b/Documentation/driver-api/mei/hdcp.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+HDCP:
+=====
+
+ME FW as a security engine provides the capability for setting up
+HDCP2.2 protocol negotiation between the Intel graphics device and
+an HDC2.2 sink.
+
+ME FW prepares HDCP2.2 negotiation parameters, signs and encrypts them
+according the HDCP 2.2 spec. The Intel graphics sends the created blob
+to the HDCP2.2 sink.
+
+Similarly, the HDCP2.2 sink's response is transferred to ME FW
+for decryption and verification.
+
+Once all the steps of HDCP2.2 negotiation are completed,
+upon request ME FW will configure the port as authenticated and supply
+the HDCP encryption keys to Intel graphics hardware.
+
+
+mei_hdcp driver
+---------------
+.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c
+    :doc: MEI_HDCP Client Driver
+
+mei_hdcp api
+------------
+
+.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c
+    :functions:
+
diff --git a/Documentation/driver-api/mei/iamt.rst b/Documentation/driver-api/mei/iamt.rst
new file mode 100644
index 000000000000..6ef3e613684b
--- /dev/null
+++ b/Documentation/driver-api/mei/iamt.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Intel(R) Active Management Technology (Intel AMT)
+=================================================
+
+Prominent usage of the Intel ME Interface is to communicate with Intel(R)
+Active Management Technology (Intel AMT) implemented in firmware running on
+the Intel ME.
+
+Intel AMT provides the ability to manage a host remotely out-of-band (OOB)
+even when the operating system running on the host processor has crashed or
+is in a sleep state.
+
+Some examples of Intel AMT usage are:
+   - Monitoring hardware state and platform components
+   - Remote power off/on (useful for green computing or overnight IT
+     maintenance)
+   - OS updates
+   - Storage of useful platform information such as software assets
+   - Built-in hardware KVM
+   - Selective network isolation of Ethernet and IP protocol flows based
+     on policies set by a remote management console
+   - IDE device redirection from remote management console
+
+Intel AMT (OOB) communication is based on SOAP (deprecated
+starting with Release 6.0) over HTTP/S or WS-Management protocol over
+HTTP/S that are received from a remote management console application.
+
+For more information about Intel AMT:
+https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+
+
+Intel AMT Applications
+----------------------
+
+    1) Intel Local Management Service (Intel LMS)
+
+       Applications running locally on the platform communicate with Intel AMT Release
+       2.0 and later releases in the same way that network applications do via SOAP
+       over HTTP (deprecated starting with Release 6.0) or with WS-Management over
+       SOAP over HTTP. This means that some Intel AMT features can be accessed from a
+       local application using the same network interface as a remote application
+       communicating with Intel AMT over the network.
+
+       When a local application sends a message addressed to the local Intel AMT host
+       name, the Intel LMS, which listens for traffic directed to the host name,
+       intercepts the message and routes it to the Intel MEI.
+       For more information:
+       https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+       Under "About Intel AMT" => "Local Access"
+
+       For downloading Intel LMS:
+       https://github.com/intel/lms
+
+       The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS
+       firmware feature using a defined GUID and then communicates with the feature
+       using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol).
+       The protocol is used to maintain multiple sessions with Intel AMT from a
+       single application.
+
+       See the protocol specification in the Intel AMT Software Development Kit (SDK)
+       https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+       Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)"
+       => "Information for Intel(R) vPro(TM) Gateway Developers"
+       => "Description of the Intel AMT Port Forwarding (APF) Protocol"
+
+    2) Intel AMT Remote configuration using a Local Agent
+
+       A Local Agent enables IT personnel to configure Intel AMT out-of-the-box
+       without requiring installing additional data to enable setup. The remote
+       configuration process may involve an ISV-developed remote configuration
+       agent that runs on the host.
+       For more information:
+       https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+       Under "Setup and Configuration of Intel AMT" =>
+       "SDK Tools Supporting Setup and Configuration" =>
+       "Using the Local Agent Sample"
+
+Intel AMT OS Health Watchdog
+----------------------------
+
+The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog.
+Whenever the OS hangs or crashes, Intel AMT will send an event
+to any subscriber to this event. This mechanism means that
+IT knows when a platform crashes even when there is a hard failure on the host.
+
+The Intel AMT Watchdog is composed of two parts:
+    1) Firmware feature - receives the heartbeats
+       and sends an event when the heartbeats stop.
+    2) Intel MEI iAMT watchdog driver - connects to the watchdog feature,
+       configures the watchdog and sends the heartbeats.
+
+The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure
+the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the
+watchdog is 120 seconds.
+
+If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate
+on the me client bus and watchdog devices won't be exposed.
+
+---
+linux-mei@linux.intel.com
diff --git a/Documentation/driver-api/mei/index.rst b/Documentation/driver-api/mei/index.rst
new file mode 100644
index 000000000000..3a22b522ee78
--- /dev/null
+++ b/Documentation/driver-api/mei/index.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: <isonum.txt>
+
+===================================================
+Intel(R) Management Engine Interface (Intel(R) MEI)
+===================================================
+
+**Copyright** |copy| 2019 Intel Corporation
+
+
+.. only:: html
+
+   .. class:: toc-title
+
+        Table of Contents
+
+.. toctree::
+   :maxdepth: 3
+
+   mei
+   mei-client-bus
+   iamt
diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst
new file mode 100644
index 000000000000..f242b3f8d6aa
--- /dev/null
+++ b/Documentation/driver-api/mei/mei-client-bus.rst
@@ -0,0 +1,168 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================================
+Intel(R) Management Engine (ME) Client bus API
+==============================================
+
+
+Rationale
+=========
+
+The MEI character device is useful for dedicated applications to send and receive
+data to the many FW appliance found in Intel's ME from the user space.
+However, for some of the ME functionalities it makes sense to leverage existing software
+stack and expose them through existing kernel subsystems.
+
+In order to plug seamlessly into the kernel device driver model we add kernel virtual
+bus abstraction on top of the MEI driver. This allows implementing Linux kernel drivers
+for the various MEI features as a stand alone entities found in their respective subsystem.
+Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to
+the existing code.
+
+
+MEI CL bus API
+==============
+
+A driver implementation for an MEI Client is very similar to any other existing bus
+based device drivers. The driver registers itself as an MEI CL bus driver through
+the ``struct mei_cl_driver`` structure defined in :file:`include/linux/mei_cl_bus.c`
+
+.. code-block:: C
+
+        struct mei_cl_driver {
+                struct device_driver driver;
+                const char *name;
+
+                const struct mei_cl_device_id *id_table;
+
+                int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id);
+                int (*remove)(struct mei_cl_device *dev);
+        };
+
+
+
+The mei_cl_device_id structure defined in :file:`include/linux/mod_devicetable.h` allows a
+driver to bind itself against a device name.
+
+.. code-block:: C
+
+        struct mei_cl_device_id {
+                char name[MEI_CL_NAME_SIZE];
+                uuid_le uuid;
+                __u8    version;
+                kernel_ulong_t driver_info;
+        };
+
+To actually register a driver on the ME Client bus one must call the :c:func:`mei_cl_add_driver`
+API. This is typically called at module initialization time.
+
+Once the driver is registered and bound to the device, a driver will typically
+try to do some I/O on this bus and this should be done through the :c:func:`mei_cl_send`
+and :c:func:`mei_cl_recv` functions. More detailed information is in :ref:`api` section.
+
+In order for a driver to be notified about pending traffic or event, the driver
+should register a callback via :c:func:`mei_cl_devev_register_rx_cb` and
+:c:func:`mei_cldev_register_notify_cb` function respectively.
+
+.. _api:
+
+API:
+----
+.. kernel-doc:: drivers/misc/mei/bus.c
+    :export: drivers/misc/mei/bus.c
+
+
+
+Example
+=======
+
+As a theoretical example let's pretend the ME comes with a "contact" NFC IP.
+The driver init and exit routines for this device would look like:
+
+.. code-block:: C
+
+        #define CONTACT_DRIVER_NAME "contact"
+
+        static struct mei_cl_device_id contact_mei_cl_tbl[] = {
+                { CONTACT_DRIVER_NAME, },
+
+                /* required last entry */
+                { }
+        };
+        MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl);
+
+        static struct mei_cl_driver contact_driver = {
+                .id_table = contact_mei_tbl,
+                .name = CONTACT_DRIVER_NAME,
+
+                .probe = contact_probe,
+                .remove = contact_remove,
+        };
+
+        static int contact_init(void)
+        {
+                int r;
+
+                r = mei_cl_driver_register(&contact_driver);
+                if (r) {
+                        pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n");
+                        return r;
+                }
+
+                return 0;
+        }
+
+        static void __exit contact_exit(void)
+        {
+                mei_cl_driver_unregister(&contact_driver);
+        }
+
+        module_init(contact_init);
+        module_exit(contact_exit);
+
+And the driver's simplified probe routine would look like that:
+
+.. code-block:: C
+
+        int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
+        {
+                [...]
+                mei_cldev_enable(dev);
+
+                mei_cldev_register_rx_cb(dev, contact_rx_cb);
+
+                return 0;
+        }
+
+In the probe routine the driver first enable the MEI device and then registers
+an rx handler which is as close as it can get to registering a threaded IRQ handler.
+The handler implementation will typically call :c:func:`mei_cldev_recv` and then
+process received data.
+
+.. code-block:: C
+
+        #define MAX_PAYLOAD 128
+        #define HDR_SIZE 4
+        static void conntact_rx_cb(struct mei_cl_device *cldev)
+        {
+                struct contact *c = mei_cldev_get_drvdata(cldev);
+                unsigned char payload[MAX_PAYLOAD];
+                ssize_t payload_sz;
+
+                payload_sz = mei_cldev_recv(cldev, payload,  MAX_PAYLOAD)
+                if (reply_size < HDR_SIZE) {
+                        return;
+                }
+
+                c->process_rx(payload);
+
+        }
+
+MEI Client Bus Drivers
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   hdcp
+   nfc
diff --git a/Documentation/driver-api/mei/mei.rst b/Documentation/driver-api/mei/mei.rst
new file mode 100644
index 000000000000..c800d8e5f422
--- /dev/null
+++ b/Documentation/driver-api/mei/mei.rst
@@ -0,0 +1,176 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Introduction
+============
+
+The Intel Management Engine (Intel ME) is an isolated and protected computing
+resource (Co-processor) residing inside certain Intel chipsets. The Intel ME
+provides support for computer/IT management and security features.
+The actual feature set depends on the Intel chipset SKU.
+
+The Intel Management Engine Interface (Intel MEI, previously known as HECI)
+is the interface between the Host and Intel ME. This interface is exposed
+to the host as a PCI device, actually multiple PCI devices might be exposed.
+The Intel MEI Driver is in charge of the communication channel between
+a host application and the Intel ME features.
+
+Each Intel ME feature, or Intel ME Client is addressed by a unique GUID and
+each client has its own protocol. The protocol is message-based with a
+header and payload up to maximal number of bytes advertised by the client,
+upon connection.
+
+Intel MEI Driver
+================
+
+The driver exposes a character device with device nodes /dev/meiX.
+
+An application maintains communication with an Intel ME feature while
+/dev/meiX is open. The binding to a specific feature is performed by calling
+:c:macro:`MEI_CONNECT_CLIENT_IOCTL`, which passes the desired GUID.
+The number of instances of an Intel ME feature that can be opened
+at the same time depends on the Intel ME feature, but most of the
+features allow only a single instance.
+
+The driver is transparent to data that are passed between firmware feature
+and host application.
+
+Because some of the Intel ME features can change the system
+configuration, the driver by default allows only a privileged
+user to access it.
+
+The session is terminated calling :c:func:`close(int fd)`.
+
+A code snippet for an application communicating with Intel AMTHI client:
+
+.. code-block:: C
+
+	struct mei_connect_client_data data;
+	fd = open(MEI_DEVICE);
+
+	data.d.in_client_uuid = AMTHI_GUID;
+
+	ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data);
+
+	printf("Ver=%d, MaxLen=%ld\n",
+	       data.d.in_client_uuid.protocol_version,
+	       data.d.in_client_uuid.max_msg_length);
+
+	[...]
+
+	write(fd, amthi_req_data, amthi_req_data_len);
+
+	[...]
+
+	read(fd, &amthi_res_data, amthi_res_data_len);
+
+	[...]
+	close(fd);
+
+
+User space API
+
+IOCTLs:
+=======
+
+The Intel MEI Driver supports the following IOCTL commands:
+
+IOCTL_MEI_CONNECT_CLIENT
+-------------------------
+Connect to firmware Feature/Client.
+
+.. code-block:: none
+
+	Usage:
+
+        struct mei_connect_client_data client_data;
+
+        ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &client_data);
+
+	Inputs:
+
+        struct mei_connect_client_data - contain the following
+	Input field:
+
+		in_client_uuid -	GUID of the FW Feature that needs
+					to connect to.
+         Outputs:
+		out_client_properties - Client Properties: MTU and Protocol Version.
+
+         Error returns:
+
+                ENOTTY  No such client (i.e. wrong GUID) or connection is not allowed.
+		EINVAL	Wrong IOCTL Number
+		ENODEV	Device or Connection is not initialized or ready.
+		ENOMEM	Unable to allocate memory to client internal data.
+		EFAULT	Fatal Error (e.g. Unable to access user input data)
+		EBUSY	Connection Already Open
+
+:Note:
+        max_msg_length (MTU) in client properties describes the maximum
+        data that can be sent or received. (e.g. if MTU=2K, can send
+        requests up to bytes 2k and received responses up to 2k bytes).
+
+
+IOCTL_MEI_NOTIFY_SET
+---------------------
+Enable or disable event notifications.
+
+
+.. code-block:: none
+
+	Usage:
+
+		uint32_t enable;
+
+		ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable);
+
+
+		uint32_t enable = 1;
+		or
+		uint32_t enable[disable] = 0;
+
+	Error returns:
+
+
+		EINVAL	Wrong IOCTL Number
+		ENODEV	Device  is not initialized or the client not connected
+		ENOMEM	Unable to allocate memory to client internal data.
+		EFAULT	Fatal Error (e.g. Unable to access user input data)
+		EOPNOTSUPP if the device doesn't support the feature
+
+:Note:
+	The client must be connected in order to enable notification events
+
+
+IOCTL_MEI_NOTIFY_GET
+--------------------
+Retrieve event
+
+.. code-block:: none
+
+	Usage:
+		uint32_t event;
+		ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event);
+
+	Outputs:
+		1 - if an event is pending
+		0 - if there is no even pending
+
+	Error returns:
+		EINVAL	Wrong IOCTL Number
+		ENODEV	Device is not initialized or the client not connected
+		ENOMEM	Unable to allocate memory to client internal data.
+		EFAULT	Fatal Error (e.g. Unable to access user input data)
+		EOPNOTSUPP if the device doesn't support the feature
+
+:Note:
+	The client must be connected and event notification has to be enabled
+	in order to receive an event
+
+
+
+Supported Chipsets
+==================
+82X38/X48 Express and newer
+
+linux-mei@linux.intel.com
diff --git a/Documentation/driver-api/mei/nfc.rst b/Documentation/driver-api/mei/nfc.rst
new file mode 100644
index 000000000000..b5b6fc96f85e
--- /dev/null
+++ b/Documentation/driver-api/mei/nfc.rst
@@ -0,0 +1,28 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+MEI NFC
+-------
+
+Some Intel 8 and 9 Serieses chipsets supports NFC devices connected behind
+the Intel Management Engine controller.
+MEI client bus exposes the NFC chips as NFC phy devices and enables
+binding with Microread and NXP PN544 NFC device driver from the Linux NFC
+subsystem.
+
+.. kernel-render:: DOT
+   :alt: MEI NFC digraph
+   :caption: **MEI NFC** Stack
+
+   digraph NFC {
+    cl_nfc -> me_cl_nfc;
+    "drivers/nfc/mei_phy" -> cl_nfc [lhead=bus];
+    "drivers/nfc/microread/mei" -> cl_nfc;
+    "drivers/nfc/microread/mei" -> "drivers/nfc/mei_phy";
+    "drivers/nfc/pn544/mei" -> cl_nfc;
+    "drivers/nfc/pn544/mei" -> "drivers/nfc/mei_phy";
+    "net/nfc" -> "drivers/nfc/microread/mei";
+    "net/nfc" -> "drivers/nfc/pn544/mei";
+    "neard" -> "net/nfc";
+    cl_nfc [label="mei/bus(nfc)"];
+    me_cl_nfc [label="me fw (nfc)"];
+   }
diff --git a/Documentation/driver-api/memory-devices/index.rst b/Documentation/driver-api/memory-devices/index.rst
new file mode 100644
index 000000000000..28101458cda5
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+Memory Controller drivers
+=========================
+
+.. toctree::
+    :maxdepth: 1
+
+    ti-emif
+    ti-gpmc
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/memory-devices/ti-emif.rst b/Documentation/driver-api/memory-devices/ti-emif.rst
new file mode 100644
index 000000000000..dea2ad9bcd7e
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/ti-emif.rst
@@ -0,0 +1,64 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================
+TI EMIF SDRAM Controller Driver
+===============================
+
+Author
+======
+Aneesh V <aneesh@ti.com>
+
+Location
+========
+driver/memory/emif.c
+
+Supported SoCs:
+===============
+TI OMAP44xx
+TI OMAP54xx
+
+Menuconfig option:
+==================
+Device Drivers
+	Memory devices
+		Texas Instruments EMIF driver
+
+Description
+===========
+This driver is for the EMIF module available in Texas Instruments
+SoCs. EMIF is an SDRAM controller that, based on its revision,
+supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols.
+This driver takes care of only LPDDR2 memories presently. The
+functions of the driver includes re-configuring AC timing
+parameters and other settings during frequency, voltage and
+temperature changes
+
+Platform Data (see include/linux/platform_data/emif_plat.h)
+===========================================================
+DDR device details and other board dependent and SoC dependent
+information can be passed through platform data (struct emif_platform_data)
+
+- DDR device details: 'struct ddr_device_info'
+- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck'
+- Custom configurations: customizable policy options through
+  'struct emif_custom_configs'
+- IP revision
+- PHY type
+
+Interface to the external world
+===============================
+EMIF driver registers notifiers for voltage and frequency changes
+affecting EMIF and takes appropriate actions when these are invoked.
+
+- freq_pre_notify_handling()
+- freq_post_notify_handling()
+- volt_notify_handling()
+
+Debugfs
+=======
+The driver creates two debugfs entries per device.
+
+- regcache_dump : dump of register values calculated and saved for all
+  frequencies used so far.
+- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4
+  indicates the current temperature level of the device.
diff --git a/Documentation/driver-api/memory-devices/ti-gpmc.rst b/Documentation/driver-api/memory-devices/ti-gpmc.rst
new file mode 100644
index 000000000000..33efcb81f080
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/ti-gpmc.rst
@@ -0,0 +1,179 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================
+GPMC (General Purpose Memory Controller)
+========================================
+
+GPMC is an unified memory controller dedicated to interfacing external
+memory devices like
+
+ * Asynchronous SRAM like memories and application specific integrated
+   circuit devices.
+ * Asynchronous, synchronous, and page mode burst NOR flash devices
+   NAND flash
+ * Pseudo-SRAM devices
+
+GPMC is found on Texas Instruments SoC's (OMAP based)
+IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1
+
+
+GPMC generic timing calculation:
+================================
+
+GPMC has certain timings that has to be programmed for proper
+functioning of the peripheral, while peripheral has another set of
+timings. To have peripheral work with gpmc, peripheral timings has to
+be translated to the form gpmc can understand. The way it has to be
+translated depends on the connected peripheral. Also there is a
+dependency for certain gpmc timings on gpmc clock frequency. Hence a
+generic timing routine was developed to achieve above requirements.
+
+Generic routine provides a generic method to calculate gpmc timings
+from gpmc peripheral timings. struct gpmc_device_timings fields has to
+be updated with timings from the datasheet of the peripheral that is
+connected to gpmc. A few of the peripheral timings can be fed either
+in time or in cycles, provision to handle this scenario has been
+provided (refer struct gpmc_device_timings definition). It may so
+happen that timing as specified by peripheral datasheet is not present
+in timing structure, in this scenario, try to correlate peripheral
+timing to the one available. If that doesn't work, try to add a new
+field as required by peripheral, educate generic timing routine to
+handle it, make sure that it does not break any of the existing.
+Then there may be cases where peripheral datasheet doesn't mention
+certain fields of struct gpmc_device_timings, zero those entries.
+
+Generic timing routine has been verified to work properly on
+multiple onenand's and tusb6010 peripherals.
+
+A word of caution: generic timing routine has been developed based
+on understanding of gpmc timings, peripheral timings, available
+custom timing routines, a kind of reverse engineering without
+most of the datasheets & hardware (to be exact none of those supported
+in mainline having custom timing routine) and by simulation.
+
+gpmc timing dependency on peripheral timings:
+
+[<gpmc_timing>: <peripheral timing1>, <peripheral timing2> ...]
+
+1. common
+
+cs_on:
+	t_ceasu
+adv_on:
+	t_avdasu, t_ceavd
+
+2. sync common
+
+sync_clk:
+	clk
+page_burst_access:
+	t_bacc
+clk_activation:
+	t_ces, t_avds
+
+3. read async muxed
+
+adv_rd_off:
+	t_avdp_r
+oe_on:
+	t_oeasu, t_aavdh
+access:
+	t_iaa, t_oe, t_ce, t_aa
+rd_cycle:
+	t_rd_cycle, t_cez_r, t_oez
+
+4. read async non-muxed
+
+adv_rd_off:
+	t_avdp_r
+oe_on:
+	t_oeasu
+access:
+	t_iaa, t_oe, t_ce, t_aa
+rd_cycle:
+	t_rd_cycle, t_cez_r, t_oez
+
+5. read sync muxed
+
+adv_rd_off:
+	t_avdp_r, t_avdh
+oe_on:
+	t_oeasu, t_ach, cyc_aavdh_oe
+access:
+	t_iaa, cyc_iaa, cyc_oe
+rd_cycle:
+	t_cez_r, t_oez, t_ce_rdyz
+
+6. read sync non-muxed
+
+adv_rd_off:
+	t_avdp_r
+oe_on:
+	t_oeasu
+access:
+	t_iaa, cyc_iaa, cyc_oe
+rd_cycle:
+	t_cez_r, t_oez, t_ce_rdyz
+
+7. write async muxed
+
+adv_wr_off:
+	t_avdp_w
+we_on, wr_data_mux_bus:
+	t_weasu, t_aavdh, cyc_aavhd_we
+we_off:
+	t_wpl
+cs_wr_off:
+	t_wph
+wr_cycle:
+	t_cez_w, t_wr_cycle
+
+8. write async non-muxed
+
+adv_wr_off:
+	t_avdp_w
+we_on, wr_data_mux_bus:
+	t_weasu
+we_off:
+	t_wpl
+cs_wr_off:
+	t_wph
+wr_cycle:
+	t_cez_w, t_wr_cycle
+
+9. write sync muxed
+
+adv_wr_off:
+	t_avdp_w, t_avdh
+we_on, wr_data_mux_bus:
+	t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we
+we_off:
+	t_wpl, cyc_wpl
+cs_wr_off:
+	t_wph
+wr_cycle:
+	t_cez_w, t_ce_rdyz
+
+10. write sync non-muxed
+
+adv_wr_off:
+	t_avdp_w
+we_on, wr_data_mux_bus:
+	t_weasu, t_rdyo
+we_off:
+	t_wpl, cyc_wpl
+cs_wr_off:
+	t_wph
+wr_cycle:
+	t_cez_w, t_ce_rdyz
+
+
+Note:
+  Many of gpmc timings are dependent on other gpmc timings (a few
+  gpmc timings purely dependent on other gpmc timings, a reason that
+  some of the gpmc timings are missing above), and it will result in
+  indirect dependency of peripheral timings to gpmc timings other than
+  mentioned above, refer timing routine for more details. To know what
+  these peripheral timings correspond to, please see explanations in
+  struct gpmc_device_timings definition. And for gpmc timings refer
+  IP details (link above).
diff --git a/Documentation/men-chameleon-bus.txt b/Documentation/driver-api/men-chameleon-bus.rst
index 1b1f048aa748..1b1f048aa748 100644
--- a/Documentation/men-chameleon-bus.txt
+++ b/Documentation/driver-api/men-chameleon-bus.rst
diff --git a/Documentation/driver-api/mmc/index.rst b/Documentation/driver-api/mmc/index.rst
new file mode 100644
index 000000000000..7339736ac774
--- /dev/null
+++ b/Documentation/driver-api/mmc/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+MMC/SD/SDIO card support
+========================
+
+.. toctree::
+   :maxdepth: 1
+
+   mmc-dev-attrs
+   mmc-dev-parts
+   mmc-async-req
+   mmc-tools
diff --git a/Documentation/driver-api/mmc/mmc-async-req.rst b/Documentation/driver-api/mmc/mmc-async-req.rst
new file mode 100644
index 000000000000..0f7197c9c3b5
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-async-req.rst
@@ -0,0 +1,98 @@
+========================
+MMC Asynchronous Request
+========================
+
+Rationale
+=========
+
+How significant is the cache maintenance overhead?
+
+It depends. Fast eMMC and multiple cache levels with speculative cache
+pre-fetch makes the cache overhead relatively significant. If the DMA
+preparations for the next request are done in parallel with the current
+transfer, the DMA preparation overhead would not affect the MMC performance.
+
+The intention of non-blocking (asynchronous) MMC requests is to minimize the
+time between when an MMC request ends and another MMC request begins.
+
+Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and
+dma_unmap_sg are processing. Using non-blocking MMC requests makes it
+possible to prepare the caches for next job in parallel with an active
+MMC request.
+
+MMC block driver
+================
+
+The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking.
+
+The increase in throughput is proportional to the time it takes to
+prepare (major part of preparations are dma_map_sg() and dma_unmap_sg())
+a request and how fast the memory is. The faster the MMC/SD is the
+more significant the prepare request time becomes. Roughly the expected
+performance gain is 5% for large writes and 10% on large reads on a L2 cache
+platform. In power save mode, when clocks run on a lower frequency, the DMA
+preparation may cost even more. As long as these slower preparations are run
+in parallel with the transfer performance won't be affected.
+
+Details on measurements from IOZone and mmc_test
+================================================
+
+https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
+
+MMC core API extension
+======================
+
+There is one new public function mmc_start_req().
+
+It starts a new MMC command request for a host. The function isn't
+truly non-blocking. If there is an ongoing async request it waits
+for completion of that request and starts the new one and returns. It
+doesn't wait for the new request to complete. If there is no ongoing
+request it starts the new request and returns immediately.
+
+MMC host extensions
+===================
+
+There are two optional members in the mmc_host_ops -- pre_req() and
+post_req() -- that the host driver may implement in order to move work
+to before and after the actual mmc_host_ops.request() function is called.
+
+In the DMA case pre_req() may do dma_map_sg() and prepare the DMA
+descriptor, and post_req() runs the dma_unmap_sg().
+
+Optimize for the first request
+==============================
+
+The first request in a series of requests can't be prepared in parallel
+with the previous transfer, since there is no previous request.
+
+The argument is_first_req in pre_req() indicates that there is no previous
+request. The host driver may optimize for this scenario to minimize
+the performance loss. A way to optimize for this is to split the current
+request in two chunks, prepare the first chunk and start the request,
+and finally prepare the second chunk and start the transfer.
+
+Pseudocode to handle is_first_req scenario with minimal prepare overhead::
+
+  if (is_first_req && req->size > threshold)
+     /* start MMC transfer for the complete transfer size */
+     mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE);
+
+     /*
+      * Begin to prepare DMA while cmd is being processed by MMC.
+      * The first chunk of the request should take the same time
+      * to prepare as the "MMC process command time".
+      * If prepare time exceeds MMC cmd time
+      * the transfer is delayed, guesstimate max 4k as first chunk size.
+      */
+      prepare_1st_chunk_for_dma(req);
+      /* flush pending desc to the DMAC (dmaengine.h) */
+      dma_issue_pending(req->dma_desc);
+
+      prepare_2nd_chunk_for_dma(req);
+      /*
+       * The second issue_pending should be called before MMC runs out
+       * of the first chunk. If the MMC runs out of the first data chunk
+       * before this call, the transfer is delayed.
+       */
+      dma_issue_pending(req->dma_desc);
diff --git a/Documentation/driver-api/mmc/mmc-dev-attrs.rst b/Documentation/driver-api/mmc/mmc-dev-attrs.rst
new file mode 100644
index 000000000000..4f44b1b730d6
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-dev-attrs.rst
@@ -0,0 +1,91 @@
+==================================
+SD and MMC Block Device Attributes
+==================================
+
+These attributes are defined for the block devices associated with the
+SD or MMC device.
+
+The following attributes are read/write.
+
+	========		===============================================
+	force_ro		Enforce read-only access even if write protect 					switch is off.
+	========		===============================================
+
+SD and MMC Device Attributes
+============================
+
+All attributes are read-only.
+
+	======================	===============================================
+	cid			Card Identification Register
+	csd			Card Specific Data Register
+	scr			SD Card Configuration Register (SD only)
+	date			Manufacturing Date (from CID Register)
+	fwrev			Firmware/Product Revision (from CID Register)
+				(SD and MMCv1 only)
+	hwrev			Hardware/Product Revision (from CID Register)
+				(SD and MMCv1 only)
+	manfid			Manufacturer ID (from CID Register)
+	name			Product Name (from CID Register)
+	oemid			OEM/Application ID (from CID Register)
+	prv			Product Revision (from CID Register)
+				(SD and MMCv4 only)
+	serial			Product Serial Number (from CID Register)
+	erase_size		Erase group size
+	preferred_erase_size	Preferred erase size
+	raw_rpmb_size_mult	RPMB partition size
+	rel_sectors		Reliable write sector count
+	ocr 			Operation Conditions Register
+	dsr			Driver Stage Register
+	cmdq_en			Command Queue enabled:
+
+					1 => enabled, 0 => not enabled
+	======================	===============================================
+
+Note on Erase Size and Preferred Erase Size:
+
+	"erase_size" is the  minimum size, in bytes, of an erase
+	operation.  For MMC, "erase_size" is the erase group size
+	reported by the card.  Note that "erase_size" does not apply
+	to trim or secure trim operations where the minimum size is
+	always one 512 byte sector.  For SD, "erase_size" is 512
+	if the card is block-addressed, 0 otherwise.
+
+	SD/MMC cards can erase an arbitrarily large area up to and
+	including the whole card.  When erasing a large area it may
+	be desirable to do it in smaller chunks for three reasons:
+
+	     1. A single erase command will make all other I/O on
+		the card wait.  This is not a problem if the whole card
+		is being erased, but erasing one partition will make
+		I/O for another partition on the same card wait for the
+		duration of the erase - which could be a several
+		minutes.
+	     2. To be able to inform the user of erase progress.
+	     3. The erase timeout becomes too large to be very
+		useful.  Because the erase timeout contains a margin
+		which is multiplied by the size of the erase area,
+		the value can end up being several minutes for large
+		areas.
+
+	"erase_size" is not the most efficient unit to erase
+	(especially for SD where it is just one sector),
+	hence "preferred_erase_size" provides a good chunk
+	size for erasing large areas.
+
+	For MMC, "preferred_erase_size" is the high-capacity
+	erase size if a card specifies one, otherwise it is
+	based on the capacity of the card.
+
+	For SD, "preferred_erase_size" is the allocation unit
+	size specified by the card.
+
+	"preferred_erase_size" is in bytes.
+
+Note on raw_rpmb_size_mult:
+
+	"raw_rpmb_size_mult" is a multiple of 128kB block.
+
+	RPMB size in byte is calculated by using the following equation:
+
+		RPMB partition size = 128kB x raw_rpmb_size_mult
diff --git a/Documentation/driver-api/mmc/mmc-dev-parts.rst b/Documentation/driver-api/mmc/mmc-dev-parts.rst
new file mode 100644
index 000000000000..995922f1f744
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-dev-parts.rst
@@ -0,0 +1,41 @@
+============================
+SD and MMC Device Partitions
+============================
+
+Device partitions are additional logical block devices present on the
+SD/MMC device.
+
+As of this writing, MMC boot partitions as supported and exposed as
+/dev/mmcblkXboot0 and /dev/mmcblkXboot1, where X is the index of the
+parent /dev/mmcblkX.
+
+MMC Boot Partitions
+===================
+
+Read and write access is provided to the two MMC boot partitions. Due to
+the sensitive nature of the boot partition contents, which often store
+a bootloader or bootloader configuration tables crucial to booting the
+platform, write access is disabled by default to reduce the chance of
+accidental bricking.
+
+To enable write access to /dev/mmcblkXbootY, disable the forced read-only
+access with::
+
+	echo 0 > /sys/block/mmcblkXbootY/force_ro
+
+To re-enable read-only access::
+
+	echo 1 > /sys/block/mmcblkXbootY/force_ro
+
+The boot partitions can also be locked read only until the next power on,
+with::
+
+	echo 1 > /sys/block/mmcblkXbootY/ro_lock_until_next_power_on
+
+This is a feature of the card and not of the kernel. If the card does
+not support boot partition locking, the file will not exist. If the
+feature has been disabled on the card, the file will be read-only.
+
+The boot partitions can also be locked permanently, but this feature is
+not accessible through sysfs in order to avoid accidental or malicious
+bricking.
diff --git a/Documentation/driver-api/mmc/mmc-tools.rst b/Documentation/driver-api/mmc/mmc-tools.rst
new file mode 100644
index 000000000000..54406093768b
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-tools.rst
@@ -0,0 +1,37 @@
+======================
+MMC tools introduction
+======================
+
+There is one MMC test tools called mmc-utils, which is maintained by Chris Ball,
+you can find it at the below public git repository:
+
+	http://git.kernel.org/cgit/linux/kernel/git/cjb/mmc-utils.git/
+
+Functions
+=========
+
+The mmc-utils tools can do the following:
+
+ - Print and parse extcsd data.
+ - Determine the eMMC writeprotect status.
+ - Set the eMMC writeprotect status.
+ - Set the eMMC data sector size to 4KB by disabling emulation.
+ - Create general purpose partition.
+ - Enable the enhanced user area.
+ - Enable write reliability per partition.
+ - Print the response to STATUS_SEND (CMD13).
+ - Enable the boot partition.
+ - Set Boot Bus Conditions.
+ - Enable the eMMC BKOPS feature.
+ - Permanently enable the eMMC H/W Reset feature.
+ - Permanently disable the eMMC H/W Reset feature.
+ - Send Sanitize command.
+ - Program authentication key for the device.
+ - Counter value for the rpmb device will be read to stdout.
+ - Read from rpmb device to output.
+ - Write to rpmb device from data file.
+ - Enable the eMMC cache feature.
+ - Disable the eMMC cache feature.
+ - Print and parse CID data.
+ - Print and parse CSD data.
+ - Print and parse SCR data.
diff --git a/Documentation/driver-api/mtd/index.rst b/Documentation/driver-api/mtd/index.rst
new file mode 100644
index 000000000000..436ba5a851d7
--- /dev/null
+++ b/Documentation/driver-api/mtd/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Memory Technology Device (MTD)
+==============================
+
+.. toctree::
+   :maxdepth: 1
+
+   intel-spi
+   nand_ecc
+   spi-nor
diff --git a/Documentation/driver-api/mtd/intel-spi.rst b/Documentation/driver-api/mtd/intel-spi.rst
new file mode 100644
index 000000000000..0e6d9cd5388d
--- /dev/null
+++ b/Documentation/driver-api/mtd/intel-spi.rst
@@ -0,0 +1,90 @@
+==============================
+Upgrading BIOS using intel-spi
+==============================
+
+Many Intel CPUs like Baytrail and Braswell include SPI serial flash host
+controller which is used to hold BIOS and other platform specific data.
+Since contents of the SPI serial flash is crucial for machine to function,
+it is typically protected by different hardware protection mechanisms to
+avoid accidental (or on purpose) overwrite of the content.
+
+Not all manufacturers protect the SPI serial flash, mainly because it
+allows upgrading the BIOS image directly from an OS.
+
+The intel-spi driver makes it possible to read and write the SPI serial
+flash, if certain protection bits are not set and locked. If it finds
+any of them set, the whole MTD device is made read-only to prevent
+partial overwrites. By default the driver exposes SPI serial flash
+contents as read-only but it can be changed from kernel command line,
+passing "intel-spi.writeable=1".
+
+Please keep in mind that overwriting the BIOS image on SPI serial flash
+might render the machine unbootable and requires special equipment like
+Dediprog to revive. You have been warned!
+
+Below are the steps how to upgrade MinnowBoard MAX BIOS directly from
+Linux.
+
+ 1) Download and extract the latest Minnowboard MAX BIOS SPI image
+    [1]. At the time writing this the latest image is v92.
+
+ 2) Install mtd-utils package [2]. We need this in order to erase the SPI
+    serial flash. Distros like Debian and Fedora have this prepackaged with
+    name "mtd-utils".
+
+ 3) Add "intel-spi.writeable=1" to the kernel command line and reboot
+    the board (you can also reload the driver passing "writeable=1" as
+    module parameter to modprobe).
+
+ 4) Once the board is up and running again, find the right MTD partition
+    (it is named as "BIOS")::
+
+	# cat /proc/mtd
+	dev:    size   erasesize  name
+	mtd0: 00800000 00001000 "BIOS"
+
+    So here it will be /dev/mtd0 but it may vary.
+
+ 5) Make backup of the existing image first::
+
+	# dd if=/dev/mtd0ro of=bios.bak
+	16384+0 records in
+	16384+0 records out
+	8388608 bytes (8.4 MB) copied, 10.0269 s, 837 kB/s
+
+ 6) Verify the backup:
+
+	# sha1sum /dev/mtd0ro bios.bak
+	fdbb011920572ca6c991377c4b418a0502668b73  /dev/mtd0ro
+	fdbb011920572ca6c991377c4b418a0502668b73  bios.bak
+
+    The SHA1 sums must match. Otherwise do not continue any further!
+
+ 7) Erase the SPI serial flash. After this step, do not reboot the
+    board! Otherwise it will not start anymore::
+
+	# flash_erase /dev/mtd0 0 0
+	Erasing 4 Kibyte @ 7ff000 -- 100 % complete
+
+ 8) Once completed without errors you can write the new BIOS image:
+
+    # dd if=MNW2MAX1.X64.0092.R01.1605221712.bin of=/dev/mtd0
+
+ 9) Verify that the new content of the SPI serial flash matches the new
+    BIOS image::
+
+	# sha1sum /dev/mtd0ro MNW2MAX1.X64.0092.R01.1605221712.bin
+	9b4df9e4be2057fceec3a5529ec3d950836c87a2  /dev/mtd0ro
+	9b4df9e4be2057fceec3a5529ec3d950836c87a2 MNW2MAX1.X64.0092.R01.1605221712.bin
+
+    The SHA1 sums should match.
+
+ 10) Now you can reboot your board and observe the new BIOS starting up
+     properly.
+
+References
+----------
+
+[1] https://firmware.intel.com/sites/default/files/MinnowBoard%2EMAX_%2EX64%2E92%2ER01%2Ezip
+
+[2] http://www.linux-mtd.infradead.org/
diff --git a/Documentation/driver-api/mtd/nand_ecc.rst b/Documentation/driver-api/mtd/nand_ecc.rst
new file mode 100644
index 000000000000..e8d3c53a5056
--- /dev/null
+++ b/Documentation/driver-api/mtd/nand_ecc.rst
@@ -0,0 +1,763 @@
+==========================
+NAND Error-correction Code
+==========================
+
+Introduction
+============
+
+Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
+I felt there was room for optimisation. I bashed the code for a few hours
+performing tricks like table lookup removing superfluous code etc.
+After that the speed was increased by 35-40%.
+Still I was not too happy as I felt there was additional room for improvement.
+
+Bad! I was hooked.
+I decided to annotate my steps in this file. Perhaps it is useful to someone
+or someone learns something from it.
+
+
+The problem
+===========
+
+NAND flash (at least SLC one) typically has sectors of 256 bytes.
+However NAND flash is not extremely reliable so some error detection
+(and sometimes correction) is needed.
+
+This is done by means of a Hamming code. I'll try to explain it in
+laymans terms (and apologies to all the pro's in the field in case I do
+not use the right terminology, my coding theory class was almost 30
+years ago, and I must admit it was not one of my favourites).
+
+As I said before the ecc calculation is performed on sectors of 256
+bytes. This is done by calculating several parity bits over the rows and
+columns. The parity used is even parity which means that the parity bit = 1
+if the data over which the parity is calculated is 1 and the parity bit = 0
+if the data over which the parity is calculated is 0. So the total
+number of bits over the data over which the parity is calculated + the
+parity bit is even. (see wikipedia if you can't follow this).
+Parity is often calculated by means of an exclusive or operation,
+sometimes also referred to as xor. In C the operator for xor is ^
+
+Back to ecc.
+Let's give a small figure:
+
+=========  ==== ==== ==== ==== ==== ==== ==== ====   === === === === ====
+byte   0:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp4 ... rp14
+byte   1:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp2 rp4 ... rp14
+byte   2:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp4 ... rp14
+byte   3:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp4 ... rp14
+byte   4:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp5 ... rp14
+...
+byte 254:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp5 ... rp15
+byte 255:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp5 ... rp15
+           cp1  cp0  cp1  cp0  cp1  cp0  cp1  cp0
+           cp3  cp3  cp2  cp2  cp3  cp3  cp2  cp2
+           cp5  cp5  cp5  cp5  cp4  cp4  cp4  cp4
+=========  ==== ==== ==== ==== ==== ==== ==== ====   === === === === ====
+
+This figure represents a sector of 256 bytes.
+cp is my abbreviation for column parity, rp for row parity.
+
+Let's start to explain column parity.
+
+- cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
+
+  so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
+
+Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
+
+- cp2 is the parity over bit0, bit1, bit4 and bit5
+- cp3 is the parity over bit2, bit3, bit6 and bit7.
+- cp4 is the parity over bit0, bit1, bit2 and bit3.
+- cp5 is the parity over bit4, bit5, bit6 and bit7.
+
+Note that each of cp0 .. cp5 is exactly one bit.
+
+Row parity actually works almost the same.
+
+- rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
+- rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
+- rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
+  (so handle two bytes, then skip 2 bytes).
+- rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
+- for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
+
+  so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
+- and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
+
+The story now becomes quite boring. I guess you get the idea.
+
+- rp6 covers 8 bytes then skips 8 etc
+- rp7 skips 8 bytes then covers 8 etc
+- rp8 covers 16 bytes then skips 16 etc
+- rp9 skips 16 bytes then covers 16 etc
+- rp10 covers 32 bytes then skips 32 etc
+- rp11 skips 32 bytes then covers 32 etc
+- rp12 covers 64 bytes then skips 64 etc
+- rp13 skips 64 bytes then covers 64 etc
+- rp14 covers 128 bytes then skips 128
+- rp15 skips 128 bytes then covers 128
+
+In the end the parity bits are grouped together in three bytes as
+follows:
+
+=====  ===== ===== ===== ===== ===== ===== ===== =====
+ECC    Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
+=====  ===== ===== ===== ===== ===== ===== ===== =====
+ECC 0   rp07  rp06  rp05  rp04  rp03  rp02  rp01  rp00
+ECC 1   rp15  rp14  rp13  rp12  rp11  rp10  rp09  rp08
+ECC 2   cp5   cp4   cp3   cp2   cp1   cp0      1     1
+=====  ===== ===== ===== ===== ===== ===== ===== =====
+
+I detected after writing this that ST application note AN1823
+(http://www.st.com/stonline/) gives a much
+nicer picture.(but they use line parity as term where I use row parity)
+Oh well, I'm graphically challenged, so suffer with me for a moment :-)
+
+And I could not reuse the ST picture anyway for copyright reasons.
+
+
+Attempt 0
+=========
+
+Implementing the parity calculation is pretty simple.
+In C pseudocode::
+
+  for (i = 0; i < 256; i++)
+  {
+    if (i & 0x01)
+       rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+    else
+       rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
+    if (i & 0x02)
+       rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
+    else
+       rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
+    if (i & 0x04)
+      rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
+    else
+      rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
+    if (i & 0x08)
+      rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
+    else
+      rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
+    if (i & 0x10)
+      rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
+    else
+      rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
+    if (i & 0x20)
+      rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
+    else
+      rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
+    if (i & 0x40)
+      rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
+    else
+      rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
+    if (i & 0x80)
+      rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
+    else
+      rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
+    cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
+    cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
+    cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
+    cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
+    cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
+    cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
+  }
+
+
+Analysis 0
+==========
+
+C does have bitwise operators but not really operators to do the above
+efficiently (and most hardware has no such instructions either).
+Therefore without implementing this it was clear that the code above was
+not going to bring me a Nobel prize :-)
+
+Fortunately the exclusive or operation is commutative, so we can combine
+the values in any order. So instead of calculating all the bits
+individually, let us try to rearrange things.
+For the column parity this is easy. We can just xor the bytes and in the
+end filter out the relevant bits. This is pretty nice as it will bring
+all cp calculation out of the for loop.
+
+Similarly we can first xor the bytes for the various rows.
+This leads to:
+
+
+Attempt 1
+=========
+
+::
+
+  const char parity[256] = {
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+      0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
+  };
+
+  void ecc1(const unsigned char *buf, unsigned char *code)
+  {
+      int i;
+      const unsigned char *bp = buf;
+      unsigned char cur;
+      unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+      unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+      unsigned char par;
+
+      par = 0;
+      rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+      rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+      rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+      rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+      for (i = 0; i < 256; i++)
+      {
+          cur = *bp++;
+          par ^= cur;
+          if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
+          if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
+          if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
+          if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
+          if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
+          if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
+          if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
+          if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
+      }
+      code[0] =
+          (parity[rp7] << 7) |
+          (parity[rp6] << 6) |
+          (parity[rp5] << 5) |
+          (parity[rp4] << 4) |
+          (parity[rp3] << 3) |
+          (parity[rp2] << 2) |
+          (parity[rp1] << 1) |
+          (parity[rp0]);
+      code[1] =
+          (parity[rp15] << 7) |
+          (parity[rp14] << 6) |
+          (parity[rp13] << 5) |
+          (parity[rp12] << 4) |
+          (parity[rp11] << 3) |
+          (parity[rp10] << 2) |
+          (parity[rp9]  << 1) |
+          (parity[rp8]);
+      code[2] =
+          (parity[par & 0xf0] << 7) |
+          (parity[par & 0x0f] << 6) |
+          (parity[par & 0xcc] << 5) |
+          (parity[par & 0x33] << 4) |
+          (parity[par & 0xaa] << 3) |
+          (parity[par & 0x55] << 2);
+      code[0] = ~code[0];
+      code[1] = ~code[1];
+      code[2] = ~code[2];
+  }
+
+Still pretty straightforward. The last three invert statements are there to
+give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
+all data is 0xff, so the checksum then matches.
+
+I also introduced the parity lookup. I expected this to be the fastest
+way to calculate the parity, but I will investigate alternatives later
+on.
+
+
+Analysis 1
+==========
+
+The code works, but is not terribly efficient. On my system it took
+almost 4 times as much time as the linux driver code. But hey, if it was
+*that* easy this would have been done long before.
+No pain. no gain.
+
+Fortunately there is plenty of room for improvement.
+
+In step 1 we moved from bit-wise calculation to byte-wise calculation.
+However in C we can also use the unsigned long data type and virtually
+every modern microprocessor supports 32 bit operations, so why not try
+to write our code in such a way that we process data in 32 bit chunks.
+
+Of course this means some modification as the row parity is byte by
+byte. A quick analysis:
+for the column parity we use the par variable. When extending to 32 bits
+we can in the end easily calculate rp0 and rp1 from it.
+(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
+respectively, from MSB to LSB)
+also rp2 and rp3 can be easily retrieved from par as rp3 covers the
+first two MSBs and rp2 covers the last two LSBs.
+
+Note that of course now the loop is executed only 64 times (256/4).
+And note that care must taken wrt byte ordering. The way bytes are
+ordered in a long is machine dependent, and might affect us.
+Anyway, if there is an issue: this code is developed on x86 (to be
+precise: a DELL PC with a D920 Intel CPU)
+
+And of course the performance might depend on alignment, but I expect
+that the I/O buffers in the nand driver are aligned properly (and
+otherwise that should be fixed to get maximum performance).
+
+Let's give it a try...
+
+
+Attempt 2
+=========
+
+::
+
+  extern const char parity[256];
+
+  void ecc2(const unsigned char *buf, unsigned char *code)
+  {
+      int i;
+      const unsigned long *bp = (unsigned long *)buf;
+      unsigned long cur;
+      unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+      unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+      unsigned long par;
+
+      par = 0;
+      rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+      rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+      rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+      rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+      for (i = 0; i < 64; i++)
+      {
+          cur = *bp++;
+          par ^= cur;
+          if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+          if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+          if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+          if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+          if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+          if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+      }
+      /*
+         we need to adapt the code generation for the fact that rp vars are now
+         long; also the column parity calculation needs to be changed.
+         we'll bring rp4 to 15 back to single byte entities by shifting and
+         xoring
+      */
+      rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
+      rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
+      rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
+      rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
+      rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
+      rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
+      rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
+      rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
+      rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
+      rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
+      rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
+      rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
+      rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
+      rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
+      par ^= (par >> 16);
+      rp1 = (par >> 8); rp1 &= 0xff;
+      rp0 = (par & 0xff);
+      par ^= (par >> 8); par &= 0xff;
+
+      code[0] =
+          (parity[rp7] << 7) |
+          (parity[rp6] << 6) |
+          (parity[rp5] << 5) |
+          (parity[rp4] << 4) |
+          (parity[rp3] << 3) |
+          (parity[rp2] << 2) |
+          (parity[rp1] << 1) |
+          (parity[rp0]);
+      code[1] =
+          (parity[rp15] << 7) |
+          (parity[rp14] << 6) |
+          (parity[rp13] << 5) |
+          (parity[rp12] << 4) |
+          (parity[rp11] << 3) |
+          (parity[rp10] << 2) |
+          (parity[rp9]  << 1) |
+          (parity[rp8]);
+      code[2] =
+          (parity[par & 0xf0] << 7) |
+          (parity[par & 0x0f] << 6) |
+          (parity[par & 0xcc] << 5) |
+          (parity[par & 0x33] << 4) |
+          (parity[par & 0xaa] << 3) |
+          (parity[par & 0x55] << 2);
+      code[0] = ~code[0];
+      code[1] = ~code[1];
+      code[2] = ~code[2];
+  }
+
+The parity array is not shown any more. Note also that for these
+examples I kinda deviated from my regular programming style by allowing
+multiple statements on a line, not using { } in then and else blocks
+with only a single statement and by using operators like ^=
+
+
+Analysis 2
+==========
+
+The code (of course) works, and hurray: we are a little bit faster than
+the linux driver code (about 15%). But wait, don't cheer too quickly.
+There is more to be gained.
+If we look at e.g. rp14 and rp15 we see that we either xor our data with
+rp14 or with rp15. However we also have par which goes over all data.
+This means there is no need to calculate rp14 as it can be calculated from
+rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
+(or if desired we can avoid calculating rp15 and calculate it from
+rp14).  That is why some places refer to inverse parity.
+Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
+Effectively this means we can eliminate the else clause from the if
+statements. Also we can optimise the calculation in the end a little bit
+by going from long to byte first. Actually we can even avoid the table
+lookups
+
+Attempt 3
+=========
+
+Odd replaced::
+
+          if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+          if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+          if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+          if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+          if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+          if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+
+with::
+
+          if (i & 0x01) rp5 ^= cur;
+          if (i & 0x02) rp7 ^= cur;
+          if (i & 0x04) rp9 ^= cur;
+          if (i & 0x08) rp11 ^= cur;
+          if (i & 0x10) rp13 ^= cur;
+          if (i & 0x20) rp15 ^= cur;
+
+and outside the loop added::
+
+          rp4  = par ^ rp5;
+          rp6  = par ^ rp7;
+          rp8  = par ^ rp9;
+          rp10  = par ^ rp11;
+          rp12  = par ^ rp13;
+          rp14  = par ^ rp15;
+
+And after that the code takes about 30% more time, although the number of
+statements is reduced. This is also reflected in the assembly code.
+
+
+Analysis 3
+==========
+
+Very weird. Guess it has to do with caching or instruction parallellism
+or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
+observation was that this one is only 30% slower (according to time)
+executing the code as my 3Ghz D920 processor.
+
+Well, it was expected not to be easy so maybe instead move to a
+different track: let's move back to the code from attempt2 and do some
+loop unrolling. This will eliminate a few if statements. I'll try
+different amounts of unrolling to see what works best.
+
+
+Attempt 4
+=========
+
+Unrolled the loop 1, 2, 3 and 4 times.
+For 4 the code starts with::
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++;
+        par ^= cur;
+        rp4 ^= cur;
+        rp6 ^= cur;
+        rp8 ^= cur;
+        rp10 ^= cur;
+        if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
+        if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
+        cur = *bp++;
+        par ^= cur;
+        rp5 ^= cur;
+        rp6 ^= cur;
+        ...
+
+
+Analysis 4
+==========
+
+Unrolling once gains about 15%
+
+Unrolling twice keeps the gain at about 15%
+
+Unrolling three times gives a gain of 30% compared to attempt 2.
+
+Unrolling four times gives a marginal improvement compared to unrolling
+three times.
+
+I decided to proceed with a four time unrolled loop anyway. It was my gut
+feeling that in the next steps I would obtain additional gain from it.
+
+The next step was triggered by the fact that par contains the xor of all
+bytes and rp4 and rp5 each contain the xor of half of the bytes.
+So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
+that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
+eliminate rp5 (or rp4, but I already foresaw another optimisation).
+The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
+
+
+Attempt 5
+=========
+
+Effectively so all odd digit rp assignments in the loop were removed.
+This included the else clause of the if statements.
+Of course after the loop we need to correct things by adding code like::
+
+    rp5 = par ^ rp4;
+
+Also the initial assignments (rp5 = 0; etc) could be removed.
+Along the line I also removed the initialisation of rp0/1/2/3.
+
+
+Analysis 5
+==========
+
+Measurements showed this was a good move. The run-time roughly halved
+compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
+of the processor time compared to the current code in the linux kernel.
+
+However, still I thought there was more. I didn't like all the if
+statements. Why not keep a running parity and only keep the last if
+statement. Time for yet another version!
+
+
+Attempt 6
+=========
+
+THe code within the for loop was changed to::
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++; tmppar  = cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= cur;
+
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+
+        par ^= tmppar;
+        if ((i & 0x1) == 0) rp12 ^= tmppar;
+        if ((i & 0x2) == 0) rp14 ^= tmppar;
+    }
+
+As you can see tmppar is used to accumulate the parity within a for
+iteration. In the last 3 statements is added to par and, if needed,
+to rp12 and rp14.
+
+While making the changes I also found that I could exploit that tmppar
+contains the running parity for this iteration. So instead of having:
+rp4 ^= cur; rp6 ^= cur;
+I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
+statement. A similar change was done for rp8 and rp10
+
+
+Analysis 6
+==========
+
+Measuring this code again showed big gain. When executing the original
+linux code 1 million times, this took about 1 second on my system.
+(using time to measure the performance). After this iteration I was back
+to 0.075 sec. Actually I had to decide to start measuring over 10
+million iterations in order not to lose too much accuracy. This one
+definitely seemed to be the jackpot!
+
+There is a little bit more room for improvement though. There are three
+places with statements::
+
+	rp4 ^= cur; rp6 ^= cur;
+
+It seems more efficient to also maintain a variable rp4_6 in the while
+loop; This eliminates 3 statements per loop. Of course after the loop we
+need to correct by adding::
+
+	rp4 ^= rp4_6;
+	rp6 ^= rp4_6
+
+Furthermore there are 4 sequential assignments to rp8. This can be
+encoded slightly more efficiently by saving tmppar before those 4 lines
+and later do rp8 = rp8 ^ tmppar ^ notrp8;
+(where notrp8 is the value of rp8 before those 4 lines).
+Again a use of the commutative property of xor.
+Time for a new test!
+
+
+Attempt 7
+=========
+
+The new code now looks like::
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++; tmppar  = cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+        notrp8 = tmppar;
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+        rp8 = rp8 ^ tmppar ^ notrp8;
+
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+
+        par ^= tmppar;
+        if ((i & 0x1) == 0) rp12 ^= tmppar;
+        if ((i & 0x2) == 0) rp14 ^= tmppar;
+    }
+    rp4 ^= rp4_6;
+    rp6 ^= rp4_6;
+
+
+Not a big change, but every penny counts :-)
+
+
+Analysis 7
+==========
+
+Actually this made things worse. Not very much, but I don't want to move
+into the wrong direction. Maybe something to investigate later. Could
+have to do with caching again.
+
+Guess that is what there is to win within the loop. Maybe unrolling one
+more time will help. I'll keep the optimisations from 7 for now.
+
+
+Attempt 8
+=========
+
+Unrolled the loop one more time.
+
+
+Analysis 8
+==========
+
+This makes things worse. Let's stick with attempt 6 and continue from there.
+Although it seems that the code within the loop cannot be optimised
+further there is still room to optimize the generation of the ecc codes.
+We can simply calculate the total parity. If this is 0 then rp4 = rp5
+etc. If the parity is 1, then rp4 = !rp5;
+
+But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
+in the result byte and then do something like::
+
+    code[0] |= (code[0] << 1);
+
+Lets test this.
+
+
+Attempt 9
+=========
+
+Changed the code but again this slightly degrades performance. Tried all
+kind of other things, like having dedicated parity arrays to avoid the
+shift after parity[rp7] << 7; No gain.
+Change the lookup using the parity array by using shift operators (e.g.
+replace parity[rp7] << 7 with::
+
+	rp7 ^= (rp7 << 4);
+	rp7 ^= (rp7 << 2);
+	rp7 ^= (rp7 << 1);
+	rp7 &= 0x80;
+
+No gain.
+
+The only marginal change was inverting the parity bits, so we can remove
+the last three invert statements.
+
+Ah well, pity this does not deliver more. Then again 10 million
+iterations using the linux driver code takes between 13 and 13.5
+seconds, whereas my code now takes about 0.73 seconds for those 10
+million iterations. So basically I've improved the performance by a
+factor 18 on my system. Not that bad. Of course on different hardware
+you will get different results. No warranties!
+
+But of course there is no such thing as a free lunch. The codesize almost
+tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
+
+
+Correcting errors
+=================
+
+For correcting errors I again used the ST application note as a starter,
+but I also peeked at the existing code.
+
+The algorithm itself is pretty straightforward. Just xor the given and
+the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
+are 1 we have one correctable bit error. If there is 1 bit 1, we have an
+error in the given ecc code.
+
+It proved to be fastest to do some table lookups. Performance gain
+introduced by this is about a factor 2 on my system when a repair had to
+be done, and 1% or so if no repair had to be done.
+
+Code size increased from 330 bytes to 686 bytes for this function.
+(gcc 4.2, -O3)
+
+
+Conclusion
+==========
+
+The gain when calculating the ecc is tremendous. Om my development hardware
+a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
+embedded system with a MIPS core a factor 7 was obtained.
+
+On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
+5 (big endian mode, gcc 4.1.2, -O3)
+
+For correction not much gain could be obtained (as bitflips are rare). Then
+again there are also much less cycles spent there.
+
+It seems there is not much more gain possible in this, at least when
+programmed in C. Of course it might be possible to squeeze something more
+out of it with an assembler program, but due to pipeline behaviour etc
+this is very tricky (at least for intel hw).
+
+Author: Frans Meulenbroeks
+
+Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/driver-api/mtd/spi-nor.rst b/Documentation/driver-api/mtd/spi-nor.rst
new file mode 100644
index 000000000000..f5333e3bf486
--- /dev/null
+++ b/Documentation/driver-api/mtd/spi-nor.rst
@@ -0,0 +1,66 @@
+=================
+SPI NOR framework
+=================
+
+Part I - Why do we need this framework?
+---------------------------------------
+
+SPI bus controllers (drivers/spi/) only deal with streams of bytes; the bus
+controller operates agnostic of the specific device attached. However, some
+controllers (such as Freescale's QuadSPI controller) cannot easily handle
+arbitrary streams of bytes, but rather are designed specifically for SPI NOR.
+
+In particular, Freescale's QuadSPI controller must know the NOR commands to
+find the right LUT sequence. Unfortunately, the SPI subsystem has no notion of
+opcodes, addresses, or data payloads; a SPI controller simply knows to send or
+receive bytes (Tx and Rx). Therefore, we must define a new layering scheme under
+which the controller driver is aware of the opcodes, addressing, and other
+details of the SPI NOR protocol.
+
+Part II - How does the framework work?
+--------------------------------------
+
+This framework just adds a new layer between the MTD and the SPI bus driver.
+With this new layer, the SPI NOR controller driver does not depend on the
+m25p80 code anymore.
+
+Before this framework, the layer is like::
+
+                   MTD
+         ------------------------
+                  m25p80
+         ------------------------
+	       SPI bus driver
+         ------------------------
+	        SPI NOR chip
+
+   After this framework, the layer is like:
+                   MTD
+         ------------------------
+              SPI NOR framework
+         ------------------------
+                  m25p80
+         ------------------------
+	       SPI bus driver
+         ------------------------
+	       SPI NOR chip
+
+  With the SPI NOR controller driver (Freescale QuadSPI), it looks like:
+                   MTD
+         ------------------------
+              SPI NOR framework
+         ------------------------
+                fsl-quadSPI
+         ------------------------
+	       SPI NOR chip
+
+Part III - How can drivers use the framework?
+---------------------------------------------
+
+The main API is spi_nor_scan(). Before you call the hook, a driver should
+initialize the necessary fields for spi_nor{}. Please see
+drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to fsl-quadspi.c
+when you want to write a new driver for a SPI NOR controller.
+Another API is spi_nor_restore(), this is used to restore the status of SPI
+flash chip such as addressing mode. Call it whenever detach the driver from
+device or reboot the system.
diff --git a/Documentation/driver-api/nfc/index.rst b/Documentation/driver-api/nfc/index.rst
new file mode 100644
index 000000000000..b6e9eedbff29
--- /dev/null
+++ b/Documentation/driver-api/nfc/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Near Field Communication
+========================
+
+.. toctree::
+   :maxdepth: 1
+
+   nfc-hci
+   nfc-pn544
diff --git a/Documentation/driver-api/nfc/nfc-hci.rst b/Documentation/driver-api/nfc/nfc-hci.rst
new file mode 100644
index 000000000000..eb8a1a14e919
--- /dev/null
+++ b/Documentation/driver-api/nfc/nfc-hci.rst
@@ -0,0 +1,311 @@
+========================
+HCI backend for NFC Core
+========================
+
+- Author: Eric Lapuyade, Samuel Ortiz
+- Contact: eric.lapuyade@intel.com, samuel.ortiz@intel.com
+
+General
+-------
+
+The HCI layer implements much of the ETSI TS 102 622 V10.2.0 specification. It
+enables easy writing of HCI-based NFC drivers. The HCI layer runs as an NFC Core
+backend, implementing an abstract nfc device and translating NFC Core API
+to HCI commands and events.
+
+HCI
+---
+
+HCI registers as an nfc device with NFC Core. Requests coming from userspace are
+routed through netlink sockets to NFC Core and then to HCI. From this point,
+they are translated in a sequence of HCI commands sent to the HCI layer in the
+host controller (the chip). Commands can be executed synchronously (the sending
+context blocks waiting for response) or asynchronously (the response is returned
+from HCI Rx context).
+HCI events can also be received from the host controller. They will be handled
+and a translation will be forwarded to NFC Core as needed. There are hooks to
+let the HCI driver handle proprietary events or override standard behavior.
+HCI uses 2 execution contexts:
+
+- one for executing commands : nfc_hci_msg_tx_work(). Only one command
+  can be executing at any given moment.
+- one for dispatching received events and commands : nfc_hci_msg_rx_work().
+
+HCI Session initialization
+--------------------------
+
+The Session initialization is an HCI standard which must unfortunately
+support proprietary gates. This is the reason why the driver will pass a list
+of proprietary gates that must be part of the session. HCI will ensure all
+those gates have pipes connected when the hci device is set up.
+In case the chip supports pre-opened gates and pseudo-static pipes, the driver
+can pass that information to HCI core.
+
+HCI Gates and Pipes
+-------------------
+
+A gate defines the 'port' where some service can be found. In order to access
+a service, one must create a pipe to that gate and open it. In this
+implementation, pipes are totally hidden. The public API only knows gates.
+This is consistent with the driver need to send commands to proprietary gates
+without knowing the pipe connected to it.
+
+Driver interface
+----------------
+
+A driver is generally written in two parts : the physical link management and
+the HCI management. This makes it easier to maintain a driver for a chip that
+can be connected using various phy (i2c, spi, ...)
+
+HCI Management
+--------------
+
+A driver would normally register itself with HCI and provide the following
+entry points::
+
+  struct nfc_hci_ops {
+	int (*open)(struct nfc_hci_dev *hdev);
+	void (*close)(struct nfc_hci_dev *hdev);
+	int (*hci_ready) (struct nfc_hci_dev *hdev);
+	int (*xmit) (struct nfc_hci_dev *hdev, struct sk_buff *skb);
+	int (*start_poll) (struct nfc_hci_dev *hdev,
+			   u32 im_protocols, u32 tm_protocols);
+	int (*dep_link_up)(struct nfc_hci_dev *hdev, struct nfc_target *target,
+			   u8 comm_mode, u8 *gb, size_t gb_len);
+	int (*dep_link_down)(struct nfc_hci_dev *hdev);
+	int (*target_from_gate) (struct nfc_hci_dev *hdev, u8 gate,
+				 struct nfc_target *target);
+	int (*complete_target_discovered) (struct nfc_hci_dev *hdev, u8 gate,
+					   struct nfc_target *target);
+	int (*im_transceive) (struct nfc_hci_dev *hdev,
+			      struct nfc_target *target, struct sk_buff *skb,
+			      data_exchange_cb_t cb, void *cb_context);
+	int (*tm_send)(struct nfc_hci_dev *hdev, struct sk_buff *skb);
+	int (*check_presence)(struct nfc_hci_dev *hdev,
+			      struct nfc_target *target);
+	int (*event_received)(struct nfc_hci_dev *hdev, u8 gate, u8 event,
+			      struct sk_buff *skb);
+  };
+
+- open() and close() shall turn the hardware on and off.
+- hci_ready() is an optional entry point that is called right after the hci
+  session has been set up. The driver can use it to do additional initialization
+  that must be performed using HCI commands.
+- xmit() shall simply write a frame to the physical link.
+- start_poll() is an optional entrypoint that shall set the hardware in polling
+  mode. This must be implemented only if the hardware uses proprietary gates or a
+  mechanism slightly different from the HCI standard.
+- dep_link_up() is called after a p2p target has been detected, to finish
+  the p2p connection setup with hardware parameters that need to be passed back
+  to nfc core.
+- dep_link_down() is called to bring the p2p link down.
+- target_from_gate() is an optional entrypoint to return the nfc protocols
+  corresponding to a proprietary gate.
+- complete_target_discovered() is an optional entry point to let the driver
+  perform additional proprietary processing necessary to auto activate the
+  discovered target.
+- im_transceive() must be implemented by the driver if proprietary HCI commands
+  are required to send data to the tag. Some tag types will require custom
+  commands, others can be written to using the standard HCI commands. The driver
+  can check the tag type and either do proprietary processing, or return 1 to ask
+  for standard processing. The data exchange command itself must be sent
+  asynchronously.
+- tm_send() is called to send data in the case of a p2p connection
+- check_presence() is an optional entry point that will be called regularly
+  by the core to check that an activated tag is still in the field. If this is
+  not implemented, the core will not be able to push tag_lost events to the user
+  space
+- event_received() is called to handle an event coming from the chip. Driver
+  can handle the event or return 1 to let HCI attempt standard processing.
+
+On the rx path, the driver is responsible to push incoming HCP frames to HCI
+using nfc_hci_recv_frame(). HCI will take care of re-aggregation and handling
+This must be done from a context that can sleep.
+
+PHY Management
+--------------
+
+The physical link (i2c, ...) management is defined by the following structure::
+
+  struct nfc_phy_ops {
+	int (*write)(void *dev_id, struct sk_buff *skb);
+	int (*enable)(void *dev_id);
+	void (*disable)(void *dev_id);
+  };
+
+enable():
+	turn the phy on (power on), make it ready to transfer data
+disable():
+	turn the phy off
+write():
+	Send a data frame to the chip. Note that to enable higher
+	layers such as an llc to store the frame for re-emission, this
+	function must not alter the skb. It must also not return a positive
+	result (return 0 for success, negative for failure).
+
+Data coming from the chip shall be sent directly to nfc_hci_recv_frame().
+
+LLC
+---
+
+Communication between the CPU and the chip often requires some link layer
+protocol. Those are isolated as modules managed by the HCI layer. There are
+currently two modules : nop (raw transfert) and shdlc.
+A new llc must implement the following functions::
+
+  struct nfc_llc_ops {
+	void *(*init) (struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
+		       rcv_to_hci_t rcv_to_hci, int tx_headroom,
+		       int tx_tailroom, int *rx_headroom, int *rx_tailroom,
+		       llc_failure_t llc_failure);
+	void (*deinit) (struct nfc_llc *llc);
+	int (*start) (struct nfc_llc *llc);
+	int (*stop) (struct nfc_llc *llc);
+	void (*rcv_from_drv) (struct nfc_llc *llc, struct sk_buff *skb);
+	int (*xmit_from_hci) (struct nfc_llc *llc, struct sk_buff *skb);
+  };
+
+init():
+	allocate and init your private storage
+deinit():
+	cleanup
+start():
+	establish the logical connection
+stop ():
+	terminate the logical connection
+rcv_from_drv():
+	handle data coming from the chip, going to HCI
+xmit_from_hci():
+	handle data sent by HCI, going to the chip
+
+The llc must be registered with nfc before it can be used. Do that by
+calling::
+
+	nfc_llc_register(const char *name, struct nfc_llc_ops *ops);
+
+Again, note that the llc does not handle the physical link. It is thus very
+easy to mix any physical link with any llc for a given chip driver.
+
+Included Drivers
+----------------
+
+An HCI based driver for an NXP PN544, connected through I2C bus, and using
+shdlc is included.
+
+Execution Contexts
+------------------
+
+The execution contexts are the following:
+- IRQ handler (IRQH):
+fast, cannot sleep. sends incoming frames to HCI where they are passed to
+the current llc. In case of shdlc, the frame is queued in shdlc rx queue.
+
+- SHDLC State Machine worker (SMW)
+
+  Only when llc_shdlc is used: handles shdlc rx & tx queues.
+
+  Dispatches HCI cmd responses.
+
+- HCI Tx Cmd worker (MSGTXWQ)
+
+  Serializes execution of HCI commands.
+
+  Completes execution in case of response timeout.
+
+- HCI Rx worker (MSGRXWQ)
+
+  Dispatches incoming HCI commands or events.
+
+- Syscall context from a userspace call (SYSCALL)
+
+  Any entrypoint in HCI called from NFC Core
+
+Workflow executing an HCI command (using shdlc)
+-----------------------------------------------
+
+Executing an HCI command can easily be performed synchronously using the
+following API::
+
+  int nfc_hci_send_cmd (struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
+			const u8 *param, size_t param_len, struct sk_buff **skb)
+
+The API must be invoked from a context that can sleep. Most of the time, this
+will be the syscall context. skb will return the result that was received in
+the response.
+
+Internally, execution is asynchronous. So all this API does is to enqueue the
+HCI command, setup a local wait queue on stack, and wait_event() for completion.
+The wait is not interruptible because it is guaranteed that the command will
+complete after some short timeout anyway.
+
+MSGTXWQ context will then be scheduled and invoke nfc_hci_msg_tx_work().
+This function will dequeue the next pending command and send its HCP fragments
+to the lower layer which happens to be shdlc. It will then start a timer to be
+able to complete the command with a timeout error if no response arrive.
+
+SMW context gets scheduled and invokes nfc_shdlc_sm_work(). This function
+handles shdlc framing in and out. It uses the driver xmit to send frames and
+receives incoming frames in an skb queue filled from the driver IRQ handler.
+SHDLC I(nformation) frames payload are HCP fragments. They are aggregated to
+form complete HCI frames, which can be a response, command, or event.
+
+HCI Responses are dispatched immediately from this context to unblock
+waiting command execution. Response processing involves invoking the completion
+callback that was provided by nfc_hci_msg_tx_work() when it sent the command.
+The completion callback will then wake the syscall context.
+
+It is also possible to execute the command asynchronously using this API::
+
+  static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+				       const u8 *param, size_t param_len,
+				       data_exchange_cb_t cb, void *cb_context)
+
+The workflow is the same, except that the API call returns immediately, and
+the callback will be called with the result from the SMW context.
+
+Workflow receiving an HCI event or command
+------------------------------------------
+
+HCI commands or events are not dispatched from SMW context. Instead, they are
+queued to HCI rx_queue and will be dispatched from HCI rx worker
+context (MSGRXWQ). This is done this way to allow a cmd or event handler
+to also execute other commands (for example, handling the
+NFC_HCI_EVT_TARGET_DISCOVERED event from PN544 requires to issue an
+ANY_GET_PARAMETER to the reader A gate to get information on the target
+that was discovered).
+
+Typically, such an event will be propagated to NFC Core from MSGRXWQ context.
+
+Error management
+----------------
+
+Errors that occur synchronously with the execution of an NFC Core request are
+simply returned as the execution result of the request. These are easy.
+
+Errors that occur asynchronously (e.g. in a background protocol handling thread)
+must be reported such that upper layers don't stay ignorant that something
+went wrong below and know that expected events will probably never happen.
+Handling of these errors is done as follows:
+
+- driver (pn544) fails to deliver an incoming frame: it stores the error such
+  that any subsequent call to the driver will result in this error. Then it
+  calls the standard nfc_shdlc_recv_frame() with a NULL argument to report the
+  problem above. shdlc stores a EREMOTEIO sticky status, which will trigger
+  SMW to report above in turn.
+
+- SMW is basically a background thread to handle incoming and outgoing shdlc
+  frames. This thread will also check the shdlc sticky status and report to HCI
+  when it discovers it is not able to run anymore because of an unrecoverable
+  error that happened within shdlc or below. If the problem occurs during shdlc
+  connection, the error is reported through the connect completion.
+
+- HCI: if an internal HCI error happens (frame is lost), or HCI is reported an
+  error from a lower layer, HCI will either complete the currently executing
+  command with that error, or notify NFC Core directly if no command is
+  executing.
+
+- NFC Core: when NFC Core is notified of an error from below and polling is
+  active, it will send a tag discovered event with an empty tag list to the user
+  space to let it know that the poll operation will never be able to detect a
+  tag. If polling is not active and the error was sticky, lower levels will
+  return it at next invocation.
diff --git a/Documentation/driver-api/nfc/nfc-pn544.rst b/Documentation/driver-api/nfc/nfc-pn544.rst
new file mode 100644
index 000000000000..6b2d8aae0c4e
--- /dev/null
+++ b/Documentation/driver-api/nfc/nfc-pn544.rst
@@ -0,0 +1,34 @@
+============================================================================
+Kernel driver for the NXP Semiconductors PN544 Near Field Communication chip
+============================================================================
+
+
+General
+-------
+
+The PN544 is an integrated transmission module for contactless
+communication. The driver goes under drives/nfc/ and is compiled as a
+module named "pn544".
+
+Host Interfaces: I2C, SPI and HSU, this driver supports currently only I2C.
+
+Protocols
+---------
+
+In the normal (HCI) mode and in the firmware update mode read and
+write functions behave a bit differently because the message formats
+or the protocols are different.
+
+In the normal (HCI) mode the protocol used is derived from the ETSI
+HCI specification. The firmware is updated using a specific protocol,
+which is different from HCI.
+
+HCI messages consist of an eight bit header and the message body. The
+header contains the message length. Maximum size for an HCI message is
+33. In HCI mode sent messages are tested for a correct
+checksum. Firmware update messages have the length in the second (MSB)
+and third (LSB) bytes of the message. The maximum FW message length is
+1024 bytes.
+
+For the ETSI HCI specification see
+http://www.etsi.org/WebSite/Technologies/ProtocolSpecification.aspx
diff --git a/Documentation/driver-api/ntb.rst b/Documentation/driver-api/ntb.rst
new file mode 100644
index 000000000000..87d1372da879
--- /dev/null
+++ b/Documentation/driver-api/ntb.rst
@@ -0,0 +1,263 @@
+===========
+NTB Drivers
+===========
+
+NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
+the separate memory systems of two or more computers to the same PCI-Express
+fabric. Existing NTB hardware supports a common feature set: doorbell
+registers and memory translation windows, as well as non common features like
+scratchpad and message registers. Scratchpad registers are read-and-writable
+registers that are accessible from either side of the device, so that peers can
+exchange a small amount of information at a fixed address. Message registers can
+be utilized for the same purpose. Additionally they are provided with with
+special status bits to make sure the information isn't rewritten by another
+peer. Doorbell registers provide a way for peers to send interrupt events.
+Memory windows allow translated read and write access to the peer memory.
+
+NTB Core Driver (ntb)
+=====================
+
+The NTB core driver defines an api wrapping the common feature set, and allows
+clients interested in NTB features to discover NTB the devices supported by
+hardware drivers.  The term "client" is used here to mean an upper layer
+component making use of the NTB api.  The term "driver," or "hardware driver,"
+is used here to mean a driver for a specific vendor and model of NTB hardware.
+
+NTB Client Drivers
+==================
+
+NTB client drivers should register with the NTB core driver.  After
+registering, the client probe and remove functions will be called appropriately
+as ntb hardware, or hardware drivers, are inserted and removed.  The
+registration uses the Linux Device framework, so it should feel familiar to
+anyone who has written a pci driver.
+
+NTB Typical client driver implementation
+----------------------------------------
+
+Primary purpose of NTB is to share some peace of memory between at least two
+systems. So the NTB device features like Scratchpad/Message registers are
+mainly used to perform the proper memory window initialization. Typically
+there are two types of memory window interfaces supported by the NTB API:
+inbound translation configured on the local ntb port and outbound translation
+configured by the peer, on the peer ntb port. The first type is
+depicted on the next figure::
+
+ Inbound translation:
+
+ Memory:              Local NTB Port:      Peer NTB Port:      Peer MMIO:
+  ____________
+ | dma-mapped |-ntb_mw_set_trans(addr)  |
+ | memory     |        _v____________   |   ______________
+ | (addr)     |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO
+ |------------|       |--------------|  |  |--------------|
+
+So typical scenario of the first type memory window initialization looks:
+1) allocate a memory region, 2) put translated address to NTB config,
+3) somehow notify a peer device of performed initialization, 4) peer device
+maps corresponding outbound memory window so to have access to the shared
+memory region.
+
+The second type of interface, that implies the shared windows being
+initialized by a peer device, is depicted on the figure::
+
+ Outbound translation:
+
+ Memory:        Local NTB Port:    Peer NTB Port:      Peer MMIO:
+  ____________                      ______________
+ | dma-mapped |                |   | MW base addr |<== memory-mapped IO
+ | memory     |                |   |--------------|
+ | (addr)     |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr)
+ |------------|                |   |--------------|
+
+Typical scenario of the second type interface initialization would be:
+1) allocate a memory region, 2) somehow deliver a translated address to a peer
+device, 3) peer puts the translated address to NTB config, 4) peer device maps
+outbound memory window so to have access to the shared memory region.
+
+As one can see the described scenarios can be combined in one portable
+algorithm.
+
+ Local device:
+  1) Allocate memory for a shared window
+  2) Initialize memory window by translated address of the allocated region
+     (it may fail if local memory window initialization is unsupported)
+  3) Send the translated address and memory window index to a peer device
+
+ Peer device:
+  1) Initialize memory window with retrieved address of the allocated
+     by another device memory region (it may fail if peer memory window
+     initialization is unsupported)
+  2) Map outbound memory window
+
+In accordance with this scenario, the NTB Memory Window API can be used as
+follows:
+
+ Local device:
+  1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can
+     be allocated for memory windows between local device and peer device
+     of port with specified index.
+  2) ntb_get_align(pidx, midx) - retrieve parameters restricting the
+     shared memory region alignment and size. Then memory can be properly
+     allocated.
+  3) Allocate physically contiguous memory region in compliance with
+     restrictions retrieved in 2).
+  4) ntb_mw_set_trans(pidx, midx) - try to set translation address of
+     the memory window with specified index for the defined peer device
+     (it may fail if local translated address setting is not supported)
+  5) Send translated base address (usually together with memory window
+     number) to the peer device using, for instance, scratchpad or message
+     registers.
+
+ Peer device:
+  1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other
+     device (related to pidx) translated address for specified memory
+     window. It may fail if retrieved address, for instance, exceeds
+     maximum possible address or isn't properly aligned.
+  2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory
+     window so to have an access to the shared memory.
+
+Also it is worth to note, that method ntb_mw_count(pidx) should return the
+same value as ntb_peer_mw_count() on the peer with port index - pidx.
+
+NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
+------------------------------------------------------------------
+
+The primary client for NTB is the Transport client, used in tandem with NTB
+Netdev.  These drivers function together to create a logical link to the peer,
+across the ntb, to exchange packets of network data.  The Transport client
+establishes a logical link to the peer, and creates queue pairs to exchange
+messages and data.  The NTB Netdev then creates an ethernet device using a
+Transport queue pair.  Network data is copied between socket buffers and the
+Transport queue pair buffer.  The Transport client may be used for other things
+besides Netdev, however no other applications have yet been written.
+
+NTB Ping Pong Test Client (ntb\_pingpong)
+-----------------------------------------
+
+The Ping Pong test client serves as a demonstration to exercise the doorbell
+and scratchpad registers of NTB hardware, and as an example simple NTB client.
+Ping Pong enables the link when started, waits for the NTB link to come up, and
+then proceeds to read and write the doorbell scratchpad registers of the NTB.
+The peers interrupt each other using a bit mask of doorbell bits, which is
+shifted by one in each round, to test the behavior of multiple doorbell bits
+and interrupt vectors.  The Ping Pong driver also reads the first local
+scratchpad, and writes the value plus one to the first peer scratchpad, each
+round before writing the peer doorbell register.
+
+Module Parameters:
+
+* unsafe - Some hardware has known issues with scratchpad and doorbell
+	registers.  By default, Ping Pong will not attempt to exercise such
+	hardware.  You may override this behavior at your own risk by setting
+	unsafe=1.
+* delay\_ms - Specify the delay between receiving a doorbell
+	interrupt event and setting the peer doorbell register for the next
+	round.
+* init\_db - Specify the doorbell bits to start new series of rounds.  A new
+	series begins once all the doorbell bits have been shifted out of
+	range.
+* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and
+	then to observe debugging output on the console.
+
+NTB Tool Test Client (ntb\_tool)
+--------------------------------
+
+The Tool test client serves for debugging, primarily, ntb hardware and drivers.
+The Tool provides access through debugfs for reading, setting, and clearing the
+NTB doorbell, and reading and writing scratchpads.
+
+The Tool does not currently have any module parameters.
+
+Debugfs Files:
+
+* *debugfs*/ntb\_tool/*hw*/
+	A directory in debugfs will be created for each
+	NTB device probed by the tool.  This directory is shortened to *hw*
+	below.
+* *hw*/db
+	This file is used to read, set, and clear the local doorbell.  Not
+	all operations may be supported by all hardware.  To read the doorbell,
+	read the file.  To set the doorbell, write `s` followed by the bits to
+	set (eg: `echo 's 0x0101' > db`).  To clear the doorbell, write `c`
+	followed by the bits to clear.
+* *hw*/mask
+	This file is used to read, set, and clear the local doorbell mask.
+	See *db* for details.
+* *hw*/peer\_db
+	This file is used to read, set, and clear the peer doorbell.
+	See *db* for details.
+* *hw*/peer\_mask
+	This file is used to read, set, and clear the peer doorbell
+	mask.  See *db* for details.
+* *hw*/spad
+	This file is used to read and write local scratchpads.  To read
+	the values of all scratchpads, read the file.  To write values, write a
+	series of pairs of scratchpad number and value
+	(eg: `echo '4 0x123 7 0xabc' > spad`
+	# to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively).
+* *hw*/peer\_spad
+	This file is used to read and write peer scratchpads.  See
+	*spad* for details.
+
+NTB MSI Test Client (ntb\_msi\_test)
+------------------------------------
+
+The MSI test client serves to test and debug the MSI library which
+allows for passing MSI interrupts across NTB memory windows. The
+test client is interacted with through the debugfs filesystem:
+
+* *debugfs*/ntb\_tool/*hw*/
+	A directory in debugfs will be created for each
+	NTB device probed by the tool.  This directory is shortened to *hw*
+	below.
+* *hw*/port
+	This file describes the local port number
+* *hw*/irq*_occurrences
+	One occurrences file exists for each interrupt and, when read,
+	returns the number of times the interrupt has been triggered.
+* *hw*/peer*/port
+	This file describes the port number for each peer
+* *hw*/peer*/count
+	This file describes the number of interrupts that can be
+	triggered on each peer
+* *hw*/peer*/trigger
+	Writing an interrupt number (any number less than the value
+	specified in count) will trigger the interrupt on the
+	specified peer. That peer's interrupt's occurrence file
+	should be incremented.
+
+NTB Hardware Drivers
+====================
+
+NTB hardware drivers should register devices with the NTB core driver.  After
+registering, clients probe and remove functions will be called.
+
+NTB Intel Hardware Driver (ntb\_hw\_intel)
+------------------------------------------
+
+The Intel hardware driver supports NTB on Xeon and Atom CPUs.
+
+Module Parameters:
+
+* b2b\_mw\_idx
+	If the peer ntb is to be accessed via a memory window, then use
+	this memory window to access the peer ntb.  A value of zero or positive
+	starts from the first mw idx, and a negative value starts from the last
+	mw idx.  Both sides MUST set the same value here!  The default value is
+	`-1`.
+* b2b\_mw\_share
+	If the peer ntb is to be accessed via a memory window, and if
+	the memory window is large enough, still allow the client to use the
+	second half of the memory window for address translation to the peer.
+* xeon\_b2b\_usd\_bar2\_addr64
+	If using B2B topology on Xeon hardware, use
+	this 64 bit address on the bus between the NTB devices for the window
+	at BAR2, on the upstream side of the link.
+* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
diff --git a/Documentation/driver-api/nvdimm/btt.rst b/Documentation/driver-api/nvdimm/btt.rst
new file mode 100644
index 000000000000..107395c042ae
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/btt.rst
@@ -0,0 +1,285 @@
+=============================
+BTT - Block Translation Table
+=============================
+
+
+1. Introduction
+===============
+
+Persistent memory based storage is able to perform IO at byte (or more
+accurately, cache line) granularity. However, we often want to expose such
+storage as traditional block devices. The block drivers for persistent memory
+will do exactly this. However, they do not provide any atomicity guarantees.
+Traditional SSDs typically provide protection against torn sectors in hardware,
+using stored energy in capacitors to complete in-flight block writes, or perhaps
+in firmware. We don't have this luxury with persistent memory - if a write is in
+progress, and we experience a power failure, the block will contain a mix of old
+and new data. Applications may not be prepared to handle such a scenario.
+
+The Block Translation Table (BTT) provides atomic sector update semantics for
+persistent memory devices, so that applications that rely on sector writes not
+being torn can continue to do so. The BTT manifests itself as a stacked block
+device, and reserves a portion of the underlying storage for its metadata. At
+the heart of it, is an indirection table that re-maps all the blocks on the
+volume. It can be thought of as an extremely simple file system that only
+provides atomic sector updates.
+
+
+2. Static Layout
+================
+
+The underlying storage on which a BTT can be laid out is not limited in any way.
+The BTT, however, splits the available space into chunks of up to 512 GiB,
+called "Arenas".
+
+Each arena follows the same layout for its metadata, and all references in an
+arena are internal to it (with the exception of one field that points to the
+next arena). The following depicts the "On-disk" metadata layout::
+
+
+    Backing Store     +------->  Arena
+  +---------------+   |   +------------------+
+  |               |   |   | Arena info block |
+  |    Arena 0    +---+   |       4K         |
+  |     512G      |       +------------------+
+  |               |       |                  |
+  +---------------+       |                  |
+  |               |       |                  |
+  |    Arena 1    |       |   Data Blocks    |
+  |     512G      |       |                  |
+  |               |       |                  |
+  +---------------+       |                  |
+  |       .       |       |                  |
+  |       .       |       |                  |
+  |       .       |       |                  |
+  |               |       |                  |
+  |               |       |                  |
+  +---------------+       +------------------+
+                          |                  |
+                          |     BTT Map      |
+                          |                  |
+                          |                  |
+                          +------------------+
+                          |                  |
+                          |     BTT Flog     |
+                          |                  |
+                          +------------------+
+                          | Info block copy  |
+                          |       4K         |
+                          +------------------+
+
+
+3. Theory of Operation
+======================
+
+
+a. The BTT Map
+--------------
+
+The map is a simple lookup/indirection table that maps an LBA to an internal
+block. Each map entry is 32 bits. The two most significant bits are special
+flags, and the remaining form the internal block number.
+
+======== =============================================================
+Bit      Description
+======== =============================================================
+31 - 30	 Error and Zero flags - Used in the following way::
+
+	   == ==  ====================================================
+	   31 30  Description
+	   == ==  ====================================================
+	   0  0	  Initial state. Reads return zeroes; Premap = Postmap
+	   0  1	  Zero state: Reads return zeroes
+	   1  0	  Error state: Reads fail; Writes clear 'E' bit
+	   1  1	  Normal Block – has valid postmap
+	   == ==  ====================================================
+
+29 - 0	 Mappings to internal 'postmap' blocks
+======== =============================================================
+
+
+Some of the terminology that will be subsequently used:
+
+============	================================================================
+External LBA	LBA as made visible to upper layers.
+ABA		Arena Block Address - Block offset/number within an arena
+Premap ABA	The block offset into an arena, which was decided upon by range
+		checking the External LBA
+Postmap ABA	The block number in the "Data Blocks" area obtained after
+		indirection from the map
+nfree		The number of free blocks that are maintained at any given time.
+		This is the number of concurrent writes that can happen to the
+		arena.
+============	================================================================
+
+
+For example, after adding a BTT, we surface a disk of 1024G. We get a read for
+the external LBA at 768G. This falls into the second arena, and of the 512G
+worth of blocks that this arena contributes, this block is at 256G. Thus, the
+premap ABA is 256G. We now refer to the map, and find out the mapping for block
+'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
+
+
+b. The BTT Flog
+---------------
+
+The BTT provides sector atomicity by making every write an "allocating write",
+i.e. Every write goes to a "free" block. A running list of free blocks is
+maintained in the form of the BTT flog. 'Flog' is a combination of the words
+"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
+
+========  =====================================================================
+lba       The premap ABA that is being written to
+old_map   The old postmap ABA - after 'this' write completes, this will be a
+	  free block.
+new_map   The new postmap ABA. The map will up updated to reflect this
+	  lba->postmap_aba mapping, but we log it here in case we have to
+	  recover.
+seq	  Sequence number to mark which of the 2 sections of this flog entry is
+	  valid/newest. It cycles between 01->10->11->01 (binary) under normal
+	  operation, with 00 indicating an uninitialized state.
+lba'	  alternate lba entry
+old_map'  alternate old postmap entry
+new_map'  alternate new postmap entry
+seq'	  alternate sequence number.
+========  =====================================================================
+
+Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
+padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
+done such that for any entry being written, it:
+a. overwrites the 'old' section in the entry based on sequence numbers
+b. writes the 'new' section such that the sequence number is written last.
+
+
+c. The concept of lanes
+-----------------------
+
+While 'nfree' describes the number of concurrent IOs an arena can process
+concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
+process::
+
+	nlanes = min(nfree, num_cpus)
+
+A lane number is obtained at the start of any IO, and is used for indexing into
+all the on-disk and in-memory data structures for the duration of the IO. If
+there are more CPUs than the max number of available lanes, than lanes are
+protected by spinlocks.
+
+
+d. In-memory data structure: Read Tracking Table (RTT)
+------------------------------------------------------
+
+Consider a case where we have two threads, one doing reads and the other,
+writes. We can hit a condition where the writer thread grabs a free block to do
+a new IO, but the (slow) reader thread is still reading from it. In other words,
+the reader consulted a map entry, and started reading the corresponding block. A
+writer started writing to the same external LBA, and finished the write updating
+the map for that external LBA to point to its new postmap ABA. At this point the
+internal, postmap block that the reader is (still) reading has been inserted
+into the list of free blocks. If another write comes in for the same LBA, it can
+grab this free block, and start writing to it, causing the reader to read
+incorrect data. To prevent this, we introduce the RTT.
+
+The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
+into rtt[lane_number], the postmap ABA it is reading, and clears it after the
+read is complete. Every writer thread, after grabbing a free block, checks the
+RTT for its presence. If the postmap free block is in the RTT, it waits till the
+reader clears the RTT entry, and only then starts writing to it.
+
+
+e. In-memory data structure: map locks
+--------------------------------------
+
+Consider a case where two writer threads are writing to the same LBA. There can
+be a race in the following sequence of steps::
+
+	free[lane] = map[premap_aba]
+	map[premap_aba] = postmap_aba
+
+Both threads can update their respective free[lane] with the same old, freed
+postmap_aba. This has made the layout inconsistent by losing a free entry, and
+at the same time, duplicating another free entry for two lanes.
+
+To solve this, we could have a single map lock (per arena) that has to be taken
+before performing the above sequence, but we feel that could be too contentious.
+Instead we use an array of (nfree) map_locks that is indexed by
+(premap_aba modulo nfree).
+
+
+f. Reconstruction from the Flog
+-------------------------------
+
+On startup, we analyze the BTT flog to create our list of free blocks. We walk
+through all the entries, and for each lane, of the set of two possible
+'sections', we always look at the most recent one only (based on the sequence
+number). The reconstruction rules/steps are simple:
+
+- Read map[log_entry.lba].
+- If log_entry.new matches the map entry, then log_entry.old is free.
+- If log_entry.new does not match the map entry, then log_entry.new is free.
+  (This case can only be caused by power-fails/unsafe shutdowns)
+
+
+g. Summarizing - Read and Write flows
+-------------------------------------
+
+Read:
+
+1.  Convert external LBA to arena number + pre-map ABA
+2.  Get a lane (and take lane_lock)
+3.  Read map to get the entry for this pre-map ABA
+4.  Enter post-map ABA into RTT[lane]
+5.  If TRIM flag set in map, return zeroes, and end IO (go to step 8)
+6.  If ERROR flag set in map, end IO with EIO (go to step 8)
+7.  Read data from this block
+8.  Remove post-map ABA entry from RTT[lane]
+9.  Release lane (and lane_lock)
+
+Write:
+
+1.  Convert external LBA to Arena number + pre-map ABA
+2.  Get a lane (and take lane_lock)
+3.  Use lane to index into in-memory free list and obtain a new block, next flog
+    index, next sequence number
+4.  Scan the RTT to check if free block is present, and spin/wait if it is.
+5.  Write data to this free block
+6.  Read map to get the existing post-map ABA entry for this pre-map ABA
+7.  Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
+8.  Write new post-map ABA into map.
+9.  Write old post-map entry into the free list
+10. Calculate next sequence number and write into the free list entry
+11. Release lane (and lane_lock)
+
+
+4. Error Handling
+=================
+
+An arena would be in an error state if any of the metadata is corrupted
+irrecoverably, either due to a bug or a media error. The following conditions
+indicate an error:
+
+- Info block checksum does not match (and recovering from the copy also fails)
+- All internal available blocks are not uniquely and entirely addressed by the
+  sum of mapped blocks and free blocks (from the BTT flog).
+- Rebuilding free list from the flog reveals missing/duplicate/impossible
+  entries
+- A map entry is out of bounds
+
+If any of these error conditions are encountered, the arena is put into a read
+only state using a flag in the info block.
+
+
+5. Usage
+========
+
+The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
+(pmem, or blk mode). The easiest way to set up such a namespace is using the
+'ndctl' utility [1]:
+
+For example, the ndctl command line to setup a btt with a 4k sector size is::
+
+    ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
+
+See ndctl create-namespace --help for more options.
+
+[1]: https://github.com/pmem/ndctl
diff --git a/Documentation/driver-api/nvdimm/index.rst b/Documentation/driver-api/nvdimm/index.rst
new file mode 100644
index 000000000000..a4f8f98aeb94
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+Non-Volatile Memory Device (NVDIMM)
+===================================
+
+.. toctree::
+   :maxdepth: 1
+
+   nvdimm
+   btt
+   security
diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst
new file mode 100644
index 000000000000..08f855cbb4e6
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/nvdimm.rst
@@ -0,0 +1,887 @@
+===============================
+LIBNVDIMM: Non-Volatile Devices
+===============================
+
+libnvdimm - kernel / libndctl - userspace helper library
+
+linux-nvdimm@lists.01.org
+
+Version 13
+
+.. contents:
+
+	Glossary
+	Overview
+	    Supporting Documents
+	    Git Trees
+	LIBNVDIMM PMEM and BLK
+	Why BLK?
+	    PMEM vs BLK
+	        BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+	Example NVDIMM Platform
+	LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+	    LIBNDCTL: Context
+	        libndctl: instantiate a new library context example
+	    LIBNVDIMM/LIBNDCTL: Bus
+	        libnvdimm: control class device in /sys/class
+	        libnvdimm: bus
+	        libndctl: bus enumeration example
+	    LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+	        libnvdimm: DIMM (NMEM)
+	        libndctl: DIMM enumeration example
+	    LIBNVDIMM/LIBNDCTL: Region
+	        libnvdimm: region
+	        libndctl: region enumeration example
+	        Why Not Encode the Region Type into the Region Name?
+	        How Do I Determine the Major Type of a Region?
+	    LIBNVDIMM/LIBNDCTL: Namespace
+	        libnvdimm: namespace
+	        libndctl: namespace enumeration example
+	        libndctl: namespace creation example
+	        Why the Term "namespace"?
+	    LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+	        libnvdimm: btt layout
+	        libndctl: btt creation example
+	Summary LIBNDCTL Diagram
+
+
+Glossary
+========
+
+PMEM:
+  A system-physical-address range where writes are persistent.  A
+  block device composed of PMEM is capable of DAX.  A PMEM address range
+  may span an interleave of several DIMMs.
+
+BLK:
+  A set of one or more programmable memory mapped apertures provided
+  by a DIMM to access its media.  This indirection precludes the
+  performance benefit of interleaving, but enables DIMM-bounded failure
+  modes.
+
+DPA:
+  DIMM Physical Address, is a DIMM-relative offset.  With one DIMM in
+  the system there would be a 1:1 system-physical-address:DPA association.
+  Once more DIMMs are added a memory controller interleave must be
+  decoded to determine the DPA associated with a given
+  system-physical-address.  BLK capacity always has a 1:1 relationship
+  with a single-DIMM's DPA range.
+
+DAX:
+  File system extensions to bypass the page cache and block layer to
+  mmap persistent memory, from a PMEM block device, directly into a
+  process address space.
+
+DSM:
+  Device Specific Method: ACPI method to to control specific
+  device - in this case the firmware.
+
+DCR:
+  NVDIMM Control Region Structure defined in ACPI 6 Section 5.2.25.5.
+  It defines a vendor-id, device-id, and interface format for a given DIMM.
+
+BTT:
+  Block Translation Table: Persistent memory is byte addressable.
+  Existing software may have an expectation that the power-fail-atomicity
+  of writes is at least one sector, 512 bytes.  The BTT is an indirection
+  table with atomic update semantics to front a PMEM/BLK block device
+  driver and present arbitrary atomic sector sizes.
+
+LABEL:
+  Metadata stored on a DIMM device that partitions and identifies
+  (persistently names) storage between PMEM and BLK.  It also partitions
+  BLK storage to host BTTs with different parameters per BLK-partition.
+  Note that traditional partition tables, GPT/MBR, are layered on top of a
+  BLK or PMEM device.
+
+
+Overview
+========
+
+The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
+PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
+and BLK mode access.  These three modes of operation are described by
+the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6.  While the LIBNVDIMM
+implementation is generic and supports pre-NFIT platforms, it was guided
+by the superset of capabilities need to support this ACPI 6 definition
+for NVDIMM resources.  The bulk of the kernel implementation is in place
+to handle the case where DPA accessible via PMEM is aliased with DPA
+accessible via BLK.  When that occurs a LABEL is needed to reserve DPA
+for exclusive access via one mode a time.
+
+Supporting Documents
+--------------------
+
+ACPI 6:
+	http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
+NVDIMM Namespace:
+	http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
+DSM Interface Example:
+	http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+Driver Writer's Guide:
+	http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
+
+Git Trees
+---------
+
+LIBNVDIMM:
+	https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
+LIBNDCTL:
+	https://github.com/pmem/ndctl.git
+PMEM:
+	https://github.com/01org/prd
+
+
+LIBNVDIMM PMEM and BLK
+======================
+
+Prior to the arrival of the NFIT, non-volatile memory was described to a
+system in various ad-hoc ways.  Usually only the bare minimum was
+provided, namely, a single system-physical-address range where writes
+are expected to be durable after a system power loss.  Now, the NFIT
+specification standardizes not only the description of PMEM, but also
+BLK and platform message-passing entry points for control and
+configuration.
+
+For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
+device driver:
+
+    1. PMEM (nd_pmem.ko): Drives a system-physical-address range.  This
+       range is contiguous in system memory and may be interleaved (hardware
+       memory controller striped) across multiple DIMMs.  When interleaved the
+       platform may optionally provide details of which DIMMs are participating
+       in the interleave.
+
+       Note that while LIBNVDIMM describes system-physical-address ranges that may
+       alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
+       alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
+       distinction.  The different device-types are an implementation detail
+       that userspace can exploit to implement policies like "only interface
+       with address ranges from certain DIMMs".  It is worth noting that when
+       aliasing is present and a DIMM lacks a label, then no block device can
+       be created by default as userspace needs to do at least one allocation
+       of DPA to the PMEM range.  In contrast ND_NAMESPACE_IO ranges, once
+       registered, can be immediately attached to nd_pmem.
+
+    2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
+       defined apertures.  A set of apertures will access just one DIMM.
+       Multiple windows (apertures) allow multiple concurrent accesses, much like
+       tagged-command-queuing, and would likely be used by different threads or
+       different CPUs.
+
+       The NFIT specification defines a standard format for a BLK-aperture, but
+       the spec also allows for vendor specific layouts, and non-NFIT BLK
+       implementations may have other designs for BLK I/O.  For this reason
+       "nd_blk" calls back into platform-specific code to perform the I/O.
+
+       One such implementation is defined in the "Driver Writer's Guide" and "DSM
+       Interface Example".
+
+
+Why BLK?
+========
+
+While PMEM provides direct byte-addressable CPU-load/store access to
+NVDIMM storage, it does not provide the best system RAS (recovery,
+availability, and serviceability) model.  An access to a corrupted
+system-physical-address address causes a CPU exception while an access
+to a corrupted address through an BLK-aperture causes that block window
+to raise an error status in a register.  The latter is more aligned with
+the standard error model that host-bus-adapter attached disks present.
+
+Also, if an administrator ever wants to replace a memory it is easier to
+service a system at DIMM module boundaries.  Compare this to PMEM where
+data could be interleaved in an opaque hardware specific manner across
+several DIMMs.
+
+PMEM vs BLK
+-----------
+
+BLK-apertures solve these RAS problems, but their presence is also the
+major contributing factor to the complexity of the ND subsystem.  They
+complicate the implementation because PMEM and BLK alias in DPA space.
+Any given DIMM's DPA-range may contribute to one or more
+system-physical-address sets of interleaved DIMMs, *and* may also be
+accessed in its entirety through its BLK-aperture.  Accessing a DPA
+through a system-physical-address while simultaneously accessing the
+same DPA through a BLK-aperture has undefined results.  For this reason,
+DIMMs with this dual interface configuration include a DSM function to
+store/retrieve a LABEL.  The LABEL effectively partitions the DPA-space
+into exclusive system-physical-address and BLK-aperture accessible
+regions.  For simplicity a DIMM is allowed a PMEM "region" per each
+interleave set in which it is a member.  The remaining DPA space can be
+carved into an arbitrary number of BLK devices with discontiguous
+extents.
+
+BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One of the few
+reasons to allow multiple BLK namespaces per REGION is so that each
+BLK-namespace can be configured with a BTT with unique atomic sector
+sizes.  While a PMEM device can host a BTT the LABEL specification does
+not provide for a sector size to be specified for a PMEM namespace.
+
+This is due to the expectation that the primary usage model for PMEM is
+via DAX, and the BTT is incompatible with DAX.  However, for the cases
+where an application or filesystem still needs atomic sector update
+guarantees it can register a BTT on a PMEM device or partition.  See
+LIBNVDIMM/NDCTL: Block Translation Table "btt"
+
+
+Example NVDIMM Platform
+=======================
+
+For the remainder of this document the following diagram will be
+referenced for any example sysfs layouts::
+
+
+                               (a)               (b)           DIMM   BLK-REGION
+            +-------------------+--------+--------+--------+
+  +------+  |       pm0.0       | blk2.0 | pm1.0  | blk2.1 |    0      region2
+  | imc0 +--+- - - region0- - - +--------+        +--------+
+  +--+---+  |       pm0.0       | blk3.0 | pm1.0  | blk3.1 |    1      region3
+     |      +-------------------+--------v        v--------+
+  +--+---+                               |                 |
+  | cpu0 |                                     region1
+  +--+---+                               |                 |
+     |      +----------------------------^        ^--------+
+  +--+---+  |           blk4.0           | pm1.0  | blk4.0 |    2      region4
+  | imc1 +--+----------------------------|        +--------+
+  +------+  |           blk5.0           | pm1.0  | blk5.0 |    3      region5
+            +----------------------------+--------+--------+
+
+In this platform we have four DIMMs and two memory controllers in one
+socket.  Each unique interface (BLK or PMEM) to DPA space is identified
+by a region device with a dynamically assigned id (REGION0 - REGION5).
+
+    1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
+       single PMEM namespace is created in the REGION0-SPA-range that spans most
+       of DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
+       interleaved system-physical-address range is reclaimed as BLK-aperture
+       accessed space starting at DPA-offset (a) into each DIMM.  In that
+       reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
+       REGION3 where "blk2.0" and "blk3.0" are just human readable names that
+       could be set to any user-desired name in the LABEL.
+
+    2. In the last portion of DIMM0 and DIMM1 we have an interleaved
+       system-physical-address range, REGION1, that spans those two DIMMs as
+       well as DIMM2 and DIMM3.  Some of REGION1 is allocated to a PMEM namespace
+       named "pm1.0", the rest is reclaimed in 4 BLK-aperture namespaces (for
+       each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
+       "blk5.0".
+
+    3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
+       interleaved system-physical-address range (i.e. the DPA address past
+       offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
+       Note, that this example shows that BLK-aperture namespaces don't need to
+       be contiguous in DPA-space.
+
+    This bus is provided by the kernel under the device
+    /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
+    the nfit_test.ko module is loaded.  This not only test LIBNVDIMM but the
+    acpi_nfit.ko driver as well.
+
+
+LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+========================================================
+
+What follows is a description of the LIBNVDIMM sysfs layout and a
+corresponding object hierarchy diagram as viewed through the LIBNDCTL
+API.  The example sysfs paths and diagrams are relative to the Example
+NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
+test.
+
+LIBNDCTL: Context
+-----------------
+
+Every API call in the LIBNDCTL library requires a context that holds the
+logging parameters and other library instance state.  The library is
+based on the libabc template:
+
+	https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git
+
+LIBNDCTL: instantiate a new library context example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+	struct ndctl_ctx *ctx;
+
+	if (ndctl_new(&ctx) == 0)
+		return ctx;
+	else
+		return NULL;
+
+LIBNVDIMM/LIBNDCTL: Bus
+-----------------------
+
+A bus has a 1:1 relationship with an NFIT.  The current expectation for
+ACPI based systems is that there is only ever one platform-global NFIT.
+That said, it is trivial to register multiple NFITs, the specification
+does not preclude it.  The infrastructure supports multiple busses and
+we use this capability to test multiple NFIT configurations in the unit
+test.
+
+LIBNVDIMM: control class device in /sys/class
+---------------------------------------------
+
+This character device accepts DSM messages to be passed to DIMM
+identified by its NFIT handle::
+
+	/sys/class/nd/ndctl0
+	|-- dev
+	|-- device -> ../../../ndbus0
+	|-- subsystem -> ../../../../../../../class/nd
+
+
+
+LIBNVDIMM: bus
+--------------
+
+::
+
+	struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+	       struct nvdimm_bus_descriptor *nfit_desc);
+
+::
+
+	/sys/devices/platform/nfit_test.0/ndbus0
+	|-- commands
+	|-- nd
+	|-- nfit
+	|-- nmem0
+	|-- nmem1
+	|-- nmem2
+	|-- nmem3
+	|-- power
+	|-- provider
+	|-- region0
+	|-- region1
+	|-- region2
+	|-- region3
+	|-- region4
+	|-- region5
+	|-- uevent
+	`-- wait_probe
+
+LIBNDCTL: bus enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Find the bus handle that describes the bus from Example NVDIMM Platform::
+
+	static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
+			const char *provider)
+	{
+		struct ndctl_bus *bus;
+
+		ndctl_bus_foreach(ctx, bus)
+			if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
+				return bus;
+
+		return NULL;
+	}
+
+	bus = get_bus_by_provider(ctx, "nfit_test.0");
+
+
+LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+-------------------------------
+
+The DIMM device provides a character device for sending commands to
+hardware, and it is a container for LABELs.  If the DIMM is defined by
+NFIT then an optional 'nfit' attribute sub-directory is available to add
+NFIT-specifics.
+
+Note that the kernel device name for "DIMMs" is "nmemX".  The NFIT
+describes these devices via "Memory Device to System Physical Address
+Range Mapping Structure", and there is no requirement that they actually
+be physical DIMMs, so we use a more generic name.
+
+LIBNVDIMM: DIMM (NMEM)
+^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+	struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+			const struct attribute_group **groups, unsigned long flags,
+			unsigned long *dsm_mask);
+
+::
+
+	/sys/devices/platform/nfit_test.0/ndbus0
+	|-- nmem0
+	|   |-- available_slots
+	|   |-- commands
+	|   |-- dev
+	|   |-- devtype
+	|   |-- driver -> ../../../../../bus/nd/drivers/nvdimm
+	|   |-- modalias
+	|   |-- nfit
+	|   |   |-- device
+	|   |   |-- format
+	|   |   |-- handle
+	|   |   |-- phys_id
+	|   |   |-- rev_id
+	|   |   |-- serial
+	|   |   `-- vendor
+	|   |-- state
+	|   |-- subsystem -> ../../../../../bus/nd
+	|   `-- uevent
+	|-- nmem1
+	[..]
+
+
+LIBNDCTL: DIMM enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Note, in this example we are assuming NFIT-defined DIMMs which are
+identified by an "nfit_handle" a 32-bit value where:
+
+   - Bit 3:0 DIMM number within the memory channel
+   - Bit 7:4 memory channel number
+   - Bit 11:8 memory controller ID
+   - Bit 15:12 socket ID (within scope of a Node controller if node
+     controller is present)
+   - Bit 27:16 Node Controller ID
+   - Bit 31:28 Reserved
+
+::
+
+	static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
+	       unsigned int handle)
+	{
+		struct ndctl_dimm *dimm;
+
+		ndctl_dimm_foreach(bus, dimm)
+			if (ndctl_dimm_get_handle(dimm) == handle)
+				return dimm;
+
+		return NULL;
+	}
+
+	#define DIMM_HANDLE(n, s, i, c, d) \
+		(((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
+		 | ((c & 0xf) << 4) | (d & 0xf))
+
+	dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
+
+LIBNVDIMM/LIBNDCTL: Region
+--------------------------
+
+A generic REGION device is registered for each PMEM range or BLK-aperture
+set.  Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
+sets on the "nfit_test.0" bus.  The primary role of regions are to be a
+container of "mappings".  A mapping is a tuple of <DIMM,
+DPA-start-offset, length>.
+
+LIBNVDIMM provides a built-in driver for these REGION devices.  This driver
+is responsible for reconciling the aliased DPA mappings across all
+regions, parsing the LABEL, if present, and then emitting NAMESPACE
+devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
+nd_blk device driver to consume.
+
+In addition to the generic attributes of "mapping"s, "interleave_ways"
+and "size" the REGION device also exports some convenience attributes.
+"nstype" indicates the integer type of namespace-device this region
+emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
+'add' event, "modalias" duplicates the MODALIAS variable stored by udev
+at the 'add' event, and finally, the optional "spa_index" is provided in
+the case where the region is defined by a SPA.
+
+LIBNVDIMM: region::
+
+	struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+			struct nd_region_desc *ndr_desc);
+	struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+			struct nd_region_desc *ndr_desc);
+
+::
+
+	/sys/devices/platform/nfit_test.0/ndbus0
+	|-- region0
+	|   |-- available_size
+	|   |-- btt0
+	|   |-- btt_seed
+	|   |-- devtype
+	|   |-- driver -> ../../../../../bus/nd/drivers/nd_region
+	|   |-- init_namespaces
+	|   |-- mapping0
+	|   |-- mapping1
+	|   |-- mappings
+	|   |-- modalias
+	|   |-- namespace0.0
+	|   |-- namespace_seed
+	|   |-- numa_node
+	|   |-- nfit
+	|   |   `-- spa_index
+	|   |-- nstype
+	|   |-- set_cookie
+	|   |-- size
+	|   |-- subsystem -> ../../../../../bus/nd
+	|   `-- uevent
+	|-- region1
+	[..]
+
+LIBNDCTL: region enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sample region retrieval routines based on NFIT-unique data like
+"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
+BLK::
+
+	static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
+			unsigned int spa_index)
+	{
+		struct ndctl_region *region;
+
+		ndctl_region_foreach(bus, region) {
+			if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
+				continue;
+			if (ndctl_region_get_spa_index(region) == spa_index)
+				return region;
+		}
+		return NULL;
+	}
+
+	static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
+			unsigned int handle)
+	{
+		struct ndctl_region *region;
+
+		ndctl_region_foreach(bus, region) {
+			struct ndctl_mapping *map;
+
+			if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
+				continue;
+			ndctl_mapping_foreach(region, map) {
+				struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
+
+				if (ndctl_dimm_get_handle(dimm) == handle)
+					return region;
+			}
+		}
+		return NULL;
+	}
+
+
+Why Not Encode the Region Type into the Region Name?
+----------------------------------------------------
+
+At first glance it seems since NFIT defines just PMEM and BLK interface
+types that we should simply name REGION devices with something derived
+from those type names.  However, the ND subsystem explicitly keeps the
+REGION name generic and expects userspace to always consider the
+region-attributes for four reasons:
+
+    1. There are already more than two REGION and "namespace" types.  For
+       PMEM there are two subtypes.  As mentioned previously we have PMEM where
+       the constituent DIMM devices are known and anonymous PMEM.  For BLK
+       regions the NFIT specification already anticipates vendor specific
+       implementations.  The exact distinction of what a region contains is in
+       the region-attributes not the region-name or the region-devtype.
+
+    2. A region with zero child-namespaces is a possible configuration.  For
+       example, the NFIT allows for a DCR to be published without a
+       corresponding BLK-aperture.  This equates to a DIMM that can only accept
+       control/configuration messages, but no i/o through a descendant block
+       device.  Again, this "type" is advertised in the attributes ('mappings'
+       == 0) and the name does not tell you much.
+
+    3. What if a third major interface type arises in the future?  Outside
+       of vendor specific implementations, it's not difficult to envision a
+       third class of interface type beyond BLK and PMEM.  With a generic name
+       for the REGION level of the device-hierarchy old userspace
+       implementations can still make sense of new kernel advertised
+       region-types.  Userspace can always rely on the generic region
+       attributes like "mappings", "size", etc and the expected child devices
+       named "namespace".  This generic format of the device-model hierarchy
+       allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
+       future-proof.
+
+    4. There are more robust mechanisms for determining the major type of a
+       region than a device name.  See the next section, How Do I Determine the
+       Major Type of a Region?
+
+How Do I Determine the Major Type of a Region?
+----------------------------------------------
+
+Outside of the blanket recommendation of "use libndctl", or simply
+looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
+"nstype" integer attribute, here are some other options.
+
+1. module alias lookup
+^^^^^^^^^^^^^^^^^^^^^^
+
+    The whole point of region/namespace device type differentiation is to
+    decide which block-device driver will attach to a given LIBNVDIMM namespace.
+    One can simply use the modalias to lookup the resulting module.  It's
+    important to note that this method is robust in the presence of a
+    vendor-specific driver down the road.  If a vendor-specific
+    implementation wants to supplant the standard nd_blk driver it can with
+    minimal impact to the rest of LIBNVDIMM.
+
+    In fact, a vendor may also want to have a vendor-specific region-driver
+    (outside of nd_region).  For example, if a vendor defined its own LABEL
+    format it would need its own region driver to parse that LABEL and emit
+    the resulting namespaces.  The output from module resolution is more
+    accurate than a region-name or region-devtype.
+
+2. udev
+^^^^^^^
+
+    The kernel "devtype" is registered in the udev database::
+
+	# udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
+	P: /devices/platform/nfit_test.0/ndbus0/region0
+	E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
+	E: DEVTYPE=nd_pmem
+	E: MODALIAS=nd:t2
+	E: SUBSYSTEM=nd
+
+	# udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
+	P: /devices/platform/nfit_test.0/ndbus0/region4
+	E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
+	E: DEVTYPE=nd_blk
+	E: MODALIAS=nd:t3
+	E: SUBSYSTEM=nd
+
+    ...and is available as a region attribute, but keep in mind that the
+    "devtype" does not indicate sub-type variations and scripts should
+    really be understanding the other attributes.
+
+3. type specific attributes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+    As it currently stands a BLK-aperture region will never have a
+    "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region.  A
+    BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
+    that does not allow I/O.  A PMEM region with a "mappings" value of zero
+    is a simple system-physical-address range.
+
+
+LIBNVDIMM/LIBNDCTL: Namespace
+-----------------------------
+
+A REGION, after resolving DPA aliasing and LABEL specified boundaries,
+surfaces one or more "namespace" devices.  The arrival of a "namespace"
+device currently triggers either the nd_blk or nd_pmem driver to load
+and register a disk/block device.
+
+LIBNVDIMM: namespace
+^^^^^^^^^^^^^^^^^^^^
+
+Here is a sample layout from the three major types of NAMESPACE where
+namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
+attribute), namespace2.0 represents a BLK namespace (note it has a
+'sector_size' attribute) that, and namespace6.0 represents an anonymous
+PMEM namespace (note that has no 'uuid' attribute due to not support a
+LABEL)::
+
+	/sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
+	|-- alt_name
+	|-- devtype
+	|-- dpa_extents
+	|-- force_raw
+	|-- modalias
+	|-- numa_node
+	|-- resource
+	|-- size
+	|-- subsystem -> ../../../../../../bus/nd
+	|-- type
+	|-- uevent
+	`-- uuid
+	/sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
+	|-- alt_name
+	|-- devtype
+	|-- dpa_extents
+	|-- force_raw
+	|-- modalias
+	|-- numa_node
+	|-- sector_size
+	|-- size
+	|-- subsystem -> ../../../../../../bus/nd
+	|-- type
+	|-- uevent
+	`-- uuid
+	/sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
+	|-- block
+	|   `-- pmem0
+	|-- devtype
+	|-- driver -> ../../../../../../bus/nd/drivers/pmem
+	|-- force_raw
+	|-- modalias
+	|-- numa_node
+	|-- resource
+	|-- size
+	|-- subsystem -> ../../../../../../bus/nd
+	|-- type
+	`-- uevent
+
+LIBNDCTL: namespace enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Namespaces are indexed relative to their parent region, example below.
+These indexes are mostly static from boot to boot, but subsystem makes
+no guarantees in this regard.  For a static namespace identifier use its
+'uuid' attribute.
+
+::
+
+  static struct ndctl_namespace
+  *get_namespace_by_id(struct ndctl_region *region, unsigned int id)
+  {
+          struct ndctl_namespace *ndns;
+
+          ndctl_namespace_foreach(region, ndns)
+                  if (ndctl_namespace_get_id(ndns) == id)
+                          return ndns;
+
+          return NULL;
+  }
+
+LIBNDCTL: namespace creation example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Idle namespaces are automatically created by the kernel if a given
+region has enough available capacity to create a new namespace.
+Namespace instantiation involves finding an idle namespace and
+configuring it.  For the most part the setting of namespace attributes
+can occur in any order, the only constraint is that 'uuid' must be set
+before 'size'.  This enables the kernel to track DPA allocations
+internally with a static identifier::
+
+  static int configure_namespace(struct ndctl_region *region,
+                  struct ndctl_namespace *ndns,
+                  struct namespace_parameters *parameters)
+  {
+          char devname[50];
+
+          snprintf(devname, sizeof(devname), "namespace%d.%d",
+                          ndctl_region_get_id(region), paramaters->id);
+
+          ndctl_namespace_set_alt_name(ndns, devname);
+          /* 'uuid' must be set prior to setting size! */
+          ndctl_namespace_set_uuid(ndns, paramaters->uuid);
+          ndctl_namespace_set_size(ndns, paramaters->size);
+          /* unlike pmem namespaces, blk namespaces have a sector size */
+          if (parameters->lbasize)
+                  ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
+          ndctl_namespace_enable(ndns);
+  }
+
+
+Why the Term "namespace"?
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+    1. Why not "volume" for instance?  "volume" ran the risk of confusing
+       ND (libnvdimm subsystem) to a volume manager like device-mapper.
+
+    2. The term originated to describe the sub-devices that can be created
+       within a NVME controller (see the nvme specification:
+       http://www.nvmexpress.org/specifications/), and NFIT namespaces are
+       meant to parallel the capabilities and configurability of
+       NVME-namespaces.
+
+
+LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+-------------------------------------------------
+
+A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
+block device driver that fronts either the whole block device or a
+partition of a block device emitted by either a PMEM or BLK NAMESPACE.
+
+LIBNVDIMM: btt layout
+^^^^^^^^^^^^^^^^^^^^^
+
+Every region will start out with at least one BTT device which is the
+seed device.  To activate it set the "namespace", "uuid", and
+"sector_size" attributes and then bind the device to the nd_pmem or
+nd_blk driver depending on the region type::
+
+	/sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
+	|-- namespace
+	|-- delete
+	|-- devtype
+	|-- modalias
+	|-- numa_node
+	|-- sector_size
+	|-- subsystem -> ../../../../../bus/nd
+	|-- uevent
+	`-- uuid
+
+LIBNDCTL: btt creation example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Similar to namespaces an idle BTT device is automatically created per
+region.  Each time this "seed" btt device is configured and enabled a new
+seed is created.  Creating a BTT configuration involves two steps of
+finding and idle BTT and assigning it to consume a PMEM or BLK namespace::
+
+	static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
+	{
+		struct ndctl_btt *btt;
+
+		ndctl_btt_foreach(region, btt)
+			if (!ndctl_btt_is_enabled(btt)
+					&& !ndctl_btt_is_configured(btt))
+				return btt;
+
+		return NULL;
+	}
+
+	static int configure_btt(struct ndctl_region *region,
+			struct btt_parameters *parameters)
+	{
+		btt = get_idle_btt(region);
+
+		ndctl_btt_set_uuid(btt, parameters->uuid);
+		ndctl_btt_set_sector_size(btt, parameters->sector_size);
+		ndctl_btt_set_namespace(btt, parameters->ndns);
+		/* turn off raw mode device */
+		ndctl_namespace_disable(parameters->ndns);
+		/* turn on btt access */
+		ndctl_btt_enable(btt);
+	}
+
+Once instantiated a new inactive btt seed device will appear underneath
+the region.
+
+Once a "namespace" is removed from a BTT that instance of the BTT device
+will be deleted or otherwise reset to default values.  This deletion is
+only at the device model level.  In order to destroy a BTT the "info
+block" needs to be destroyed.  Note, that to destroy a BTT the media
+needs to be written in raw mode.  By default, the kernel will autodetect
+the presence of a BTT and disable raw mode.  This autodetect behavior
+can be suppressed by enabling raw mode for the namespace via the
+ndctl_namespace_set_raw_mode() API.
+
+
+Summary LIBNDCTL Diagram
+------------------------
+
+For the given example above, here is the view of the objects as seen by the
+LIBNDCTL API::
+
+              +---+
+              |CTX|    +---------+   +--------------+  +---------------+
+              +-+-+  +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
+                |    | +---------+   +--------------+  +---------------+
+  +-------+     |    | +---------+   +--------------+  +---------------+
+  | DIMM0 <-+   |    +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
+  +-------+ |   |    | +---------+   +--------------+  +---------------+
+  | DIMM1 <-+ +-v--+ | +---------+   +--------------+  +---------------+
+  +-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6  "blk2.0" |
+  | DIMM2 <-+ +----+ | +---------+ | +--------------+  +----------------------+
+  +-------+ |        |             +-> NAMESPACE2.1 +--> ND5  "blk2.1" | BTT2 |
+  | DIMM3 <-+        |               +--------------+  +----------------------+
+  +-------+          | +---------+   +--------------+  +---------------+
+                     +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4  "blk3.0" |
+                     | +---------+ | +--------------+  +----------------------+
+                     |             +-> NAMESPACE3.1 +--> ND3  "blk3.1" | BTT1 |
+                     |               +--------------+  +----------------------+
+                     | +---------+   +--------------+  +---------------+
+                     +-> REGION4 +---> NAMESPACE4.0 +--> ND2  "blk4.0" |
+                     | +---------+   +--------------+  +---------------+
+                     | +---------+   +--------------+  +----------------------+
+                     +-> REGION5 +---> NAMESPACE5.0 +--> ND1  "blk5.0" | BTT0 |
+                       +---------+   +--------------+  +---------------+------+
diff --git a/Documentation/driver-api/nvdimm/security.rst b/Documentation/driver-api/nvdimm/security.rst
new file mode 100644
index 000000000000..ad9dea099b34
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/security.rst
@@ -0,0 +1,143 @@
+===============
+NVDIMM Security
+===============
+
+1. Introduction
+---------------
+
+With the introduction of Intel Device Specific Methods (DSM) v1.8
+specification [1], security DSMs are introduced. The spec added the following
+security DSMs: "get security state", "set passphrase", "disable passphrase",
+"unlock unit", "freeze lock", "secure erase", and "overwrite". A security_ops
+data structure has been added to struct dimm in order to support the security
+operations and generic APIs are exposed to allow vendor neutral operations.
+
+2. Sysfs Interface
+------------------
+The "security" sysfs attribute is provided in the nvdimm sysfs directory. For
+example:
+/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/security
+
+The "show" attribute of that attribute will display the security state for
+that DIMM. The following states are available: disabled, unlocked, locked,
+frozen, and overwrite. If security is not supported, the sysfs attribute
+will not be visible.
+
+The "store" attribute takes several commands when it is being written to
+in order to support some of the security functionalities:
+update <old_keyid> <new_keyid> - enable or update passphrase.
+disable <keyid> - disable enabled security and remove key.
+freeze - freeze changing of security states.
+erase <keyid> - delete existing user encryption key.
+overwrite <keyid> - wipe the entire nvdimm.
+master_update <keyid> <new_keyid> - enable or update master passphrase.
+master_erase <keyid> - delete existing user encryption key.
+
+3. Key Management
+-----------------
+
+The key is associated to the payload by the DIMM id. For example:
+# cat /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/nfit/id
+8089-a2-1740-00000133
+The DIMM id would be provided along with the key payload (passphrase) to
+the kernel.
+
+The security keys are managed on the basis of a single key per DIMM. The
+key "passphrase" is expected to be 32bytes long. This is similar to the ATA
+security specification [2]. A key is initially acquired via the request_key()
+kernel API call during nvdimm unlock. It is up to the user to make sure that
+all the keys are in the kernel user keyring for unlock.
+
+A nvdimm encrypted-key of format enc32 has the description format of:
+nvdimm:<bus-provider-specific-unique-id>
+
+See file ``Documentation/security/keys/trusted-encrypted.rst`` for creating
+encrypted-keys of enc32 format. TPM usage with a master trusted key is
+preferred for sealing the encrypted-keys.
+
+4. Unlocking
+------------
+When the DIMMs are being enumerated by the kernel, the kernel will attempt to
+retrieve the key from the kernel user keyring. This is the only time
+a locked DIMM can be unlocked. Once unlocked, the DIMM will remain unlocked
+until reboot. Typically an entity (i.e. shell script) will inject all the
+relevant encrypted-keys into the kernel user keyring during the initramfs phase.
+This provides the unlock function access to all the related keys that contain
+the passphrase for the respective nvdimms.  It is also recommended that the
+keys are injected before libnvdimm is loaded by modprobe.
+
+5. Update
+---------
+When doing an update, it is expected that the existing key is removed from
+the kernel user keyring and reinjected as different (old) key. It's irrelevant
+what the key description is for the old key since we are only interested in the
+keyid when doing the update operation. It is also expected that the new key
+is injected with the description format described from earlier in this
+document.  The update command written to the sysfs attribute will be with
+the format:
+update <old keyid> <new keyid>
+
+If there is no old keyid due to a security enabling, then a 0 should be
+passed in.
+
+6. Freeze
+---------
+The freeze operation does not require any keys. The security config can be
+frozen by a user with root privelege.
+
+7. Disable
+----------
+The security disable command format is:
+disable <keyid>
+
+An key with the current passphrase payload that is tied to the nvdimm should be
+in the kernel user keyring.
+
+8. Secure Erase
+---------------
+The command format for doing a secure erase is:
+erase <keyid>
+
+An key with the current passphrase payload that is tied to the nvdimm should be
+in the kernel user keyring.
+
+9. Overwrite
+------------
+The command format for doing an overwrite is:
+overwrite <keyid>
+
+Overwrite can be done without a key if security is not enabled. A key serial
+of 0 can be passed in to indicate no key.
+
+The sysfs attribute "security" can be polled to wait on overwrite completion.
+Overwrite can last tens of minutes or more depending on nvdimm size.
+
+An encrypted-key with the current user passphrase that is tied to the nvdimm
+should be injected and its keyid should be passed in via sysfs.
+
+10. Master Update
+-----------------
+The command format for doing a master update is:
+update <old keyid> <new keyid>
+
+The operating mechanism for master update is identical to update except the
+master passphrase key is passed to the kernel. The master passphrase key
+is just another encrypted-key.
+
+This command is only available when security is disabled.
+
+11. Master Erase
+----------------
+The command format for doing a master erase is:
+master_erase <current keyid>
+
+This command has the same operating mechanism as erase except the master
+passphrase key is passed to the kernel. The master passphrase key is just
+another encrypted-key.
+
+This command is only available when the master security is enabled, indicated
+by the extended security status.
+
+[1]: http://pmem.io/documents/NVDIMM_DSM_Interface-V1.8.pdf
+
+[2]: http://www.t13.org/documents/UploadedDocuments/docs2006/e05179r4-ACS-SecurityClarifications.pdf
diff --git a/Documentation/driver-api/nvmem.rst b/Documentation/driver-api/nvmem.rst
new file mode 100644
index 000000000000..d9d958d5c824
--- /dev/null
+++ b/Documentation/driver-api/nvmem.rst
@@ -0,0 +1,189 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+NVMEM Subsystem
+===============
+
+ Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+
+This document explains the NVMEM Framework along with the APIs provided,
+and how to use it.
+
+1. Introduction
+===============
+*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to
+retrieve configuration of SOC or Device specific data from non volatile
+memories like eeprom, efuses and so on.
+
+Before this framework existed, NVMEM drivers like eeprom were stored in
+drivers/misc, where they all had to duplicate pretty much the same code to
+register a sysfs file, allow in-kernel users to access the content of the
+devices they were driving, etc.
+
+This was also a problem as far as other in-kernel users were involved, since
+the solutions used were pretty much different from one driver to another, there
+was a rather big abstraction leak.
+
+This framework aims at solve these problems. It also introduces DT
+representation for consumer devices to go get the data they require (MAC
+Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This
+framework is based on regmap, so that most of the abstraction available in
+regmap can be reused, across multiple types of buses.
+
+NVMEM Providers
++++++++++++++++
+
+NVMEM provider refers to an entity that implements methods to initialize, read
+and write the non-volatile memory.
+
+2. Registering/Unregistering the NVMEM provider
+===============================================
+
+A NVMEM provider can register with NVMEM core by supplying relevant
+nvmem configuration to nvmem_register(), on success core would return a valid
+nvmem_device pointer.
+
+nvmem_unregister(nvmem) is used to unregister a previously registered provider.
+
+For example, a simple qfprom case::
+
+  static struct nvmem_config econfig = {
+	.name = "qfprom",
+	.owner = THIS_MODULE,
+  };
+
+  static int qfprom_probe(struct platform_device *pdev)
+  {
+	...
+	econfig.dev = &pdev->dev;
+	nvmem = nvmem_register(&econfig);
+	...
+  }
+
+It is mandatory that the NVMEM provider has a regmap associated with its
+struct device. Failure to do would return error code from nvmem_register().
+
+Users of board files can define and register nvmem cells using the
+nvmem_cell_table struct::
+
+  static struct nvmem_cell_info foo_nvmem_cells[] = {
+	{
+		.name		= "macaddr",
+		.offset		= 0x7f00,
+		.bytes		= ETH_ALEN,
+	}
+  };
+
+  static struct nvmem_cell_table foo_nvmem_cell_table = {
+	.nvmem_name		= "i2c-eeprom",
+	.cells			= foo_nvmem_cells,
+	.ncells			= ARRAY_SIZE(foo_nvmem_cells),
+  };
+
+  nvmem_add_cell_table(&foo_nvmem_cell_table);
+
+Additionally it is possible to create nvmem cell lookup entries and register
+them with the nvmem framework from machine code as shown in the example below::
+
+  static struct nvmem_cell_lookup foo_nvmem_lookup = {
+	.nvmem_name		= "i2c-eeprom",
+	.cell_name		= "macaddr",
+	.dev_id			= "foo_mac.0",
+	.con_id			= "mac-address",
+  };
+
+  nvmem_add_cell_lookups(&foo_nvmem_lookup, 1);
+
+NVMEM Consumers
++++++++++++++++
+
+NVMEM consumers are the entities which make use of the NVMEM provider to
+read from and to NVMEM.
+
+3. NVMEM cell based consumer APIs
+=================================
+
+NVMEM cells are the data entries/fields in the NVMEM.
+The NVMEM framework provides 3 APIs to read/write NVMEM cells::
+
+  struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name);
+  struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name);
+
+  void nvmem_cell_put(struct nvmem_cell *cell);
+  void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
+
+  void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len);
+  int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len);
+
+`*nvmem_cell_get()` apis will get a reference to nvmem cell for a given id,
+and nvmem_cell_read/write() can then read or write to the cell.
+Once the usage of the cell is finished the consumer should call
+`*nvmem_cell_put()` to free all the allocation memory for the cell.
+
+4. Direct NVMEM device based consumer APIs
+==========================================
+
+In some instances it is necessary to directly read/write the NVMEM.
+To facilitate such consumers NVMEM framework provides below apis::
+
+  struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
+  struct nvmem_device *devm_nvmem_device_get(struct device *dev,
+					   const char *name);
+  void nvmem_device_put(struct nvmem_device *nvmem);
+  int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset,
+		      size_t bytes, void *buf);
+  int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset,
+		       size_t bytes, void *buf);
+  int nvmem_device_cell_read(struct nvmem_device *nvmem,
+			   struct nvmem_cell_info *info, void *buf);
+  int nvmem_device_cell_write(struct nvmem_device *nvmem,
+			    struct nvmem_cell_info *info, void *buf);
+
+Before the consumers can read/write NVMEM directly, it should get hold
+of nvmem_controller from one of the `*nvmem_device_get()` api.
+
+The difference between these apis and cell based apis is that these apis always
+take nvmem_device as parameter.
+
+5. Releasing a reference to the NVMEM
+=====================================
+
+When a consumer no longer needs the NVMEM, it has to release the reference
+to the NVMEM it has obtained using the APIs mentioned in the above section.
+The NVMEM framework provides 2 APIs to release a reference to the NVMEM::
+
+  void nvmem_cell_put(struct nvmem_cell *cell);
+  void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
+  void nvmem_device_put(struct nvmem_device *nvmem);
+  void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem);
+
+Both these APIs are used to release a reference to the NVMEM and
+devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated
+with this NVMEM.
+
+Userspace
++++++++++
+
+6. Userspace binary interface
+==============================
+
+Userspace can read/write the raw NVMEM file located at::
+
+	/sys/bus/nvmem/devices/*/nvmem
+
+ex::
+
+  hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
+
+  0000000 0000 0000 0000 0000 0000 0000 0000 0000
+  *
+  00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00
+  0000000 0000 0000 0000 0000 0000 0000 0000 0000
+  ...
+  *
+  0001000
+
+7. DeviceTree Binding
+=====================
+
+See Documentation/devicetree/bindings/nvmem/nvmem.txt
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/driver-api/parport-lowlevel.rst
index 0633d70ffda7..0633d70ffda7 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/driver-api/parport-lowlevel.rst
diff --git a/Documentation/driver-api/phy/index.rst b/Documentation/driver-api/phy/index.rst
new file mode 100644
index 000000000000..69ba1216de72
--- /dev/null
+++ b/Documentation/driver-api/phy/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Generic PHY Framework
+=====================
+
+.. toctree::
+
+   phy
+   samsung-usb2
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
+
diff --git a/Documentation/driver-api/phy/phy.rst b/Documentation/driver-api/phy/phy.rst
new file mode 100644
index 000000000000..8fc1ce0bb905
--- /dev/null
+++ b/Documentation/driver-api/phy/phy.rst
@@ -0,0 +1,197 @@
+=============
+PHY subsystem
+=============
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+This document explains the Generic PHY Framework along with the APIs provided,
+and how-to-use.
+
+Introduction
+============
+
+*PHY* is the abbreviation for physical layer. It is used to connect a device
+to the physical medium e.g., the USB controller has a PHY to provide functions
+such as serialization, de-serialization, encoding, decoding and is responsible
+for obtaining the required data transmission rate. Note that some USB
+controllers have PHY functionality embedded into it and others use an external
+PHY. Other peripherals that use PHY include Wireless LAN, Ethernet,
+SATA etc.
+
+The intention of creating this framework is to bring the PHY drivers spread
+all over the Linux kernel to drivers/phy to increase code re-use and for
+better code maintainability.
+
+This framework will be of use only to devices that use external PHY (PHY
+functionality is not embedded within the controller).
+
+Registering/Unregistering the PHY provider
+==========================================
+
+PHY provider refers to an entity that implements one or more PHY instances.
+For the simple case where the PHY provider implements only a single instance of
+the PHY, the framework provides its own implementation of of_xlate in
+of_phy_simple_xlate. If the PHY provider implements multiple instances, it
+should provide its own implementation of of_xlate. of_xlate is used only for
+dt boot case.
+
+::
+
+	#define of_phy_provider_register(dev, xlate)    \
+		__of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
+
+	#define devm_of_phy_provider_register(dev, xlate)       \
+		__devm_of_phy_provider_register((dev), NULL, THIS_MODULE,
+						(xlate))
+
+of_phy_provider_register and devm_of_phy_provider_register macros can be used to
+register the phy_provider and it takes device and of_xlate as
+arguments. For the dt boot case, all PHY providers should use one of the above
+2 macros to register the PHY provider.
+
+Often the device tree nodes associated with a PHY provider will contain a set
+of children that each represent a single PHY. Some bindings may nest the child
+nodes within extra levels for context and extensibility, in which case the low
+level of_phy_provider_register_full() and devm_of_phy_provider_register_full()
+macros can be used to override the node containing the children.
+
+::
+
+	#define of_phy_provider_register_full(dev, children, xlate) \
+		__of_phy_provider_register(dev, children, THIS_MODULE, xlate)
+
+	#define devm_of_phy_provider_register_full(dev, children, xlate) \
+		__devm_of_phy_provider_register_full(dev, children,
+						     THIS_MODULE, xlate)
+
+	void devm_of_phy_provider_unregister(struct device *dev,
+		struct phy_provider *phy_provider);
+	void of_phy_provider_unregister(struct phy_provider *phy_provider);
+
+devm_of_phy_provider_unregister and of_phy_provider_unregister can be used to
+unregister the PHY.
+
+Creating the PHY
+================
+
+The PHY driver should create the PHY in order for other peripheral controllers
+to make use of it. The PHY framework provides 2 APIs to create the PHY.
+
+::
+
+	struct phy *phy_create(struct device *dev, struct device_node *node,
+			       const struct phy_ops *ops);
+	struct phy *devm_phy_create(struct device *dev,
+				    struct device_node *node,
+				    const struct phy_ops *ops);
+
+The PHY drivers can use one of the above 2 APIs to create the PHY by passing
+the device pointer and phy ops.
+phy_ops is a set of function pointers for performing PHY operations such as
+init, exit, power_on and power_off.
+
+Inorder to dereference the private data (in phy_ops), the phy provider driver
+can use phy_set_drvdata() after creating the PHY and use phy_get_drvdata() in
+phy_ops to get back the private data.
+
+4. Getting a reference to the PHY
+
+Before the controller can make use of the PHY, it has to get a reference to
+it. This framework provides the following APIs to get a reference to the PHY.
+
+::
+
+	struct phy *phy_get(struct device *dev, const char *string);
+	struct phy *phy_optional_get(struct device *dev, const char *string);
+	struct phy *devm_phy_get(struct device *dev, const char *string);
+	struct phy *devm_phy_optional_get(struct device *dev,
+					  const char *string);
+	struct phy *devm_of_phy_get_by_index(struct device *dev,
+					     struct device_node *np,
+					     int index);
+
+phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can
+be used to get the PHY. In the case of dt boot, the string arguments
+should contain the phy name as given in the dt data and in the case of
+non-dt boot, it should contain the label of the PHY.  The two
+devm_phy_get associates the device with the PHY using devres on
+successful PHY get. On driver detach, release function is invoked on
+the devres data and devres data is freed. phy_optional_get and
+devm_phy_optional_get should be used when the phy is optional. These
+two functions will never return -ENODEV, but instead returns NULL when
+the phy cannot be found.Some generic drivers, such as ehci, may use multiple
+phys and for such drivers referencing phy(s) by name(s) does not make sense. In
+this case, devm_of_phy_get_by_index can be used to get a phy reference based on
+the index.
+
+It should be noted that NULL is a valid phy reference. All phy
+consumer calls on the NULL phy become NOPs. That is the release calls,
+the phy_init() and phy_exit() calls, and phy_power_on() and
+phy_power_off() calls are all NOP when applied to a NULL phy. The NULL
+phy is useful in devices for handling optional phy devices.
+
+Releasing a reference to the PHY
+================================
+
+When the controller no longer needs the PHY, it has to release the reference
+to the PHY it has obtained using the APIs mentioned in the above section. The
+PHY framework provides 2 APIs to release a reference to the PHY.
+
+::
+
+	void phy_put(struct phy *phy);
+	void devm_phy_put(struct device *dev, struct phy *phy);
+
+Both these APIs are used to release a reference to the PHY and devm_phy_put
+destroys the devres associated with this PHY.
+
+Destroying the PHY
+==================
+
+When the driver that created the PHY is unloaded, it should destroy the PHY it
+created using one of the following 2 APIs::
+
+	void phy_destroy(struct phy *phy);
+	void devm_phy_destroy(struct device *dev, struct phy *phy);
+
+Both these APIs destroy the PHY and devm_phy_destroy destroys the devres
+associated with this PHY.
+
+PM Runtime
+==========
+
+This subsystem is pm runtime enabled. So while creating the PHY,
+pm_runtime_enable of the phy device created by this subsystem is called and
+while destroying the PHY, pm_runtime_disable is called. Note that the phy
+device created by this subsystem will be a child of the device that calls
+phy_create (PHY provider device).
+
+So pm_runtime_get_sync of the phy_device created by this subsystem will invoke
+pm_runtime_get_sync of PHY provider device because of parent-child relationship.
+It should also be noted that phy_power_on and phy_power_off performs
+phy_pm_runtime_get_sync and phy_pm_runtime_put respectively.
+There are exported APIs like phy_pm_runtime_get, phy_pm_runtime_get_sync,
+phy_pm_runtime_put, phy_pm_runtime_put_sync, phy_pm_runtime_allow and
+phy_pm_runtime_forbid for performing PM operations.
+
+PHY Mappings
+============
+
+In order to get reference to a PHY without help from DeviceTree, the framework
+offers lookups which can be compared to clkdev that allow clk structures to be
+bound to devices. A lookup can be made during runtime when a handle to the
+struct phy already exists.
+
+The framework offers the following API for registering and unregistering the
+lookups::
+
+	int phy_create_lookup(struct phy *phy, const char *con_id,
+			      const char *dev_id);
+	void phy_remove_lookup(struct phy *phy, const char *con_id,
+			       const char *dev_id);
+
+DeviceTree Binding
+==================
+
+The documentation for PHY dt binding can be found @
+Documentation/devicetree/bindings/phy/phy-bindings.txt
diff --git a/Documentation/driver-api/phy/samsung-usb2.rst b/Documentation/driver-api/phy/samsung-usb2.rst
new file mode 100644
index 000000000000..c48c8b9797b9
--- /dev/null
+++ b/Documentation/driver-api/phy/samsung-usb2.rst
@@ -0,0 +1,137 @@
+====================================
+Samsung USB 2.0 PHY adaptation layer
+====================================
+
+1. Description
+--------------
+
+The architecture of the USB 2.0 PHY module in Samsung SoCs is similar
+among many SoCs. In spite of the similarities it proved difficult to
+create a one driver that would fit all these PHY controllers. Often
+the differences were minor and were found in particular bits of the
+registers of the PHY. In some rare cases the order of register writes or
+the PHY powering up process had to be altered. This adaptation layer is
+a compromise between having separate drivers and having a single driver
+with added support for many special cases.
+
+2. Files description
+--------------------
+
+- phy-samsung-usb2.c
+   This is the main file of the adaptation layer. This file contains
+   the probe function and provides two callbacks to the Generic PHY
+   Framework. This two callbacks are used to power on and power off the
+   phy. They carry out the common work that has to be done on all version
+   of the PHY module. Depending on which SoC was chosen they execute SoC
+   specific callbacks. The specific SoC version is selected by choosing
+   the appropriate compatible string. In addition, this file contains
+   struct of_device_id definitions for particular SoCs.
+
+- phy-samsung-usb2.h
+   This is the include file. It declares the structures used by this
+   driver. In addition it should contain extern declarations for
+   structures that describe particular SoCs.
+
+3. Supporting SoCs
+------------------
+
+To support a new SoC a new file should be added to the drivers/phy
+directory. Each SoC's configuration is stored in an instance of the
+struct samsung_usb2_phy_config::
+
+  struct samsung_usb2_phy_config {
+	const struct samsung_usb2_common_phy *phys;
+	int (*rate_to_clk)(unsigned long, u32 *);
+	unsigned int num_phys;
+	bool has_mode_switch;
+  };
+
+The num_phys is the number of phys handled by the driver. `*phys` is an
+array that contains the configuration for each phy. The has_mode_switch
+property is a boolean flag that determines whether the SoC has USB host
+and device on a single pair of pins. If so, a special register has to
+be modified to change the internal routing of these pins between a USB
+device or host module.
+
+For example the configuration for Exynos 4210 is following::
+
+  const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = {
+	.has_mode_switch        = 0,
+	.num_phys		= EXYNOS4210_NUM_PHYS,
+	.phys			= exynos4210_phys,
+	.rate_to_clk		= exynos4210_rate_to_clk,
+  }
+
+- `int (*rate_to_clk)(unsigned long, u32 *)`
+
+	The rate_to_clk callback is to convert the rate of the clock
+	used as the reference clock for the PHY module to the value
+	that should be written in the hardware register.
+
+The exynos4210_phys configuration array is as follows::
+
+  static const struct samsung_usb2_common_phy exynos4210_phys[] = {
+	{
+		.label		= "device",
+		.id		= EXYNOS4210_DEVICE,
+		.power_on	= exynos4210_power_on,
+		.power_off	= exynos4210_power_off,
+	},
+	{
+		.label		= "host",
+		.id		= EXYNOS4210_HOST,
+		.power_on	= exynos4210_power_on,
+		.power_off	= exynos4210_power_off,
+	},
+	{
+		.label		= "hsic0",
+		.id		= EXYNOS4210_HSIC0,
+		.power_on	= exynos4210_power_on,
+		.power_off	= exynos4210_power_off,
+	},
+	{
+		.label		= "hsic1",
+		.id		= EXYNOS4210_HSIC1,
+		.power_on	= exynos4210_power_on,
+		.power_off	= exynos4210_power_off,
+	},
+	{},
+  };
+
+- `int (*power_on)(struct samsung_usb2_phy_instance *);`
+  `int (*power_off)(struct samsung_usb2_phy_instance *);`
+
+	These two callbacks are used to power on and power off the phy
+	by modifying appropriate registers.
+
+Final change to the driver is adding appropriate compatible value to the
+phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were
+added to the struct of_device_id samsung_usb2_phy_of_match[] array::
+
+  #ifdef CONFIG_PHY_EXYNOS4210_USB2
+	{
+		.compatible = "samsung,exynos4210-usb2-phy",
+		.data = &exynos4210_usb2_phy_config,
+	},
+  #endif
+
+To add further flexibility to the driver the Kconfig file enables to
+include support for selected SoCs in the compiled driver. The Kconfig
+entry for Exynos 4210 is following::
+
+  config PHY_EXYNOS4210_USB2
+	bool "Support for Exynos 4210"
+	depends on PHY_SAMSUNG_USB2
+	depends on CPU_EXYNOS4210
+	help
+	  Enable USB PHY support for Exynos 4210. This option requires that
+	  Samsung USB 2.0 PHY driver is enabled and means that support for this
+	  particular SoC is compiled in the driver. In case of Exynos 4210 four
+	  phys are available - device, host, HSCI0 and HSCI1.
+
+The newly created file that supports the new SoC has to be also added to the
+Makefile. In case of Exynos 4210 the added line is following::
+
+  obj-$(CONFIG_PHY_EXYNOS4210_USB2)       += phy-exynos4210-usb2.o
+
+After completing these steps the support for the new SoC should be ready.
diff --git a/Documentation/driver-api/pm/conf.py b/Documentation/driver-api/pm/conf.py
deleted file mode 100644
index a89fac11272f..000000000000
--- a/Documentation/driver-api/pm/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Device Power Management"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'pm.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 30835683616a..f66c7b9126ea 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto`
 flag is clear.
 
 For more information about the runtime power management framework, refer to
-:file:`Documentation/power/runtime_pm.txt`.
+:file:`Documentation/power/runtime_pm.rst`.
 
 
 Calling Drivers to Enter and Leave System Sleep States
@@ -728,7 +728,7 @@ it into account in any way.
 
 Devices may be defined as IRQ-safe which indicates to the PM core that their
 runtime PM callbacks may be invoked with disabled interrupts (see
-:file:`Documentation/power/runtime_pm.txt` for more information).  If an
+:file:`Documentation/power/runtime_pm.rst` for more information).  If an
 IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
 disallowed, unless the domain itself is defined as IRQ-safe. However, it
 makes sense to define a PM domain as IRQ-safe only if all the devices in it
@@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM
 status in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`.
 [Refer to that document for more information regarding this particular issue as
 well as for information on the device runtime power management framework in
 general.]
diff --git a/Documentation/driver-api/pps.rst b/Documentation/driver-api/pps.rst
new file mode 100644
index 000000000000..2d6b99766ee8
--- /dev/null
+++ b/Documentation/driver-api/pps.rst
@@ -0,0 +1,242 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+PPS - Pulse Per Second
+======================
+
+Copyright (C) 2007 Rodolfo Giometti <giometti@enneenne.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+
+
+Overview
+--------
+
+LinuxPPS provides a programming interface (API) to define in the
+system several PPS sources.
+
+PPS means "pulse per second" and a PPS source is just a device which
+provides a high precision signal each second so that an application
+can use it to adjust system clock time.
+
+A PPS source can be connected to a serial port (usually to the Data
+Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
+CPU's GPIOs (this is the common case in embedded systems) but in each
+case when a new pulse arrives the system must apply to it a timestamp
+and record it for userland.
+
+Common use is the combination of the NTPD as userland program, with a
+GPS receiver as PPS source, to obtain a wallclock-time with
+sub-millisecond synchronisation to UTC.
+
+
+RFC considerations
+------------------
+
+While implementing a PPS API as RFC 2783 defines and using an embedded
+CPU GPIO-Pin as physical link to the signal, I encountered a deeper
+problem:
+
+   At startup it needs a file descriptor as argument for the function
+   time_pps_create().
+
+This implies that the source has a /dev/... entry. This assumption is
+OK for the serial and parallel port, where you can do something
+useful besides(!) the gathering of timestamps as it is the central
+task for a PPS API. But this assumption does not work for a single
+purpose GPIO line. In this case even basic file-related functionality
+(like read() and write()) makes no sense at all and should not be a
+precondition for the use of a PPS API.
+
+The problem can be simply solved if you consider that a PPS source is
+not always connected with a GPS data source.
+
+So your programs should check if the GPS data source (the serial port
+for instance) is a PPS source too, and if not they should provide the
+possibility to open another device as PPS source.
+
+In LinuxPPS the PPS sources are simply char devices usually mapped
+into files /dev/pps0, /dev/pps1, etc.
+
+
+PPS with USB to serial devices
+------------------------------
+
+It is possible to grab the PPS from an USB to serial device. However,
+you should take into account the latencies and jitter introduced by
+the USB stack. Users have reported clock instability around +-1ms when
+synchronized with PPS through USB. With USB 2.0, jitter may decrease
+down to the order of 125 microseconds.
+
+This may be suitable for time server synchronization with NTP because
+of its undersampling and algorithms.
+
+If your device doesn't report PPS, you can check that the feature is
+supported by its driver. Most of the time, you only need to add a call
+to usb_serial_handle_dcd_change after checking the DCD status (see
+ch341 and pl2303 examples).
+
+
+Coding example
+--------------
+
+To register a PPS source into the kernel you should define a struct
+pps_source_info as follows::
+
+    static struct pps_source_info pps_ktimer_info = {
+	    .name         = "ktimer",
+	    .path         = "",
+	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
+			    PPS_ECHOASSERT |
+			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
+	    .echo         = pps_ktimer_echo,
+	    .owner        = THIS_MODULE,
+    };
+
+and then calling the function pps_register_source() in your
+initialization routine as follows::
+
+    source = pps_register_source(&pps_ktimer_info,
+			PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
+
+The pps_register_source() prototype is::
+
+  int pps_register_source(struct pps_source_info *info, int default_params)
+
+where "info" is a pointer to a structure that describes a particular
+PPS source, "default_params" tells the system what the initial default
+parameters for the device should be (it is obvious that these parameters
+must be a subset of ones defined in the struct
+pps_source_info which describe the capabilities of the driver).
+
+Once you have registered a new PPS source into the system you can
+signal an assert event (for example in the interrupt handler routine)
+just using::
+
+    pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
+
+where "ts" is the event's timestamp.
+
+The same function may also run the defined echo function
+(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
+asked for that... etc..
+
+Please see the file drivers/pps/clients/pps-ktimer.c for example code.
+
+
+SYSFS support
+-------------
+
+If the SYSFS filesystem is enabled in the kernel it provides a new class::
+
+   $ ls /sys/class/pps/
+   pps0/  pps1/  pps2/
+
+Every directory is the ID of a PPS sources defined in the system and
+inside you find several files::
+
+   $ ls -F /sys/class/pps/pps0/
+   assert     dev        mode       path       subsystem@
+   clear      echo       name       power/     uevent
+
+
+Inside each "assert" and "clear" file you can find the timestamp and a
+sequence number::
+
+   $ cat /sys/class/pps/pps0/assert
+   1170026870.983207967#8
+
+Where before the "#" is the timestamp in seconds; after it is the
+sequence number. Other files are:
+
+ * echo: reports if the PPS source has an echo function or not;
+
+ * mode: reports available PPS functioning modes;
+
+ * name: reports the PPS source's name;
+
+ * path: reports the PPS source's device path, that is the device the
+   PPS source is connected to (if it exists).
+
+
+Testing the PPS support
+-----------------------
+
+In order to test the PPS support even without specific hardware you can use
+the pps-ktimer driver (see the client subsection in the PPS configuration menu)
+and the userland tools available in your distribution's pps-tools package,
+http://linuxpps.org , or https://github.com/redlab-i/pps-tools.
+
+Once you have enabled the compilation of pps-ktimer just modprobe it (if
+not statically compiled)::
+
+   # modprobe pps-ktimer
+
+and the run ppstest as follow::
+
+   $ ./ppstest /dev/pps1
+   trying PPS source "/dev/pps1"
+   found PPS source "/dev/pps1"
+   ok, found 1 source(s), now start fetching data...
+   source 0 - assert 1186592699.388832443, sequence: 364 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
+
+Please note that to compile userland programs, you need the file timepps.h.
+This is available in the pps-tools repository mentioned above.
+
+
+Generators
+----------
+
+Sometimes one needs to be able not only to catch PPS signals but to produce
+them also. For example, running a distributed simulation, which requires
+computers' clock to be synchronized very tightly. One way to do this is to
+invent some complicated hardware solutions but it may be neither necessary
+nor affordable. The cheap way is to load a PPS generator on one of the
+computers (master) and PPS clients on others (slaves), and use very simple
+cables to deliver signals using parallel ports, for example.
+
+Parallel port cable pinout::
+
+	pin	name	master      slave
+	1	STROBE	  *------     *
+	2	D0	  *     |     *
+	3	D1	  *     |     *
+	4	D2	  *     |     *
+	5	D3	  *     |     *
+	6	D4	  *     |     *
+	7	D5	  *     |     *
+	8	D6	  *     |     *
+	9	D7	  *     |     *
+	10	ACK	  *     ------*
+	11	BUSY	  *           *
+	12	PE	  *           *
+	13	SEL	  *           *
+	14	AUTOFD	  *           *
+	15	ERROR	  *           *
+	16	INIT	  *           *
+	17	SELIN	  *           *
+	18-25	GND	  *-----------*
+
+Please note that parallel port interrupt occurs only on high->low transition,
+so it is used for PPS assert edge. PPS clear edge can be determined only
+using polling in the interrupt handler which actually can be done way more
+precisely because interrupt handling delays can be quite big and random. So
+current parport PPS generator implementation (pps_gen_parport module) is
+geared towards using the clear edge for time synchronization.
+
+Clear edge polling is done with disabled interrupts so it's better to select
+delay between assert and clear edge as small as possible to reduce system
+latencies. But if it is too small slave won't be able to capture clear edge
+transition. The default of 30us should be good enough in most situations.
+The delay can be selected using 'delay' pps_gen_parport module parameter.
diff --git a/Documentation/driver-api/pti_intel_mid.rst b/Documentation/driver-api/pti_intel_mid.rst
new file mode 100644
index 000000000000..20f1cff42d5f
--- /dev/null
+++ b/Documentation/driver-api/pti_intel_mid.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Intel MID PTI
+=============
+
+The Intel MID PTI project is HW implemented in Intel Atom
+system-on-a-chip designs based on the Parallel Trace
+Interface for MIPI P1149.7 cJTAG standard.  The kernel solution
+for this platform involves the following files::
+
+	./include/linux/pti.h
+	./drivers/.../n_tracesink.h
+	./drivers/.../n_tracerouter.c
+	./drivers/.../n_tracesink.c
+	./drivers/.../pti.c
+
+pti.c is the driver that enables various debugging features
+popular on platforms from certain mobile manufacturers.
+n_tracerouter.c and n_tracesink.c allow extra system information to
+be collected and routed to the pti driver, such as trace
+debugging data from a modem.  Although n_tracerouter
+and n_tracesink are a part of the complete PTI solution,
+these two line disciplines can work separately from
+pti.c and route any data stream from one /dev/tty node
+to another /dev/tty node via kernel-space.  This provides
+a stable, reliable connection that will not break unless
+the user-space application shuts down (plus avoids
+kernel->user->kernel context switch overheads of routing
+data).
+
+An example debugging usage for this driver system:
+
+  * Hook /dev/ttyPTI0 to syslogd.  Opening this port will also start
+    a console device to further capture debugging messages to PTI.
+  * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
+    This is where n_tracerouter and n_tracesink are used.
+  * Hook /dev/pti to a user-level debugging application for writing
+    to PTI HW.
+  * `Use mipi_` Kernel Driver API in other device drivers for
+    debugging to PTI by first requesting a PTI write address via
+    mipi_request_masterchannel(1).
+
+Below is example pseudo-code on how a 'privileged' application
+can hook up n_tracerouter and n_tracesink to any tty on
+a system.  'Privileged' means the application has enough
+privileges to successfully manipulate the ldisc drivers
+but is not just blindly executing as 'root'. Keep in mind
+the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
+and n_tracesink line discpline drivers but is a generic
+operation for a program to use a line discpline driver
+on a tty port other than the default n_tty::
+
+  /////////// To hook up n_tracerouter and n_tracesink /////////
+
+  // Note that n_tracerouter depends on n_tracesink.
+  #include <errno.h>
+  #define ONE_TTY "/dev/ttyOne"
+  #define TWO_TTY "/dev/ttyTwo"
+
+  // needed global to hand onto ldisc connection
+  static int g_fd_source = -1;
+  static int g_fd_sink  = -1;
+
+  // these two vars used to grab LDISC values from loaded ldisc drivers
+  // in OS.  Look at /proc/tty/ldiscs to get the right numbers from
+  // the ldiscs loaded in the system.
+  int source_ldisc_num, sink_ldisc_num = -1;
+  int retval;
+
+  g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
+  g_fd_sink   = open(TWO_TTY, O_RDWR); // must be R/W
+
+  if (g_fd_source <= 0) || (g_fd_sink <= 0) {
+     // doubt you'll want to use these exact error lines of code
+     printf("Error on open(). errno: %d\n",errno);
+     return errno;
+  }
+
+  retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
+  if (retval < 0) {
+     printf("Error on ioctl().  errno: %d\n", errno);
+     return errno;
+  }
+
+  retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
+  if (retval < 0) {
+     printf("Error on ioctl().  errno: %d\n", errno);
+     return errno;
+  }
+
+  /////////// To disconnect n_tracerouter and n_tracesink ////////
+
+  // First make sure data through the ldiscs has stopped.
+
+  // Second, disconnect ldiscs.  This provides a
+  // little cleaner shutdown on tty stack.
+  sink_ldisc_num = 0;
+  source_ldisc_num = 0;
+  ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
+  ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
+
+  // Three, program closes connection, and cleanup:
+  close(g_fd_uart);
+  close(g_fd_gadget);
+  g_fd_uart = g_fd_gadget = NULL;
diff --git a/Documentation/driver-api/ptp.rst b/Documentation/driver-api/ptp.rst
new file mode 100644
index 000000000000..a15192e32347
--- /dev/null
+++ b/Documentation/driver-api/ptp.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================================
+PTP hardware clock infrastructure for Linux
+===========================================
+
+  This patch set introduces support for IEEE 1588 PTP clocks in
+  Linux. Together with the SO_TIMESTAMPING socket options, this
+  presents a standardized method for developing PTP user space
+  programs, synchronizing Linux with external clocks, and using the
+  ancillary features of PTP hardware clocks.
+
+  A new class driver exports a kernel interface for specific clock
+  drivers and a user space interface. The infrastructure supports a
+  complete set of PTP hardware clock functionality.
+
+  + Basic clock operations
+    - Set time
+    - Get time
+    - Shift the clock by a given offset atomically
+    - Adjust clock frequency
+
+  + Ancillary clock features
+    - Time stamp external events
+    - Period output signals configurable from user space
+    - Synchronization of the Linux system time via the PPS subsystem
+
+PTP hardware clock kernel API
+=============================
+
+   A PTP clock driver registers itself with the class driver. The
+   class driver handles all of the dealings with user space. The
+   author of a clock driver need only implement the details of
+   programming the clock hardware. The clock driver notifies the class
+   driver of asynchronous events (alarms and external time stamps) via
+   a simple message passing interface.
+
+   The class driver supports multiple PTP clock drivers. In normal use
+   cases, only one PTP clock is needed. However, for testing and
+   development, it can be useful to have more than one clock in a
+   single system, in order to allow performance comparisons.
+
+PTP hardware clock user space API
+=================================
+
+   The class driver also creates a character device for each
+   registered clock. User space can use an open file descriptor from
+   the character device as a POSIX clock id and may call
+   clock_gettime, clock_settime, and clock_adjtime.  These calls
+   implement the basic clock operations.
+
+   User space programs may control the clock using standardized
+   ioctls. A program may query, enable, configure, and disable the
+   ancillary clock features. User space can receive time stamped
+   events via blocking read() and poll().
+
+Writing clock drivers
+=====================
+
+   Clock drivers include include/linux/ptp_clock_kernel.h and register
+   themselves by presenting a 'struct ptp_clock_info' to the
+   registration method. Clock drivers must implement all of the
+   functions in the interface. If a clock does not offer a particular
+   ancillary feature, then the driver should just return -EOPNOTSUPP
+   from those functions.
+
+   Drivers must ensure that all of the methods in interface are
+   reentrant. Since most hardware implementations treat the time value
+   as a 64 bit integer accessed as two 32 bit registers, drivers
+   should use spin_lock_irqsave/spin_unlock_irqrestore to protect
+   against concurrent access. This locking cannot be accomplished in
+   class driver, since the lock may also be needed by the clock
+   driver's interrupt service routine.
+
+Supported hardware
+==================
+
+   * Freescale eTSEC gianfar
+
+     - 2 Time stamp external triggers, programmable polarity (opt. interrupt)
+     - 2 Alarm registers (optional interrupt)
+     - 3 Periodic signals (optional interrupt)
+
+   * National DP83640
+
+     - 6 GPIOs programmable as inputs or outputs
+     - 6 GPIOs with dedicated functions (LED/JTAG/clock) can also be
+       used as general inputs or outputs
+     - GPIO inputs can time stamp external triggers
+     - GPIO outputs can produce periodic signals
+     - 1 interrupt pin
+
+   * Intel IXP465
+
+     - Auxiliary Slave/Master Mode Snapshot (optional interrupt)
+     - Target Time (optional interrupt)
diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
new file mode 100644
index 000000000000..ab62f1bb0366
--- /dev/null
+++ b/Documentation/driver-api/pwm.rst
@@ -0,0 +1,165 @@
+======================================
+Pulse Width Modulation (PWM) interface
+======================================
+
+This provides an overview about the Linux PWM interface
+
+PWMs are commonly used for controlling LEDs, fans or vibrators in
+cell phones. PWMs with a fixed purpose have no need implementing
+the Linux PWM API (although they could). However, PWMs are often
+found as discrete devices on SoCs which have no fixed purpose. It's
+up to the board designer to connect them to LEDs or fans. To provide
+this kind of flexibility the generic PWM API exists.
+
+Identifying PWMs
+----------------
+
+Users of the legacy PWM API use unique IDs to refer to PWM devices.
+
+Instead of referring to a PWM device via its unique ID, board setup code
+should instead register a static mapping that can be used to match PWM
+consumers to providers, as given in the following example::
+
+	static struct pwm_lookup board_pwm_lookup[] = {
+		PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL,
+			   50000, PWM_POLARITY_NORMAL),
+	};
+
+	static void __init board_init(void)
+	{
+		...
+		pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup));
+		...
+	}
+
+Using PWMs
+----------
+
+Legacy users can request a PWM device using pwm_request() and free it
+after usage with pwm_free().
+
+New users should use the pwm_get() function and pass to it the consumer
+device or a consumer name. pwm_put() is used to free the PWM device. Managed
+variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
+
+After being requested, a PWM has to be configured using::
+
+	int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+
+This API controls both the PWM period/duty_cycle config and the
+enable/disable state.
+
+The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
+around pwm_apply_state() and should not be used if the user wants to change
+several parameter at once. For example, if you see pwm_config() and
+pwm_{enable,disable}() calls in the same function, this probably means you
+should switch to pwm_apply_state().
+
+The PWM user API also allows one to query the PWM state with pwm_get_state().
+
+In addition to the PWM state, the PWM API also exposes PWM arguments, which
+are the reference PWM config one should use on this PWM.
+PWM arguments are usually platform-specific and allows the PWM user to only
+care about dutycycle relatively to the full period (like, duty = 50% of the
+period). struct pwm_args contains 2 fields (period and polarity) and should
+be used to set the initial PWM config (usually done in the probe function
+of the PWM user). PWM arguments are retrieved with pwm_get_args().
+
+All consumers should really be reconfiguring the PWM upon resume as
+appropriate. This is the only way to ensure that everything is resumed in
+the proper order.
+
+Using PWMs with the sysfs interface
+-----------------------------------
+
+If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs
+interface is provided to use the PWMs from userspace. It is exposed at
+/sys/class/pwm/. Each probed PWM controller/chip will be exported as
+pwmchipN, where N is the base of the PWM chip. Inside the directory you
+will find:
+
+  npwm
+    The number of PWM channels this chip supports (read-only).
+
+  export
+    Exports a PWM channel for use with sysfs (write-only).
+
+  unexport
+   Unexports a PWM channel from sysfs (write-only).
+
+The PWM channels are numbered using a per-chip index from 0 to npwm-1.
+
+When a PWM channel is exported a pwmX directory will be created in the
+pwmchipN directory it is associated with, where X is the number of the
+channel that was exported. The following properties will then be available:
+
+  period
+    The total period of the PWM signal (read/write).
+    Value is in nanoseconds and is the sum of the active and inactive
+    time of the PWM.
+
+  duty_cycle
+    The active time of the PWM signal (read/write).
+    Value is in nanoseconds and must be less than the period.
+
+  polarity
+    Changes the polarity of the PWM signal (read/write).
+    Writes to this property only work if the PWM chip supports changing
+    the polarity. The polarity can only be changed if the PWM is not
+    enabled. Value is the string "normal" or "inversed".
+
+  enable
+    Enable/disable the PWM signal (read/write).
+
+	- 0 - disabled
+	- 1 - enabled
+
+Implementing a PWM driver
+-------------------------
+
+Currently there are two ways to implement pwm drivers. Traditionally
+there only has been the barebone API meaning that each driver has
+to implement the pwm_*() functions itself. This means that it's impossible
+to have multiple PWM drivers in the system. For this reason it's mandatory
+for new drivers to use the generic PWM framework.
+
+A new PWM controller/chip can be added using pwmchip_add() and removed
+again with pwmchip_remove(). pwmchip_add() takes a filled in struct
+pwm_chip as argument which provides a description of the PWM chip, the
+number of PWM devices provided by the chip and the chip-specific
+implementation of the supported PWM operations to the framework.
+
+When implementing polarity support in a PWM driver, make sure to respect the
+signal conventions in the PWM framework. By definition, normal polarity
+characterizes a signal starts high for the duration of the duty cycle and
+goes low for the remainder of the period. Conversely, a signal with inversed
+polarity starts low for the duration of the duty cycle and goes high for the
+remainder of the period.
+
+Drivers are encouraged to implement ->apply() instead of the legacy
+->enable(), ->disable() and ->config() methods. Doing that should provide
+atomicity in the PWM config workflow, which is required when the PWM controls
+a critical device (like a regulator).
+
+The implementation of ->get_state() (a method used to retrieve initial PWM
+state) is also encouraged for the same reason: letting the PWM user know
+about the current PWM state would allow him to avoid glitches.
+
+Drivers should not implement any power management. In other words,
+consumers should implement it as described in the "Using PWMs" section.
+
+Locking
+-------
+
+The PWM core list manipulations are protected by a mutex, so pwm_request()
+and pwm_free() may not be called from an atomic context. Currently the
+PWM core does not enforce any locking to pwm_enable(), pwm_disable() and
+pwm_config(), so the calling context is currently driver specific. This
+is an issue derived from the former barebone API and should be fixed soon.
+
+Helpers
+-------
+
+Currently a PWM can only be configured with period_ns and duty_ns. For several
+use cases freq_hz and duty_percent might be better. Instead of calculating
+this in your driver please consider adding appropriate helpers to the framework.
diff --git a/Documentation/driver-api/rapidio/index.rst b/Documentation/driver-api/rapidio/index.rst
new file mode 100644
index 000000000000..a41b4242d16f
--- /dev/null
+++ b/Documentation/driver-api/rapidio/index.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+The Linux RapidIO Subsystem
+===========================
+
+.. toctree::
+   :maxdepth: 1
+
+   rapidio
+   sysfs
+
+   tsi721
+   mport_cdev
+   rio_cm
diff --git a/Documentation/driver-api/rapidio/mport_cdev.rst b/Documentation/driver-api/rapidio/mport_cdev.rst
new file mode 100644
index 000000000000..df77a7f7be7d
--- /dev/null
+++ b/Documentation/driver-api/rapidio/mport_cdev.rst
@@ -0,0 +1,110 @@
+==================================================================
+RapidIO subsystem mport character device driver (rio_mport_cdev.c)
+==================================================================
+
+1. Overview
+===========
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Freescale,
+Prodrive Technologies, Nokia Networks, BAE and IDT.  Additional input was
+received from other members of RapidIO.org. The objective was to create a
+character mode driver interface which exposes the capabilities of RapidIO
+devices directly to applications, in a manner that allows the numerous and
+varied RapidIO implementations to interoperate.
+
+This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
+for user-space applications. Most of RapidIO operations are supported through
+'ioctl' system calls.
+
+When loaded this device driver creates filesystem nodes named rio_mportX in /dev
+directory for each registered RapidIO mport device. 'X' in the node name matches
+to unique port ID assigned to each local mport device.
+
+Using available set of ioctl commands user-space applications can perform
+following RapidIO bus and subsystem operations:
+
+- Reads and writes from/to configuration registers of mport devices
+  (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
+- Reads and writes from/to configuration registers of remote RapidIO devices.
+  This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
+  (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
+- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
+- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
+- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
+- Query capabilities and RapidIO link configuration of mport devices
+  (RIO_MPORT_GET_PROPERTIES)
+- Enable/Disable reporting of RapidIO doorbell events to user-space applications
+  (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
+- Enable/Disable reporting of RIO port-write events to user-space applications
+  (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
+- Query/Control type of events reported through this driver: doorbells,
+  port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
+- Configure/Map mport's outbound requests window(s) for specific size,
+  RapidIO destination ID, hopcount and request type
+  (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
+- Configure/Map mport's inbound requests window(s) for specific size,
+  RapidIO base address and local memory base address
+  (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
+- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
+  to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
+- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
+  Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
+  transfer modes.
+- Check/Wait for completion of asynchronous DMA data transfer
+  (RIO_WAIT_FOR_ASYNC)
+- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
+  This allows implementation of various RapidIO fabric enumeration algorithms
+  as user-space applications while using remaining functionality provided by
+  kernel RapidIO subsystem.
+
+2. Hardware Compatibility
+=========================
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport implementation.
+
+At this moment the most common limitation is availability of RapidIO-specific
+DMA engine framework for specific mport device. Users should verify available
+functionality of their platform when planning to use this driver:
+
+- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
+  compatible with this driver.
+- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
+  specific DMA engine support and therefore DMA data transfers mport_cdev driver
+  are not available.
+
+3. Module parameters
+====================
+
+- 'dma_timeout'
+      - DMA transfer completion timeout (in msec, default value 3000).
+        This parameter set a maximum completion wait time for SYNC mode DMA
+        transfer requests and for RIO_WAIT_FOR_ASYNC ioctl requests.
+
+- 'dbg_level'
+      - This parameter allows to control amount of debug information
+        generated by this device driver. This parameter is formed by set of
+        bit masks that correspond to the specific functional blocks.
+        For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
+        This parameter can be changed dynamically.
+        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+4. Known problems
+=================
+
+  None.
+
+5. User-space Applications and API
+==================================
+
+API library and applications that use this device driver are available from
+RapidIO.org.
+
+6. TODO List
+============
+
+- Add support for sending/receiving "raw" RapidIO messaging packets.
+- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
+  is not available.
diff --git a/Documentation/driver-api/rapidio/rapidio.rst b/Documentation/driver-api/rapidio/rapidio.rst
new file mode 100644
index 000000000000..fb8942d3ba85
--- /dev/null
+++ b/Documentation/driver-api/rapidio/rapidio.rst
@@ -0,0 +1,362 @@
+============
+Introduction
+============
+
+The RapidIO standard is a packet-based fabric interconnect standard designed for
+use in embedded systems. Development of the RapidIO standard is directed by the
+RapidIO Trade Association (RTA). The current version of the RapidIO specification
+is publicly available for download from the RTA web-site [1].
+
+This document describes the basics of the Linux RapidIO subsystem and provides
+information on its major components.
+
+1 Overview
+==========
+
+Because the RapidIO subsystem follows the Linux device model it is integrated
+into the kernel similarly to other buses by defining RapidIO-specific device and
+bus types and registering them within the device model.
+
+The Linux RapidIO subsystem is architecture independent and therefore defines
+architecture-specific interfaces that provide support for common RapidIO
+subsystem operations.
+
+2. Core Components
+==================
+
+A typical RapidIO network is a combination of endpoints and switches.
+Each of these components is represented in the subsystem by an associated data
+structure. The core logical components of the RapidIO subsystem are defined
+in include/linux/rio.h file.
+
+2.1 Master Port
+---------------
+
+A master port (or mport) is a RapidIO interface controller that is local to the
+processor executing the Linux code. A master port generates and receives RapidIO
+packets (transactions). In the RapidIO subsystem each master port is represented
+by a rio_mport data structure. This structure contains master port specific
+resources such as mailboxes and doorbells. The rio_mport also includes a unique
+host device ID that is valid when a master port is configured as an enumerating
+host.
+
+RapidIO master ports are serviced by subsystem specific mport device drivers
+that provide functionality defined for this subsystem. To provide a hardware
+independent interface for RapidIO subsystem operations, rio_mport structure
+includes rio_ops data structure which contains pointers to hardware specific
+implementations of RapidIO functions.
+
+2.2 Device
+----------
+
+A RapidIO device is any endpoint (other than mport) or switch in the network.
+All devices are presented in the RapidIO subsystem by corresponding rio_dev data
+structure. Devices form one global device list and per-network device lists
+(depending on number of available mports and networks).
+
+2.3 Switch
+----------
+
+A RapidIO switch is a special class of device that routes packets between its
+ports towards their final destination. The packet destination port within a
+switch is defined by an internal routing table. A switch is presented in the
+RapidIO subsystem by rio_dev data structure expanded by additional rio_switch
+data structure, which contains switch specific information such as copy of the
+routing table and pointers to switch specific functions.
+
+The RapidIO subsystem defines the format and initialization method for subsystem
+specific switch drivers that are designed to provide hardware-specific
+implementation of common switch management routines.
+
+2.4 Network
+-----------
+
+A RapidIO network is a combination of interconnected endpoint and switch devices.
+Each RapidIO network known to the system is represented by corresponding rio_net
+data structure. This structure includes lists of all devices and local master
+ports that form the same network. It also contains a pointer to the default
+master port that is used to communicate with devices within the network.
+
+2.5 Device Drivers
+------------------
+
+RapidIO device-specific drivers follow Linux Kernel Driver Model and are
+intended to support specific RapidIO devices attached to the RapidIO network.
+
+2.6 Subsystem Interfaces
+------------------------
+
+RapidIO interconnect specification defines features that may be used to provide
+one or more common service layers for all participating RapidIO devices. These
+common services may act separately from device-specific drivers or be used by
+device-specific drivers. Example of such service provider is the RIONET driver
+which implements Ethernet-over-RapidIO interface. Because only one driver can be
+registered for a device, all common RapidIO services have to be registered as
+subsystem interfaces. This allows to have multiple common services attached to
+the same device without blocking attachment of a device-specific driver.
+
+3. Subsystem Initialization
+===========================
+
+In order to initialize the RapidIO subsystem, a platform must initialize and
+register at least one master port within the RapidIO network. To register mport
+within the subsystem controller driver's initialization code calls function
+rio_register_mport() for each available master port.
+
+After all active master ports are registered with a RapidIO subsystem,
+an enumeration and/or discovery routine may be called automatically or
+by user-space command.
+
+RapidIO subsystem can be configured to be built as a statically linked or
+modular component of the kernel (see details below).
+
+4. Enumeration and Discovery
+============================
+
+4.1 Overview
+------------
+
+RapidIO subsystem configuration options allow users to build enumeration and
+discovery methods as statically linked components or loadable modules.
+An enumeration/discovery method implementation and available input parameters
+define how any given method can be attached to available RapidIO mports:
+simply to all available mports OR individually to the specified mport device.
+
+Depending on selected enumeration/discovery build configuration, there are
+several methods to initiate an enumeration and/or discovery process:
+
+  (a) Statically linked enumeration and discovery process can be started
+  automatically during kernel initialization time using corresponding module
+  parameters. This was the original method used since introduction of RapidIO
+  subsystem. Now this method relies on enumerator module parameter which is
+  'rio-scan.scan' for existing basic enumeration/discovery method.
+  When automatic start of enumeration/discovery is used a user has to ensure
+  that all discovering endpoints are started before the enumerating endpoint
+  and are waiting for enumeration to be completed.
+  Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering
+  endpoint waits for enumeration to be completed. If the specified timeout
+  expires the discovery process is terminated without obtaining RapidIO network
+  information. NOTE: a timed out discovery process may be restarted later using
+  a user-space command as it is described below (if the given endpoint was
+  enumerated successfully).
+
+  (b) Statically linked enumeration and discovery process can be started by
+  a command from user space. This initiation method provides more flexibility
+  for a system startup compared to the option (a) above. After all participating
+  endpoints have been successfully booted, an enumeration process shall be
+  started first by issuing a user-space command, after an enumeration is
+  completed a discovery process can be started on all remaining endpoints.
+
+  (c) Modular enumeration and discovery process can be started by a command from
+  user space. After an enumeration/discovery module is loaded, a network scan
+  process can be started by issuing a user-space command.
+  Similar to the option (b) above, an enumerator has to be started first.
+
+  (d) Modular enumeration and discovery process can be started by a module
+  initialization routine. In this case an enumerating module shall be loaded
+  first.
+
+When a network scan process is started it calls an enumeration or discovery
+routine depending on the configured role of a master port: host or agent.
+
+Enumeration is performed by a master port if it is configured as a host port by
+assigning a host destination ID greater than or equal to zero. The host
+destination ID can be assigned to a master port using various methods depending
+on RapidIO subsystem build configuration:
+
+  (a) For a statically linked RapidIO subsystem core use command line parameter
+  "rapidio.hdid=" with a list of destination ID assignments in order of mport
+  device registration. For example, in a system with two RapidIO controllers
+  the command line parameter "rapidio.hdid=-1,7" will result in assignment of
+  the host destination ID=7 to the second RapidIO controller, while the first
+  one will be assigned destination ID=-1.
+
+  (b) If the RapidIO subsystem core is built as a loadable module, in addition
+  to the method shown above, the host destination ID(s) can be specified using
+  traditional methods of passing module parameter "hdid=" during its loading:
+
+  - from command line: "modprobe rapidio hdid=-1,7", or
+  - from modprobe configuration file using configuration command "options",
+    like in this example: "options rapidio hdid=-1,7". An example of modprobe
+    configuration file is provided in the section below.
+
+NOTES:
+  (i) if "hdid=" parameter is omitted all available mport will be assigned
+  destination ID = -1;
+
+  (ii) the "hdid=" parameter in systems with multiple mports can have
+  destination ID assignments omitted from the end of list (default = -1).
+
+If the host device ID for a specific master port is set to -1, the discovery
+process will be performed for it.
+
+The enumeration and discovery routines use RapidIO maintenance transactions
+to access the configuration space of devices.
+
+NOTE: If RapidIO switch-specific device drivers are built as loadable modules
+they must be loaded before enumeration/discovery process starts.
+This requirement is cased by the fact that enumeration/discovery methods invoke
+vendor-specific callbacks on early stages.
+
+4.2 Automatic Start of Enumeration and Discovery
+------------------------------------------------
+
+Automatic enumeration/discovery start method is applicable only to built-in
+enumeration/discovery RapidIO configuration selection. To enable automatic
+enumeration/discovery start by existing basic enumerator method set use boot
+command line parameter "rio-scan.scan=1".
+
+This configuration requires synchronized start of all RapidIO endpoints that
+form a network which will be enumerated/discovered. Discovering endpoints have
+to be started before an enumeration starts to ensure that all RapidIO
+controllers have been initialized and are ready to be discovered. Configuration
+parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which
+a discovering endpoint will wait for enumeration to be completed.
+
+When automatic enumeration/discovery start is selected, basic method's
+initialization routine calls rio_init_mports() to perform enumeration or
+discovery for all known mport devices.
+
+Depending on RapidIO network size and configuration this automatic
+enumeration/discovery start method may be difficult to use due to the
+requirement for synchronized start of all endpoints.
+
+4.3 User-space Start of Enumeration and Discovery
+-------------------------------------------------
+
+User-space start of enumeration and discovery can be used with built-in and
+modular build configurations. For user-space controlled start RapidIO subsystem
+creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate
+an enumeration or discovery process on specific mport device, a user needs to
+write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a
+sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device
+registration. For example for machine with single RapidIO controller, mport_ID
+for that controller always will be 0.
+
+To initiate RapidIO enumeration/discovery on all available mports a user may
+write '-1' (or RIO_MPORT_ANY) into the scan attribute file.
+
+4.4 Basic Enumeration Method
+----------------------------
+
+This is an original enumeration/discovery method which is available since
+first release of RapidIO subsystem code. The enumeration process is
+implemented according to the enumeration algorithm outlined in the RapidIO
+Interconnect Specification: Annex I [1].
+
+This method can be configured as statically linked or loadable module.
+The method's single parameter "scan" allows to trigger the enumeration/discovery
+process from module initialization routine.
+
+This enumeration/discovery method can be started only once and does not support
+unloading if it is built as a module.
+
+The enumeration process traverses the network using a recursive depth-first
+algorithm. When a new device is found, the enumerator takes ownership of that
+device by writing into the Host Device ID Lock CSR. It does this to ensure that
+the enumerator has exclusive right to enumerate the device. If device ownership
+is successfully acquired, the enumerator allocates a new rio_dev structure and
+initializes it according to device capabilities.
+
+If the device is an endpoint, a unique device ID is assigned to it and its value
+is written into the device's Base Device ID CSR.
+
+If the device is a switch, the enumerator allocates an additional rio_switch
+structure to store switch specific information. Then the switch's vendor ID and
+device ID are queried against a table of known RapidIO switches. Each switch
+table entry contains a pointer to a switch-specific initialization routine that
+initializes pointers to the rest of switch specific operations, and performs
+hardware initialization if necessary. A RapidIO switch does not have a unique
+device ID; it relies on hopcount and routing for device ID of an attached
+endpoint if access to its configuration registers is required. If a switch (or
+chain of switches) does not have any endpoint (except enumerator) attached to
+it, a fake device ID will be assigned to configure a route to that switch.
+In the case of a chain of switches without endpoint, one fake device ID is used
+to configure a route through the entire chain and switches are differentiated by
+their hopcount value.
+
+For both endpoints and switches the enumerator writes a unique component tag
+into device's Component Tag CSR. That unique value is used by the error
+management notification mechanism to identify a device that is reporting an
+error management event.
+
+Enumeration beyond a switch is completed by iterating over each active egress
+port of that switch. For each active link, a route to a default device ID
+(0xFF for 8-bit systems and 0xFFFF for 16-bit systems) is temporarily written
+into the routing table. The algorithm recurs by calling itself with hopcount + 1
+and the default device ID in order to access the device on the active port.
+
+After the host has completed enumeration of the entire network it releases
+devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint
+in the system, it sets the Discovered bit in the Port General Control CSR
+to indicate that enumeration is completed and agents are allowed to execute
+passive discovery of the network.
+
+The discovery process is performed by agents and is similar to the enumeration
+process that is described above. However, the discovery process is performed
+without changes to the existing routing because agents only gather information
+about RapidIO network structure and are building an internal map of discovered
+devices. This way each Linux-based component of the RapidIO subsystem has
+a complete view of the network. The discovery process can be performed
+simultaneously by several agents. After initializing its RapidIO master port
+each agent waits for enumeration completion by the host for the configured wait
+time period. If this wait time period expires before enumeration is completed,
+an agent skips RapidIO discovery and continues with remaining kernel
+initialization.
+
+4.5 Adding New Enumeration/Discovery Method
+-------------------------------------------
+
+RapidIO subsystem code organization allows addition of new enumeration/discovery
+methods as new configuration options without significant impact to the core
+RapidIO code.
+
+A new enumeration/discovery method has to be attached to one or more mport
+devices before an enumeration/discovery process can be started. Normally,
+method's module initialization routine calls rio_register_scan() to attach
+an enumerator to a specified mport device (or devices). The basic enumerator
+implementation demonstrates this process.
+
+4.6 Using Loadable RapidIO Switch Drivers
+-----------------------------------------
+
+In the case when RapidIO switch drivers are built as loadable modules a user
+must ensure that they are loaded before the enumeration/discovery starts.
+This process can be automated by specifying pre- or post- dependencies in the
+RapidIO-specific modprobe configuration file as shown in the example below.
+
+File /etc/modprobe.d/rapidio.conf::
+
+  # Configure RapidIO subsystem modules
+
+  # Set enumerator host destination ID (overrides kernel command line option)
+  options rapidio hdid=-1,2
+
+  # Load RapidIO switch drivers immediately after rapidio core module was loaded
+  softdep rapidio post: idt_gen2 idtcps tsi57x
+
+  # OR :
+
+  # Load RapidIO switch drivers just before rio-scan enumerator module is loaded
+  softdep rio-scan pre: idt_gen2 idtcps tsi57x
+
+  --------------------------
+
+NOTE:
+  In the example above, one of "softdep" commands must be removed or
+  commented out to keep required module loading sequence.
+
+5. References
+=============
+
+[1] RapidIO Trade Association. RapidIO Interconnect Specifications.
+    http://www.rapidio.org.
+
+[2] Rapidio TA. Technology Comparisons.
+    http://www.rapidio.org/education/technology_comparisons/
+
+[3] RapidIO support for Linux.
+    http://lwn.net/Articles/139118/
+
+[4] Matt Porter. RapidIO for Linux. Ottawa Linux Symposium, 2005
+    http://www.kernel.org/doc/ols/2005/ols2005v2-pages-43-56.pdf
diff --git a/Documentation/driver-api/rapidio/rio_cm.rst b/Documentation/driver-api/rapidio/rio_cm.rst
new file mode 100644
index 000000000000..5294430a7a74
--- /dev/null
+++ b/Documentation/driver-api/rapidio/rio_cm.rst
@@ -0,0 +1,135 @@
+==========================================================================
+RapidIO subsystem Channelized Messaging character device driver (rio_cm.c)
+==========================================================================
+
+
+1. Overview
+===========
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Prodrive Technologies,
+Nokia Networks, BAE and IDT.  Additional input was received from other members
+of RapidIO.org.
+
+The objective was to create a character mode driver interface which exposes
+messaging capabilities of RapidIO endpoint devices (mports) directly
+to applications, in a manner that allows the numerous and varied RapidIO
+implementations to interoperate.
+
+This driver (RIO_CM) provides to user-space applications shared access to
+RapidIO mailbox messaging resources.
+
+RapidIO specification (Part 2) defines that endpoint devices may have up to four
+messaging mailboxes in case of multi-packet message (up to 4KB) and
+up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition
+to protocol definition limitations, a particular hardware implementation can
+have reduced number of messaging mailboxes.  RapidIO aware applications must
+therefore share the messaging resources of a RapidIO endpoint.
+
+Main purpose of this device driver is to provide RapidIO mailbox messaging
+capability to large number of user-space processes by introducing socket-like
+operations using a single messaging mailbox.  This allows applications to
+use the limited RapidIO messaging hardware resources efficiently.
+
+Most of device driver's operations are supported through 'ioctl' system calls.
+
+When loaded this device driver creates a single file system node named rio_cm
+in /dev directory common for all registered RapidIO mport devices.
+
+Following ioctl commands are available to user-space applications:
+
+- RIO_CM_MPORT_GET_LIST:
+    Returns to caller list of local mport devices that
+    support messaging operations (number of entries up to RIO_MAX_MPORTS).
+    Each list entry is combination of mport's index in the system and RapidIO
+    destination ID assigned to the port.
+- RIO_CM_EP_GET_LIST_SIZE:
+    Returns number of messaging capable remote endpoints
+    in a RapidIO network associated with the specified mport device.
+- RIO_CM_EP_GET_LIST:
+    Returns list of RapidIO destination IDs for messaging
+    capable remote endpoints (peers) available in a RapidIO network associated
+    with the specified mport device.
+- RIO_CM_CHAN_CREATE:
+    Creates RapidIO message exchange channel data structure
+    with channel ID assigned automatically or as requested by a caller.
+- RIO_CM_CHAN_BIND:
+    Binds the specified channel data structure to the specified
+    mport device.
+- RIO_CM_CHAN_LISTEN:
+    Enables listening for connection requests on the specified
+    channel.
+- RIO_CM_CHAN_ACCEPT:
+    Accepts a connection request from peer on the specified
+    channel. If wait timeout for this request is specified by a caller it is
+    a blocking call. If timeout set to 0 this is non-blocking call - ioctl
+    handler checks for a pending connection request and if one is not available
+    exits with -EGAIN error status immediately.
+- RIO_CM_CHAN_CONNECT:
+    Sends a connection request to a remote peer/channel.
+- RIO_CM_CHAN_SEND:
+    Sends a data message through the specified channel.
+    The handler for this request assumes that message buffer specified by
+    a caller includes the reserved space for a packet header required by
+    this driver.
+- RIO_CM_CHAN_RECEIVE:
+    Receives a data message through a connected channel.
+    If the channel does not have an incoming message ready to return this ioctl
+    handler will wait for new message until timeout specified by a caller
+    expires. If timeout value is set to 0, ioctl handler uses a default value
+    defined by MAX_SCHEDULE_TIMEOUT.
+- RIO_CM_CHAN_CLOSE:
+    Closes a specified channel and frees associated buffers.
+    If the specified channel is in the CONNECTED state, sends close notification
+    to the remote peer.
+
+The ioctl command codes and corresponding data structures intended for use by
+user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'.
+
+2. Hardware Compatibility
+=========================
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport HW implementation of messaging
+mailboxes.
+
+3. Module parameters
+====================
+
+- 'dbg_level'
+      - This parameter allows to control amount of debug information
+        generated by this device driver. This parameter is formed by set of
+        bit masks that correspond to the specific functional block.
+        For mask definitions see 'drivers/rapidio/devices/rio_cm.c'
+        This parameter can be changed dynamically.
+        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+- 'cmbox'
+      - Number of RapidIO mailbox to use (default value is 1).
+        This parameter allows to set messaging mailbox number that will be used
+        within entire RapidIO network. It can be used when default mailbox is
+        used by other device drivers or is not supported by some nodes in the
+        RapidIO network.
+
+- 'chstart'
+      - Start channel number for dynamic assignment. Default value - 256.
+        Allows to exclude channel numbers below this parameter from dynamic
+        allocation to avoid conflicts with software components that use
+        reserved predefined channel numbers.
+
+4. Known problems
+=================
+
+  None.
+
+5. User-space Applications and API Library
+==========================================
+
+Messaging API library and applications that use this device driver are available
+from RapidIO.org.
+
+6. TODO List
+============
+
+- Add support for system notification messages (reserved channel 0).
diff --git a/Documentation/driver-api/rapidio/sysfs.rst b/Documentation/driver-api/rapidio/sysfs.rst
new file mode 100644
index 000000000000..540f72683496
--- /dev/null
+++ b/Documentation/driver-api/rapidio/sysfs.rst
@@ -0,0 +1,7 @@
+=============
+Sysfs entries
+=============
+
+The RapidIO sysfs files have moved to:
+Documentation/ABI/testing/sysfs-bus-rapidio and
+Documentation/ABI/testing/sysfs-class-rapidio
diff --git a/Documentation/driver-api/rapidio/tsi721.rst b/Documentation/driver-api/rapidio/tsi721.rst
new file mode 100644
index 000000000000..42aea438cd20
--- /dev/null
+++ b/Documentation/driver-api/rapidio/tsi721.rst
@@ -0,0 +1,112 @@
+=========================================================================
+RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge.
+=========================================================================
+
+1. Overview
+===========
+
+This driver implements all currently defined RapidIO mport callback functions.
+It supports maintenance read and write operations, inbound and outbound RapidIO
+doorbells, inbound maintenance port-writes and RapidIO messaging.
+
+To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA
+channels. This mechanism provides access to larger range of hop counts and
+destination IDs without need for changes in outbound window translation.
+
+RapidIO messaging support uses dedicated messaging channels for each mailbox.
+For inbound messages this driver uses destination ID matching to forward messages
+into the corresponding message queue. Messaging callbacks are implemented to be
+fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
+
+1. Module parameters:
+
+- 'dbg_level'
+      - This parameter allows to control amount of debug information
+        generated by this device driver. This parameter is formed by set of
+        This parameter can be changed bit masks that correspond to the specific
+        functional block.
+        For mask definitions see 'drivers/rapidio/devices/tsi721.h'
+        This parameter can be changed dynamically.
+        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+- 'dma_desc_per_channel'
+      - This parameter defines number of hardware buffer
+        descriptors allocated for each registered Tsi721 DMA channel.
+        Its default value is 128.
+
+- 'dma_txqueue_sz'
+      - DMA transactions queue size. Defines number of pending
+        transaction requests that can be accepted by each DMA channel.
+        Default value is 16.
+
+- 'dma_sel'
+      - DMA channel selection mask. Bitmask that defines which hardware
+        DMA channels (0 ... 6) will be registered with DmaEngine core.
+        If bit is set to 1, the corresponding DMA channel will be registered.
+        DMA channels not selected by this mask will not be used by this device
+        driver. Default value is 0x7f (use all channels).
+
+- 'pcie_mrrs'
+      - override value for PCIe Maximum Read Request Size (MRRS).
+        This parameter gives an ability to override MRRS value set during PCIe
+        configuration process. Tsi721 supports read request sizes up to 4096B.
+        Value for this parameter must be set as defined by PCIe specification:
+        0 = 128B, 1 = 256B, 2 = 512B, 3 = 1024B, 4 = 2048B and 5 = 4096B.
+        Default value is '-1' (= keep platform setting).
+
+- 'mbox_sel'
+      - RIO messaging MBOX selection mask. This is a bitmask that defines
+        messaging MBOXes are managed by this device driver. Mask bits 0 - 3
+        correspond to MBOX0 - MBOX3. MBOX is under driver's control if the
+        corresponding bit is set to '1'. Default value is 0x0f (= all).
+
+2. Known problems
+=================
+
+  None.
+
+3. DMA Engine Support
+=====================
+
+Tsi721 mport driver supports DMA data transfers between local system memory and
+remote RapidIO devices. This functionality is implemented according to SLAVE
+mode API defined by common Linux kernel DMA Engine framework.
+
+Depending on system requirements RapidIO DMA operations can be included/excluded
+by setting CONFIG_RAPIDIO_DMA_ENGINE option. Tsi721 miniport driver uses seven
+out of eight available BDMA channels to support DMA data transfers.
+One BDMA channel is reserved for generation of maintenance read/write requests.
+
+If Tsi721 mport driver have been built with RAPIDIO_DMA_ENGINE support included,
+this driver will accept DMA-specific module parameter:
+
+  "dma_desc_per_channel"
+			 - defines number of hardware buffer descriptors used by
+                           each BDMA channel of Tsi721 (by default - 128).
+
+4. Version History
+
+  =====   ====================================================================
+  1.1.0   DMA operations re-worked to support data scatter/gather lists larger
+          than hardware buffer descriptors ring.
+  1.0.0   Initial driver release.
+  =====   ====================================================================
+
+5.  License
+===========
+
+  Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation; either version 2 of the License, or (at your option)
+  any later version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
diff --git a/Documentation/rfkill.txt b/Documentation/driver-api/rfkill.rst
index 7d3684e81df6..7d3684e81df6 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/driver-api/rfkill.rst
diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index 30e6aa7e160b..5158577bc29b 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well,
 although they are not the focus of this document.
 
 Some additional information can also be found in the kernel source under
-Documentation/s390/driver-model.txt.
+Documentation/s390/driver-model.rst.
 
 The css bus
 ===========
@@ -38,7 +38,7 @@ into several categories:
 * Standard I/O subchannels, for use by the system. They have a child
   device on the ccw bus and are described below.
 * I/O subchannels bound to the vfio-ccw driver. See
-  Documentation/s390/vfio-ccw.txt.
+  Documentation/s390/vfio-ccw.rst.
 * Message subchannels. No Linux driver currently exists.
 * CHSC subchannels (at most one). The chsc subchannel driver can be used
   to send asynchronous chsc commands.
diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
index 532ff67e2f1c..532ff67e2f1c 100644
--- a/Documentation/serial/cyclades_z.rst
+++ b/Documentation/driver-api/serial/cyclades_z.rst
diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
new file mode 100644
index 000000000000..31bd4e16fb1f
--- /dev/null
+++ b/Documentation/driver-api/serial/driver.rst
@@ -0,0 +1,549 @@
+====================
+Low Level Serial API
+====================
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver.  It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba-pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver.  The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions.  This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_console_write) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock.  There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are two locks.  A per-port spinlock, and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data::
+
+	port->mctrl
+	port->icount
+	port->state->xmit.head (circ_buf->head)
+	port->state->xmit.tail (circ_buf->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times. Since v2.6.27, this
+semaphore has been the 'mutex' member of the tty_port struct, and
+commonly referred to as the port mutex.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver.  It contains all the methods to control the
+hardware.
+
+  tx_empty(port)
+	This function tests whether the transmitter fifo and shifter
+	for the port described by 'port' is empty.  If it is empty,
+	this function should return TIOCSER_TEMT, otherwise return 0.
+	If the port does not support this operation, then it should
+	return TIOCSER_TEMT.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_mctrl(port, mctrl)
+	This function sets the modem control lines for port described
+	by 'port' to the state described by mctrl.  The relevant bits
+	of mctrl are:
+
+		- TIOCM_RTS	RTS signal.
+		- TIOCM_DTR	DTR signal.
+		- TIOCM_OUT1	OUT1 signal.
+		- TIOCM_OUT2	OUT2 signal.
+		- TIOCM_LOOP	Set the port into loopback mode.
+
+	If the appropriate bit is set, the signal should be driven
+	active.  If the bit is clear, the signal should be driven
+	inactive.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  get_mctrl(port)
+	Returns the current state of modem control inputs.  The state
+	of the outputs should not be returned, since the core keeps
+	track of their state.  The state information should include:
+
+		- TIOCM_CAR	state of DCD signal
+		- TIOCM_CTS	state of CTS signal
+		- TIOCM_DSR	state of DSR signal
+		- TIOCM_RI	state of RI signal
+
+	The bit is set if the signal is currently driven active.  If
+	the port does not support CTS, DCD or DSR, the driver should
+	indicate that the signal is permanently active.  If RI is
+	not available, the signal should not be indicated as active.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  stop_tx(port)
+	Stop transmitting characters.  This might be due to the CTS
+	line becoming inactive or the tty layer indicating we want
+	to stop transmission due to an XOFF character.
+
+	The driver should stop transmitting characters as soon as
+	possible.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  start_tx(port)
+	Start transmitting characters.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  throttle(port)
+	Notify the serial driver that input buffers for the line discipline are
+	close to full, and it should somehow signal that no more characters
+	should be sent to the serial port.
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .unthrottle() and termios modification by the
+	tty layer.
+
+  unthrottle(port)
+	Notify the serial driver that characters can now be sent to the serial
+	port without fear of overrunning the input buffers of the line
+	disciplines.
+
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .throttle() and termios modification by the
+	tty layer.
+
+  send_xchar(port,ch)
+	Transmit a high priority character, even if the port is stopped.
+	This is used to implement XON/XOFF flow control and tcflow().  If
+	the serial driver does not implement this function, the tty core
+	will append the character to the circular buffer and then call
+	start_tx() / stop_tx() to flush the data out.
+
+	Do not transmit if ch == '\0' (__DISABLED_CHAR).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  stop_rx(port)
+	Stop receiving characters; the port is in the process of
+	being closed.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  enable_ms(port)
+	Enable the modem status interrupts.
+
+	This method may be called multiple times.  Modem status
+	interrupts should be disabled when the shutdown method is
+	called.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  break_ctl(port,ctl)
+	Control the transmission of a break signal.  If ctl is
+	nonzero, the break signal should be transmitted.  The signal
+	should be terminated when another call is made with a zero
+	ctl.
+
+	Locking: caller holds tty_port->mutex
+
+  startup(port)
+	Grab any interrupt resources and initialise any low level driver
+	state.  Enable the port for reception.  It should not activate
+	RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+	This method will only be called when the port is initially opened.
+
+	Locking: port_sem taken.
+
+	Interrupts: globally disabled.
+
+  shutdown(port)
+	Disable the port, disable any break condition that may be in
+	effect, and free any interrupt resources.  It should not disable
+	RTS nor DTR; this will have already been done via a separate
+	call to set_mctrl.
+
+	Drivers must not access port->state once this call has completed.
+
+	This method will only be called when there are no more users of
+	this port.
+
+	Locking: port_sem taken.
+
+	Interrupts: caller dependent.
+
+  flush_buffer(port)
+	Flush any write buffers, reset any DMA state and stop any
+	ongoing DMA transfers.
+
+	This will be called whenever the port->state->xmit circular
+	buffer is cleared.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  set_termios(port,termios,oldtermios)
+	Change the port parameters, including word length, parity, stop
+	bits.  Update read_status_mask and ignore_status_mask to indicate
+	the types of events we are interested in receiving.  Relevant
+	termios->c_cflag bits are:
+
+		CSIZE
+			- word size
+		CSTOPB
+			- 2 stop bits
+		PARENB
+			- parity enable
+		PARODD
+			- odd parity (when PARENB is in force)
+		CREAD
+			- enable reception of characters (if not set,
+			  still receive characters from the port, but
+			  throw them away.
+		CRTSCTS
+			- if set, enable CTS status change reporting
+		CLOCAL
+			- if not set, enable modem status change
+			  reporting.
+
+	Relevant termios->c_iflag bits are:
+
+		INPCK
+			- enable frame and parity error events to be
+			  passed to the TTY layer.
+		BRKINT / PARMRK
+			- both of these enable break events to be
+			  passed to the TTY layer.
+
+		IGNPAR
+			- ignore parity and framing errors
+		IGNBRK
+			- ignore break errors,  If IGNPAR is also
+			  set, ignore overrun errors as well.
+
+	The interaction of the iflag bits is as follows (parity error
+	given as an example):
+
+	=============== ======= ======  =============================
+	Parity error	INPCK	IGNPAR
+	=============== ======= ======  =============================
+	n/a		0	n/a	character received, marked as
+					TTY_NORMAL
+	None		1	n/a	character received, marked as
+					TTY_NORMAL
+	Yes		1	0	character received, marked as
+					TTY_PARITY
+	Yes		1	1	character discarded
+	=============== ======= ======  =============================
+
+	Other flags may be used (eg, xon/xoff characters) if your
+	hardware supports hardware "soft" flow control.
+
+	Locking: caller holds tty_port->mutex
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_ldisc(port,termios)
+	Notifier for discipline change. See Documentation/driver-api/serial/tty.rst.
+
+	Locking: caller holds tty_port->mutex
+
+  pm(port,state,oldstate)
+	Perform any power management related activities on the specified
+	port.  State indicates the new state (defined by
+	enum uart_pm_state), oldstate indicates the previous state.
+
+	This function should not be used to grab any resources.
+
+	This will be called when the port is initially opened and finally
+	closed, except when the port is also the system console.  This
+	will occur even if CONFIG_PM is not set.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  type(port)
+	Return a pointer to a string constant describing the specified
+	port, or return NULL, in which case the string 'unknown' is
+	substituted.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  release_port(port)
+	Release any memory and IO region resources currently in use by
+	the port.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  request_port(port)
+	Request any memory and IO region resources required by the port.
+	If any fail, no resources should be registered when this function
+	returns, and it should return -EBUSY on failure.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  config_port(port,type)
+	Perform any autoconfiguration steps required for the port.  `type`
+	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
+	indicates that the port requires detection and identification.
+	port->type should be set to the type found, or PORT_UNKNOWN if
+	no port was detected.
+
+	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+	which should be probed using standard kernel autoprobing techniques.
+	This is not necessary on platforms where ports have interrupts
+	internally hard wired (eg, system on a chip implementations).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  verify_port(port,serinfo)
+	Verify the new serial port information contained within serinfo is
+	suitable for this port type.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  ioctl(port,cmd,arg)
+	Perform any port specific IOCTLs.  IOCTL commands must be defined
+	using the standard numbering system found in <asm/ioctl.h>
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  poll_init(port)
+	Called by kgdb to perform the minimal hardware initialization needed
+	to support poll_put_char() and poll_get_char().  Unlike ->startup()
+	this should not request interrupts.
+
+	Locking: tty_mutex and tty_port->mutex taken.
+
+	Interrupts: n/a.
+
+  poll_put_char(port,ch)
+	Called by kgdb to write a single character directly to the serial
+	port.  It can and should block until there is space in the TX FIFO.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  poll_get_char(port)
+	Called by kgdb to read a single character directly from the serial
+	port.  If data is available, it should be returned; otherwise
+	the function should return NO_POLL_CHAR immediately.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+	Update the FIFO drain timeout, port->timeout, according to the
+	number of bits, parity, stop bits and baud rate.
+
+	Locking: caller is expected to take port->lock
+
+	Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+	Return the numeric baud rate for the specified termios, taking
+	account of the special 38400 baud "kludge".  The B0 baud rate
+	is mapped to 9600 baud.
+
+	If the baud rate is not within min..max, then if old is non-NULL,
+	the original baud rate will be tried.  If that exceeds the
+	min..max constraint, 9600 baud will be returned.  termios will
+	be updated to the baud rate in use.
+
+	Note: min..max must always allow 9600 baud to be selected.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_get_divisor(port,baud)
+	Return the divisor (baud_base / baud) for the specified baud
+	rate, appropriately rounded.
+
+	If 38400 baud and custom divisor is selected, return the
+	custom divisor instead.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_match_port(port1,port2)
+	This utility function can be used to determine whether two
+	uart_port structures describe the same port.
+
+	Locking: n/a
+
+	Interrupts: n/a
+
+uart_write_wakeup(port)
+	A driver is expected to call this function when the number of
+	characters in the transmit buffer have dropped below a threshold.
+
+	Locking: port->lock should be held.
+
+	Interrupts: n/a
+
+uart_register_driver(drv)
+	Register a uart driver with the core driver.  We in turn register
+	with the tty layer, and initialise the core driver per-port state.
+
+	drv->port should be NULL, and the per-port structures should be
+	registered using uart_add_one_port after this call has succeeded.
+
+	Locking: none
+
+	Interrupts: enabled
+
+uart_unregister_driver()
+	Remove all references to a driver from the core driver.  The low
+	level driver must have removed all its ports via the
+	uart_remove_one_port() if it registered them with uart_add_one_port().
+
+	Locking: none
+
+	Interrupts: enabled
+
+**uart_suspend_port()**
+
+**uart_resume_port()**
+
+**uart_add_one_port()**
+
+**uart_remove_one_port()**
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core.  This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus::
+
+	struct my_port {
+		struct uart_port	port;
+		int			my_stuff;
+	};
+
+Modem control lines via GPIO
+----------------------------
+
+Some helpers are provided in order to set/get modem control lines via GPIO.
+
+mctrl_gpio_init(port, idx):
+	This will get the {cts,rts,...}-gpios from device tree if they are
+	present and request them, set direction etc, and return an
+	allocated structure. `devm_*` functions are used, so there's no need
+	to call mctrl_gpio_free().
+	As this sets up the irq handling make sure to not handle changes to the
+	gpio input lines in your driver, too.
+
+mctrl_gpio_free(dev, gpios):
+	This will free the requested gpios in mctrl_gpio_init().
+	As `devm_*` functions are used, there's generally no need to call
+	this function.
+
+mctrl_gpio_to_gpiod(gpios, gidx)
+	This returns the gpio_desc structure associated to the modem line
+	index.
+
+mctrl_gpio_set(gpios, mctrl):
+	This will sets the gpios according to the mctrl state.
+
+mctrl_gpio_get(gpios, mctrl):
+	This will update mctrl with the gpios values.
+
+mctrl_gpio_enable_ms(gpios):
+	Enables irqs and handling of changes to the ms lines.
+
+mctrl_gpio_disable_ms(gpios):
+	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
new file mode 100644
index 000000000000..33ad10d05b26
--- /dev/null
+++ b/Documentation/driver-api/serial/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Support for Serial devices
+==========================
+
+.. toctree::
+    :maxdepth: 1
+
+
+    driver
+    tty
+
+Serial drivers
+==============
+
+.. toctree::
+    :maxdepth: 1
+
+    cyclades_z
+    moxa-smartio
+    n_gsm
+    rocket
+    serial-iso7816
+    serial-rs485
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst
index 156100f17c3f..156100f17c3f 100644
--- a/Documentation/serial/moxa-smartio.rst
+++ b/Documentation/driver-api/serial/moxa-smartio.rst
diff --git a/Documentation/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst
index f3ad9fd26408..f3ad9fd26408 100644
--- a/Documentation/serial/n_gsm.rst
+++ b/Documentation/driver-api/serial/n_gsm.rst
diff --git a/Documentation/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
index 23761eae4282..23761eae4282 100644
--- a/Documentation/serial/rocket.rst
+++ b/Documentation/driver-api/serial/rocket.rst
diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst
index d990143de0c6..d990143de0c6 100644
--- a/Documentation/serial/serial-iso7816.rst
+++ b/Documentation/driver-api/serial/serial-iso7816.rst
diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
index 6bc824f948f9..6bc824f948f9 100644
--- a/Documentation/serial/serial-rs485.rst
+++ b/Documentation/driver-api/serial/serial-rs485.rst
diff --git a/Documentation/serial/tty.rst b/Documentation/driver-api/serial/tty.rst
index dd972caacf3e..dd972caacf3e 100644
--- a/Documentation/serial/tty.rst
+++ b/Documentation/driver-api/serial/tty.rst
diff --git a/Documentation/sgi-ioc4.txt b/Documentation/driver-api/sgi-ioc4.rst
index 72709222d3c0..72709222d3c0 100644
--- a/Documentation/sgi-ioc4.txt
+++ b/Documentation/driver-api/sgi-ioc4.rst
diff --git a/Documentation/SM501.txt b/Documentation/driver-api/sm501.rst
index 882507453ba4..882507453ba4 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/driver-api/sm501.rst
diff --git a/Documentation/smsc_ece1099.txt b/Documentation/driver-api/smsc_ece1099.rst
index 079277421eaf..079277421eaf 100644
--- a/Documentation/smsc_ece1099.txt
+++ b/Documentation/driver-api/smsc_ece1099.rst
diff --git a/Documentation/driver-api/soundwire/locking.rst b/Documentation/driver-api/soundwire/locking.rst
index 253f73555255..3a7ffb3d87f3 100644
--- a/Documentation/driver-api/soundwire/locking.rst
+++ b/Documentation/driver-api/soundwire/locking.rst
@@ -44,7 +44,9 @@ Message transfer.
      b. Transfer message (Read/Write) to Slave1 or broadcast message on
         Bus in case of bank switch.
 
-     c. Release Message lock ::
+     c. Release Message lock
+
+     ::
 
 	+----------+                    +---------+
 	|          |                    |         |
diff --git a/Documentation/driver-api/switchtec.rst b/Documentation/driver-api/switchtec.rst
new file mode 100644
index 000000000000..7611fdc53e19
--- /dev/null
+++ b/Documentation/driver-api/switchtec.rst
@@ -0,0 +1,102 @@
+========================
+Linux Switchtec Support
+========================
+
+Microsemi's "Switchtec" line of PCI switch devices is already
+supported by the kernel with standard PCI switch drivers. However, the
+Switchtec device advertises a special management endpoint which
+enables some additional functionality. This includes:
+
+* Packet and Byte Counters
+* Firmware Upgrades
+* Event and Error logs
+* Querying port link status
+* Custom user firmware commands
+
+The switchtec kernel module implements this functionality.
+
+
+Interface
+=========
+
+The primary means of communicating with the Switchtec management firmware is
+through the Memory-mapped Remote Procedure Call (MRPC) interface.
+Commands are submitted to the interface with a 4-byte command
+identifier and up to 1KB of command specific data. The firmware will
+respond with a 4-byte return code and up to 1KB of command-specific
+data. The interface only processes a single command at a time.
+
+
+Userspace Interface
+===================
+
+The MRPC interface will be exposed to userspace through a simple char
+device: /dev/switchtec#, one for each management endpoint in the system.
+
+The char device has the following semantics:
+
+* A write must consist of at least 4 bytes and no more than 1028 bytes.
+  The first 4 bytes will be interpreted as the Command ID and the
+  remainder will be used as the input data. A write will send the
+  command to the firmware to begin processing.
+
+* Each write must be followed by exactly one read. Any double write will
+  produce an error and any read that doesn't follow a write will
+  produce an error.
+
+* A read will block until the firmware completes the command and return
+  the 4-byte Command Return Value plus up to 1024 bytes of output
+  data. (The length will be specified by the size parameter of the read
+  call -- reading less than 4 bytes will produce an error.)
+
+* The poll call will also be supported for userspace applications that
+  need to do other things while waiting for the command to complete.
+
+The following IOCTLs are also supported by the device:
+
+* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
+  of partitions in the device.
+
+* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
+  any specified partition in flash.
+
+* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
+  indicating all uncleared events.
+
+* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
+  for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
+  with the event_id, index and flags set (index being the partition or PFF
+  number for non-global events). It returns whether the event has
+  occurred, the number of times and any event specific data. The flags
+  can be used to clear the count or enable and disable actions to
+  happen when the event occurs.
+  By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
+  you can set an event to trigger a poll command to return with
+  POLLPRI. In this way, userspace can wait for events to occur.
+
+* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
+  between PCI Function Framework number (used by the event system)
+  and Switchtec Logic Port ID and Partition number (which is more
+  user friendly).
+
+
+Non-Transparent Bridge (NTB) Driver
+===================================
+
+An NTB hardware driver is provided for the Switchtec hardware in
+ntb_hw_switchtec. Currently, it only supports switches configured with
+exactly 2 NT partitions and zero or more non-NT partitions. It also requires
+the following configuration settings:
+
+* Both NT partitions must be able to access each other's GAS spaces.
+  Thus, the bits in the GAS Access Vector under Management Settings
+  must be set to support this.
+* Kernel configuration MUST include support for NTB (CONFIG_NTB needs
+  to be set)
+
+NT EP BAR 2 will be dynamically configured as a Direct Window, and
+the configuration file does not need to configure it explicitly.
+
+Please refer to Documentation/driver-api/ntb.rst in Linux source tree for an overall
+understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB
+Hardware Driver in this stack.
diff --git a/Documentation/sync_file.txt b/Documentation/driver-api/sync_file.rst
index 496fb2c3b3e6..496fb2c3b3e6 100644
--- a/Documentation/sync_file.txt
+++ b/Documentation/driver-api/sync_file.rst
diff --git a/Documentation/driver-api/target.rst b/Documentation/driver-api/target.rst
index 4363611dd86d..620ec6173a93 100644
--- a/Documentation/driver-api/target.rst
+++ b/Documentation/driver-api/target.rst
@@ -10,8 +10,8 @@ TBD
 Target core device interfaces
 =============================
 
-.. kernel-doc:: drivers/target/target_core_device.c
-    :export:
+This section is blank because no kerneldoc comments have been added to
+drivers/target/target_core_device.c.
 
 Target core transport interfaces
 ================================
diff --git a/Documentation/driver-api/uio-howto.rst b/Documentation/driver-api/uio-howto.rst
index 25f50eace28b..8fecfa11d4ff 100644
--- a/Documentation/driver-api/uio-howto.rst
+++ b/Documentation/driver-api/uio-howto.rst
@@ -276,8 +276,8 @@ fields of ``struct uio_mem``:
 -  ``int memtype``: Required if the mapping is used. Set this to
    ``UIO_MEM_PHYS`` if you you have physical memory on your card to be
    mapped. Use ``UIO_MEM_LOGICAL`` for logical memory (e.g. allocated
-   with :c:func:`kmalloc()`). There's also ``UIO_MEM_VIRTUAL`` for
-   virtual memory.
+   with :c:func:`__get_free_pages()` but not kmalloc()). There's also
+   ``UIO_MEM_VIRTUAL`` for virtual memory.
 
 -  ``phys_addr_t addr``: Required if the mapping is used. Fill in the
    address of your memory block. This address is the one that appears in
diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst
index 4a74cf6f2797..2525c3622cae 100644
--- a/Documentation/driver-api/usb/power-management.rst
+++ b/Documentation/driver-api/usb/power-management.rst
@@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we
 call it a "dynamic suspend" (also known as a "runtime suspend" or
 "selective suspend").  This document concentrates mostly on how
 dynamic PM is implemented in the USB subsystem, although system PM is
-covered to some extent (see ``Documentation/power/*.txt`` for more
+covered to some extent (see ``Documentation/power/*.rst`` for more
 information about system PM).
 
 System PM support is present only if the kernel was built with
diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
new file mode 100644
index 000000000000..25eb7d5b834b
--- /dev/null
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -0,0 +1,414 @@
+.. include:: <isonum.txt>
+
+=====================
+VFIO Mediated devices
+=====================
+
+:Copyright: |copy| 2016, NVIDIA CORPORATION. All rights reserved.
+:Author: Neo Jia <cjia@nvidia.com>
+:Author: Kirti Wankhede <kwankhede@nvidia.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 2 as
+published by the Free Software Foundation.
+
+
+Virtual Function I/O (VFIO) Mediated devices[1]
+===============================================
+
+The number of use cases for virtualizing DMA devices that do not have built-in
+SR_IOV capability is increasing. Previously, to virtualize such devices,
+developers had to create their own management interfaces and APIs, and then
+integrate them with user space software. To simplify integration with user space
+software, we have identified common requirements and a unified management
+interface for such devices.
+
+The VFIO driver framework provides unified APIs for direct device access. It is
+an IOMMU/device-agnostic framework for exposing direct device access to user
+space in a secure, IOMMU-protected environment. This framework is used for
+multiple devices, such as GPUs, network adapters, and compute accelerators. With
+direct device access, virtual machines or user space applications have direct
+access to the physical device. This framework is reused for mediated devices.
+
+The mediated core driver provides a common interface for mediated device
+management that can be used by drivers of different devices. This module
+provides a generic interface to perform these operations:
+
+* Create and destroy a mediated device
+* Add a mediated device to and remove it from a mediated bus driver
+* Add a mediated device to and remove it from an IOMMU group
+
+The mediated core driver also provides an interface to register a bus driver.
+For example, the mediated VFIO mdev driver is designed for mediated devices and
+supports VFIO APIs. The mediated bus driver adds a mediated device to and
+removes it from a VFIO group.
+
+The following high-level block diagram shows the main components and interfaces
+in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM
+devices as examples, as these devices are the first devices to use this module::
+
+     +---------------+
+     |               |
+     | +-----------+ |  mdev_register_driver() +--------------+
+     | |           | +<------------------------+              |
+     | |  mdev     | |                         |              |
+     | |  bus      | +------------------------>+ vfio_mdev.ko |<-> VFIO user
+     | |  driver   | |     probe()/remove()    |              |    APIs
+     | |           | |                         +--------------+
+     | +-----------+ |
+     |               |
+     |  MDEV CORE    |
+     |   MODULE      |
+     |   mdev.ko     |
+     | +-----------+ |  mdev_register_device() +--------------+
+     | |           | +<------------------------+              |
+     | |           | |                         |  nvidia.ko   |<-> physical
+     | |           | +------------------------>+              |    device
+     | |           | |        callbacks        +--------------+
+     | | Physical  | |
+     | |  device   | |  mdev_register_device() +--------------+
+     | | interface | |<------------------------+              |
+     | |           | |                         |  i915.ko     |<-> physical
+     | |           | +------------------------>+              |    device
+     | |           | |        callbacks        +--------------+
+     | |           | |
+     | |           | |  mdev_register_device() +--------------+
+     | |           | +<------------------------+              |
+     | |           | |                         | ccw_device.ko|<-> physical
+     | |           | +------------------------>+              |    device
+     | |           | |        callbacks        +--------------+
+     | +-----------+ |
+     +---------------+
+
+
+Registration Interfaces
+=======================
+
+The mediated core driver provides the following types of registration
+interfaces:
+
+* Registration interface for a mediated bus driver
+* Physical device driver interface
+
+Registration Interface for a Mediated Bus Driver
+------------------------------------------------
+
+The registration interface for a mediated bus driver provides the following
+structure to represent a mediated device's driver::
+
+     /*
+      * struct mdev_driver [2] - Mediated device's driver
+      * @name: driver name
+      * @probe: called when new device created
+      * @remove: called when device removed
+      * @driver: device driver structure
+      */
+     struct mdev_driver {
+	     const char *name;
+	     int  (*probe)  (struct device *dev);
+	     void (*remove) (struct device *dev);
+	     struct device_driver    driver;
+     };
+
+A mediated bus driver for mdev should use this structure in the function calls
+to register and unregister itself with the core driver:
+
+* Register::
+
+    extern int  mdev_register_driver(struct mdev_driver *drv,
+				   struct module *owner);
+
+* Unregister::
+
+    extern void mdev_unregister_driver(struct mdev_driver *drv);
+
+The mediated bus driver is responsible for adding mediated devices to the VFIO
+group when devices are bound to the driver and removing mediated devices from
+the VFIO when devices are unbound from the driver.
+
+
+Physical Device Driver Interface
+--------------------------------
+
+The physical device driver interface provides the mdev_parent_ops[3] structure
+to define the APIs to manage work in the mediated core driver that is related
+to the physical device.
+
+The structures in the mdev_parent_ops structure are as follows:
+
+* dev_attr_groups: attributes of the parent device
+* mdev_attr_groups: attributes of the mediated device
+* supported_config: attributes to define supported configurations
+
+The functions in the mdev_parent_ops structure are as follows:
+
+* create: allocate basic resources in a driver for a mediated device
+* remove: free resources in a driver when a mediated device is destroyed
+
+(Note that mdev-core provides no implicit serialization of create/remove
+callbacks per mdev parent device, per mdev type, or any other categorization.
+Vendor drivers are expected to be fully asynchronous in this respect or
+provide their own internal resource protection.)
+
+The callbacks in the mdev_parent_ops structure are as follows:
+
+* open: open callback of mediated device
+* close: close callback of mediated device
+* ioctl: ioctl callback of mediated device
+* read : read emulation callback
+* write: write emulation callback
+* mmap: mmap emulation callback
+
+A driver should use the mdev_parent_ops structure in the function call to
+register itself with the mdev core driver::
+
+	extern int  mdev_register_device(struct device *dev,
+	                                 const struct mdev_parent_ops *ops);
+
+However, the mdev_parent_ops structure is not required in the function call
+that a driver should use to unregister itself with the mdev core driver::
+
+	extern void mdev_unregister_device(struct device *dev);
+
+
+Mediated Device Management Interface Through sysfs
+==================================================
+
+The management interface through sysfs enables user space software, such as
+libvirt, to query and configure mediated devices in a hardware-agnostic fashion.
+This management interface provides flexibility to the underlying physical
+device's driver to support features such as:
+
+* Mediated device hot plug
+* Multiple mediated devices in a single virtual machine
+* Multiple mediated devices from different physical devices
+
+Links in the mdev_bus Class Directory
+-------------------------------------
+The /sys/class/mdev_bus/ directory contains links to devices that are registered
+with the mdev core driver.
+
+Directories and files under the sysfs for Each Physical Device
+--------------------------------------------------------------
+
+::
+
+  |- [parent physical device]
+  |--- Vendor-specific-attributes [optional]
+  |--- [mdev_supported_types]
+  |     |--- [<type-id>]
+  |     |   |--- create
+  |     |   |--- name
+  |     |   |--- available_instances
+  |     |   |--- device_api
+  |     |   |--- description
+  |     |   |--- [devices]
+  |     |--- [<type-id>]
+  |     |   |--- create
+  |     |   |--- name
+  |     |   |--- available_instances
+  |     |   |--- device_api
+  |     |   |--- description
+  |     |   |--- [devices]
+  |     |--- [<type-id>]
+  |          |--- create
+  |          |--- name
+  |          |--- available_instances
+  |          |--- device_api
+  |          |--- description
+  |          |--- [devices]
+
+* [mdev_supported_types]
+
+  The list of currently supported mediated device types and their details.
+
+  [<type-id>], device_api, and available_instances are mandatory attributes
+  that should be provided by vendor driver.
+
+* [<type-id>]
+
+  The [<type-id>] name is created by adding the device driver string as a prefix
+  to the string provided by the vendor driver. This format of this name is as
+  follows::
+
+	sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name);
+
+  (or using mdev_parent_dev(mdev) to arrive at the parent device outside
+  of the core mdev code)
+
+* device_api
+
+  This attribute should show which device API is being created, for example,
+  "vfio-pci" for a PCI device.
+
+* available_instances
+
+  This attribute should show the number of devices of type <type-id> that can be
+  created.
+
+* [device]
+
+  This directory contains links to the devices of type <type-id> that have been
+  created.
+
+* name
+
+  This attribute should show human readable name. This is optional attribute.
+
+* description
+
+  This attribute should show brief features/description of the type. This is
+  optional attribute.
+
+Directories and Files Under the sysfs for Each mdev Device
+----------------------------------------------------------
+
+::
+
+  |- [parent phy device]
+  |--- [$MDEV_UUID]
+         |--- remove
+         |--- mdev_type {link to its type}
+         |--- vendor-specific-attributes [optional]
+
+* remove (write only)
+
+Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can
+fail the remove() callback if that device is active and the vendor driver
+doesn't support hot unplug.
+
+Example::
+
+	# echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
+
+Mediated device Hot plug
+------------------------
+
+Mediated devices can be created and assigned at runtime. The procedure to hot
+plug a mediated device is the same as the procedure to hot plug a PCI device.
+
+Translation APIs for Mediated Devices
+=====================================
+
+The following APIs are provided for translating user pfn to host pfn in a VFIO
+driver::
+
+	extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+				  int npage, int prot, unsigned long *phys_pfn);
+
+	extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
+				    int npage);
+
+These functions call back into the back-end IOMMU module by using the pin_pages
+and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently
+these callbacks are supported in the TYPE1 IOMMU module. To enable them for
+other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide
+these two callback functions.
+
+Using the Sample Code
+=====================
+
+mtty.c in samples/vfio-mdev/ directory is a sample driver program to
+demonstrate how to use the mediated device framework.
+
+The sample driver creates an mdev device that simulates a serial port over a PCI
+card.
+
+1. Build and load the mtty.ko module.
+
+   This step creates a dummy device, /sys/devices/virtual/mtty/mtty/
+
+   Files in this device directory in sysfs are similar to the following::
+
+     # tree /sys/devices/virtual/mtty/mtty/
+        /sys/devices/virtual/mtty/mtty/
+        |-- mdev_supported_types
+        |   |-- mtty-1
+        |   |   |-- available_instances
+        |   |   |-- create
+        |   |   |-- device_api
+        |   |   |-- devices
+        |   |   `-- name
+        |   `-- mtty-2
+        |       |-- available_instances
+        |       |-- create
+        |       |-- device_api
+        |       |-- devices
+        |       `-- name
+        |-- mtty_dev
+        |   `-- sample_mtty_dev
+        |-- power
+        |   |-- autosuspend_delay_ms
+        |   |-- control
+        |   |-- runtime_active_time
+        |   |-- runtime_status
+        |   `-- runtime_suspended_time
+        |-- subsystem -> ../../../../class/mtty
+        `-- uevent
+
+2. Create a mediated device by using the dummy device that you created in the
+   previous step::
+
+     # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" >	\
+              /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
+
+3. Add parameters to qemu-kvm::
+
+     -device vfio-pci,\
+      sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
+
+4. Boot the VM.
+
+   In the Linux guest VM, with no hardware on the host, the device appears
+   as  follows::
+
+     # lspci -s 00:05.0 -xxvv
+     00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550])
+             Subsystem: Device 4348:3253
+             Physical Slot: 5
+             Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr-
+     Stepping- SERR- FastB2B- DisINTx-
+             Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort-
+     <TAbort- <MAbort- >SERR- <PERR- INTx-
+             Interrupt: pin A routed to IRQ 10
+             Region 0: I/O ports at c150 [size=8]
+             Region 1: I/O ports at c158 [size=8]
+             Kernel driver in use: serial
+     00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00
+     10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00
+     20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32
+     30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00
+
+     In the Linux guest VM, dmesg output for the device is as follows:
+
+     serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10
+     0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A
+     0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A
+
+
+5. In the Linux guest VM, check the serial ports::
+
+     # setserial -g /dev/ttyS*
+     /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4
+     /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10
+     /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10
+
+6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or
+   /dev/ttyS2 with hardware flow control disabled.
+
+7. Type data on the minicom terminal or send data to the terminal emulation
+   program and read the data.
+
+   Data is loop backed from hosts mtty driver.
+
+8. Destroy the mediated device that you created::
+
+     # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove
+
+References
+==========
+
+1. See Documentation/driver-api/vfio.rst for more information on VFIO.
+2. struct mdev_driver in include/linux/mdev.h
+3. struct mdev_parent_ops in include/linux/mdev.h
+4. struct vfio_iommu_driver_ops in include/linux/vfio.h
diff --git a/Documentation/vfio.txt b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..f1a4d3c3ba0b 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/driver-api/vfio.rst
diff --git a/Documentation/driver-api/xilinx/eemi.rst b/Documentation/driver-api/xilinx/eemi.rst
new file mode 100644
index 000000000000..9dcbc6f18d75
--- /dev/null
+++ b/Documentation/driver-api/xilinx/eemi.rst
@@ -0,0 +1,67 @@
+====================================
+Xilinx Zynq MPSoC EEMI Documentation
+====================================
+
+Xilinx Zynq MPSoC Firmware Interface
+-------------------------------------
+The zynqmp-firmware node describes the interface to platform firmware.
+ZynqMP has an interface to communicate with secure firmware. Firmware
+driver provides an interface to firmware APIs. Interface APIs can be
+used by any driver to communicate with PMC(Platform Management Controller).
+
+Embedded Energy Management Interface (EEMI)
+----------------------------------------------
+The embedded energy management interface is used to allow software
+components running across different processing clusters on a chip or
+device to communicate with a power management controller (PMC) on a
+device to issue or respond to power management requests.
+
+EEMI ops is a structure containing all eemi APIs supported by Zynq MPSoC.
+The zynqmp-firmware driver maintain all EEMI APIs in zynqmp_eemi_ops
+structure. Any driver who want to communicate with PMC using EEMI APIs
+can call zynqmp_pm_get_eemi_ops().
+
+Example of EEMI ops::
+
+	/* zynqmp-firmware driver maintain all EEMI APIs */
+	struct zynqmp_eemi_ops {
+		int (*get_api_version)(u32 *version);
+		int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out);
+	};
+
+	static const struct zynqmp_eemi_ops eemi_ops = {
+		.get_api_version = zynqmp_pm_get_api_version,
+		.query_data = zynqmp_pm_query_data,
+	};
+
+Example of EEMI ops usage::
+
+	static const struct zynqmp_eemi_ops *eemi_ops;
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	eemi_ops = zynqmp_pm_get_eemi_ops();
+	if (IS_ERR(eemi_ops))
+		return PTR_ERR(eemi_ops);
+
+	ret = eemi_ops->query_data(qdata, ret_payload);
+
+IOCTL
+------
+IOCTL API is for device control and configuration. It is not a system
+IOCTL but it is an EEMI API. This API can be used by master to control
+any device specific configuration. IOCTL definitions can be platform
+specific. This API also manage shared device configuration.
+
+The following IOCTL IDs are valid for device control:
+- IOCTL_SET_PLL_FRAC_MODE	8
+- IOCTL_GET_PLL_FRAC_MODE	9
+- IOCTL_SET_PLL_FRAC_DATA	10
+- IOCTL_GET_PLL_FRAC_DATA	11
+
+Refer EEMI API guide [0] for IOCTL specific parameters and other EEMI APIs.
+
+References
+----------
+[0] Embedded Energy Management Interface (EEMI) API guide:
+    https://www.xilinx.com/support/documentation/user_guides/ug1200-eemi-api.pdf
diff --git a/Documentation/driver-api/xilinx/index.rst b/Documentation/driver-api/xilinx/index.rst
new file mode 100644
index 000000000000..13f7589ed442
--- /dev/null
+++ b/Documentation/driver-api/xilinx/index.rst
@@ -0,0 +1,16 @@
+
+===========
+Xilinx FPGA
+===========
+
+.. toctree::
+    :maxdepth: 1
+
+    eemi
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/xillybus.txt b/Documentation/driver-api/xillybus.rst
index 2446ee303c09..2446ee303c09 100644
--- a/Documentation/xillybus.txt
+++ b/Documentation/driver-api/xillybus.rst
diff --git a/Documentation/zorro.txt b/Documentation/driver-api/zorro.rst
index 664072b017e3..664072b017e3 100644
--- a/Documentation/zorro.txt
+++ b/Documentation/driver-api/zorro.rst
diff --git a/Documentation/driver-model/binding.txt b/Documentation/driver-model/binding.txt
deleted file mode 100644
index abfc8e290d53..000000000000
--- a/Documentation/driver-model/binding.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-
-Driver Binding
-
-Driver binding is the process of associating a device with a device
-driver that can control it. Bus drivers have typically handled this
-because there have been bus-specific structures to represent the
-devices and the drivers. With generic device and device driver
-structures, most of the binding can take place using common code.
-
-
-Bus
-~~~
-
-The bus type structure contains a list of all devices that are on that bus
-type in the system. When device_register is called for a device, it is
-inserted into the end of this list. The bus object also contains a
-list of all drivers of that bus type. When driver_register is called
-for a driver, it is inserted at the end of this list. These are the
-two events which trigger driver binding.
-
-
-device_register
-~~~~~~~~~~~~~~~
-
-When a new device is added, the bus's list of drivers is iterated over
-to find one that supports it. In order to determine that, the device
-ID of the device must match one of the device IDs that the driver
-supports. The format and semantics for comparing IDs is bus-specific. 
-Instead of trying to derive a complex state machine and matching
-algorithm, it is up to the bus driver to provide a callback to compare
-a device against the IDs of a driver. The bus returns 1 if a match was
-found; 0 otherwise.
-
-int match(struct device * dev, struct device_driver * drv);
-
-If a match is found, the device's driver field is set to the driver
-and the driver's probe callback is called. This gives the driver a
-chance to verify that it really does support the hardware, and that
-it's in a working state. 
-
-Device Class
-~~~~~~~~~~~~
-
-Upon the successful completion of probe, the device is registered with
-the class to which it belongs. Device drivers belong to one and only one
-class, and that is set in the driver's devclass field. 
-devclass_add_device is called to enumerate the device within the class
-and actually register it with the class, which happens with the
-class's register_dev callback.
-
-
-Driver
-~~~~~~
-
-When a driver is attached to a device, the device is inserted into the
-driver's list of devices. 
-
-
-sysfs
-~~~~~
-
-A symlink is created in the bus's 'devices' directory that points to
-the device's directory in the physical hierarchy.
-
-A symlink is created in the driver's 'devices' directory that points
-to the device's directory in the physical hierarchy.
-
-A directory for the device is created in the class's directory. A
-symlink is created in that directory that points to the device's
-physical location in the sysfs tree. 
-
-A symlink can be created (though this isn't done yet) in the device's
-physical directory to either its class directory, or the class's
-top-level directory. One can also be created to point to its driver's
-directory also. 
-
-
-driver_register
-~~~~~~~~~~~~~~~
-
-The process is almost identical for when a new driver is added. 
-The bus's list of devices is iterated over to find a match. Devices
-that already have a driver are skipped. All the devices are iterated
-over, to bind as many devices as possible to the driver.
-
-
-Removal
-~~~~~~~
-
-When a device is removed, the reference count for it will eventually
-go to 0. When it does, the remove callback of the driver is called. It
-is removed from the driver's list of devices and the reference count
-of the driver is decremented. All symlinks between the two are removed.
-
-When a driver is removed, the list of devices that it supports is
-iterated over, and the driver's remove callback is called for each
-one. The device is removed from that list and the symlinks removed. 
-
diff --git a/Documentation/driver-model/bus.txt b/Documentation/driver-model/bus.txt
deleted file mode 100644
index c247b488a567..000000000000
--- a/Documentation/driver-model/bus.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-
-Bus Types 
-
-Definition
-~~~~~~~~~~
-See the kerneldoc for the struct bus_type.
-
-int bus_register(struct bus_type * bus);
-
-
-Declaration
-~~~~~~~~~~~
-
-Each bus type in the kernel (PCI, USB, etc) should declare one static
-object of this type. They must initialize the name field, and may
-optionally initialize the match callback.
-
-struct bus_type pci_bus_type = {
-       .name	= "pci",
-       .match	= pci_bus_match,
-};
-
-The structure should be exported to drivers in a header file:
-
-extern struct bus_type pci_bus_type;
-
-
-Registration
-~~~~~~~~~~~~
-
-When a bus driver is initialized, it calls bus_register. This
-initializes the rest of the fields in the bus object and inserts it
-into a global list of bus types. Once the bus object is registered, 
-the fields in it are usable by the bus driver. 
-
-
-Callbacks
-~~~~~~~~~
-
-match(): Attaching Drivers to Devices
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The format of device ID structures and the semantics for comparing
-them are inherently bus-specific. Drivers typically declare an array
-of device IDs of devices they support that reside in a bus-specific
-driver structure. 
-
-The purpose of the match callback is to give the bus an opportunity to
-determine if a particular driver supports a particular device by
-comparing the device IDs the driver supports with the device ID of a
-particular device, without sacrificing bus-specific functionality or
-type-safety. 
-
-When a driver is registered with the bus, the bus's list of devices is
-iterated over, and the match callback is called for each device that
-does not have a driver associated with it. 
-
-
-
-Device and Driver Lists
-~~~~~~~~~~~~~~~~~~~~~~~
-
-The lists of devices and drivers are intended to replace the local
-lists that many buses keep. They are lists of struct devices and
-struct device_drivers, respectively. Bus drivers are free to use the
-lists as they please, but conversion to the bus-specific type may be
-necessary. 
-
-The LDM core provides helper functions for iterating over each list.
-
-int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
-		     int (*fn)(struct device *, void *));
-
-int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, 
-		     void * data, int (*fn)(struct device_driver *, void *));
-
-These helpers iterate over the respective list, and call the callback
-for each device or driver in the list. All list accesses are
-synchronized by taking the bus's lock (read currently). The reference
-count on each object in the list is incremented before the callback is
-called; it is decremented after the next object has been obtained. The
-lock is not held when calling the callback. 
-
-
-sysfs
-~~~~~~~~
-There is a top-level directory named 'bus'.
-
-Each bus gets a directory in the bus directory, along with two default
-directories:
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-
-Drivers registered with the bus get a directory in the bus's drivers
-directory:
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-	    |-- Intel ICH
-	    |-- Intel ICH Joystick
-	    |-- agpgart
-	    `-- e100
-
-Each device that is discovered on a bus of that type gets a symlink in
-the bus's devices directory to the device's directory in the physical
-hierarchy:
-
-	/sys/bus/pci/
-	|-- devices
-	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
-	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
-	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
-	`-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-struct bus_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *, char * buf);
-	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
-};
-
-Bus drivers can export attributes using the BUS_ATTR_RW macro that works
-similarly to the DEVICE_ATTR_RW macro for devices. For example, a
-definition like this:
-
-static BUS_ATTR_RW(debug);
-
-is equivalent to declaring:
-
-static bus_attribute bus_attr_debug;
-
-This can then be used to add and remove the attribute from the bus's
-sysfs directory using:
-
-int bus_create_file(struct bus_type *, struct bus_attribute *);
-void bus_remove_file(struct bus_type *, struct bus_attribute *);
-
-
diff --git a/Documentation/driver-model/class.txt b/Documentation/driver-model/class.txt
deleted file mode 100644
index 1fefc480a80b..000000000000
--- a/Documentation/driver-model/class.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-
-Device Classes
-
-
-Introduction
-~~~~~~~~~~~~
-A device class describes a type of device, like an audio or network
-device. The following device classes have been identified:
-
-<Insert List of Device Classes Here>
-
-
-Each device class defines a set of semantics and a programming interface
-that devices of that class adhere to. Device drivers are the
-implementation of that programming interface for a particular device on
-a particular bus. 
-
-Device classes are agnostic with respect to what bus a device resides
-on. 
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The device class structure looks like: 
-
-
-typedef int (*devclass_add)(struct device *);
-typedef void (*devclass_remove)(struct device *);
-
-See the kerneldoc for the struct class.
-
-A typical device class definition would look like: 
-
-struct device_class input_devclass = {
-        .name		= "input",
-        .add_device	= input_add_device,
-	.remove_device	= input_remove_device,
-};
-
-Each device class structure should be exported in a header file so it
-can be used by drivers, extensions and interfaces.
-
-Device classes are registered and unregistered with the core using: 
-
-int devclass_register(struct device_class * cls);
-void devclass_unregister(struct device_class * cls);
-
-
-Devices
-~~~~~~~
-As devices are bound to drivers, they are added to the device class
-that the driver belongs to. Before the driver model core, this would
-typically happen during the driver's probe() callback, once the device
-has been initialized. It now happens after the probe() callback
-finishes from the core. 
-
-The device is enumerated in the class. Each time a device is added to
-the class, the class's devnum field is incremented and assigned to the
-device. The field is never decremented, so if the device is removed
-from the class and re-added, it will receive a different enumerated
-value. 
-
-The class is allowed to create a class-specific structure for the
-device and store it in the device's class_data pointer. 
-
-There is no list of devices in the device class. Each driver has a
-list of devices that it supports. The device class has a list of
-drivers of that particular class. To access all of the devices in the
-class, iterate over the device lists of each driver in the class.
-
-
-Device Drivers
-~~~~~~~~~~~~~~
-Device drivers are added to device classes when they are registered
-with the core. A driver specifies the class it belongs to by setting
-the struct device_driver::devclass field. 
-
-
-sysfs directory structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There is a top-level sysfs directory named 'class'. 
-
-Each class gets a directory in the class directory, along with two
-default subdirectories:
-
-        class/
-        `-- input
-            |-- devices
-            `-- drivers
-
-
-Drivers registered with the class get a symlink in the drivers/ directory 
-that points to the driver's directory (under its bus directory):
-
-   class/
-   `-- input
-       |-- devices
-       `-- drivers
-           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
-
-
-Each device gets a symlink in the devices/ directory that points to the 
-device's directory in the physical hierarchy:
-
-   class/
-   `-- input
-       |-- devices
-       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
-       `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-struct devclass_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
-        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
-};
-
-Class drivers can export attributes using the DEVCLASS_ATTR macro that works
-similarly to the DEVICE_ATTR macro for devices. For example, a definition 
-like this:
-
-static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
-
-is equivalent to declaring:
-
-static devclass_attribute devclass_attr_debug;
-
-The bus driver can add and remove the attribute from the class's
-sysfs directory using:
-
-int devclass_create_file(struct device_class *, struct devclass_attribute *);
-void devclass_remove_file(struct device_class *, struct devclass_attribute *);
-
-In the example above, the file will be named 'debug' in placed in the
-class's directory in sysfs. 
-
-
-Interfaces
-~~~~~~~~~~
-There may exist multiple mechanisms for accessing the same device of a
-particular class type. Device interfaces describe these mechanisms. 
-
-When a device is added to a device class, the core attempts to add it
-to every interface that is registered with the device class.
-
diff --git a/Documentation/driver-model/design-patterns.txt b/Documentation/driver-model/design-patterns.txt
deleted file mode 100644
index ba7b2df64904..000000000000
--- a/Documentation/driver-model/design-patterns.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-
-Device Driver Design Patterns
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This document describes a few common design patterns found in device drivers.
-It is likely that subsystem maintainers will ask driver developers to
-conform to these design patterns.
-
-1. State Container
-2. container_of()
-
-
-1. State Container
-~~~~~~~~~~~~~~~~~~
-
-While the kernel contains a few device drivers that assume that they will
-only be probed() once on a certain system (singletons), it is custom to assume
-that the device the driver binds to will appear in several instances. This
-means that the probe() function and all callbacks need to be reentrant.
-
-The most common way to achieve this is to use the state container design
-pattern. It usually has this form:
-
-struct foo {
-    spinlock_t lock; /* Example member */
-    (...)
-};
-
-static int foo_probe(...)
-{
-    struct foo *foo;
-
-    foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
-    if (!foo)
-        return -ENOMEM;
-    spin_lock_init(&foo->lock);
-    (...)
-}
-
-This will create an instance of struct foo in memory every time probe() is
-called. This is our state container for this instance of the device driver.
-Of course it is then necessary to always pass this instance of the
-state around to all functions that need access to the state and its members.
-
-For example, if the driver is registering an interrupt handler, you would
-pass around a pointer to struct foo like this:
-
-static irqreturn_t foo_handler(int irq, void *arg)
-{
-    struct foo *foo = arg;
-    (...)
-}
-
-static int foo_probe(...)
-{
-    struct foo *foo;
-
-    (...)
-    ret = request_irq(irq, foo_handler, 0, "foo", foo);
-}
-
-This way you always get a pointer back to the correct instance of foo in
-your interrupt handler.
-
-
-2. container_of()
-~~~~~~~~~~~~~~~~~
-
-Continuing on the above example we add an offloaded work:
-
-struct foo {
-    spinlock_t lock;
-    struct workqueue_struct *wq;
-    struct work_struct offload;
-    (...)
-};
-
-static void foo_work(struct work_struct *work)
-{
-    struct foo *foo = container_of(work, struct foo, offload);
-
-    (...)
-}
-
-static irqreturn_t foo_handler(int irq, void *arg)
-{
-    struct foo *foo = arg;
-
-    queue_work(foo->wq, &foo->offload);
-    (...)
-}
-
-static int foo_probe(...)
-{
-    struct foo *foo;
-
-    foo->wq = create_singlethread_workqueue("foo-wq");
-    INIT_WORK(&foo->offload, foo_work);
-    (...)
-}
-
-The design pattern is the same for an hrtimer or something similar that will
-return a single argument which is a pointer to a struct member in the
-callback.
-
-container_of() is a macro defined in <linux/kernel.h>
-
-What container_of() does is to obtain a pointer to the containing struct from
-a pointer to a member by a simple subtraction using the offsetof() macro from
-standard C, which allows something similar to object oriented behaviours.
-Notice that the contained member must not be a pointer, but an actual member
-for this to work.
-
-We can see here that we avoid having global pointers to our struct foo *
-instance this way, while still keeping the number of parameters passed to the
-work function to a single pointer.
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
deleted file mode 100644
index 2403eb856187..000000000000
--- a/Documentation/driver-model/device.txt
+++ /dev/null
@@ -1,106 +0,0 @@
-
-The Basic Device Structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See the kerneldoc for the struct device.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The bus driver that discovers the device uses this to register the
-device with the core:
-
-int device_register(struct device * dev);
-
-The bus should initialize the following fields:
-
-    - parent
-    - name
-    - bus_id
-    - bus
-
-A device is removed from the core when its reference count goes to
-0. The reference count can be adjusted using:
-
-struct device * get_device(struct device * dev);
-void put_device(struct device * dev);
-
-get_device() will return a pointer to the struct device passed to it
-if the reference is not already 0 (if it's in the process of being
-removed already).
-
-A driver can access the lock in the device structure using: 
-
-void lock_device(struct device * dev);
-void unlock_device(struct device * dev);
-
-
-Attributes
-~~~~~~~~~~
-struct device_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
-			 const char *buf, size_t count);
-};
-
-Attributes of devices can be exported by a device driver through sysfs.
-
-Please see Documentation/filesystems/sysfs.txt for more information
-on how sysfs works.
-
-As explained in Documentation/kobject.txt, device attributes must be
-created before the KOBJ_ADD uevent is generated. The only way to realize
-that is by defining an attribute group.
-
-Attributes are declared using a macro called DEVICE_ATTR:
-
-#define DEVICE_ATTR(name,mode,show,store)
-
-Example:
-
-static DEVICE_ATTR(type, 0444, show_type, NULL);
-static DEVICE_ATTR(power, 0644, show_power, store_power);
-
-This declares two structures of type struct device_attribute with respective
-names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
-organized as follows into a group:
-
-static struct attribute *dev_attrs[] = {
-	&dev_attr_type.attr,
-	&dev_attr_power.attr,
-	NULL,
-};
-
-static struct attribute_group dev_attr_group = {
-	.attrs = dev_attrs,
-};
-
-static const struct attribute_group *dev_attr_groups[] = {
-	&dev_attr_group,
-	NULL,
-};
-
-This array of groups can then be associated with a device by setting the
-group pointer in struct device before device_register() is invoked:
-
-      dev->groups = dev_attr_groups;
-      device_register(dev);
-
-The device_register() function will use the 'groups' pointer to create the
-device attributes and the device_unregister() function will use this pointer
-to remove the device attributes.
-
-Word of warning:  While the kernel allows device_create_file() and
-device_remove_file() to be called on a device at any time, userspace has
-strict expectations on when attributes get created.  When a new device is
-registered in the kernel, a uevent is generated to notify userspace (like
-udev) that a new device is available.  If attributes are added after the
-device is registered, then userspace won't get notified and userspace will
-not know about the new attributes.
-
-This is important for device driver that need to publish additional
-attributes for a device at driver probe time.  If the device driver simply
-calls device_create_file() on the device structure passed to it, then
-userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
deleted file mode 100644
index 69c7fa7f616c..000000000000
--- a/Documentation/driver-model/devres.txt
+++ /dev/null
@@ -1,412 +0,0 @@
-Devres - Managed Device Resource
-================================
-
-Tejun Heo	<teheo@suse.de>
-
-First draft	10 January 2007
-
-
-1. Intro			: Huh? Devres?
-2. Devres			: Devres in a nutshell
-3. Devres Group			: Group devres'es and release them together
-4. Details			: Life time rules, calling context, ...
-5. Overhead			: How much do we have to pay for this?
-6. List of managed interfaces	: Currently implemented managed interfaces
-
-
-  1. Intro
-  --------
-
-devres came up while trying to convert libata to use iomap.  Each
-iomapped address should be kept and unmapped on driver detach.  For
-example, a plain SFF ATA controller (that is, good old PCI IDE) in
-native mode makes use of 5 PCI BARs and all of them should be
-maintained.
-
-As with many other device drivers, libata low level drivers have
-sufficient bugs in ->remove and ->probe failure path.  Well, yes,
-that's probably because libata low level driver developers are lazy
-bunch, but aren't all low level driver developers?  After spending a
-day fiddling with braindamaged hardware with no document or
-braindamaged document, if it's finally working, well, it's working.
-
-For one reason or another, low level drivers don't receive as much
-attention or testing as core code, and bugs on driver detach or
-initialization failure don't happen often enough to be noticeable.
-Init failure path is worse because it's much less travelled while
-needs to handle multiple entry points.
-
-So, many low level drivers end up leaking resources on driver detach
-and having half broken failure path implementation in ->probe() which
-would leak resources or even cause oops when failure occurs.  iomap
-adds more to this mix.  So do msi and msix.
-
-
-  2. Devres
-  ---------
-
-devres is basically linked list of arbitrarily sized memory areas
-associated with a struct device.  Each devres entry is associated with
-a release function.  A devres can be released in several ways.  No
-matter what, all devres entries are released on driver detach.  On
-release, the associated release function is invoked and then the
-devres entry is freed.
-
-Managed interface is created for resources commonly used by device
-drivers using devres.  For example, coherent DMA memory is acquired
-using dma_alloc_coherent().  The managed version is called
-dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
-for the DMA memory allocated using it is managed and will be
-automatically released on driver detach.  Implementation looks like
-the following.
-
-  struct dma_devres {
-	size_t		size;
-	void		*vaddr;
-	dma_addr_t	dma_handle;
-  };
-
-  static void dmam_coherent_release(struct device *dev, void *res)
-  {
-	struct dma_devres *this = res;
-
-	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
-  }
-
-  dmam_alloc_coherent(dev, size, dma_handle, gfp)
-  {
-	struct dma_devres *dr;
-	void *vaddr;
-
-	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
-	...
-
-	/* alloc DMA memory as usual */
-	vaddr = dma_alloc_coherent(...);
-	...
-
-	/* record size, vaddr, dma_handle in dr */
-	dr->vaddr = vaddr;
-	...
-
-	devres_add(dev, dr);
-
-	return vaddr;
-  }
-
-If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
-freed whether initialization fails half-way or the device gets
-detached.  If most resources are acquired using managed interface, a
-driver can have much simpler init and exit code.  Init path basically
-looks like the following.
-
-  my_init_one()
-  {
-	struct mydev *d;
-
-	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
-	if (!d)
-		return -ENOMEM;
-
-	d->ring = dmam_alloc_coherent(...);
-	if (!d->ring)
-		return -ENOMEM;
-
-	if (check something)
-		return -EINVAL;
-	...
-
-	return register_to_upper_layer(d);
-  }
-
-And exit path,
-
-  my_remove_one()
-  {
-	unregister_from_upper_layer(d);
-	shutdown_my_hardware();
-  }
-
-As shown above, low level drivers can be simplified a lot by using
-devres.  Complexity is shifted from less maintained low level drivers
-to better maintained higher layer.  Also, as init failure path is
-shared with exit path, both can get more testing.
-
-Note though that when converting current calls or assignments to
-managed devm_* versions it is up to you to check if internal operations
-like allocating memory, have failed. Managed resources pertains to the
-freeing of these resources *only* - all other checks needed are still
-on you. In some cases this may mean introducing checks that were not
-necessary before moving to the managed devm_* calls.
-
-
-  3. Devres group
-  ---------------
-
-Devres entries can be grouped using devres group.  When a group is
-released, all contained normal devres entries and properly nested
-groups are released.  One usage is to rollback series of acquired
-resources on failure.  For example,
-
-  if (!devres_open_group(dev, NULL, GFP_KERNEL))
-	return -ENOMEM;
-
-  acquire A;
-  if (failed)
-	goto err;
-
-  acquire B;
-  if (failed)
-	goto err;
-  ...
-
-  devres_remove_group(dev, NULL);
-  return 0;
-
- err:
-  devres_release_group(dev, NULL);
-  return err_code;
-
-As resource acquisition failure usually means probe failure, constructs
-like above are usually useful in midlayer driver (e.g. libata core
-layer) where interface function shouldn't have side effect on failure.
-For LLDs, just returning error code suffices in most cases.
-
-Each group is identified by void *id.  It can either be explicitly
-specified by @id argument to devres_open_group() or automatically
-created by passing NULL as @id as in the above example.  In both
-cases, devres_open_group() returns the group's id.  The returned id
-can be passed to other devres functions to select the target group.
-If NULL is given to those functions, the latest open group is
-selected.
-
-For example, you can do something like the following.
-
-  int my_midlayer_create_something()
-  {
-	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
-		return -ENOMEM;
-
-	...
-
-	devres_close_group(dev, my_midlayer_create_something);
-	return 0;
-  }
-
-  void my_midlayer_destroy_something()
-  {
-	devres_release_group(dev, my_midlayer_create_something);
-  }
-
-
-  4. Details
-  ----------
-
-Lifetime of a devres entry begins on devres allocation and finishes
-when it is released or destroyed (removed and freed) - no reference
-counting.
-
-devres core guarantees atomicity to all basic devres operations and
-has support for single-instance devres types (atomic
-lookup-and-add-if-not-found).  Other than that, synchronizing
-concurrent accesses to allocated devres data is caller's
-responsibility.  This is usually non-issue because bus ops and
-resource allocations already do the job.
-
-For an example of single-instance devres type, read pcim_iomap_table()
-in lib/devres.c.
-
-All devres interface functions can be called without context if the
-right gfp mask is given.
-
-
-  5. Overhead
-  -----------
-
-Each devres bookkeeping info is allocated together with requested data
-area.  With debug option turned off, bookkeeping info occupies 16
-bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
-up to ull alignment).  If singly linked list is used, it can be
-reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
-
-Each devres group occupies 8 pointers.  It can be reduced to 6 if
-singly linked list is used.
-
-Memory space overhead on ahci controller with two ports is between 300
-and 400 bytes on 32bit machine after naive conversion (we can
-certainly invest a bit more effort into libata core layer).
-
-
-  6. List of managed interfaces
-  -----------------------------
-
-CLOCK
-  devm_clk_get()
-  devm_clk_get_optional()
-  devm_clk_put()
-  devm_clk_hw_register()
-  devm_of_clk_add_hw_provider()
-  devm_clk_hw_register_clkdev()
-
-DMA
-  dmaenginem_async_device_register()
-  dmam_alloc_coherent()
-  dmam_alloc_attrs()
-  dmam_free_coherent()
-  dmam_pool_create()
-  dmam_pool_destroy()
-
-DRM
-  devm_drm_dev_init()
-
-GPIO
-  devm_gpiod_get()
-  devm_gpiod_get_index()
-  devm_gpiod_get_index_optional()
-  devm_gpiod_get_optional()
-  devm_gpiod_put()
-  devm_gpiod_unhinge()
-  devm_gpiochip_add_data()
-  devm_gpio_request()
-  devm_gpio_request_one()
-  devm_gpio_free()
-
-I2C
-  devm_i2c_new_dummy_device()
-
-IIO
-  devm_iio_device_alloc()
-  devm_iio_device_free()
-  devm_iio_device_register()
-  devm_iio_device_unregister()
-  devm_iio_kfifo_allocate()
-  devm_iio_kfifo_free()
-  devm_iio_triggered_buffer_setup()
-  devm_iio_triggered_buffer_cleanup()
-  devm_iio_trigger_alloc()
-  devm_iio_trigger_free()
-  devm_iio_trigger_register()
-  devm_iio_trigger_unregister()
-  devm_iio_channel_get()
-  devm_iio_channel_release()
-  devm_iio_channel_get_all()
-  devm_iio_channel_release_all()
-
-INPUT
-  devm_input_allocate_device()
-
-IO region
-  devm_release_mem_region()
-  devm_release_region()
-  devm_release_resource()
-  devm_request_mem_region()
-  devm_request_region()
-  devm_request_resource()
-
-IOMAP
-  devm_ioport_map()
-  devm_ioport_unmap()
-  devm_ioremap()
-  devm_ioremap_nocache()
-  devm_ioremap_wc()
-  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
-  devm_iounmap()
-  pcim_iomap()
-  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
-  pcim_iomap_table()	: array of mapped addresses indexed by BAR
-  pcim_iounmap()
-
-IRQ
-  devm_free_irq()
-  devm_request_any_context_irq()
-  devm_request_irq()
-  devm_request_threaded_irq()
-  devm_irq_alloc_descs()
-  devm_irq_alloc_desc()
-  devm_irq_alloc_desc_at()
-  devm_irq_alloc_desc_from()
-  devm_irq_alloc_descs_from()
-  devm_irq_alloc_generic_chip()
-  devm_irq_setup_generic_chip()
-  devm_irq_sim_init()
-
-LED
-  devm_led_classdev_register()
-  devm_led_classdev_unregister()
-
-MDIO
-  devm_mdiobus_alloc()
-  devm_mdiobus_alloc_size()
-  devm_mdiobus_free()
-
-MEM
-  devm_free_pages()
-  devm_get_free_pages()
-  devm_kasprintf()
-  devm_kcalloc()
-  devm_kfree()
-  devm_kmalloc()
-  devm_kmalloc_array()
-  devm_kmemdup()
-  devm_kstrdup()
-  devm_kvasprintf()
-  devm_kzalloc()
-
-MFD
-  devm_mfd_add_devices()
-
-MUX
-  devm_mux_chip_alloc()
-  devm_mux_chip_register()
-  devm_mux_control_get()
-
-PER-CPU MEM
-  devm_alloc_percpu()
-  devm_free_percpu()
-
-PCI
-  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
-  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
-  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
-  pcim_enable_device()		: after success, all PCI ops become managed
-  pcim_pin_device()		: keep PCI device enabled after release
-
-PHY
-  devm_usb_get_phy()
-  devm_usb_put_phy()
-
-PINCTRL
-  devm_pinctrl_get()
-  devm_pinctrl_put()
-  devm_pinctrl_register()
-  devm_pinctrl_unregister()
-
-POWER
-  devm_reboot_mode_register()
-  devm_reboot_mode_unregister()
-
-PWM
-  devm_pwm_get()
-  devm_pwm_put()
-
-REGULATOR
-  devm_regulator_bulk_get()
-  devm_regulator_get()
-  devm_regulator_put()
-  devm_regulator_register()
-
-RESET
-  devm_reset_control_get()
-  devm_reset_controller_register()
-
-SERDEV
-  devm_serdev_device_open()
-
-SLAVE DMA ENGINE
-  devm_acpi_dma_controller_register()
-
-SPI
-  devm_spi_register_master()
-
-WATCHDOG
-  devm_watchdog_register_device()
diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt
deleted file mode 100644
index d661e6f7e6a0..000000000000
--- a/Documentation/driver-model/driver.txt
+++ /dev/null
@@ -1,215 +0,0 @@
-
-Device Drivers
-
-See the kerneldoc for the struct device_driver.
-
-
-Allocation
-~~~~~~~~~~
-
-Device drivers are statically allocated structures. Though there may
-be multiple devices in a system that a driver supports, struct
-device_driver represents the driver as a whole (not a particular
-device instance).
-
-Initialization
-~~~~~~~~~~~~~~
-
-The driver must initialize at least the name and bus fields. It should
-also initialize the devclass field (when it arrives), so it may obtain
-the proper linkage internally. It should also initialize as many of
-the callbacks as possible, though each is optional.
-
-Declaration
-~~~~~~~~~~~
-
-As stated above, struct device_driver objects are statically
-allocated. Below is an example declaration of the eepro100
-driver. This declaration is hypothetical only; it relies on the driver
-being converted completely to the new model. 
-
-static struct device_driver eepro100_driver = {
-       .name		= "eepro100",
-       .bus		= &pci_bus_type,
-       
-       .probe		= eepro100_probe,
-       .remove		= eepro100_remove,
-       .suspend		= eepro100_suspend,
-       .resume		= eepro100_resume,
-};
-
-Most drivers will not be able to be converted completely to the new
-model because the bus they belong to has a bus-specific structure with
-bus-specific fields that cannot be generalized. 
-
-The most common example of this are device ID structures. A driver
-typically defines an array of device IDs that it supports. The format
-of these structures and the semantics for comparing device IDs are
-completely bus-specific. Defining them as bus-specific entities would
-sacrifice type-safety, so we keep bus-specific structures around. 
-
-Bus-specific drivers should include a generic struct device_driver in
-the definition of the bus-specific driver. Like this:
-
-struct pci_driver {
-       const struct pci_device_id *id_table;
-       struct device_driver	  driver;
-};
-
-A definition that included bus-specific fields would look like
-(using the eepro100 driver again):
-
-static struct pci_driver eepro100_driver = {
-       .id_table       = eepro100_pci_tbl,
-       .driver	       = {
-		.name		= "eepro100",
-		.bus		= &pci_bus_type,
-		.probe		= eepro100_probe,
-		.remove		= eepro100_remove,
-		.suspend	= eepro100_suspend,
-		.resume		= eepro100_resume,
-       },
-};
-
-Some may find the syntax of embedded struct initialization awkward or
-even a bit ugly. So far, it's the best way we've found to do what we want...
-
-Registration
-~~~~~~~~~~~~
-
-int driver_register(struct device_driver * drv);
-
-The driver registers the structure on startup. For drivers that have
-no bus-specific fields (i.e. don't have a bus-specific driver
-structure), they would use driver_register and pass a pointer to their
-struct device_driver object. 
-
-Most drivers, however, will have a bus-specific structure and will
-need to register with the bus using something like pci_driver_register.
-
-It is important that drivers register their driver structure as early as
-possible. Registration with the core initializes several fields in the
-struct device_driver object, including the reference count and the
-lock. These fields are assumed to be valid at all times and may be
-used by the device model core or the bus driver.
-
-
-Transition Bus Drivers
-~~~~~~~~~~~~~~~~~~~~~~
-
-By defining wrapper functions, the transition to the new model can be
-made easier. Drivers can ignore the generic structure altogether and
-let the bus wrapper fill in the fields. For the callbacks, the bus can
-define generic callbacks that forward the call to the bus-specific
-callbacks of the drivers. 
-
-This solution is intended to be only temporary. In order to get class
-information in the driver, the drivers must be modified anyway. Since
-converting drivers to the new model should reduce some infrastructural
-complexity and code size, it is recommended that they are converted as
-class information is added.
-
-Access
-~~~~~~
-
-Once the object has been registered, it may access the common fields of
-the object, like the lock and the list of devices. 
-
-int driver_for_each_dev(struct device_driver * drv, void * data, 
-		        int (*callback)(struct device * dev, void * data));
-
-The devices field is a list of all the devices that have been bound to
-the driver. The LDM core provides a helper function to operate on all
-the devices a driver controls. This helper locks the driver on each
-node access, and does proper reference counting on each device as it
-accesses it. 
-
-
-sysfs
-~~~~~
-
-When a driver is registered, a sysfs directory is created in its
-bus's directory. In this directory, the driver can export an interface
-to userspace to control operation of the driver on a global basis;
-e.g. toggling debugging output in the driver.
-
-A future feature of this directory will be a 'devices' directory. This
-directory will contain symlinks to the directories of devices it
-supports.
-
-
-
-Callbacks
-~~~~~~~~~
-
-	int	(*probe)	(struct device * dev);
-
-The probe() entry is called in task context, with the bus's rwsem locked
-and the driver partially bound to the device.  Drivers commonly use
-container_of() to convert "dev" to a bus-specific type, both in probe()
-and other routines.  That type often provides device resource data, such
-as pci_dev.resource[] or platform_device.resources, which is used in
-addition to dev->platform_data to initialize the driver.
-
-This callback holds the driver-specific logic to bind the driver to a
-given device.  That includes verifying that the device is present, that
-it's a version the driver can handle, that driver data structures can
-be allocated and initialized, and that any hardware can be initialized.
-Drivers often store a pointer to their state with dev_set_drvdata().
-When the driver has successfully bound itself to that device, then probe()
-returns zero and the driver model code will finish its part of binding
-the driver to that device.
-
-A driver's probe() may return a negative errno value to indicate that
-the driver did not bind to this device, in which case it should have
-released all resources it allocated.
-
-	int 	(*remove)	(struct device * dev);
-
-remove is called to unbind a driver from a device. This may be
-called if a device is physically removed from the system, if the
-driver module is being unloaded, during a reboot sequence, or
-in other cases.
-
-It is up to the driver to determine if the device is present or
-not. It should free any resources allocated specifically for the
-device; i.e. anything in the device's driver_data field. 
-
-If the device is still present, it should quiesce the device and place
-it into a supported low-power state.
-
-	int	(*suspend)	(struct device * dev, pm_message_t state);
-
-suspend is called to put the device in a low power state.
-
-	int	(*resume)	(struct device * dev);
-
-Resume is used to bring a device back from a low power state.
-
-
-Attributes
-~~~~~~~~~~
-struct driver_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(struct device_driver *driver, char *buf);
-        ssize_t (*store)(struct device_driver *, const char * buf, size_t count);
-};
-
-Device drivers can export attributes via their sysfs directories. 
-Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
-macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
-macros.
-
-Example:
-
-DRIVER_ATTR_RW(debug);
-
-This is equivalent to declaring:
-
-struct driver_attribute driver_attr_debug;
-
-This can then be used to add and remove the attribute from the
-driver's directory using:
-
-int driver_create_file(struct device_driver *, const struct driver_attribute *);
-void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-model/overview.txt b/Documentation/driver-model/overview.txt
deleted file mode 100644
index 6a8f9a8075d8..000000000000
--- a/Documentation/driver-model/overview.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The Linux Kernel Device Model
-
-Patrick Mochel	<mochel@digitalimplant.org>
-
-Drafted 26 August 2002
-Updated 31 January 2006
-
-
-Overview
-~~~~~~~~
-
-The Linux Kernel Driver Model is a unification of all the disparate driver
-models that were previously used in the kernel. It is intended to augment the
-bus-specific drivers for bridges and devices by consolidating a set of data
-and operations into globally accessible data structures.
-
-Traditional driver models implemented some sort of tree-like structure
-(sometimes just a list) for the devices they control. There wasn't any
-uniformity across the different bus types.
-
-The current driver model provides a common, uniform data model for describing
-a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
-of common callbacks, such as device discovery during bus probing, bus
-shutdown, bus power management, etc.
-
-The common device and bridge interface reflects the goals of the modern
-computer: namely the ability to do seamless device "plug and play", power
-management, and hot plug. In particular, the model dictated by Intel and
-Microsoft (namely ACPI) ensures that almost every device on almost any bus
-on an x86-compatible system can work within this paradigm.  Of course,
-not every bus is able to support all such operations, although most
-buses support most of those operations.
-
-
-Downstream Access
-~~~~~~~~~~~~~~~~~
-
-Common data fields have been moved out of individual bus layers into a common
-data structure. These fields must still be accessed by the bus layers,
-and sometimes by the device-specific drivers.
-
-Other bus layers are encouraged to do what has been done for the PCI layer.
-struct pci_dev now looks like this:
-
-struct pci_dev {
-	...
-
-	struct device dev;     /* Generic device interface */
-	...
-};
-
-Note first that the struct device dev within the struct pci_dev is
-statically allocated. This means only one allocation on device discovery.
-
-Note also that that struct device dev is not necessarily defined at the
-front of the pci_dev structure.  This is to make people think about what
-they're doing when switching between the bus driver and the global driver,
-and to discourage meaningless and incorrect casts between the two.
-
-The PCI bus layer freely accesses the fields of struct device. It knows about
-the structure of struct pci_dev, and it should know the structure of struct
-device. Individual PCI device drivers that have been converted to the current
-driver model generally do not and should not touch the fields of struct device,
-unless there is a compelling reason to do so.
-
-The above abstraction prevents unnecessary pain during transitional phases.
-If it were not done this way, then when a field was renamed or removed, every
-downstream driver would break.  On the other hand, if only the bus layer
-(and not the device layer) accesses the struct device, it is only the bus
-layer that needs to change.
-
-
-User Interface
-~~~~~~~~~~~~~~
-
-By virtue of having a complete hierarchical view of all the devices in the
-system, exporting a complete hierarchical view to userspace becomes relatively
-easy. This has been accomplished by implementing a special purpose virtual
-file system named sysfs.
-
-Almost all mainstream Linux distros mount this filesystem automatically; you
-can see some variation of the following in the output of the "mount" command:
-
-$ mount
-...
-none on /sys type sysfs (rw,noexec,nosuid,nodev)
-...
-$
-
-The auto-mounting of sysfs is typically accomplished by an entry similar to
-the following in the /etc/fstab file:
-
-none     	/sys	sysfs    defaults	  	0 0
-
-or something similar in the /lib/init/fstab file on Debian-based systems:
-
-none            /sys    sysfs    nodev,noexec,nosuid    0 0
-
-If sysfs is not automatically mounted, you can always do it manually with:
-
-# mount -t sysfs sysfs /sys
-
-Whenever a device is inserted into the tree, a directory is created for it.
-This directory may be populated at each layer of discovery - the global layer,
-the bus layer, or the device layer.
-
-The global layer currently creates two files - 'name' and 'power'. The
-former only reports the name of the device. The latter reports the
-current power state of the device. It will also be used to set the current
-power state. 
-
-The bus layer may also create files for the devices it finds while probing the
-bus. For example, the PCI layer currently creates 'irq' and 'resource' files
-for each PCI device.
-
-A device-specific driver may also export files in its directory to expose
-device-specific data or tunable interfaces.
-
-More information about the sysfs directory layout can be found in
-the other documents in this directory and in the file 
-Documentation/filesystems/sysfs.txt.
-
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
deleted file mode 100644
index 9d9e47dfc013..000000000000
--- a/Documentation/driver-model/platform.txt
+++ /dev/null
@@ -1,244 +0,0 @@
-Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-See <linux/platform_device.h> for the driver model interface to the
-platform bus:  platform_device, and platform_driver.  This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
-like those used to integrate peripherals on many system-on-chip
-processors, or some "legacy" PC interconnects; as opposed to large
-formally specified ones like PCI or USB.
-
-
-Platform devices
-~~~~~~~~~~~~~~~~
-Platform devices are devices that typically appear as autonomous
-entities in the system. This includes legacy port-based devices and
-host bridges to peripheral buses, and most controllers integrated
-into system-on-chip platforms.  What they usually have in common
-is direct addressing from a CPU bus.  Rarely, a platform_device will
-be connected through a segment of some other kind of bus; but its
-registers will still be directly addressable.
-
-Platform devices are given a name, used in driver binding, and a
-list of resources such as addresses and IRQs.
-
-struct platform_device {
-	const char	*name;
-	u32		id;
-	struct device	dev;
-	u32		num_resources;
-	struct resource	*resource;
-};
-
-
-Platform drivers
-~~~~~~~~~~~~~~~~
-Platform drivers follow the standard driver model convention, where
-discovery/enumeration is handled outside the drivers, and drivers
-provide probe() and remove() methods.  They support power management
-and shutdown notifications using the standard conventions.
-
-struct platform_driver {
-	int (*probe)(struct platform_device *);
-	int (*remove)(struct platform_device *);
-	void (*shutdown)(struct platform_device *);
-	int (*suspend)(struct platform_device *, pm_message_t state);
-	int (*suspend_late)(struct platform_device *, pm_message_t state);
-	int (*resume_early)(struct platform_device *);
-	int (*resume)(struct platform_device *);
-	struct device_driver driver;
-};
-
-Note that probe() should in general verify that the specified device hardware
-actually exists; sometimes platform setup code can't be sure.  The probing
-can use device resources, including clocks, and device platform_data.
-
-Platform drivers register themselves the normal way:
-
-	int platform_driver_register(struct platform_driver *drv);
-
-Or, in common situations where the device is known not to be hot-pluggable,
-the probe() routine can live in an init section to reduce the driver's
-runtime memory footprint:
-
-	int platform_driver_probe(struct platform_driver *drv,
-			  int (*probe)(struct platform_device *))
-
-Kernel modules can be composed of several platform drivers. The platform core
-provides helpers to register and unregister an array of drivers:
-
-	int __platform_register_drivers(struct platform_driver * const *drivers,
-				      unsigned int count, struct module *owner);
-	void platform_unregister_drivers(struct platform_driver * const *drivers,
-					 unsigned int count);
-
-If one of the drivers fails to register, all drivers registered up to that
-point will be unregistered in reverse order. Note that there is a convenience
-macro that passes THIS_MODULE as owner parameter:
-
-	#define platform_register_drivers(drivers, count)
-
-
-Device Enumeration
-~~~~~~~~~~~~~~~~~~
-As a rule, platform specific (and often board-specific) setup code will
-register platform devices:
-
-	int platform_device_register(struct platform_device *pdev);
-
-	int platform_add_devices(struct platform_device **pdevs, int ndev);
-
-The general rule is to register only those devices that actually exist,
-but in some cases extra devices might be registered.  For example, a kernel
-might be configured to work with an external network adapter that might not
-be populated on all boards, or likewise to work with an integrated controller
-that some boards might not hook up to any peripherals.
-
-In some cases, boot firmware will export tables describing the devices
-that are populated on a given board.   Without such tables, often the
-only way for system setup code to set up the correct devices is to build
-a kernel for a specific target board.  Such board-specific kernels are
-common with embedded and custom systems development.
-
-In many cases, the memory and IRQ resources associated with the platform
-device are not enough to let the device's driver work.  Board setup code
-will often provide additional information using the device's platform_data
-field to hold additional information.
-
-Embedded systems frequently need one or more clocks for platform devices,
-which are normally kept off until they're actively needed (to save power).
-System setup also associates those clocks with the device, so that that
-calls to clk_get(&pdev->dev, clock_name) return them as needed.
-
-
-Legacy Drivers:  Device Probing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers are not fully converted to the driver model, because they take
-on a non-driver role:  the driver registers its platform device, rather than
-leaving that for system infrastructure.  Such drivers can't be hotplugged
-or coldplugged, since those mechanisms require device creation to be in a
-different system component than the driver.
-
-The only "good" reason for this is to handle older system designs which, like
-original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
-configuration.  Newer systems have largely abandoned that model, in favor of
-bus-level support for dynamic configuration (PCI, USB), or device tables
-provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
-conflicting options about what might be where, and even educated guesses by
-an operating system will be wrong often enough to make trouble.
-
-This style of driver is discouraged.  If you're updating such a driver,
-please try to move the device enumeration to a more appropriate location,
-outside the driver.  This will usually be cleanup, since such drivers
-tend to already have "normal" modes, such as ones using device nodes that
-were created by PNP or by platform device setup.
-
-None the less, there are some APIs to support such legacy drivers.  Avoid
-using these calls except with such hotplug-deficient drivers.
-
-	struct platform_device *platform_device_alloc(
-			const char *name, int id);
-
-You can use platform_device_alloc() to dynamically allocate a device, which
-you will then initialize with resources and platform_device_register().
-A better solution is usually:
-
-	struct platform_device *platform_device_register_simple(
-			const char *name, int id,
-			struct resource *res, unsigned int nres);
-
-You can use platform_device_register_simple() as a one-step call to allocate
-and register a device.
-
-
-Device Naming and Driver Binding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The platform_device.dev.bus_id is the canonical name for the devices.
-It's built from two components:
-
-    * platform_device.name ... which is also used to for driver matching.
-
-    * platform_device.id ... the device instance number, or else "-1"
-      to indicate there's only one.
-
-These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
-"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
-named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
-and use the platform_driver called "my_rtc".
-
-Driver binding is performed automatically by the driver core, invoking
-driver probe() after finding a match between device and driver.  If the
-probe() succeeds, the driver and device are bound as usual.  There are
-three different ways to find such a match:
-
-    - Whenever a device is registered, the drivers for that bus are
-      checked for matches.  Platform devices should be registered very
-      early during system boot.
-
-    - When a driver is registered using platform_driver_register(), all
-      unbound devices on that bus are checked for matches.  Drivers
-      usually register later during booting, or by module loading.
-
-    - Registering a driver using platform_driver_probe() works just like
-      using platform_driver_register(), except that the driver won't
-      be probed later if another device registers.  (Which is OK, since
-      this interface is only for use with non-hotpluggable devices.)
-
-
-Early Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The early platform interfaces provide platform data to platform device
-drivers early on during the system boot. The code is built on top of the
-early_param() command line parsing and can be executed very early on.
-
-Example: "earlyprintk" class early serial console in 6 steps
-
-1. Registering early platform device data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code registers platform device data using the function
-early_platform_add_devices(). In the case of early serial console this
-should be hardware configuration for the serial port. Devices registered
-at this point will later on be matched against early platform drivers.
-
-2. Parsing kernel command line
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls parse_early_param() to parse the kernel
-command line. This will execute all matching early_param() callbacks.
-User specified early platform devices will be registered at this point.
-For the early serial console case the user can specify port on the
-kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platform driver and
-0 is the platform device id. If the id is -1 then the dot and the
-id can be omitted.
-
-3. Installing early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code may optionally force registration of all early
-platform drivers belonging to a certain class using the function
-early_platform_driver_register_all(). User specified devices from
-step 2 have priority over these. This step is omitted by the serial
-driver example since the early serial driver code should be disabled
-unless the user has specified port on the kernel command line.
-
-4. Early platform driver registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Compiled-in platform drivers making use of early_platform_init() are
-automatically registered during step 2 or 3. The serial driver example
-should use early_platform_init("earlyprintk", &platform_driver).
-
-5. Probing of early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls early_platform_driver_probe() to match
-registered early platform devices associated with a certain class with
-registered early platform drivers. Matched devices will get probed().
-This step can be executed at any point during the early boot. As soon
-as possible may be good for the serial port case.
-
-6. Inside the early platform driver probe()
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The driver code needs to take special care during early boot, especially
-when it comes to memory allocation and interrupt registration. The code
-in the probe() function can use is_early_platform_device() to check if
-it is called at early platform device or at the regular platform device
-time. The early serial driver performs register_console() at this point.
-
-For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-model/porting.txt b/Documentation/driver-model/porting.txt
deleted file mode 100644
index 453053f1661f..000000000000
--- a/Documentation/driver-model/porting.txt
+++ /dev/null
@@ -1,447 +0,0 @@
-
-Porting Drivers to the New Driver Model
-
-Patrick Mochel
-
-7 January 2003
-
-
-Overview
-
-Please refer to Documentation/driver-model/*.txt for definitions of
-various driver types and concepts. 
-
-Most of the work of porting devices drivers to the new model happens
-at the bus driver layer. This was intentional, to minimize the
-negative effect on kernel drivers, and to allow a gradual transition
-of bus drivers.
-
-In a nutshell, the driver model consists of a set of objects that can
-be embedded in larger, bus-specific objects. Fields in these generic
-objects can replace fields in the bus-specific objects. 
-
-The generic objects must be registered with the driver model core. By
-doing so, they will exported via the sysfs filesystem. sysfs can be
-mounted by doing 
-
-	# mount -t sysfs sysfs /sys
-
-
-
-The Process
-
-Step 0: Read include/linux/device.h for object and function definitions. 
-
-Step 1: Registering the bus driver. 
-
-
-- Define a struct bus_type for the bus driver.
-
-struct bus_type pci_bus_type = {
-        .name           = "pci",
-};
-
-
-- Register the bus type.
-  This should be done in the initialization function for the bus type,
-  which is usually the module_init(), or equivalent, function. 
-
-static int __init pci_driver_init(void)
-{
-        return bus_register(&pci_bus_type);
-}
-
-subsys_initcall(pci_driver_init);
-
-
-  The bus type may be unregistered (if the bus driver may be compiled
-  as a module) by doing:
-
-     bus_unregister(&pci_bus_type);
-
-
-- Export the bus type for others to use. 
-
-  Other code may wish to reference the bus type, so declare it in a 
-  shared header file and export the symbol.
-
-From include/linux/pci.h:
-
-extern struct bus_type pci_bus_type;
-
-
-From file the above code appears in:
-
-EXPORT_SYMBOL(pci_bus_type);
-
-
-
-- This will cause the bus to show up in /sys/bus/pci/ with two
-  subdirectories: 'devices' and 'drivers'.
-
-# tree -d /sys/bus/pci/
-/sys/bus/pci/
-|-- devices
-`-- drivers
-
-
-
-Step 2: Registering Devices. 
-
-struct device represents a single device. It mainly contains metadata
-describing the relationship the device has to other entities. 
-
-
-- Embed a struct device in the bus-specific device type. 
-
-
-struct pci_dev {
-       ...
-       struct  device  dev;            /* Generic device interface */
-       ...
-};
-
-  It is recommended that the generic device not be the first item in 
-  the struct to discourage programmers from doing mindless casts
-  between the object types. Instead macros, or inline functions,
-  should be created to convert from the generic object type.
-
-
-#define to_pci_dev(n) container_of(n, struct pci_dev, dev)
-
-or 
-
-static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
-{
-	return container_of(n, struct pci_dev, dev);
-}
-
-  This allows the compiler to verify type-safety of the operations 
-  that are performed (which is Good).
-
-
-- Initialize the device on registration.
-
-  When devices are discovered or registered with the bus type, the 
-  bus driver should initialize the generic device. The most important
-  things to initialize are the bus_id, parent, and bus fields.
-
-  The bus_id is an ASCII string that contains the device's address on
-  the bus. The format of this string is bus-specific. This is
-  necessary for representing devices in sysfs. 
-
-  parent is the physical parent of the device. It is important that
-  the bus driver sets this field correctly. 
-
-  The driver model maintains an ordered list of devices that it uses
-  for power management. This list must be in order to guarantee that
-  devices are shutdown before their physical parents, and vice versa.
-  The order of this list is determined by the parent of registered
-  devices.
-
-  Also, the location of the device's sysfs directory depends on a
-  device's parent. sysfs exports a directory structure that mirrors 
-  the device hierarchy. Accurately setting the parent guarantees that
-  sysfs will accurately represent the hierarchy.
-
-  The device's bus field is a pointer to the bus type the device
-  belongs to. This should be set to the bus_type that was declared
-  and initialized before. 
-
-  Optionally, the bus driver may set the device's name and release
-  fields.
-
-  The name field is an ASCII string describing the device, like
-
-     "ATI Technologies Inc Radeon QD"
-
-  The release field is a callback that the driver model core calls 
-  when the device has been removed, and all references to it have 
-  been released. More on this in a moment.
-
-
-- Register the device. 
-
-  Once the generic device has been initialized, it can be registered
-  with the driver model core by doing:
-
-       device_register(&dev->dev);
-
-  It can later be unregistered by doing: 
-
-       device_unregister(&dev->dev);
-
-  This should happen on buses that support hotpluggable devices. 
-  If a bus driver unregisters a device, it should not immediately free
-  it. It should instead wait for the driver model core to call the 
-  device's release method, then free the bus-specific object. 
-  (There may be other code that is currently referencing the device
-  structure, and it would be rude to free the device while that is 
-  happening).
-
-
-  When the device is registered, a directory in sysfs is created. 
-  The PCI tree in sysfs looks like: 
-
-/sys/devices/pci0/
-|-- 00:00.0
-|-- 00:01.0
-|   `-- 01:00.0
-|-- 00:02.0
-|   `-- 02:1f.0
-|       `-- 03:00.0
-|-- 00:1e.0
-|   `-- 04:04.0
-|-- 00:1f.0
-|-- 00:1f.1
-|   |-- ide0
-|   |   |-- 0.0
-|   |   `-- 0.1
-|   `-- ide1
-|       `-- 1.0
-|-- 00:1f.2
-|-- 00:1f.3
-`-- 00:1f.5
-
-  Also, symlinks are created in the bus's 'devices' directory
-  that point to the device's directory in the physical hierarchy. 
-
-/sys/bus/pci/devices/
-|-- 00:00.0 -> ../../../devices/pci0/00:00.0
-|-- 00:01.0 -> ../../../devices/pci0/00:01.0
-|-- 00:02.0 -> ../../../devices/pci0/00:02.0
-|-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
-|-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
-|-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
-|-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
-|-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
-|-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
-|-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
-|-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
-|-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
-`-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
-
-
-
-Step 3: Registering Drivers.
-
-struct device_driver is a simple driver structure that contains a set
-of operations that the driver model core may call. 
-
-
-- Embed a struct device_driver in the bus-specific driver. 
-
-  Just like with devices, do something like:
-
-struct pci_driver {
-       ...
-       struct device_driver    driver;
-};
-
-
-- Initialize the generic driver structure. 
-
-  When the driver registers with the bus (e.g. doing pci_register_driver()),
-  initialize the necessary fields of the driver: the name and bus
-  fields. 
-
-
-- Register the driver.
-
-  After the generic driver has been initialized, call
-
-	driver_register(&drv->driver);
-
-  to register the driver with the core.
-
-  When the driver is unregistered from the bus, unregister it from the
-  core by doing:
-
-        driver_unregister(&drv->driver);
-
-  Note that this will block until all references to the driver have
-  gone away. Normally, there will not be any.
-
-
-- Sysfs representation.
-
-  Drivers are exported via sysfs in their bus's 'driver's directory. 
-  For example:
-
-/sys/bus/pci/drivers/
-|-- 3c59x
-|-- Ensoniq AudioPCI
-|-- agpgart-amdk7
-|-- e100
-`-- serial
-
-
-Step 4: Define Generic Methods for Drivers.
-
-struct device_driver defines a set of operations that the driver model
-core calls. Most of these operations are probably similar to
-operations the bus already defines for drivers, but taking different
-parameters. 
-
-It would be difficult and tedious to force every driver on a bus to
-simultaneously convert their drivers to generic format. Instead, the
-bus driver should define single instances of the generic methods that
-forward call to the bus-specific drivers. For instance: 
-
-
-static int pci_device_remove(struct device * dev)
-{
-        struct pci_dev * pci_dev = to_pci_dev(dev);
-        struct pci_driver * drv = pci_dev->driver;
-
-        if (drv) {
-                if (drv->remove)
-                        drv->remove(pci_dev);
-                pci_dev->driver = NULL;
-        }
-        return 0;
-}
-
-
-The generic driver should be initialized with these methods before it
-is registered. 
-
-        /* initialize common driver fields */
-        drv->driver.name = drv->name;
-        drv->driver.bus = &pci_bus_type;
-        drv->driver.probe = pci_device_probe;
-        drv->driver.resume = pci_device_resume;
-        drv->driver.suspend = pci_device_suspend;
-        drv->driver.remove = pci_device_remove;
-
-        /* register with core */
-        driver_register(&drv->driver);
-
-
-Ideally, the bus should only initialize the fields if they are not
-already set. This allows the drivers to implement their own generic
-methods. 
-
-
-Step 5: Support generic driver binding. 
-
-The model assumes that a device or driver can be dynamically
-registered with the bus at any time. When registration happens,
-devices must be bound to a driver, or drivers must be bound to all
-devices that it supports. 
-
-A driver typically contains a list of device IDs that it supports. The
-bus driver compares these IDs to the IDs of devices registered with it. 
-The format of the device IDs, and the semantics for comparing them are
-bus-specific, so the generic model does attempt to generalize them. 
-
-Instead, a bus may supply a method in struct bus_type that does the
-comparison: 
-
-  int (*match)(struct device * dev, struct device_driver * drv);
-
-match should return positive value if the driver supports the device,
-and zero otherwise. It may also return error code (for example
--EPROBE_DEFER) if determining that given driver supports the device is
-not possible.
-
-When a device is registered, the bus's list of drivers is iterated
-over. bus->match() is called for each one until a match is found. 
-
-When a driver is registered, the bus's list of devices is iterated
-over. bus->match() is called for each device that is not already
-claimed by a driver. 
-
-When a device is successfully bound to a driver, device->driver is
-set, the device is added to a per-driver list of devices, and a
-symlink is created in the driver's sysfs directory that points to the
-device's physical directory:
-
-/sys/bus/pci/drivers/
-|-- 3c59x
-|   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
-|-- Ensoniq AudioPCI
-|-- agpgart-amdk7
-|   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
-|-- e100
-|   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
-`-- serial
-
-
-This driver binding should replace the existing driver binding
-mechanism the bus currently uses. 
-
-
-Step 6: Supply a hotplug callback.
-
-Whenever a device is registered with the driver model core, the
-userspace program /sbin/hotplug is called to notify userspace. 
-Users can define actions to perform when a device is inserted or
-removed. 
-
-The driver model core passes several arguments to userspace via
-environment variables, including
-
-- ACTION: set to 'add' or 'remove'
-- DEVPATH: set to the device's physical path in sysfs. 
-
-A bus driver may also supply additional parameters for userspace to
-consume. To do this, a bus must implement the 'hotplug' method in
-struct bus_type:
-
-     int (*hotplug) (struct device *dev, char **envp, 
-                     int num_envp, char *buffer, int buffer_size);
-
-This is called immediately before /sbin/hotplug is executed. 
-
-
-Step 7: Cleaning up the bus driver.
-
-The generic bus, device, and driver structures provide several fields
-that can replace those defined privately to the bus driver. 
-
-- Device list.
-
-struct bus_type contains a list of all devices registered with the bus
-type. This includes all devices on all instances of that bus type.
-An internal list that the bus uses may be removed, in favor of using
-this one.
-
-The core provides an iterator to access these devices. 
-
-int bus_for_each_dev(struct bus_type * bus, struct device * start, 
-                     void * data, int (*fn)(struct device *, void *));
-
-
-- Driver list.
-
-struct bus_type also contains a list of all drivers registered with
-it. An internal list of drivers that the bus driver maintains may 
-be removed in favor of using the generic one. 
-
-The drivers may be iterated over, like devices: 
-
-int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-                     void * data, int (*fn)(struct device_driver *, void *));
-
-
-Please see drivers/base/bus.c for more information.
-
-
-- rwsem 
-
-struct bus_type contains an rwsem that protects all core accesses to
-the device and driver lists. This can be used by the bus driver
-internally, and should be used when accessing the device or driver
-lists the bus maintains. 
-
-
-- Device and driver fields. 
-
-Some of the fields in struct device and struct device_driver duplicate
-fields in the bus-specific representations of these objects. Feel free
-to remove the bus-specific ones and favor the generic ones. Note
-though, that this will likely mean fixing up all the drivers that
-reference the bus-specific fields (though those should all be 1-line
-changes).
-
diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README
deleted file mode 100644
index 955d667dc87e..000000000000
--- a/Documentation/early-userspace/README
+++ /dev/null
@@ -1,151 +0,0 @@
-Early userspace support
-=======================
-
-Last update: 2004-12-20 tlh
-
-
-"Early userspace" is a set of libraries and programs that provide
-various pieces of functionality that are important enough to be
-available while a Linux kernel is coming up, but that don't need to be
-run inside the kernel itself.
-
-It consists of several major infrastructure components:
-
-- gen_init_cpio, a program that builds a cpio-format archive
-  containing a root filesystem image.  This archive is compressed, and
-  the compressed image is linked into the kernel image.
-- initramfs, a chunk of code that unpacks the compressed cpio image
-  midway through the kernel boot process.
-- klibc, a userspace C library, currently packaged separately, that is
-  optimized for correctness and small size.
-
-The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
-format, and is documented in the file "buffer-format.txt".  There are
-two ways to add an early userspace image: specify an existing cpio
-archive to be used as the image or have the kernel build process build
-the image from specifications.
-
-CPIO ARCHIVE method
-
-You can create a cpio archive that contains the early userspace image.
-Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it
-will be used directly.  Only a single cpio file may be specified in
-CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in
-combination with a cpio archive.
-
-IMAGE BUILDING method
-
-The kernel build process can also build an early userspace image from
-source parts rather than supplying a cpio archive.  This method provides
-a way to create images with root-owned files even though the image was
-built by an unprivileged user.
-
-The image is specified as one or more sources in
-CONFIG_INITRAMFS_SOURCE.  Sources can be either directories or files -
-cpio archives are *not* allowed when building from sources.
-
-A source directory will have it and all of its contents packaged.  The
-specified directory name will be mapped to '/'.  When packaging a
-directory, limited user and group ID translation can be performed.
-INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
-user root (0).  INITRAMFS_ROOT_GID can be set to a group ID that needs
-to be mapped to group root (0).
-
-A source file must be directives in the format required by the
-usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the
-file format).  The directives in the file will be passed directly to
-usr/gen_init_cpio.
-
-When a combination of directories and files are specified then the
-initramfs image will be an aggregate of all of them.  In this way a user
-can create a 'root-image' directory and install all files into it.
-Because device-special files cannot be created by a unprivileged user,
-special files can be listed in a 'root-files' file.  Both 'root-image'
-and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete
-early userspace image can be built by an unprivileged user.
-
-As a technical note, when directories and files are specified, the
-entire CONFIG_INITRAMFS_SOURCE is passed to
-usr/gen_initramfs_list.sh.  This means that CONFIG_INITRAMFS_SOURCE
-can really be interpreted as any legal argument to
-gen_initramfs_list.sh.  If a directory is specified as an argument then
-the contents are scanned, uid/gid translation is performed, and
-usr/gen_init_cpio file directives are output.  If a directory is
-specified as an argument to usr/gen_initramfs_list.sh then the
-contents of the file are simply copied to the output.  All of the output
-directives from directory scanning and file contents copying are
-processed by usr/gen_init_cpio.
-
-See also 'usr/gen_initramfs_list.sh -h'.
-
-Where's this all leading?
-=========================
-
-The klibc distribution contains some of the necessary software to make
-early userspace useful.  The klibc distribution is currently
-maintained separately from the kernel.
-
-You can obtain somewhat infrequent snapshots of klibc from
-https://www.kernel.org/pub/linux/libs/klibc/
-
-For active users, you are better off using the klibc git
-repository, at http://git.kernel.org/?p=libs/klibc/klibc.git
-
-The standalone klibc distribution currently provides three components,
-in addition to the klibc library:
-
-- ipconfig, a program that configures network interfaces.  It can
-  configure them statically, or use DHCP to obtain information
-  dynamically (aka "IP autoconfiguration").
-- nfsmount, a program that can mount an NFS filesystem.
-- kinit, the "glue" that uses ipconfig and nfsmount to replace the old
-  support for IP autoconfig, mount a filesystem over NFS, and continue
-  system boot using that filesystem as root.
-
-kinit is built as a single statically linked binary to save space.
-
-Eventually, several more chunks of kernel functionality will hopefully
-move to early userspace:
-
-- Almost all of init/do_mounts* (the beginning of this is already in
-  place)
-- ACPI table parsing
-- Insert unwieldy subsystem that doesn't really need to be in kernel
-  space here
-
-If kinit doesn't meet your current needs and you've got bytes to burn,
-the klibc distribution includes a small Bourne-compatible shell (ash)
-and a number of other utilities, so you can replace kinit and build
-custom initramfs images that meet your needs exactly.
-
-For questions and help, you can sign up for the early userspace
-mailing list at http://www.zytor.com/mailman/listinfo/klibc
-
-How does it work?
-=================
-
-The kernel has currently 3 ways to mount the root filesystem:
-
-a) all required device and filesystem drivers compiled into the kernel, no
-   initrd.  init/main.c:init() will call prepare_namespace() to mount the
-   final root filesystem, based on the root= option and optional init= to run
-   some other init binary than listed at the end of init/main.c:init().
-
-b) some device and filesystem drivers built as modules and stored in an
-   initrd.  The initrd must contain a binary '/linuxrc' which is supposed to
-   load these driver modules.  It is also possible to mount the final root
-   filesystem via linuxrc and use the pivot_root syscall.  The initrd is
-   mounted and executed via prepare_namespace().
-
-c) using initramfs.  The call to prepare_namespace() must be skipped.
-   This means that a binary must do all the work.  Said binary can be stored
-   into initramfs either via modifying usr/gen_init_cpio.c or via the new
-   initrd format, an cpio archive.  It must be called "/init".  This binary
-   is responsible to do all the things prepare_namespace() would do.
-
-   To maintain backwards compatibility, the /init binary will only run if it
-   comes via an initramfs cpio archive.  If this is not the case,
-   init/main.c:init() will run prepare_namespace() to mount the final root
-   and exec one of the predefined init binaries.
-
-Bryan O'Sullivan <bos@serpentine.com>
diff --git a/Documentation/early-userspace/buffer-format.txt b/Documentation/early-userspace/buffer-format.txt
deleted file mode 100644
index e1fd7f9dad16..000000000000
--- a/Documentation/early-userspace/buffer-format.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-		       initramfs buffer format
-		       -----------------------
-
-		       Al Viro, H. Peter Anvin
-		      Last revision: 2002-01-13
-
-Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
-getting {replaced/complemented} with the new "initial ramfs"
-(initramfs) protocol.  The initramfs contents is passed using the same
-memory buffer protocol used by the initrd protocol, but the contents
-is different.  The initramfs buffer contains an archive which is
-expanded into a ramfs filesystem; this document details the format of
-the initramfs buffer format.
-
-The initramfs buffer format is based around the "newc" or "crc" CPIO
-formats, and can be created with the cpio(1) utility.  The cpio
-archive can be compressed using gzip(1).  One valid version of an
-initramfs buffer is thus a single .cpio.gz file.
-
-The full format of the initramfs buffer is defined by the following
-grammar, where:
-	*	is used to indicate "0 or more occurrences of"
-	(|)	indicates alternatives
-	+	indicates concatenation
-	GZIP()	indicates the gzip(1) of the operand
-	ALGN(n)	means padding with null bytes to an n-byte boundary
-
-	initramfs  := ("\0" | cpio_archive | cpio_gzip_archive)*
-
-	cpio_gzip_archive := GZIP(cpio_archive)
-
-	cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
-
-	cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data
-
-	cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4)
-
-
-In human terms, the initramfs buffer contains a collection of
-compressed and/or uncompressed cpio archives (in the "newc" or "crc"
-formats); arbitrary amounts zero bytes (for padding) can be added
-between members.
-
-The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is
-not ignored; see "handling of hard links" below.
-
-The structure of the cpio_header is as follows (all fields contain
-hexadecimal ASCII numbers fully padded with '0' on the left to the
-full width of the field, for example, the integer 4780 is represented
-by the ASCII string "000012ac"):
-
-Field name    Field size	 Meaning
-c_magic	      6 bytes		 The string "070701" or "070702"
-c_ino	      8 bytes		 File inode number
-c_mode	      8 bytes		 File mode and permissions
-c_uid	      8 bytes		 File uid
-c_gid	      8 bytes		 File gid
-c_nlink	      8 bytes		 Number of links
-c_mtime	      8 bytes		 Modification time
-c_filesize    8 bytes		 Size of data field
-c_maj	      8 bytes		 Major part of file device number
-c_min	      8 bytes		 Minor part of file device number
-c_rmaj	      8 bytes		 Major part of device node reference
-c_rmin	      8 bytes		 Minor part of device node reference
-c_namesize    8 bytes		 Length of filename, including final \0
-c_chksum      8 bytes		 Checksum of data field if c_magic is 070702;
-				 otherwise zero
-
-The c_mode field matches the contents of st_mode returned by stat(2)
-on Linux, and encodes the file type and file permissions.
-
-The c_filesize should be zero for any file which is not a regular file
-or symlink.
-
-The c_chksum field contains a simple 32-bit unsigned sum of all the
-bytes in the data field.  cpio(1) refers to this as "crc", which is
-clearly incorrect (a cyclic redundancy check is a different and
-significantly stronger integrity check), however, this is the
-algorithm used.
-
-If the filename is "TRAILER!!!" this is actually an end-of-archive
-marker; the c_filesize for an end-of-archive marker must be zero.
-
-
-*** Handling of hard links
-
-When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino)
-tuple is looked up in a tuple buffer.  If not found, it is entered in
-the tuple buffer and the entry is created as usual; if found, a hard
-link rather than a second copy of the file is created.  It is not
-necessary (but permitted) to include a second copy of the file
-contents; if the file contents is not included, the c_filesize field
-should be set to zero to indicate no data section follows.  If data is
-present, the previous instance of the file is overwritten; this allows
-the data-carrying instance of a file to occur anywhere in the sequence
-(GNU cpio is reported to attach the data to the last instance of a
-file only.)
-
-c_filesize must not be zero for a symlink.
-
-When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is
-reset.  This permits archives which are generated independently to be
-concatenated.
-
-To combine file data from different sources (without having to
-regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of
-the following techniques can be used:
-
-a) Separate the different file data sources with a "TRAILER!!!"
-   end-of-archive marker, or
-
-b) Make sure c_nlink == 1 for all nondirectory entries.
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
deleted file mode 100644
index 2806e5544e43..000000000000
--- a/Documentation/eisa.txt
+++ /dev/null
@@ -1,230 +0,0 @@
-================
-EISA bus support
-================
-
-:Author: Marc Zyngier <maz@wild-wind.fr.eu.org>
-
-This document groups random notes about porting EISA drivers to the
-new EISA/sysfs API.
-
-Starting from version 2.5.59, the EISA bus is almost given the same
-status as other much more mainstream busses such as PCI or USB. This
-has been possible through sysfs, which defines a nice enough set of
-abstractions to manage busses, devices and drivers.
-
-Although the new API is quite simple to use, converting existing
-drivers to the new infrastructure is not an easy task (mostly because
-detection code is generally also used to probe ISA cards). Moreover,
-most EISA drivers are among the oldest Linux drivers so, as you can
-imagine, some dust has settled here over the years.
-
-The EISA infrastructure is made up of three parts:
-
-    - The bus code implements most of the generic code. It is shared
-      among all the architectures that the EISA code runs on. It
-      implements bus probing (detecting EISA cards available on the bus),
-      allocates I/O resources, allows fancy naming through sysfs, and
-      offers interfaces for driver to register.
-
-    - The bus root driver implements the glue between the bus hardware
-      and the generic bus code. It is responsible for discovering the
-      device implementing the bus, and setting it up to be latter probed
-      by the bus code. This can go from something as simple as reserving
-      an I/O region on x86, to the rather more complex, like the hppa
-      EISA code. This is the part to implement in order to have EISA
-      running on an "new" platform.
-
-    - The driver offers the bus a list of devices that it manages, and
-      implements the necessary callbacks to probe and release devices
-      whenever told to.
-
-Every function/structure below lives in <linux/eisa.h>, which depends
-heavily on <linux/device.h>.
-
-Bus root driver
-===============
-
-::
-
-	int eisa_root_register (struct eisa_root_device *root);
-
-The eisa_root_register function is used to declare a device as the
-root of an EISA bus. The eisa_root_device structure holds a reference
-to this device, as well as some parameters for probing purposes::
-
-	struct eisa_root_device {
-		struct device   *dev;	 /* Pointer to bridge device */
-		struct resource *res;
-		unsigned long    bus_base_addr;
-		int		 slots;  /* Max slot number */
-		int		 force_probe; /* Probe even when no slot 0 */
-		u64		 dma_mask; /* from bridge device */
-		int              bus_nr; /* Set by eisa_root_register */
-		struct resource  eisa_root_res;	/* ditto */
-	};
-
-============= ======================================================
-node          used for eisa_root_register internal purpose
-dev           pointer to the root device
-res           root device I/O resource
-bus_base_addr slot 0 address on this bus
-slots	      max slot number to probe
-force_probe   Probe even when slot 0 is empty (no EISA mainboard)
-dma_mask      Default DMA mask. Usually the bridge device dma_mask.
-bus_nr	      unique bus id, set by eisa_root_register
-============= ======================================================
-
-Driver
-======
-
-::
-
-	int eisa_driver_register (struct eisa_driver *edrv);
-	void eisa_driver_unregister (struct eisa_driver *edrv);
-
-Clear enough ?
-
-::
-
-	struct eisa_device_id {
-		char sig[EISA_SIG_LEN];
-		unsigned long driver_data;
-	};
-
-	struct eisa_driver {
-		const struct eisa_device_id *id_table;
-		struct device_driver         driver;
-	};
-
-=============== ====================================================
-id_table	an array of NULL terminated EISA id strings,
-		followed by an empty string. Each string can
-		optionally be paired with a driver-dependent value
-		(driver_data).
-
-driver		a generic driver, such as described in
-		Documentation/driver-model/driver.txt. Only .name,
-		.probe and .remove members are mandatory.
-=============== ====================================================
-
-An example is the 3c59x driver::
-
-	static struct eisa_device_id vortex_eisa_ids[] = {
-		{ "TCM5920", EISA_3C592_OFFSET },
-		{ "TCM5970", EISA_3C597_OFFSET },
-		{ "" }
-	};
-
-	static struct eisa_driver vortex_eisa_driver = {
-		.id_table = vortex_eisa_ids,
-		.driver   = {
-			.name    = "3c59x",
-			.probe   = vortex_eisa_probe,
-			.remove  = vortex_eisa_remove
-		}
-	};
-
-Device
-======
-
-The sysfs framework calls .probe and .remove functions upon device
-discovery and removal (note that the .remove function is only called
-when driver is built as a module).
-
-Both functions are passed a pointer to a 'struct device', which is
-encapsulated in a 'struct eisa_device' described as follows::
-
-	struct eisa_device {
-		struct eisa_device_id id;
-		int                   slot;
-		int                   state;
-		unsigned long         base_addr;
-		struct resource       res[EISA_MAX_RESOURCES];
-		u64                   dma_mask;
-		struct device         dev; /* generic device */
-	};
-
-======== ============================================================
-id	 EISA id, as read from device. id.driver_data is set from the
-	 matching driver EISA id.
-slot	 slot number which the device was detected on
-state    set of flags indicating the state of the device. Current
-	 flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
-res	 set of four 256 bytes I/O regions allocated to this device
-dma_mask DMA mask set from the parent device.
-dev	 generic device (see Documentation/driver-model/device.txt)
-======== ============================================================
-
-You can get the 'struct eisa_device' from 'struct device' using the
-'to_eisa_device' macro.
-
-Misc stuff
-==========
-
-::
-
-	void eisa_set_drvdata (struct eisa_device *edev, void *data);
-
-Stores data into the device's driver_data area.
-
-::
-
-	void *eisa_get_drvdata (struct eisa_device *edev):
-
-Gets the pointer previously stored into the device's driver_data area.
-
-::
-
-	int eisa_get_region_index (void *addr);
-
-Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
-address.
-
-Kernel parameters
-=================
-
-eisa_bus.enable_dev
-	A comma-separated list of slots to be enabled, even if the firmware
-	set the card as disabled. The driver must be able to properly
-	initialize the device in such conditions.
-
-eisa_bus.disable_dev
-	A comma-separated list of slots to be enabled, even if the firmware
-	set the card as enabled. The driver won't be called to handle this
-	device.
-
-virtual_root.force_probe
-	Force the probing code to probe EISA slots even when it cannot find an
-	EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
-	(don't force), and set to 1 (force probing) when either
-	CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
-
-Random notes
-============
-
-Converting an EISA driver to the new API mostly involves *deleting*
-code (since probing is now in the core EISA code). Unfortunately, most
-drivers share their probing routine between ISA, and EISA. Special
-care must be taken when ripping out the EISA code, so other busses
-won't suffer from these surgical strikes...
-
-You *must not* expect any EISA device to be detected when returning
-from eisa_driver_register, since the chances are that the bus has not
-yet been probed. In fact, that's what happens most of the time (the
-bus root driver usually kicks in rather late in the boot process).
-Unfortunately, most drivers are doing the probing by themselves, and
-expect to have explored the whole machine when they exit their probe
-routine.
-
-For example, switching your favorite EISA SCSI card to the "hotplug"
-model is "the right thing"(tm).
-
-Thanks
-======
-
-I'd like to thank the following people for their help:
-
-- Xavier Benigni for lending me a wonderful Alpha Jensen,
-- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
-- Andries Brouwer for contributing numerous EISA ids,
-- Catrin Jones for coping with far too many machines at home.
diff --git a/Documentation/extcon/intel-int3496.txt b/Documentation/extcon/intel-int3496.txt
deleted file mode 100644
index 8155dbc7fad3..000000000000
--- a/Documentation/extcon/intel-int3496.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Intel INT3496 ACPI device extcon driver documentation
------------------------------------------------------
-
-The Intel INT3496 ACPI device extcon driver is a driver for ACPI
-devices with an acpi-id of INT3496, such as found for example on
-Intel Baytrail and Cherrytrail tablets.
-
-This ACPI device describes how the OS can read the id-pin of the devices'
-USB-otg port, as well as how it optionally can enable Vbus output on the
-otg port and how it can optionally control the muxing of the data pins
-between an USB host and an USB peripheral controller.
-
-The ACPI devices exposes this functionality by returning an array with up
-to 3 gpio descriptors from its ACPI _CRS (Current Resource Settings) call:
-
-Index 0: The input gpio for the id-pin, this is always present and valid
-Index 1: The output gpio for enabling Vbus output from the device to the otg
-         port, write 1 to enable the Vbus output (this gpio descriptor may
-         be absent or invalid)
-Index 2: The output gpio for muxing of the data pins between the USB host and
-         the USB peripheral controller, write 1 to mux to the peripheral
-         controller
-
-There is a mapping between indices and GPIO connection IDs as follows
-	id	index 0
-	vbus	index 1
-	mux	index 2
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
new file mode 100644
index 000000000000..f51bb21d20e4
--- /dev/null
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -0,0 +1,446 @@
+===========================================
+Fault injection capabilities infrastructure
+===========================================
+
+See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug.
+
+
+Available fault injection capabilities
+--------------------------------------
+
+- failslab
+
+  injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...)
+
+- fail_page_alloc
+
+  injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
+
+- fail_futex
+
+  injects futex deadlock and uaddr fault errors.
+
+- fail_make_request
+
+  injects disk IO errors on devices permitted by setting
+  /sys/block/<device>/make-it-fail or
+  /sys/block/<device>/<partition>/make-it-fail. (generic_make_request())
+
+- fail_mmc_request
+
+  injects MMC data errors on devices permitted by setting
+  debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
+
+- fail_function
+
+  injects error return on specific functions, which are marked by
+  ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
+  under /sys/kernel/debug/fail_function. No boot option supported.
+
+- NVMe fault injection
+
+  inject NVMe status code and retry flag on devices permitted by setting
+  debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
+  status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
+  retry flag can be set via the debugfs.
+
+
+Configure fault-injection capabilities behavior
+-----------------------------------------------
+
+debugfs entries
+^^^^^^^^^^^^^^^
+
+fault-inject-debugfs kernel module provides some debugfs entries for runtime
+configuration of fault-injection capabilities.
+
+- /sys/kernel/debug/fail*/probability:
+
+	likelihood of failure injection, in percent.
+
+	Format: <percent>
+
+	Note that one-failure-per-hundred is a very high error rate
+	for some testcases.  Consider setting probability=100 and configure
+	/sys/kernel/debug/fail*/interval for such testcases.
+
+- /sys/kernel/debug/fail*/interval:
+
+	specifies the interval between failures, for calls to
+	should_fail() that pass all the other tests.
+
+	Note that if you enable this, by setting interval>1, you will
+	probably want to set probability=100.
+
+- /sys/kernel/debug/fail*/times:
+
+	specifies how many times failures may happen at most.
+	A value of -1 means "no limit".
+
+- /sys/kernel/debug/fail*/space:
+
+	specifies an initial resource "budget", decremented by "size"
+	on each call to should_fail(,size).  Failure injection is
+	suppressed until "space" reaches zero.
+
+- /sys/kernel/debug/fail*/verbose
+
+	Format: { 0 | 1 | 2 }
+
+	specifies the verbosity of the messages when failure is
+	injected.  '0' means no messages; '1' will print only a single
+	log line per failure; '2' will print a call trace too -- useful
+	to debug the problems revealed by fault injection.
+
+- /sys/kernel/debug/fail*/task-filter:
+
+	Format: { 'Y' | 'N' }
+
+	A value of 'N' disables filtering by process (default).
+	Any positive value limits failures to only processes indicated by
+	/proc/<pid>/make-it-fail==1.
+
+- /sys/kernel/debug/fail*/require-start,
+  /sys/kernel/debug/fail*/require-end,
+  /sys/kernel/debug/fail*/reject-start,
+  /sys/kernel/debug/fail*/reject-end:
+
+	specifies the range of virtual addresses tested during
+	stacktrace walking.  Failure is injected only if some caller
+	in the walked stacktrace lies within the required range, and
+	none lies within the rejected range.
+	Default required range is [0,ULONG_MAX) (whole of virtual address space).
+	Default rejected range is [0,0).
+
+- /sys/kernel/debug/fail*/stacktrace-depth:
+
+	specifies the maximum stacktrace depth walked during search
+	for a caller within [require-start,require-end) OR
+	[reject-start,reject-end).
+
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' won't inject failures into
+	highmem/user allocations.
+
+- /sys/kernel/debug/failslab/ignore-gfp-wait:
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' will inject failures
+	only into non-sleep allocations (GFP_ATOMIC allocations).
+
+- /sys/kernel/debug/fail_page_alloc/min-order:
+
+	specifies the minimum page allocation order to be injected
+	failures.
+
+- /sys/kernel/debug/fail_futex/ignore-private:
+
+	Format: { 'Y' | 'N' }
+
+	default is 'N', setting it to 'Y' will disable failure injections
+	when dealing with private (address space) futexes.
+
+- /sys/kernel/debug/fail_function/inject:
+
+	Format: { 'function-name' | '!function-name' | '' }
+
+	specifies the target function of error injection by name.
+	If the function name leads '!' prefix, given function is
+	removed from injection list. If nothing specified ('')
+	injection list is cleared.
+
+- /sys/kernel/debug/fail_function/injectable:
+
+	(read only) shows error injectable functions and what type of
+	error values can be specified. The error type will be one of
+	below;
+	- NULL:	retval must be 0.
+	- ERRNO: retval must be -1 to -MAX_ERRNO (-4096).
+	- ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096).
+
+- /sys/kernel/debug/fail_function/<functiuon-name>/retval:
+
+	specifies the "error" return value to inject to the given
+	function for given function. This will be created when
+	user specifies new injection entry.
+
+Boot option
+^^^^^^^^^^^
+
+In order to inject faults while debugfs is not available (early boot time),
+use the boot option::
+
+	failslab=
+	fail_page_alloc=
+	fail_make_request=
+	fail_futex=
+	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
+
+proc entries
+^^^^^^^^^^^^
+
+- /proc/<pid>/fail-nth,
+  /proc/self/task/<tid>/fail-nth:
+
+	Write to this file of integer N makes N-th call in the task fail.
+	Read from this file returns a integer value. A value of '0' indicates
+	that the fault setup with a previous write to this file was injected.
+	A positive integer N indicates that the fault wasn't yet injected.
+	Note that this file enables all types of faults (slab, futex, etc).
+	This setting takes precedence over all other generic debugfs settings
+	like probability, interval, times, etc. But per-capability settings
+	(e.g. fail_futex/ignore-private) take precedence over it.
+
+	This feature is intended for systematic testing of faults in a single
+	system call. See an example below.
+
+How to add new fault injection capability
+-----------------------------------------
+
+- #include <linux/fault-inject.h>
+
+- define the fault attributes
+
+  DECLARE_FAULT_ATTR(name);
+
+  Please see the definition of struct fault_attr in fault-inject.h
+  for details.
+
+- provide a way to configure fault attributes
+
+- boot option
+
+  If you need to enable the fault injection capability from boot time, you can
+  provide boot option to configure it. There is a helper function for it:
+
+	setup_fault_attr(attr, str);
+
+- debugfs entries
+
+  failslab, fail_page_alloc, and fail_make_request use this way.
+  Helper functions:
+
+	fault_create_debugfs_attr(name, parent, attr);
+
+- module parameters
+
+  If the scope of the fault injection capability is limited to a
+  single kernel module, it is better to provide module parameters to
+  configure the fault attributes.
+
+- add a hook to insert failures
+
+  Upon should_fail() returning true, client code should inject a failure:
+
+	should_fail(attr, size);
+
+Application Examples
+--------------------
+
+- Inject slab allocation failures into module init/exit code::
+
+    #!/bin/bash
+
+    FAILTYPE=failslab
+    echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+
+    faulty_system()
+    {
+	bash -c "echo 1 > /proc/self/make-it-fail && exec $*"
+    }
+
+    if [ $# -eq 0 ]
+    then
+	echo "Usage: $0 modulename [ modulename ... ]"
+	exit 1
+    fi
+
+    for m in $*
+    do
+	echo inserting $m...
+	faulty_system modprobe $m
+
+	echo removing $m...
+	faulty_system modprobe -r $m
+    done
+
+------------------------------------------------------------------------------
+
+- Inject page allocation failures only for a specific module::
+
+    #!/bin/bash
+
+    FAILTYPE=fail_page_alloc
+    module=$1
+
+    if [ -z $module ]
+    then
+	echo "Usage: $0 <modulename>"
+	exit 1
+    fi
+
+    modprobe $module
+
+    if [ ! -d /sys/module/$module/sections ]
+    then
+	echo Module $module is not loaded
+	exit 1
+    fi
+
+    cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
+    cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
+
+    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+    echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
+    echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
+
+    trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
+
+    echo "Injecting errors into the module $module... (interrupt to stop)"
+    sleep 1000000
+
+------------------------------------------------------------------------------
+
+- Inject open_ctree error while btrfs mount::
+
+    #!/bin/bash
+
+    rm -f testfile.img
+    dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+    DEVICE=$(losetup --show -f testfile.img)
+    mkfs.btrfs -f $DEVICE
+    mkdir -p tmpmnt
+
+    FAILTYPE=fail_function
+    FAILFUNC=open_ctree
+    echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject
+    echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
+    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
+    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
+    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
+    echo -1 > /sys/kernel/debug/$FAILTYPE/times
+    echo 0 > /sys/kernel/debug/$FAILTYPE/space
+    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
+
+    mount -t btrfs $DEVICE tmpmnt
+    if [ $? -ne 0 ]
+    then
+	echo "SUCCESS!"
+    else
+	echo "FAILED!"
+	umount tmpmnt
+    fi
+
+    echo > /sys/kernel/debug/$FAILTYPE/inject
+
+    rmdir tmpmnt
+    losetup -d $DEVICE
+    rm testfile.img
+
+
+Tool to run command with failslab or fail_page_alloc
+----------------------------------------------------
+In order to make it easier to accomplish the tasks mentioned above, we can use
+tools/testing/fault-injection/failcmd.sh.  Please run a command
+"./tools/testing/fault-injection/failcmd.sh --help" for more information and
+see the following examples.
+
+Examples:
+
+Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab
+allocation failure::
+
+	# ./tools/testing/fault-injection/failcmd.sh \
+		-- make -C tools/testing/selftests/ run_tests
+
+Same as above except to specify 100 times failures at most instead of one time
+at most by default::
+
+	# ./tools/testing/fault-injection/failcmd.sh --times=100 \
+		-- make -C tools/testing/selftests/ run_tests
+
+Same as above except to inject page allocation failure instead of slab
+allocation failure::
+
+	# env FAILCMD_TYPE=fail_page_alloc \
+		./tools/testing/fault-injection/failcmd.sh --times=100 \
+		-- make -C tools/testing/selftests/ run_tests
+
+Systematic faults using fail-nth
+---------------------------------
+
+The following code systematically faults 0-th, 1-st, 2-nd and so on
+capabilities in the socketpair() system call::
+
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <sys/socket.h>
+  #include <sys/syscall.h>
+  #include <fcntl.h>
+  #include <unistd.h>
+  #include <string.h>
+  #include <stdlib.h>
+  #include <stdio.h>
+  #include <errno.h>
+
+  int main()
+  {
+	int i, err, res, fail_nth, fds[2];
+	char buf[128];
+
+	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
+	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
+	fail_nth = open(buf, O_RDWR);
+	for (i = 1;; i++) {
+		sprintf(buf, "%d", i);
+		write(fail_nth, buf, strlen(buf));
+		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
+		err = errno;
+		pread(fail_nth, buf, sizeof(buf), 0);
+		if (res == 0) {
+			close(fds[0]);
+			close(fds[1]);
+		}
+		printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y',
+			res, err);
+		if (atoi(buf))
+			break;
+	}
+	return 0;
+  }
+
+An example output::
+
+	1-th fault Y: res=-1/23
+	2-th fault Y: res=-1/23
+	3-th fault Y: res=-1/12
+	4-th fault Y: res=-1/12
+	5-th fault Y: res=-1/23
+	6-th fault Y: res=-1/23
+	7-th fault Y: res=-1/23
+	8-th fault Y: res=-1/12
+	9-th fault Y: res=-1/12
+	10-th fault Y: res=-1/12
+	11-th fault Y: res=-1/12
+	12-th fault Y: res=-1/12
+	13-th fault Y: res=-1/12
+	14-th fault Y: res=-1/12
+	15-th fault Y: res=-1/12
+	16-th fault N: res=0/12
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
deleted file mode 100644
index a17517a083c3..000000000000
--- a/Documentation/fault-injection/fault-injection.txt
+++ /dev/null
@@ -1,435 +0,0 @@
-Fault injection capabilities infrastructure
-===========================================
-
-See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug.
-
-
-Available fault injection capabilities
---------------------------------------
-
-o failslab
-
-  injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...)
-
-o fail_page_alloc
-
-  injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
-
-o fail_futex
-
-  injects futex deadlock and uaddr fault errors.
-
-o fail_make_request
-
-  injects disk IO errors on devices permitted by setting
-  /sys/block/<device>/make-it-fail or
-  /sys/block/<device>/<partition>/make-it-fail. (generic_make_request())
-
-o fail_mmc_request
-
-  injects MMC data errors on devices permitted by setting
-  debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
-
-o fail_function
-
-  injects error return on specific functions, which are marked by
-  ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
-  under /sys/kernel/debug/fail_function. No boot option supported.
-
-o NVMe fault injection
-
-  inject NVMe status code and retry flag on devices permitted by setting
-  debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
-  status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
-  retry flag can be set via the debugfs.
-
-
-Configure fault-injection capabilities behavior
------------------------------------------------
-
-o debugfs entries
-
-fault-inject-debugfs kernel module provides some debugfs entries for runtime
-configuration of fault-injection capabilities.
-
-- /sys/kernel/debug/fail*/probability:
-
-	likelihood of failure injection, in percent.
-	Format: <percent>
-
-	Note that one-failure-per-hundred is a very high error rate
-	for some testcases.  Consider setting probability=100 and configure
-	/sys/kernel/debug/fail*/interval for such testcases.
-
-- /sys/kernel/debug/fail*/interval:
-
-	specifies the interval between failures, for calls to
-	should_fail() that pass all the other tests.
-
-	Note that if you enable this, by setting interval>1, you will
-	probably want to set probability=100.
-
-- /sys/kernel/debug/fail*/times:
-
-	specifies how many times failures may happen at most.
-	A value of -1 means "no limit".
-
-- /sys/kernel/debug/fail*/space:
-
-	specifies an initial resource "budget", decremented by "size"
-	on each call to should_fail(,size).  Failure injection is
-	suppressed until "space" reaches zero.
-
-- /sys/kernel/debug/fail*/verbose
-
-	Format: { 0 | 1 | 2 }
-	specifies the verbosity of the messages when failure is
-	injected.  '0' means no messages; '1' will print only a single
-	log line per failure; '2' will print a call trace too -- useful
-	to debug the problems revealed by fault injection.
-
-- /sys/kernel/debug/fail*/task-filter:
-
-	Format: { 'Y' | 'N' }
-	A value of 'N' disables filtering by process (default).
-	Any positive value limits failures to only processes indicated by
-	/proc/<pid>/make-it-fail==1.
-
-- /sys/kernel/debug/fail*/require-start:
-- /sys/kernel/debug/fail*/require-end:
-- /sys/kernel/debug/fail*/reject-start:
-- /sys/kernel/debug/fail*/reject-end:
-
-	specifies the range of virtual addresses tested during
-	stacktrace walking.  Failure is injected only if some caller
-	in the walked stacktrace lies within the required range, and
-	none lies within the rejected range.
-	Default required range is [0,ULONG_MAX) (whole of virtual address space).
-	Default rejected range is [0,0).
-
-- /sys/kernel/debug/fail*/stacktrace-depth:
-
-	specifies the maximum stacktrace depth walked during search
-	for a caller within [require-start,require-end) OR
-	[reject-start,reject-end).
-
-- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' won't inject failures into
-	highmem/user allocations.
-
-- /sys/kernel/debug/failslab/ignore-gfp-wait:
-- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' will inject failures
-	only into non-sleep allocations (GFP_ATOMIC allocations).
-
-- /sys/kernel/debug/fail_page_alloc/min-order:
-
-	specifies the minimum page allocation order to be injected
-	failures.
-
-- /sys/kernel/debug/fail_futex/ignore-private:
-
-	Format: { 'Y' | 'N' }
-	default is 'N', setting it to 'Y' will disable failure injections
-	when dealing with private (address space) futexes.
-
-- /sys/kernel/debug/fail_function/inject:
-
-	Format: { 'function-name' | '!function-name' | '' }
-	specifies the target function of error injection by name.
-	If the function name leads '!' prefix, given function is
-	removed from injection list. If nothing specified ('')
-	injection list is cleared.
-
-- /sys/kernel/debug/fail_function/injectable:
-
-	(read only) shows error injectable functions and what type of
-	error values can be specified. The error type will be one of
-	below;
-	- NULL:	retval must be 0.
-	- ERRNO: retval must be -1 to -MAX_ERRNO (-4096).
-	- ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096).
-
-- /sys/kernel/debug/fail_function/<functiuon-name>/retval:
-
-	specifies the "error" return value to inject to the given
-	function for given function. This will be created when
-	user specifies new injection entry.
-
-o Boot option
-
-In order to inject faults while debugfs is not available (early boot time),
-use the boot option:
-
-	failslab=
-	fail_page_alloc=
-	fail_make_request=
-	fail_futex=
-	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
-
-o proc entries
-
-- /proc/<pid>/fail-nth:
-- /proc/self/task/<tid>/fail-nth:
-
-	Write to this file of integer N makes N-th call in the task fail.
-	Read from this file returns a integer value. A value of '0' indicates
-	that the fault setup with a previous write to this file was injected.
-	A positive integer N indicates that the fault wasn't yet injected.
-	Note that this file enables all types of faults (slab, futex, etc).
-	This setting takes precedence over all other generic debugfs settings
-	like probability, interval, times, etc. But per-capability settings
-	(e.g. fail_futex/ignore-private) take precedence over it.
-
-	This feature is intended for systematic testing of faults in a single
-	system call. See an example below.
-
-How to add new fault injection capability
------------------------------------------
-
-o #include <linux/fault-inject.h>
-
-o define the fault attributes
-
-  DECLARE_FAULT_ATTR(name);
-
-  Please see the definition of struct fault_attr in fault-inject.h
-  for details.
-
-o provide a way to configure fault attributes
-
-- boot option
-
-  If you need to enable the fault injection capability from boot time, you can
-  provide boot option to configure it. There is a helper function for it:
-
-	setup_fault_attr(attr, str);
-
-- debugfs entries
-
-  failslab, fail_page_alloc, and fail_make_request use this way.
-  Helper functions:
-
-	fault_create_debugfs_attr(name, parent, attr);
-
-- module parameters
-
-  If the scope of the fault injection capability is limited to a
-  single kernel module, it is better to provide module parameters to
-  configure the fault attributes.
-
-o add a hook to insert failures
-
-  Upon should_fail() returning true, client code should inject a failure.
-
-	should_fail(attr, size);
-
-Application Examples
---------------------
-
-o Inject slab allocation failures into module init/exit code
-
-#!/bin/bash
-
-FAILTYPE=failslab
-echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 10 > /sys/kernel/debug/$FAILTYPE/probability
-echo 100 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
-
-faulty_system()
-{
-	bash -c "echo 1 > /proc/self/make-it-fail && exec $*"
-}
-
-if [ $# -eq 0 ]
-then
-	echo "Usage: $0 modulename [ modulename ... ]"
-	exit 1
-fi
-
-for m in $*
-do
-	echo inserting $m...
-	faulty_system modprobe $m
-
-	echo removing $m...
-	faulty_system modprobe -r $m
-done
-
-------------------------------------------------------------------------------
-
-o Inject page allocation failures only for a specific module
-
-#!/bin/bash
-
-FAILTYPE=fail_page_alloc
-module=$1
-
-if [ -z $module ]
-then
-	echo "Usage: $0 <modulename>"
-	exit 1
-fi
-
-modprobe $module
-
-if [ ! -d /sys/module/$module/sections ]
-then
-	echo Module $module is not loaded
-	exit 1
-fi
-
-cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
-cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
-
-echo N > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 10 > /sys/kernel/debug/$FAILTYPE/probability
-echo 100 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
-echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
-echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
-
-trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
-
-echo "Injecting errors into the module $module... (interrupt to stop)"
-sleep 1000000
-
-------------------------------------------------------------------------------
-
-o Inject open_ctree error while btrfs mount
-
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir -p tmpmnt
-
-FAILTYPE=fail_function
-FAILFUNC=open_ctree
-echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject
-echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
-echo N > /sys/kernel/debug/$FAILTYPE/task-filter
-echo 100 > /sys/kernel/debug/$FAILTYPE/probability
-echo 0 > /sys/kernel/debug/$FAILTYPE/interval
-echo -1 > /sys/kernel/debug/$FAILTYPE/times
-echo 0 > /sys/kernel/debug/$FAILTYPE/space
-echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
-
-mount -t btrfs $DEVICE tmpmnt
-if [ $? -ne 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-	umount tmpmnt
-fi
-
-echo > /sys/kernel/debug/$FAILTYPE/inject
-
-rmdir tmpmnt
-losetup -d $DEVICE
-rm testfile.img
-
-
-Tool to run command with failslab or fail_page_alloc
-----------------------------------------------------
-In order to make it easier to accomplish the tasks mentioned above, we can use
-tools/testing/fault-injection/failcmd.sh.  Please run a command
-"./tools/testing/fault-injection/failcmd.sh --help" for more information and
-see the following examples.
-
-Examples:
-
-Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab
-allocation failure.
-
-	# ./tools/testing/fault-injection/failcmd.sh \
-		-- make -C tools/testing/selftests/ run_tests
-
-Same as above except to specify 100 times failures at most instead of one time
-at most by default.
-
-	# ./tools/testing/fault-injection/failcmd.sh --times=100 \
-		-- make -C tools/testing/selftests/ run_tests
-
-Same as above except to inject page allocation failure instead of slab
-allocation failure.
-
-	# env FAILCMD_TYPE=fail_page_alloc \
-		./tools/testing/fault-injection/failcmd.sh --times=100 \
-                -- make -C tools/testing/selftests/ run_tests
-
-Systematic faults using fail-nth
----------------------------------
-
-The following code systematically faults 0-th, 1-st, 2-nd and so on
-capabilities in the socketpair() system call.
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
-
-int main()
-{
-	int i, err, res, fail_nth, fds[2];
-	char buf[128];
-
-	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
-	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
-	fail_nth = open(buf, O_RDWR);
-	for (i = 1;; i++) {
-		sprintf(buf, "%d", i);
-		write(fail_nth, buf, strlen(buf));
-		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
-		err = errno;
-		pread(fail_nth, buf, sizeof(buf), 0);
-		if (res == 0) {
-			close(fds[0]);
-			close(fds[1]);
-		}
-		printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y',
-			res, err);
-		if (atoi(buf))
-			break;
-	}
-	return 0;
-}
-
-An example output:
-
-1-th fault Y: res=-1/23
-2-th fault Y: res=-1/23
-3-th fault Y: res=-1/12
-4-th fault Y: res=-1/12
-5-th fault Y: res=-1/23
-6-th fault Y: res=-1/23
-7-th fault Y: res=-1/23
-8-th fault Y: res=-1/12
-9-th fault Y: res=-1/12
-10-th fault Y: res=-1/12
-11-th fault Y: res=-1/12
-12-th fault Y: res=-1/12
-13-th fault Y: res=-1/12
-14-th fault Y: res=-1/12
-15-th fault Y: res=-1/12
-16-th fault N: res=0/12
diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst
new file mode 100644
index 000000000000..8408a8a91b34
--- /dev/null
+++ b/Documentation/fault-injection/index.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+fault-injection
+===============
+
+.. toctree::
+    :maxdepth: 1
+
+    fault-injection
+    notifier-error-inject
+    nvme-fault-injection
+    provoke-crashes
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/fault-injection/notifier-error-inject.rst b/Documentation/fault-injection/notifier-error-inject.rst
new file mode 100644
index 000000000000..1668b6e48d3a
--- /dev/null
+++ b/Documentation/fault-injection/notifier-error-inject.rst
@@ -0,0 +1,98 @@
+Notifier error injection
+========================
+
+Notifier error injection provides the ability to inject artificial errors to
+specified notifier chain callbacks. It is useful to test the error handling of
+notifier call chain failures which is rarely executed.  There are kernel
+modules that can be used to test the following notifiers.
+
+ * PM notifier
+ * Memory hotplug notifier
+ * powerpc pSeries reconfig notifier
+ * Netdevice notifier
+
+PM notifier error injection module
+----------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error
+
+Possible PM notifier events to be failed are:
+
+ * PM_HIBERNATION_PREPARE
+ * PM_SUSPEND_PREPARE
+ * PM_RESTORE_PREPARE
+
+Example: Inject PM suspend error (-12 = -ENOMEM)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/pm/
+	# echo -12 > actions/PM_SUSPEND_PREPARE/error
+	# echo mem > /sys/power/state
+	bash: echo: write error: Cannot allocate memory
+
+Memory hotplug notifier error injection module
+----------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error
+
+Possible memory notifier events to be failed are:
+
+ * MEM_GOING_ONLINE
+ * MEM_GOING_OFFLINE
+
+Example: Inject memory hotplug offline error (-12 == -ENOMEM)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/memory
+	# echo -12 > actions/MEM_GOING_OFFLINE/error
+	# echo offline > /sys/devices/system/memory/memoryXXX/state
+	bash: echo: write error: Cannot allocate memory
+
+powerpc pSeries reconfig notifier error injection module
+--------------------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error
+
+Possible pSeries reconfig notifier events to be failed are:
+
+ * PSERIES_RECONFIG_ADD
+ * PSERIES_RECONFIG_REMOVE
+ * PSERIES_DRCONF_MEM_ADD
+ * PSERIES_DRCONF_MEM_REMOVE
+
+Netdevice notifier error injection module
+----------------------------------------------
+This feature is controlled through debugfs interface
+
+  /sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error
+
+Netdevice notifier events which can be failed are:
+
+ * NETDEV_REGISTER
+ * NETDEV_CHANGEMTU
+ * NETDEV_CHANGENAME
+ * NETDEV_PRE_UP
+ * NETDEV_PRE_TYPE_CHANGE
+ * NETDEV_POST_INIT
+ * NETDEV_PRECHANGEMTU
+ * NETDEV_PRECHANGEUPPER
+ * NETDEV_CHANGEUPPER
+
+Example: Inject netdevice mtu change error (-22 == -EINVAL)::
+
+	# cd /sys/kernel/debug/notifier-error-inject/netdev
+	# echo -22 > actions/NETDEV_CHANGEMTU/error
+	# ip link set eth0 mtu 1024
+	RTNETLINK answers: Invalid argument
+
+For more usage examples
+-----------------------
+There are tools/testing/selftests using the notifier error injection features
+for CPU and memory notifiers.
+
+ * tools/testing/selftests/cpu-hotplug/on-off-test.sh
+ * tools/testing/selftests/memory-hotplug/on-off-test.sh
+
+These scripts first do simple online and offline tests and then do fault
+injection tests if notifier error injection module is available.
diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt
deleted file mode 100644
index e861d761de24..000000000000
--- a/Documentation/fault-injection/notifier-error-inject.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-Notifier error injection
-========================
-
-Notifier error injection provides the ability to inject artificial errors to
-specified notifier chain callbacks. It is useful to test the error handling of
-notifier call chain failures which is rarely executed.  There are kernel
-modules that can be used to test the following notifiers.
-
- * PM notifier
- * Memory hotplug notifier
- * powerpc pSeries reconfig notifier
- * Netdevice notifier
-
-PM notifier error injection module
-----------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error
-
-Possible PM notifier events to be failed are:
-
- * PM_HIBERNATION_PREPARE
- * PM_SUSPEND_PREPARE
- * PM_RESTORE_PREPARE
-
-Example: Inject PM suspend error (-12 = -ENOMEM)
-
-	# cd /sys/kernel/debug/notifier-error-inject/pm/
-	# echo -12 > actions/PM_SUSPEND_PREPARE/error
-	# echo mem > /sys/power/state
-	bash: echo: write error: Cannot allocate memory
-
-Memory hotplug notifier error injection module
-----------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error
-
-Possible memory notifier events to be failed are:
-
- * MEM_GOING_ONLINE
- * MEM_GOING_OFFLINE
-
-Example: Inject memory hotplug offline error (-12 == -ENOMEM)
-
-	# cd /sys/kernel/debug/notifier-error-inject/memory
-	# echo -12 > actions/MEM_GOING_OFFLINE/error
-	# echo offline > /sys/devices/system/memory/memoryXXX/state
-	bash: echo: write error: Cannot allocate memory
-
-powerpc pSeries reconfig notifier error injection module
---------------------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error
-
-Possible pSeries reconfig notifier events to be failed are:
-
- * PSERIES_RECONFIG_ADD
- * PSERIES_RECONFIG_REMOVE
- * PSERIES_DRCONF_MEM_ADD
- * PSERIES_DRCONF_MEM_REMOVE
-
-Netdevice notifier error injection module
-----------------------------------------------
-This feature is controlled through debugfs interface
-/sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error
-
-Netdevice notifier events which can be failed are:
-
- * NETDEV_REGISTER
- * NETDEV_CHANGEMTU
- * NETDEV_CHANGENAME
- * NETDEV_PRE_UP
- * NETDEV_PRE_TYPE_CHANGE
- * NETDEV_POST_INIT
- * NETDEV_PRECHANGEMTU
- * NETDEV_PRECHANGEUPPER
- * NETDEV_CHANGEUPPER
-
-Example: Inject netdevice mtu change error (-22 == -EINVAL)
-
-	# cd /sys/kernel/debug/notifier-error-inject/netdev
-	# echo -22 > actions/NETDEV_CHANGEMTU/error
-	# ip link set eth0 mtu 1024
-	RTNETLINK answers: Invalid argument
-
-For more usage examples
------------------------
-There are tools/testing/selftests using the notifier error injection features
-for CPU and memory notifiers.
-
- * tools/testing/selftests/cpu-hotplug/on-off-test.sh
- * tools/testing/selftests/memory-hotplug/on-off-test.sh
-
-These scripts first do simple online and offline tests and then do fault
-injection tests if notifier error injection module is available.
diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst
new file mode 100644
index 000000000000..cdb2e829228e
--- /dev/null
+++ b/Documentation/fault-injection/nvme-fault-injection.rst
@@ -0,0 +1,178 @@
+NVMe Fault Injection
+====================
+Linux's fault injection framework provides a systematic way to support
+error injection via debugfs in the /sys/kernel/debug directory. When
+enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
+injected into the nvme_end_request. Users can change the default status
+code and no retry flag via the debugfs. The list of Generic Command
+Status can be found in include/linux/nvme.h
+
+Following examples show how to inject an error into the nvme.
+
+First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config,
+recompile the kernel. After booting up the kernel, do the
+following.
+
+Example 1: Inject default status code with no retry
+---------------------------------------------------
+
+::
+
+  mount /dev/nvme0n1 /mnt
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+  echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+  cp a.file /mnt
+
+Expected Result::
+
+  cp: cannot stat ‘/mnt/a.file’: Input/output error
+
+Message from dmesg::
+
+  FAULT_INJECTION: forcing a failure.
+  name fault_inject, interval 1, probability 100, space 0, times 1
+  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2
+  Hardware name: innotek GmbH VirtualBox/VirtualBox,
+  BIOS VirtualBox 12/01/2006
+  Call Trace:
+    <IRQ>
+    dump_stack+0x5c/0x7d
+    should_fail+0x148/0x170
+    nvme_should_fail+0x2f/0x50 [nvme_core]
+    nvme_process_cq+0xe7/0x1d0 [nvme]
+    nvme_irq+0x1e/0x40 [nvme]
+    __handle_irq_event_percpu+0x3a/0x190
+    handle_irq_event_percpu+0x30/0x70
+    handle_irq_event+0x36/0x60
+    handle_fasteoi_irq+0x78/0x120
+    handle_irq+0xa7/0x130
+    ? tick_irq_enter+0xa8/0xc0
+    do_IRQ+0x43/0xc0
+    common_interrupt+0xa2/0xa2
+    </IRQ>
+  RIP: 0010:native_safe_halt+0x2/0x10
+  RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd
+  RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+  RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480
+  R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000
+    ? __sched_text_end+0x4/0x4
+    default_idle+0x18/0xf0
+    do_idle+0x150/0x1d0
+    cpu_startup_entry+0x6f/0x80
+    start_kernel+0x4c4/0x4e4
+    ? set_init_arg+0x55/0x55
+    secondary_startup_64+0xa5/0xb0
+    print_req_error: I/O error, dev nvme0n1, sector 9240
+  EXT4-fs error (device nvme0n1): ext4_find_entry:1436:
+  inode #2: comm cp: reading directory lblock 0
+
+Example 2: Inject default status code with retry
+------------------------------------------------
+
+::
+
+  mount /dev/nvme0n1 /mnt
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+  echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+  echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status
+  echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry
+
+  cp a.file /mnt
+
+Expected Result::
+
+  command success without error
+
+Message from dmesg::
+
+  FAULT_INJECTION: forcing a failure.
+  name fault_inject, interval 1, probability 100, space 0, times 1
+  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4
+  Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+  Call Trace:
+    <IRQ>
+    dump_stack+0x5c/0x7d
+    should_fail+0x148/0x170
+    nvme_should_fail+0x30/0x60 [nvme_core]
+    nvme_loop_queue_response+0x84/0x110 [nvme_loop]
+    nvmet_req_complete+0x11/0x40 [nvmet]
+    nvmet_bio_done+0x28/0x40 [nvmet]
+    blk_update_request+0xb0/0x310
+    blk_mq_end_request+0x18/0x60
+    flush_smp_call_function_queue+0x3d/0xf0
+    smp_call_function_single_interrupt+0x2c/0xc0
+    call_function_single_interrupt+0xa2/0xb0
+    </IRQ>
+  RIP: 0010:native_safe_halt+0x2/0x10
+  RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04
+  RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+  RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680
+  R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000
+    ? __sched_text_end+0x4/0x4
+    default_idle+0x18/0xf0
+    do_idle+0x150/0x1d0
+    cpu_startup_entry+0x6f/0x80
+    start_secondary+0x187/0x1e0
+    secondary_startup_64+0xa5/0xb0
+
+Example 3: Inject an error into the 10th admin command
+------------------------------------------------------
+
+::
+
+  echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability
+  echo 10 > /sys/kernel/debug/nvme0/fault_inject/space
+  echo 1 > /sys/kernel/debug/nvme0/fault_inject/times
+  nvme reset /dev/nvme0
+
+Expected Result::
+
+  After NVMe controller reset, the reinitialization may or may not succeed.
+  It depends on which admin command is actually forced to fail.
+
+Message from dmesg::
+
+  nvme nvme0: resetting controller
+  FAULT_INJECTION: forcing a failure.
+  name fault_inject, interval 1, probability 100, space 1, times 1
+  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2
+  Hardware name: MSI MS-7A45/B150M MORTAR ARCTIC (MS-7A45), BIOS 1.50 04/25/2017
+  Call Trace:
+   <IRQ>
+   dump_stack+0x63/0x85
+   should_fail+0x14a/0x170
+   nvme_should_fail+0x38/0x80 [nvme_core]
+   nvme_irq+0x129/0x280 [nvme]
+   ? blk_mq_end_request+0xb3/0x120
+   __handle_irq_event_percpu+0x84/0x1a0
+   handle_irq_event_percpu+0x32/0x80
+   handle_irq_event+0x3b/0x60
+   handle_edge_irq+0x7f/0x1a0
+   handle_irq+0x20/0x30
+   do_IRQ+0x4e/0xe0
+   common_interrupt+0xf/0xf
+   </IRQ>
+  RIP: 0010:cpuidle_enter_state+0xc5/0x460
+  Code: ff e8 8f 5f 86 ff 80 7d c7 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 69 03 00 00 31 ff e8 62 aa 8c ff fb 66 0f 1f 44 00 00 <45> 85 ed 0f 88 37 03 00 00 4c 8b 45 d0 4c 2b 45 b8 48 ba cf f7 53
+  RSP: 0018:ffffffff88c03dd0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc
+  RAX: ffff9dac25a2ac80 RBX: ffffffff88d53760 RCX: 000000000000001f
+  RDX: 0000000000000000 RSI: 000000002d958403 RDI: 0000000000000000
+  RBP: ffffffff88c03e18 R08: fffffff75e35ffb7 R09: 00000a49a56c0b48
+  R10: ffffffff88c03da0 R11: 0000000000001b0c R12: ffff9dac25a34d00
+  R13: 0000000000000006 R14: 0000000000000006 R15: ffffffff88d53760
+   cpuidle_enter+0x2e/0x40
+   call_cpuidle+0x23/0x40
+   do_idle+0x201/0x280
+   cpu_startup_entry+0x1d/0x20
+   rest_init+0xaa/0xb0
+   arch_call_rest_init+0xe/0x1b
+   start_kernel+0x51c/0x53b
+   x86_64_start_reservations+0x24/0x26
+   x86_64_start_kernel+0x74/0x77
+   secondary_startup_64+0xa4/0xb0
+  nvme nvme0: Could not set queue count (16385)
+  nvme nvme0: IO queues not created
diff --git a/Documentation/fault-injection/nvme-fault-injection.txt b/Documentation/fault-injection/nvme-fault-injection.txt
deleted file mode 100644
index 8fbf3bf60b62..000000000000
--- a/Documentation/fault-injection/nvme-fault-injection.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-NVMe Fault Injection
-====================
-Linux's fault injection framework provides a systematic way to support
-error injection via debugfs in the /sys/kernel/debug directory. When
-enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
-injected into the nvme_end_request. Users can change the default status
-code and no retry flag via the debugfs. The list of Generic Command
-Status can be found in include/linux/nvme.h
-
-Following examples show how to inject an error into the nvme.
-
-First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config,
-recompile the kernel. After booting up the kernel, do the
-following.
-
-Example 1: Inject default status code with no retry
----------------------------------------------------
-
-mount /dev/nvme0n1 /mnt
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
-echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
-cp a.file /mnt
-
-Expected Result:
-
-cp: cannot stat ‘/mnt/a.file’: Input/output error
-
-Message from dmesg:
-
-FAULT_INJECTION: forcing a failure.
-name fault_inject, interval 1, probability 100, space 0, times 1
-CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2
-Hardware name: innotek GmbH VirtualBox/VirtualBox,
-BIOS VirtualBox 12/01/2006
-Call Trace:
-  <IRQ>
-  dump_stack+0x5c/0x7d
-  should_fail+0x148/0x170
-  nvme_should_fail+0x2f/0x50 [nvme_core]
-  nvme_process_cq+0xe7/0x1d0 [nvme]
-  nvme_irq+0x1e/0x40 [nvme]
-  __handle_irq_event_percpu+0x3a/0x190
-  handle_irq_event_percpu+0x30/0x70
-  handle_irq_event+0x36/0x60
-  handle_fasteoi_irq+0x78/0x120
-  handle_irq+0xa7/0x130
-  ? tick_irq_enter+0xa8/0xc0
-  do_IRQ+0x43/0xc0
-  common_interrupt+0xa2/0xa2
-  </IRQ>
-RIP: 0010:native_safe_halt+0x2/0x10
-RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd
-RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000
-RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
-RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000
-R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480
-R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000
-  ? __sched_text_end+0x4/0x4
-  default_idle+0x18/0xf0
-  do_idle+0x150/0x1d0
-  cpu_startup_entry+0x6f/0x80
-  start_kernel+0x4c4/0x4e4
-  ? set_init_arg+0x55/0x55
-  secondary_startup_64+0xa5/0xb0
-  print_req_error: I/O error, dev nvme0n1, sector 9240
-EXT4-fs error (device nvme0n1): ext4_find_entry:1436:
-inode #2: comm cp: reading directory lblock 0
-
-Example 2: Inject default status code with retry
-------------------------------------------------
-
-mount /dev/nvme0n1 /mnt
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
-echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
-echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status
-echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry
-
-cp a.file /mnt
-
-Expected Result:
-
-command success without error
-
-Message from dmesg:
-
-FAULT_INJECTION: forcing a failure.
-name fault_inject, interval 1, probability 100, space 0, times 1
-CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4
-Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
-Call Trace:
-  <IRQ>
-  dump_stack+0x5c/0x7d
-  should_fail+0x148/0x170
-  nvme_should_fail+0x30/0x60 [nvme_core]
-  nvme_loop_queue_response+0x84/0x110 [nvme_loop]
-  nvmet_req_complete+0x11/0x40 [nvmet]
-  nvmet_bio_done+0x28/0x40 [nvmet]
-  blk_update_request+0xb0/0x310
-  blk_mq_end_request+0x18/0x60
-  flush_smp_call_function_queue+0x3d/0xf0
-  smp_call_function_single_interrupt+0x2c/0xc0
-  call_function_single_interrupt+0xa2/0xb0
-  </IRQ>
-RIP: 0010:native_safe_halt+0x2/0x10
-RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04
-RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000
-RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
-RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000
-R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680
-R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000
-  ? __sched_text_end+0x4/0x4
-  default_idle+0x18/0xf0
-  do_idle+0x150/0x1d0
-  cpu_startup_entry+0x6f/0x80
-  start_secondary+0x187/0x1e0
-  secondary_startup_64+0xa5/0xb0
diff --git a/Documentation/fault-injection/provoke-crashes.rst b/Documentation/fault-injection/provoke-crashes.rst
new file mode 100644
index 000000000000..9279a3e12278
--- /dev/null
+++ b/Documentation/fault-injection/provoke-crashes.rst
@@ -0,0 +1,48 @@
+===============
+Provoke crashes
+===============
+
+The lkdtm module provides an interface to crash or injure the kernel at
+predefined crashpoints to evaluate the reliability of crash dumps obtained
+using different dumping solutions. The module uses KPROBEs to instrument
+crashing points, but can also crash the kernel directly without KRPOBE
+support.
+
+
+You can provide the way either through module arguments when inserting
+the module, or through a debugfs interface.
+
+Usage::
+
+	insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
+			[cpoint_count={>0}]
+
+recur_count
+	Recursion level for the stack overflow test. Default is 10.
+
+cpoint_name
+	Crash point where the kernel is to be crashed. It can be
+	one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
+	FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
+	IDE_CORE_CP, DIRECT
+
+cpoint_type
+	Indicates the action to be taken on hitting the crash point.
+	It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
+	CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
+	WRITE_AFTER_FREE,
+
+cpoint_count
+	Indicates the number of times the crash point is to be hit
+	to trigger an action. The default is 10.
+
+You can also induce failures by mounting debugfs and writing the type to
+<mountpoint>/provoke-crash/<crashpoint>. E.g.::
+
+  mount -t debugfs debugfs /mnt
+  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
+
+
+A special file is `DIRECT` which will induce the crash directly without
+KPROBE instrumentation. This mode is the only one available when the module
+is built on a kernel without KPROBEs support.
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt
deleted file mode 100644
index 7a9d3d81525b..000000000000
--- a/Documentation/fault-injection/provoke-crashes.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-The lkdtm module provides an interface to crash or injure the kernel at
-predefined crashpoints to evaluate the reliability of crash dumps obtained
-using different dumping solutions. The module uses KPROBEs to instrument
-crashing points, but can also crash the kernel directly without KRPOBE
-support.
-
-
-You can provide the way either through module arguments when inserting
-the module, or through a debugfs interface.
-
-Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
-				[cpoint_count={>0}]
-
-  recur_count : Recursion level for the stack overflow test. Default is 10.
-
-  cpoint_name : Crash point where the kernel is to be crashed. It can be
-	 one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
-	 FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
-	 IDE_CORE_CP, DIRECT
-
-  cpoint_type : Indicates the action to be taken on hitting the crash point.
-     It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
-     CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
-     WRITE_AFTER_FREE,
-
-  cpoint_count : Indicates the number of times the crash point is to be hit
-    to trigger an action. The default is 10.
-
-You can also induce failures by mounting debugfs and writing the type to
-<mountpoint>/provoke-crash/<crashpoint>. E.g.,
-
-  mount -t debugfs debugfs /mnt
-  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
-
-
-A special file is `DIRECT' which will induce the crash directly without
-KPROBE instrumentation. This mode is the only one available when the module
-is built on a kernel without KPROBEs support.
diff --git a/Documentation/fb/api.rst b/Documentation/fb/api.rst
new file mode 100644
index 000000000000..79ec33dded74
--- /dev/null
+++ b/Documentation/fb/api.rst
@@ -0,0 +1,307 @@
+===========================
+The Frame Buffer Device API
+===========================
+
+Last revised: June 21, 2011
+
+
+0. Introduction
+---------------
+
+This document describes the frame buffer API used by applications to interact
+with frame buffer devices. In-kernel APIs between device drivers and the frame
+buffer core are not described.
+
+Due to a lack of documentation in the original frame buffer API, drivers
+behaviours differ in subtle (and not so subtle) ways. This document describes
+the recommended API implementation, but applications should be prepared to
+deal with different behaviours.
+
+
+1. Capabilities
+---------------
+
+Device and driver capabilities are reported in the fixed screen information
+capabilities field::
+
+  struct fb_fix_screeninfo {
+	...
+	__u16 capabilities;		/* see FB_CAP_*			*/
+	...
+  };
+
+Application should use those capabilities to find out what features they can
+expect from the device and driver.
+
+- FB_CAP_FOURCC
+
+The driver supports the four character code (FOURCC) based format setting API.
+When supported, formats are configured using a FOURCC instead of manually
+specifying color components layout.
+
+
+2. Types and visuals
+--------------------
+
+Pixels are stored in memory in hardware-dependent formats. Applications need
+to be aware of the pixel storage format in order to write image data to the
+frame buffer memory in the format expected by the hardware.
+
+Formats are described by frame buffer types and visuals. Some visuals require
+additional information, which are stored in the variable screen information
+bits_per_pixel, grayscale, red, green, blue and transp fields.
+
+Visuals describe how color information is encoded and assembled to create
+macropixels. Types describe how macropixels are stored in memory. The following
+types and visuals are supported.
+
+- FB_TYPE_PACKED_PIXELS
+
+Macropixels are stored contiguously in a single plane. If the number of bits
+per macropixel is not a multiple of 8, whether macropixels are padded to the
+next multiple of 8 bits or packed together into bytes depends on the visual.
+
+Padding at end of lines may be present and is then reported through the fixed
+screen information line_length field.
+
+- FB_TYPE_PLANES
+
+Macropixels are split across multiple planes. The number of planes is equal to
+the number of bits per macropixel, with plane i'th storing i'th bit from all
+macropixels.
+
+Planes are located contiguously in memory.
+
+- FB_TYPE_INTERLEAVED_PLANES
+
+Macropixels are split across multiple planes. The number of planes is equal to
+the number of bits per macropixel, with plane i'th storing i'th bit from all
+macropixels.
+
+Planes are interleaved in memory. The interleave factor, defined as the
+distance in bytes between the beginning of two consecutive interleaved blocks
+belonging to different planes, is stored in the fixed screen information
+type_aux field.
+
+- FB_TYPE_FOURCC
+
+Macropixels are stored in memory as described by the format FOURCC identifier
+stored in the variable screen information grayscale field.
+
+- FB_VISUAL_MONO01
+
+Pixels are black or white and stored on a number of bits (typically one)
+specified by the variable screen information bpp field.
+
+Black pixels are represented by all bits set to 1 and white pixels by all bits
+set to 0. When the number of bits per pixel is smaller than 8, several pixels
+are packed together in a byte.
+
+FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only.
+
+- FB_VISUAL_MONO10
+
+Pixels are black or white and stored on a number of bits (typically one)
+specified by the variable screen information bpp field.
+
+Black pixels are represented by all bits set to 0 and white pixels by all bits
+set to 1. When the number of bits per pixel is smaller than 8, several pixels
+are packed together in a byte.
+
+FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only.
+
+- FB_VISUAL_TRUECOLOR
+
+Pixels are broken into red, green and blue components, and each component
+indexes a read-only lookup table for the corresponding value. Lookup tables
+are device-dependent, and provide linear or non-linear ramps.
+
+Each component is stored in a macropixel according to the variable screen
+information red, green, blue and transp fields.
+
+- FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR
+
+Pixel values are encoded as indices into a colormap that stores red, green and
+blue components. The colormap is read-only for FB_VISUAL_STATIC_PSEUDOCOLOR
+and read-write for FB_VISUAL_PSEUDOCOLOR.
+
+Each pixel value is stored in the number of bits reported by the variable
+screen information bits_per_pixel field.
+
+- FB_VISUAL_DIRECTCOLOR
+
+Pixels are broken into red, green and blue components, and each component
+indexes a programmable lookup table for the corresponding value.
+
+Each component is stored in a macropixel according to the variable screen
+information red, green, blue and transp fields.
+
+- FB_VISUAL_FOURCC
+
+Pixels are encoded and  interpreted as described by the format FOURCC
+identifier stored in the variable screen information grayscale field.
+
+
+3. Screen information
+---------------------
+
+Screen information are queried by applications using the FBIOGET_FSCREENINFO
+and FBIOGET_VSCREENINFO ioctls. Those ioctls take a pointer to a
+fb_fix_screeninfo and fb_var_screeninfo structure respectively.
+
+struct fb_fix_screeninfo stores device independent unchangeable information
+about the frame buffer device and the current format. Those information can't
+be directly modified by applications, but can be changed by the driver when an
+application modifies the format::
+
+  struct fb_fix_screeninfo {
+	char id[16];			/* identification string eg "TT Builtin" */
+	unsigned long smem_start;	/* Start of frame buffer mem */
+					/* (physical address) */
+	__u32 smem_len;			/* Length of frame buffer mem */
+	__u32 type;			/* see FB_TYPE_*		*/
+	__u32 type_aux;			/* Interleave for interleaved Planes */
+	__u32 visual;			/* see FB_VISUAL_*		*/
+	__u16 xpanstep;			/* zero if no hardware panning  */
+	__u16 ypanstep;			/* zero if no hardware panning  */
+	__u16 ywrapstep;		/* zero if no hardware ywrap    */
+	__u32 line_length;		/* length of a line in bytes    */
+	unsigned long mmio_start;	/* Start of Memory Mapped I/O   */
+					/* (physical address) */
+	__u32 mmio_len;			/* Length of Memory Mapped I/O  */
+	__u32 accel;			/* Indicate to driver which	*/
+					/*  specific chip/card we have	*/
+	__u16 capabilities;		/* see FB_CAP_*			*/
+	__u16 reserved[2];		/* Reserved for future compatibility */
+  };
+
+struct fb_var_screeninfo stores device independent changeable information
+about a frame buffer device, its current format and video mode, as well as
+other miscellaneous parameters::
+
+  struct fb_var_screeninfo {
+	__u32 xres;			/* visible resolution		*/
+	__u32 yres;
+	__u32 xres_virtual;		/* virtual resolution		*/
+	__u32 yres_virtual;
+	__u32 xoffset;			/* offset from virtual to visible */
+	__u32 yoffset;			/* resolution			*/
+
+	__u32 bits_per_pixel;		/* guess what			*/
+	__u32 grayscale;		/* 0 = color, 1 = grayscale,	*/
+					/* >1 = FOURCC			*/
+	struct fb_bitfield red;		/* bitfield in fb mem if true color, */
+	struct fb_bitfield green;	/* else only length is significant */
+	struct fb_bitfield blue;
+	struct fb_bitfield transp;	/* transparency			*/
+
+	__u32 nonstd;			/* != 0 Non standard pixel format */
+
+	__u32 activate;			/* see FB_ACTIVATE_*		*/
+
+	__u32 height;			/* height of picture in mm    */
+	__u32 width;			/* width of picture in mm     */
+
+	__u32 accel_flags;		/* (OBSOLETE) see fb_info.flags */
+
+	/* Timing: All values in pixclocks, except pixclock (of course) */
+	__u32 pixclock;			/* pixel clock in ps (pico seconds) */
+	__u32 left_margin;		/* time from sync to picture	*/
+	__u32 right_margin;		/* time from picture to sync	*/
+	__u32 upper_margin;		/* time from sync to picture	*/
+	__u32 lower_margin;
+	__u32 hsync_len;		/* length of horizontal sync	*/
+	__u32 vsync_len;		/* length of vertical sync	*/
+	__u32 sync;			/* see FB_SYNC_*		*/
+	__u32 vmode;			/* see FB_VMODE_*		*/
+	__u32 rotate;			/* angle we rotate counter clockwise */
+	__u32 colorspace;		/* colorspace for FOURCC-based modes */
+	__u32 reserved[4];		/* Reserved for future compatibility */
+  };
+
+To modify variable information, applications call the FBIOPUT_VSCREENINFO
+ioctl with a pointer to a fb_var_screeninfo structure. If the call is
+successful, the driver will update the fixed screen information accordingly.
+
+Instead of filling the complete fb_var_screeninfo structure manually,
+applications should call the FBIOGET_VSCREENINFO ioctl and modify only the
+fields they care about.
+
+
+4. Format configuration
+-----------------------
+
+Frame buffer devices offer two ways to configure the frame buffer format: the
+legacy API and the FOURCC-based API.
+
+
+The legacy API has been the only frame buffer format configuration API for a
+long time and is thus widely used by application. It is the recommended API
+for applications when using RGB and grayscale formats, as well as legacy
+non-standard formats.
+
+To select a format, applications set the fb_var_screeninfo bits_per_pixel field
+to the desired frame buffer depth. Values up to 8 will usually map to
+monochrome, grayscale or pseudocolor visuals, although this is not required.
+
+- For grayscale formats, applications set the grayscale field to one. The red,
+  blue, green and transp fields must be set to 0 by applications and ignored by
+  drivers. Drivers must fill the red, blue and green offsets to 0 and lengths
+  to the bits_per_pixel value.
+
+- For pseudocolor formats, applications set the grayscale field to zero. The
+  red, blue, green and transp fields must be set to 0 by applications and
+  ignored by drivers. Drivers must fill the red, blue and green offsets to 0
+  and lengths to the bits_per_pixel value.
+
+- For truecolor and directcolor formats, applications set the grayscale field
+  to zero, and the red, blue, green and transp fields to describe the layout of
+  color components in memory::
+
+    struct fb_bitfield {
+	__u32 offset;			/* beginning of bitfield	*/
+	__u32 length;			/* length of bitfield		*/
+	__u32 msb_right;		/* != 0 : Most significant bit is */
+					/* right */
+    };
+
+  Pixel values are bits_per_pixel wide and are split in non-overlapping red,
+  green, blue and alpha (transparency) components. Location and size of each
+  component in the pixel value are described by the fb_bitfield offset and
+  length fields. Offset are computed from the right.
+
+  Pixels are always stored in an integer number of bytes. If the number of
+  bits per pixel is not a multiple of 8, pixel values are padded to the next
+  multiple of 8 bits.
+
+Upon successful format configuration, drivers update the fb_fix_screeninfo
+type, visual and line_length fields depending on the selected format.
+
+
+The FOURCC-based API replaces format descriptions by four character codes
+(FOURCC). FOURCCs are abstract identifiers that uniquely define a format
+without explicitly describing it. This is the only API that supports YUV
+formats. Drivers are also encouraged to implement the FOURCC-based API for RGB
+and grayscale formats.
+
+Drivers that support the FOURCC-based API report this capability by setting
+the FB_CAP_FOURCC bit in the fb_fix_screeninfo capabilities field.
+
+FOURCC definitions are located in the linux/videodev2.h header. However, and
+despite starting with the V4L2_PIX_FMT_prefix, they are not restricted to V4L2
+and don't require usage of the V4L2 subsystem. FOURCC documentation is
+available in Documentation/media/uapi/v4l/pixfmt.rst.
+
+To select a format, applications set the grayscale field to the desired FOURCC.
+For YUV formats, they should also select the appropriate colorspace by setting
+the colorspace field to one of the colorspaces listed in linux/videodev2.h and
+documented in Documentation/media/uapi/v4l/colorspaces.rst.
+
+The red, green, blue and transp fields are not used with the FOURCC-based API.
+For forward compatibility reasons applications must zero those fields, and
+drivers must ignore them. Values other than 0 may get a meaning in future
+extensions.
+
+Upon successful format configuration, drivers update the fb_fix_screeninfo
+type, visual and line_length fields depending on the selected format. The type
+and visual fields are set to FB_TYPE_FOURCC and FB_VISUAL_FOURCC respectively.
diff --git a/Documentation/fb/api.txt b/Documentation/fb/api.txt
deleted file mode 100644
index d52cf1e3b975..000000000000
--- a/Documentation/fb/api.txt
+++ /dev/null
@@ -1,306 +0,0 @@
-			The Frame Buffer Device API
-			---------------------------
-
-Last revised: June 21, 2011
-
-
-0. Introduction
----------------
-
-This document describes the frame buffer API used by applications to interact
-with frame buffer devices. In-kernel APIs between device drivers and the frame
-buffer core are not described.
-
-Due to a lack of documentation in the original frame buffer API, drivers
-behaviours differ in subtle (and not so subtle) ways. This document describes
-the recommended API implementation, but applications should be prepared to
-deal with different behaviours.
-
-
-1. Capabilities
----------------
-
-Device and driver capabilities are reported in the fixed screen information
-capabilities field.
-
-struct fb_fix_screeninfo {
-	...
-	__u16 capabilities;		/* see FB_CAP_*			*/
-	...
-};
-
-Application should use those capabilities to find out what features they can
-expect from the device and driver.
-
-- FB_CAP_FOURCC
-
-The driver supports the four character code (FOURCC) based format setting API.
-When supported, formats are configured using a FOURCC instead of manually
-specifying color components layout.
-
-
-2. Types and visuals
---------------------
-
-Pixels are stored in memory in hardware-dependent formats. Applications need
-to be aware of the pixel storage format in order to write image data to the
-frame buffer memory in the format expected by the hardware.
-
-Formats are described by frame buffer types and visuals. Some visuals require
-additional information, which are stored in the variable screen information
-bits_per_pixel, grayscale, red, green, blue and transp fields.
-
-Visuals describe how color information is encoded and assembled to create
-macropixels. Types describe how macropixels are stored in memory. The following
-types and visuals are supported.
-
-- FB_TYPE_PACKED_PIXELS
-
-Macropixels are stored contiguously in a single plane. If the number of bits
-per macropixel is not a multiple of 8, whether macropixels are padded to the
-next multiple of 8 bits or packed together into bytes depends on the visual.
-
-Padding at end of lines may be present and is then reported through the fixed
-screen information line_length field.
-
-- FB_TYPE_PLANES
-
-Macropixels are split across multiple planes. The number of planes is equal to
-the number of bits per macropixel, with plane i'th storing i'th bit from all
-macropixels.
-
-Planes are located contiguously in memory.
-
-- FB_TYPE_INTERLEAVED_PLANES
-
-Macropixels are split across multiple planes. The number of planes is equal to
-the number of bits per macropixel, with plane i'th storing i'th bit from all
-macropixels.
-
-Planes are interleaved in memory. The interleave factor, defined as the
-distance in bytes between the beginning of two consecutive interleaved blocks
-belonging to different planes, is stored in the fixed screen information
-type_aux field.
-
-- FB_TYPE_FOURCC
-
-Macropixels are stored in memory as described by the format FOURCC identifier
-stored in the variable screen information grayscale field.
-
-- FB_VISUAL_MONO01
-
-Pixels are black or white and stored on a number of bits (typically one)
-specified by the variable screen information bpp field.
-
-Black pixels are represented by all bits set to 1 and white pixels by all bits
-set to 0. When the number of bits per pixel is smaller than 8, several pixels
-are packed together in a byte.
-
-FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only.
-
-- FB_VISUAL_MONO10
-
-Pixels are black or white and stored on a number of bits (typically one)
-specified by the variable screen information bpp field.
-
-Black pixels are represented by all bits set to 0 and white pixels by all bits
-set to 1. When the number of bits per pixel is smaller than 8, several pixels
-are packed together in a byte.
-
-FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only.
-
-- FB_VISUAL_TRUECOLOR
-
-Pixels are broken into red, green and blue components, and each component
-indexes a read-only lookup table for the corresponding value. Lookup tables
-are device-dependent, and provide linear or non-linear ramps.
-
-Each component is stored in a macropixel according to the variable screen
-information red, green, blue and transp fields.
-
-- FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR
-
-Pixel values are encoded as indices into a colormap that stores red, green and
-blue components. The colormap is read-only for FB_VISUAL_STATIC_PSEUDOCOLOR
-and read-write for FB_VISUAL_PSEUDOCOLOR.
-
-Each pixel value is stored in the number of bits reported by the variable
-screen information bits_per_pixel field.
-
-- FB_VISUAL_DIRECTCOLOR
-
-Pixels are broken into red, green and blue components, and each component
-indexes a programmable lookup table for the corresponding value.
-
-Each component is stored in a macropixel according to the variable screen
-information red, green, blue and transp fields.
-
-- FB_VISUAL_FOURCC
-
-Pixels are encoded and  interpreted as described by the format FOURCC
-identifier stored in the variable screen information grayscale field.
-
-
-3. Screen information
----------------------
-
-Screen information are queried by applications using the FBIOGET_FSCREENINFO
-and FBIOGET_VSCREENINFO ioctls. Those ioctls take a pointer to a
-fb_fix_screeninfo and fb_var_screeninfo structure respectively.
-
-struct fb_fix_screeninfo stores device independent unchangeable information
-about the frame buffer device and the current format. Those information can't
-be directly modified by applications, but can be changed by the driver when an
-application modifies the format.
-
-struct fb_fix_screeninfo {
-	char id[16];			/* identification string eg "TT Builtin" */
-	unsigned long smem_start;	/* Start of frame buffer mem */
-					/* (physical address) */
-	__u32 smem_len;			/* Length of frame buffer mem */
-	__u32 type;			/* see FB_TYPE_*		*/
-	__u32 type_aux;			/* Interleave for interleaved Planes */
-	__u32 visual;			/* see FB_VISUAL_*		*/
-	__u16 xpanstep;			/* zero if no hardware panning  */
-	__u16 ypanstep;			/* zero if no hardware panning  */
-	__u16 ywrapstep;		/* zero if no hardware ywrap    */
-	__u32 line_length;		/* length of a line in bytes    */
-	unsigned long mmio_start;	/* Start of Memory Mapped I/O   */
-					/* (physical address) */
-	__u32 mmio_len;			/* Length of Memory Mapped I/O  */
-	__u32 accel;			/* Indicate to driver which	*/
-					/*  specific chip/card we have	*/
-	__u16 capabilities;		/* see FB_CAP_*			*/
-	__u16 reserved[2];		/* Reserved for future compatibility */
-};
-
-struct fb_var_screeninfo stores device independent changeable information
-about a frame buffer device, its current format and video mode, as well as
-other miscellaneous parameters.
-
-struct fb_var_screeninfo {
-	__u32 xres;			/* visible resolution		*/
-	__u32 yres;
-	__u32 xres_virtual;		/* virtual resolution		*/
-	__u32 yres_virtual;
-	__u32 xoffset;			/* offset from virtual to visible */
-	__u32 yoffset;			/* resolution			*/
-
-	__u32 bits_per_pixel;		/* guess what			*/
-	__u32 grayscale;		/* 0 = color, 1 = grayscale,	*/
-					/* >1 = FOURCC			*/
-	struct fb_bitfield red;		/* bitfield in fb mem if true color, */
-	struct fb_bitfield green;	/* else only length is significant */
-	struct fb_bitfield blue;
-	struct fb_bitfield transp;	/* transparency			*/
-
-	__u32 nonstd;			/* != 0 Non standard pixel format */
-
-	__u32 activate;			/* see FB_ACTIVATE_*		*/
-
-	__u32 height;			/* height of picture in mm    */
-	__u32 width;			/* width of picture in mm     */
-
-	__u32 accel_flags;		/* (OBSOLETE) see fb_info.flags */
-
-	/* Timing: All values in pixclocks, except pixclock (of course) */
-	__u32 pixclock;			/* pixel clock in ps (pico seconds) */
-	__u32 left_margin;		/* time from sync to picture	*/
-	__u32 right_margin;		/* time from picture to sync	*/
-	__u32 upper_margin;		/* time from sync to picture	*/
-	__u32 lower_margin;
-	__u32 hsync_len;		/* length of horizontal sync	*/
-	__u32 vsync_len;		/* length of vertical sync	*/
-	__u32 sync;			/* see FB_SYNC_*		*/
-	__u32 vmode;			/* see FB_VMODE_*		*/
-	__u32 rotate;			/* angle we rotate counter clockwise */
-	__u32 colorspace;		/* colorspace for FOURCC-based modes */
-	__u32 reserved[4];		/* Reserved for future compatibility */
-};
-
-To modify variable information, applications call the FBIOPUT_VSCREENINFO
-ioctl with a pointer to a fb_var_screeninfo structure. If the call is
-successful, the driver will update the fixed screen information accordingly.
-
-Instead of filling the complete fb_var_screeninfo structure manually,
-applications should call the FBIOGET_VSCREENINFO ioctl and modify only the
-fields they care about.
-
-
-4. Format configuration
------------------------
-
-Frame buffer devices offer two ways to configure the frame buffer format: the
-legacy API and the FOURCC-based API.
-
-
-The legacy API has been the only frame buffer format configuration API for a
-long time and is thus widely used by application. It is the recommended API
-for applications when using RGB and grayscale formats, as well as legacy
-non-standard formats.
-
-To select a format, applications set the fb_var_screeninfo bits_per_pixel field
-to the desired frame buffer depth. Values up to 8 will usually map to
-monochrome, grayscale or pseudocolor visuals, although this is not required.
-
-- For grayscale formats, applications set the grayscale field to one. The red,
-  blue, green and transp fields must be set to 0 by applications and ignored by
-  drivers. Drivers must fill the red, blue and green offsets to 0 and lengths
-  to the bits_per_pixel value.
-
-- For pseudocolor formats, applications set the grayscale field to zero. The
-  red, blue, green and transp fields must be set to 0 by applications and
-  ignored by drivers. Drivers must fill the red, blue and green offsets to 0
-  and lengths to the bits_per_pixel value.
-
-- For truecolor and directcolor formats, applications set the grayscale field
-  to zero, and the red, blue, green and transp fields to describe the layout of
-  color components in memory.
-
-struct fb_bitfield {
-	__u32 offset;			/* beginning of bitfield	*/
-	__u32 length;			/* length of bitfield		*/
-	__u32 msb_right;		/* != 0 : Most significant bit is */
-					/* right */
-};
-
-  Pixel values are bits_per_pixel wide and are split in non-overlapping red,
-  green, blue and alpha (transparency) components. Location and size of each
-  component in the pixel value are described by the fb_bitfield offset and
-  length fields. Offset are computed from the right.
-
-  Pixels are always stored in an integer number of bytes. If the number of
-  bits per pixel is not a multiple of 8, pixel values are padded to the next
-  multiple of 8 bits.
-
-Upon successful format configuration, drivers update the fb_fix_screeninfo
-type, visual and line_length fields depending on the selected format.
-
-
-The FOURCC-based API replaces format descriptions by four character codes
-(FOURCC). FOURCCs are abstract identifiers that uniquely define a format
-without explicitly describing it. This is the only API that supports YUV
-formats. Drivers are also encouraged to implement the FOURCC-based API for RGB
-and grayscale formats.
-
-Drivers that support the FOURCC-based API report this capability by setting
-the FB_CAP_FOURCC bit in the fb_fix_screeninfo capabilities field.
-
-FOURCC definitions are located in the linux/videodev2.h header. However, and
-despite starting with the V4L2_PIX_FMT_prefix, they are not restricted to V4L2
-and don't require usage of the V4L2 subsystem. FOURCC documentation is
-available in Documentation/media/uapi/v4l/pixfmt.rst.
-
-To select a format, applications set the grayscale field to the desired FOURCC.
-For YUV formats, they should also select the appropriate colorspace by setting
-the colorspace field to one of the colorspaces listed in linux/videodev2.h and
-documented in Documentation/media/uapi/v4l/colorspaces.rst.
-
-The red, green, blue and transp fields are not used with the FOURCC-based API.
-For forward compatibility reasons applications must zero those fields, and
-drivers must ignore them. Values other than 0 may get a meaning in future
-extensions.
-
-Upon successful format configuration, drivers update the fb_fix_screeninfo
-type, visual and line_length fields depending on the selected format. The type
-and visual fields are set to FB_TYPE_FOURCC and FB_VISUAL_FOURCC respectively.
diff --git a/Documentation/fb/arkfb.rst b/Documentation/fb/arkfb.rst
new file mode 100644
index 000000000000..aeca8773dd7e
--- /dev/null
+++ b/Documentation/fb/arkfb.rst
@@ -0,0 +1,68 @@
+========================================
+arkfb - fbdev driver for ARK Logic chips
+========================================
+
+
+Supported Hardware
+==================
+
+	ARK 2000PV chip
+	ICS 5342 ramdac
+
+	- only BIOS initialized VGA devices supported
+	- probably not working on big endian
+
+
+Supported Features
+==================
+
+	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
+	*  8 bpp pseudocolor mode (with 18bit palette)
+	* 16 bpp truecolor modes (RGB 555 and RGB 565)
+	* 24 bpp truecolor mode (RGB 888)
+	* 32 bpp truecolor mode (RGB 888)
+	* text mode (activated by bpp = 0)
+	* doublescan mode variant (not available in text mode)
+	* panning in both directions
+	* suspend/resume support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (i got maximum about 70 MHz, it is dependent on specific
+hardware). This limitation is not enforced by driver. Text mode supports 8bit
+wide fonts only (hardware limitation) and 16bit tall fonts (driver
+limitation). Unfortunately character attributes (like color) in text mode are
+broken for unknown reason, so its usefulness is limited.
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+	* secondary (not initialized by BIOS) device support
+	* big endian support
+	* DPMS support
+	* MMIO support
+	* interlaced mode variant
+	* support for fontwidths != 8 in 4 bpp modes
+	* support for fontheight != 16 in text mode
+	* hardware cursor
+	* vsync synchronization
+	* feature connector support
+	* acceleration support (8514-like 2D)
+
+
+Known bugs
+==========
+
+	* character attributes (and cursor) in text mode are broken
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.txt
deleted file mode 100644
index e8487a9d6a05..000000000000
--- a/Documentation/fb/arkfb.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-
-	arkfb - fbdev driver for ARK Logic chips
-	========================================
-
-
-Supported Hardware
-==================
-
-	ARK 2000PV chip
-	ICS 5342 ramdac
-
-	- only BIOS initialized VGA devices supported
-	- probably not working on big endian
-
-
-Supported Features
-==================
-
-	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
-	*  8 bpp pseudocolor mode (with 18bit palette)
-	* 16 bpp truecolor modes (RGB 555 and RGB 565)
-	* 24 bpp truecolor mode (RGB 888)
-	* 32 bpp truecolor mode (RGB 888)
-	* text mode (activated by bpp = 0)
-	* doublescan mode variant (not available in text mode)
-	* panning in both directions
-	* suspend/resume support
-
-Text mode is supported even in higher resolutions, but there is limitation to
-lower pixclocks (i got maximum about 70 MHz, it is dependent on specific
-hardware). This limitation is not enforced by driver. Text mode supports 8bit
-wide fonts only (hardware limitation) and 16bit tall fonts (driver
-limitation). Unfortunately character attributes (like color) in text mode are
-broken for unknown reason, so its usefulness is limited.
-
-There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
-packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
-with interleaved planes (1 byte interleave), MSB first. Both modes support
-8bit wide fonts only (driver limitation).
-
-Suspend/resume works on systems that initialize video card during resume and
-if device is active (for example used by fbcon).
-
-
-Missing Features
-================
-(alias TODO list)
-
-	* secondary (not initialized by BIOS) device support
-   	* big endian support
-	* DPMS support
-	* MMIO support
-	* interlaced mode variant
-	* support for fontwidths != 8 in 4 bpp modes
-	* support for fontheight != 16 in text mode
-	* hardware cursor
-	* vsync synchronization
-	* feature connector support
-	* acceleration support (8514-like 2D)
-
-
-Known bugs
-==========
-
-	* character attributes (and cursor) in text mode are broken
-
---
-Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/aty128fb.rst b/Documentation/fb/aty128fb.rst
new file mode 100644
index 000000000000..3f107718f933
--- /dev/null
+++ b/Documentation/fb/aty128fb.rst
@@ -0,0 +1,75 @@
+=================
+What is aty128fb?
+=================
+
+.. [This file is cloned from VesaFB/matroxfb]
+
+This is a driver for a graphic framebuffer for ATI Rage128 based devices
+on Intel and PPC boxes.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode... but you should not notice
+   if you use same resolution as you used in textmode.
+ * still experimental.
+
+
+How to use it?
+==============
+
+Switching modes is done using the  video=aty128fb:<resolution>... modedb
+boot parameter or using `fbset` program.
+
+See Documentation/fb/modedb.rst for more information on modedb
+resolutions.
+
+You should compile in both vgacon (to boot if you remove your Rage128 from
+box) and aty128fb (for graphics mode). You should not compile-in vesafb
+unless you have primary display on non-Rage128 VBE2.0 device (see
+Documentation/fb/vesafb.rst for details).
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated. As of
+this document, 8 and 32bpp works fine.  There have been palette issues
+when switching from X to console and back to X.  You will have to restart
+X to fix this.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to vesafb with
+`video=aty128fb:option1,option2:value2,option3` (multiple options should
+be separated by comma, values are separated from options by `:`).
+Accepted options:
+
+========= =======================================================
+noaccel   do not use acceleration engine. It is default.
+accel     use acceleration engine. Not finished.
+vmode:x   chooses PowerMacintosh video mode <x>. Deprecated.
+cmode:x   chooses PowerMacintosh colour mode <x>. Deprecated.
+<XxX@X>   selects startup videomode. See modedb.txt for detailed
+	  explanation. Default is 640x480x8bpp.
+========= =======================================================
+
+
+Limitations
+===========
+
+There are known and unknown bugs, features and misfeatures.
+Currently there are following known bugs:
+
+ - This driver is still experimental and is not finished.  Too many
+   bugs/errata to list here.
+
+Brad Douglas <brad@neruo.com>
diff --git a/Documentation/fb/aty128fb.txt b/Documentation/fb/aty128fb.txt
deleted file mode 100644
index b605204fcfe1..000000000000
--- a/Documentation/fb/aty128fb.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-[This file is cloned from VesaFB/matroxfb]
-
-What is aty128fb?
-=================
-
-This is a driver for a graphic framebuffer for ATI Rage128 based devices
-on Intel and PPC boxes.
-
-Advantages:
-
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts.
- * You can run XF68_FBDev on top of /dev/fb0
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * graphic mode is slower than text mode... but you should not notice
-   if you use same resolution as you used in textmode.
- * still experimental.
-
-
-How to use it?
-==============
-
-Switching modes is done using the  video=aty128fb:<resolution>... modedb
-boot parameter or using `fbset' program.
-
-See Documentation/fb/modedb.txt for more information on modedb
-resolutions.
-
-You should compile in both vgacon (to boot if you remove your Rage128 from
-box) and aty128fb (for graphics mode). You should not compile-in vesafb
-unless you have primary display on non-Rage128 VBE2.0 device (see 
-Documentation/fb/vesafb.txt for details).
-
-
-X11
-===
-
-XF68_FBDev should generally work fine, but it is non-accelerated. As of
-this document, 8 and 32bpp works fine.  There have been palette issues
-when switching from X to console and back to X.  You will have to restart
-X to fix this.
-
-
-Configuration
-=============
-
-You can pass kernel command line options to vesafb with
-`video=aty128fb:option1,option2:value2,option3' (multiple options should
-be separated by comma, values are separated from options by `:'). 
-Accepted options:
-
-noaccel  - do not use acceleration engine. It is default.
-accel    - use acceleration engine. Not finished.
-vmode:x  - chooses PowerMacintosh video mode <x>. Deprecated.
-cmode:x  - chooses PowerMacintosh colour mode <x>. Deprecated.
-<XxX@X>  - selects startup videomode. See modedb.txt for detailed
-	   explanation. Default is 640x480x8bpp.
-
-
-Limitations
-===========
-
-There are known and unknown bugs, features and misfeatures.
-Currently there are following known bugs:
- + This driver is still experimental and is not finished.  Too many
-   bugs/errata to list here.
-
---
-Brad Douglas <brad@neruo.com>
diff --git a/Documentation/fb/cirrusfb.rst b/Documentation/fb/cirrusfb.rst
new file mode 100644
index 000000000000..8c3e6c6cb114
--- /dev/null
+++ b/Documentation/fb/cirrusfb.rst
@@ -0,0 +1,94 @@
+============================================
+Framebuffer driver for Cirrus Logic chipsets
+============================================
+
+Copyright 1999 Jeff Garzik <jgarzik@pobox.com>
+
+
+.. just a little something to get people going; contributors welcome!
+
+
+Chip families supported:
+	- SD64
+	- Piccolo
+	- Picasso
+	- Spectrum
+	- Alpine (GD-543x/4x)
+	- Picasso4 (GD-5446)
+	- GD-5480
+	- Laguna (GD-546x)
+
+Bus's supported:
+	- PCI
+	- Zorro
+
+Architectures supported:
+	- i386
+	- Alpha
+	- PPC (Motorola Powerstack)
+	- m68k (Amiga)
+
+
+
+Default video modes
+-------------------
+At the moment, there are two kernel command line arguments supported:
+
+- mode:640x480
+- mode:800x600
+- mode:1024x768
+
+Full support for startup video modes (modedb) will be integrated soon.
+
+Version 1.9.9.1
+---------------
+* Fix memory detection for 512kB case
+* 800x600 mode
+* Fixed timings
+* Hint for AXP: Use -accel false -vyres -1 when changing resolution
+
+
+Version 1.9.4.4
+---------------
+* Preliminary Laguna support
+* Overhaul color register routines.
+* Associated with the above, console colors are now obtained from a LUT
+  called 'palette' instead of from the VGA registers.  This code was
+  modelled after that in atyfb and matroxfb.
+* Code cleanup, add comments.
+* Overhaul SR07 handling.
+* Bug fixes.
+
+
+Version 1.9.4.3
+---------------
+* Correctly set default startup video mode.
+* Do not override ram size setting.  Define
+  CLGEN_USE_HARDCODED_RAM_SETTINGS if you _do_ want to override the RAM
+  setting.
+* Compile fixes related to new 2.3.x IORESOURCE_IO[PORT] symbol changes.
+* Use new 2.3.x resource allocation.
+* Some code cleanup.
+
+
+Version 1.9.4.2
+---------------
+* Casting fixes.
+* Assertions no longer cause an oops on purpose.
+* Bug fixes.
+
+
+Version 1.9.4.1
+---------------
+* Add compatibility support.  Now requires a 2.1.x, 2.2.x or 2.3.x kernel.
+
+
+Version 1.9.4
+-------------
+* Several enhancements, smaller memory footprint, a few bugfixes.
+* Requires kernel 2.3.14-pre1 or later.
+
+
+Version 1.9.3
+-------------
+* Bundled with kernel 2.3.14-pre1 or later.
diff --git a/Documentation/fb/cirrusfb.txt b/Documentation/fb/cirrusfb.txt
deleted file mode 100644
index f75950d330a4..000000000000
--- a/Documentation/fb/cirrusfb.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-
-		Framebuffer driver for Cirrus Logic chipsets
-		Copyright 1999 Jeff Garzik <jgarzik@pobox.com>
-
-
-
-{ just a little something to get people going; contributors welcome! }
-
-
-
-Chip families supported:
-	SD64
-	Piccolo
-	Picasso
-	Spectrum
-	Alpine (GD-543x/4x)
-	Picasso4 (GD-5446)
-	GD-5480
-	Laguna (GD-546x)
-
-Bus's supported:
-	PCI
-	Zorro
-
-Architectures supported:
-	i386
-	Alpha
-	PPC (Motorola Powerstack)
-	m68k (Amiga)
-
-
-
-Default video modes
--------------------
-At the moment, there are two kernel command line arguments supported:
-
-mode:640x480
-mode:800x600
-	or
-mode:1024x768
-
-Full support for startup video modes (modedb) will be integrated soon.
-
-Version 1.9.9.1
----------------
-* Fix memory detection for 512kB case
-* 800x600 mode
-* Fixed timings
-* Hint for AXP: Use -accel false -vyres -1 when changing resolution
-
-
-Version 1.9.4.4
----------------
-* Preliminary Laguna support
-* Overhaul color register routines.
-* Associated with the above, console colors are now obtained from a LUT
-  called 'palette' instead of from the VGA registers.  This code was
-  modelled after that in atyfb and matroxfb.
-* Code cleanup, add comments.
-* Overhaul SR07 handling.
-* Bug fixes.
-
-
-Version 1.9.4.3
----------------
-* Correctly set default startup video mode.
-* Do not override ram size setting.  Define
-  CLGEN_USE_HARDCODED_RAM_SETTINGS if you _do_ want to override the RAM
-  setting.
-* Compile fixes related to new 2.3.x IORESOURCE_IO[PORT] symbol changes.
-* Use new 2.3.x resource allocation.
-* Some code cleanup.
-
-
-Version 1.9.4.2
----------------
-* Casting fixes.
-* Assertions no longer cause an oops on purpose.
-* Bug fixes.
-
-
-Version 1.9.4.1
----------------
-* Add compatibility support.  Now requires a 2.1.x, 2.2.x or 2.3.x kernel.
-
-
-Version 1.9.4
--------------
-* Several enhancements, smaller memory footprint, a few bugfixes.
-* Requires kernel 2.3.14-pre1 or later.
-
-
-Version 1.9.3
--------------
-* Bundled with kernel 2.3.14-pre1 or later.
-
-
diff --git a/Documentation/fb/cmap_xfbdev.rst b/Documentation/fb/cmap_xfbdev.rst
new file mode 100644
index 000000000000..5db5e9787361
--- /dev/null
+++ b/Documentation/fb/cmap_xfbdev.rst
@@ -0,0 +1,56 @@
+==========================
+Understanding fbdev's cmap
+==========================
+
+These notes explain how X's dix layer uses fbdev's cmap structures.
+
+-  example of relevant structures in fbdev as used for a 3-bit grayscale cmap::
+
+    struct fb_var_screeninfo {
+	    .bits_per_pixel = 8,
+	    .grayscale      = 1,
+	    .red =          { 4, 3, 0 },
+	    .green =        { 0, 0, 0 },
+	    .blue =         { 0, 0, 0 },
+    }
+    struct fb_fix_screeninfo {
+	    .visual =       FB_VISUAL_STATIC_PSEUDOCOLOR,
+    }
+    for (i = 0; i < 8; i++)
+	info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16;
+    memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8);
+    memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8);
+
+-  X11 apps do something like the following when trying to use grayscale::
+
+    for (i=0; i < 8; i++) {
+	char colorspec[64];
+	memset(colorspec,0,64);
+	sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36);
+	if (!XParseColor(outputDisplay, testColormap, colorspec, &wantedColor))
+		printf("Can't get color %s\n",colorspec);
+	XAllocColor(outputDisplay, testColormap, &wantedColor);
+	grays[i] = wantedColor;
+    }
+
+There's also named equivalents like gray1..x provided you have an rgb.txt.
+
+Somewhere in X's callchain, this results in a call to X code that handles the
+colormap. For example, Xfbdev hits the following:
+
+xc-011010/programs/Xserver/dix/colormap.c::
+
+  FindBestPixel(pentFirst, size, prgb, channel)
+
+  dr = (long) pent->co.local.red - prgb->red;
+  dg = (long) pent->co.local.green - prgb->green;
+  db = (long) pent->co.local.blue - prgb->blue;
+  sq = dr * dr;
+  UnsignedToBigNum (sq, &sum);
+  BigNumAdd (&sum, &temp, &sum);
+
+co.local.red are entries that were brought in through FBIOGETCMAP which come
+directly from the info->cmap.red that was listed above. The prgb is the rgb
+that the app wants to match to. The above code is doing what looks like a least
+squares matching function. That's why the cmap entries can't be set to the left
+hand side boundaries of a color range.
diff --git a/Documentation/fb/cmap_xfbdev.txt b/Documentation/fb/cmap_xfbdev.txt
deleted file mode 100644
index 55e1f0a3d2b4..000000000000
--- a/Documentation/fb/cmap_xfbdev.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-Understanding fbdev's cmap
---------------------------
-
-These notes explain how X's dix layer uses fbdev's cmap structures.
-
-*. example of relevant structures in fbdev as used for a 3-bit grayscale cmap
-struct fb_var_screeninfo {
-        .bits_per_pixel = 8,
-        .grayscale      = 1,
-        .red =          { 4, 3, 0 },
-        .green =        { 0, 0, 0 },
-        .blue =         { 0, 0, 0 },
-}
-struct fb_fix_screeninfo {
-        .visual =       FB_VISUAL_STATIC_PSEUDOCOLOR,
-}
-for (i = 0; i < 8; i++)
-	info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16;
-memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8);
-memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8);
-
-*. X11 apps do something like the following when trying to use grayscale.
-for (i=0; i < 8; i++) {
-	char colorspec[64];
-	memset(colorspec,0,64);
-	sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36);
-	if (!XParseColor(outputDisplay, testColormap, colorspec, &wantedColor))
-		printf("Can't get color %s\n",colorspec);
-	XAllocColor(outputDisplay, testColormap, &wantedColor);
-	grays[i] = wantedColor;
-}
-There's also named equivalents like gray1..x provided you have an rgb.txt.
-
-Somewhere in X's callchain, this results in a call to X code that handles the
-colormap. For example, Xfbdev hits the following:
-
-xc-011010/programs/Xserver/dix/colormap.c:
-
-FindBestPixel(pentFirst, size, prgb, channel)
-
-dr = (long) pent->co.local.red - prgb->red;
-dg = (long) pent->co.local.green - prgb->green;
-db = (long) pent->co.local.blue - prgb->blue;
-sq = dr * dr;
-UnsignedToBigNum (sq, &sum);
-BigNumAdd (&sum, &temp, &sum);
-
-co.local.red are entries that were brought in through FBIOGETCMAP which come
-directly from the info->cmap.red that was listed above. The prgb is the rgb
-that the app wants to match to. The above code is doing what looks like a least
-squares matching function. That's why the cmap entries can't be set to the left
-hand side boundaries of a color range.
-
diff --git a/Documentation/fb/deferred_io.rst b/Documentation/fb/deferred_io.rst
new file mode 100644
index 000000000000..7300cff255a3
--- /dev/null
+++ b/Documentation/fb/deferred_io.rst
@@ -0,0 +1,79 @@
+===========
+Deferred IO
+===========
+
+Deferred IO is a way to delay and repurpose IO. It uses host memory as a
+buffer and the MMU pagefault as a pretrigger for when to perform the device
+IO. The following example may be a useful explanation of how one such setup
+works:
+
+- userspace app like Xfbdev mmaps framebuffer
+- deferred IO and driver sets up fault and page_mkwrite handlers
+- userspace app tries to write to mmaped vaddress
+- we get pagefault and reach fault handler
+- fault handler finds and returns physical page
+- we get page_mkwrite where we add this page to a list
+- schedule a workqueue task to be run after a delay
+- app continues writing to that page with no additional cost. this is
+  the key benefit.
+- the workqueue task comes in and mkcleans the pages on the list, then
+  completes the work associated with updating the framebuffer. this is
+  the real work talking to the device.
+- app tries to write to the address (that has now been mkcleaned)
+- get pagefault and the above sequence occurs again
+
+As can be seen from above, one benefit is roughly to allow bursty framebuffer
+writes to occur at minimum cost. Then after some time when hopefully things
+have gone quiet, we go and really update the framebuffer which would be
+a relatively more expensive operation.
+
+For some types of nonvolatile high latency displays, the desired image is
+the final image rather than the intermediate stages which is why it's okay
+to not update for each write that is occurring.
+
+It may be the case that this is useful in other scenarios as well. Paul Mundt
+has mentioned a case where it is beneficial to use the page count to decide
+whether to coalesce and issue SG DMA or to do memory bursts.
+
+Another one may be if one has a device framebuffer that is in an usual format,
+say diagonally shifting RGB, this may then be a mechanism for you to allow
+apps to pretend to have a normal framebuffer but reswizzle for the device
+framebuffer at vsync time based on the touched pagelist.
+
+How to use it: (for applications)
+---------------------------------
+No changes needed. mmap the framebuffer like normal and just use it.
+
+How to use it: (for fbdev drivers)
+----------------------------------
+The following example may be helpful.
+
+1. Setup your structure. Eg::
+
+	static struct fb_deferred_io hecubafb_defio = {
+		.delay		= HZ,
+		.deferred_io	= hecubafb_dpy_deferred_io,
+	};
+
+The delay is the minimum delay between when the page_mkwrite trigger occurs
+and when the deferred_io callback is called. The deferred_io callback is
+explained below.
+
+2. Setup your deferred IO callback. Eg::
+
+	static void hecubafb_dpy_deferred_io(struct fb_info *info,
+					     struct list_head *pagelist)
+
+The deferred_io callback is where you would perform all your IO to the display
+device. You receive the pagelist which is the list of pages that were written
+to during the delay. You must not modify this list. This callback is called
+from a workqueue.
+
+3. Call init::
+
+	info->fbdefio = &hecubafb_defio;
+	fb_deferred_io_init(info);
+
+4. Call cleanup::
+
+	fb_deferred_io_cleanup(info);
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
deleted file mode 100644
index 748328370250..000000000000
--- a/Documentation/fb/deferred_io.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Deferred IO
------------
-
-Deferred IO is a way to delay and repurpose IO. It uses host memory as a
-buffer and the MMU pagefault as a pretrigger for when to perform the device
-IO. The following example may be a useful explanation of how one such setup
-works:
-
-- userspace app like Xfbdev mmaps framebuffer
-- deferred IO and driver sets up fault and page_mkwrite handlers
-- userspace app tries to write to mmaped vaddress
-- we get pagefault and reach fault handler
-- fault handler finds and returns physical page
-- we get page_mkwrite where we add this page to a list
-- schedule a workqueue task to be run after a delay
-- app continues writing to that page with no additional cost. this is
-  the key benefit.
-- the workqueue task comes in and mkcleans the pages on the list, then
- completes the work associated with updating the framebuffer. this is
-  the real work talking to the device.
-- app tries to write to the address (that has now been mkcleaned)
-- get pagefault and the above sequence occurs again
-
-As can be seen from above, one benefit is roughly to allow bursty framebuffer
-writes to occur at minimum cost. Then after some time when hopefully things
-have gone quiet, we go and really update the framebuffer which would be
-a relatively more expensive operation.
-
-For some types of nonvolatile high latency displays, the desired image is
-the final image rather than the intermediate stages which is why it's okay
-to not update for each write that is occurring.
-
-It may be the case that this is useful in other scenarios as well. Paul Mundt
-has mentioned a case where it is beneficial to use the page count to decide
-whether to coalesce and issue SG DMA or to do memory bursts.
-
-Another one may be if one has a device framebuffer that is in an usual format,
-say diagonally shifting RGB, this may then be a mechanism for you to allow
-apps to pretend to have a normal framebuffer but reswizzle for the device
-framebuffer at vsync time based on the touched pagelist.
-
-How to use it: (for applications)
----------------------------------
-No changes needed. mmap the framebuffer like normal and just use it.
-
-How to use it: (for fbdev drivers)
-----------------------------------
-The following example may be helpful.
-
-1. Setup your structure. Eg:
-
-static struct fb_deferred_io hecubafb_defio = {
-	.delay		= HZ,
-	.deferred_io	= hecubafb_dpy_deferred_io,
-};
-
-The delay is the minimum delay between when the page_mkwrite trigger occurs
-and when the deferred_io callback is called. The deferred_io callback is
-explained below.
-
-2. Setup your deferred IO callback. Eg:
-static void hecubafb_dpy_deferred_io(struct fb_info *info,
-				struct list_head *pagelist)
-
-The deferred_io callback is where you would perform all your IO to the display
-device. You receive the pagelist which is the list of pages that were written
-to during the delay. You must not modify this list. This callback is called
-from a workqueue.
-
-3. Call init
-	info->fbdefio = &hecubafb_defio;
-	fb_deferred_io_init(info);
-
-4. Call cleanup
-	fb_deferred_io_cleanup(info);
diff --git a/Documentation/fb/efifb.rst b/Documentation/fb/efifb.rst
new file mode 100644
index 000000000000..04840331a00e
--- /dev/null
+++ b/Documentation/fb/efifb.rst
@@ -0,0 +1,39 @@
+==============
+What is efifb?
+==============
+
+This is a generic EFI platform driver for Intel based Apple computers.
+efifb is only for EFI booted Intel Macs.
+
+Supported Hardware
+==================
+
+- iMac 17"/20"
+- Macbook
+- Macbook Pro 15"/17"
+- MacMini
+
+How to use it?
+==============
+
+efifb does not have any kind of autodetection of your machine.
+You have to add the following kernel parameters in your elilo.conf::
+
+	Macbook :
+		video=efifb:macbook
+	MacMini :
+		video=efifb:mini
+	Macbook Pro 15", iMac 17" :
+		video=efifb:i17
+	Macbook Pro 17", iMac 20" :
+		video=efifb:i20
+
+Accepted options:
+
+======= ===========================================================
+nowc	Don't map the framebuffer write combined. This can be used
+	to workaround side-effects and slowdowns on other CPU cores
+	when large amounts of console data are written.
+======= ===========================================================
+
+Edgar Hucek <gimli@dark-green.com>
diff --git a/Documentation/fb/efifb.txt b/Documentation/fb/efifb.txt
deleted file mode 100644
index 1a85c1bdaf38..000000000000
--- a/Documentation/fb/efifb.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-
-What is efifb?
-===============
-
-This is a generic EFI platform driver for Intel based Apple computers.
-efifb is only for EFI booted Intel Macs.
-
-Supported Hardware
-==================
-
-iMac 17"/20"
-Macbook
-Macbook Pro 15"/17"
-MacMini
-
-How to use it?
-==============
-
-efifb does not have any kind of autodetection of your machine.
-You have to add the following kernel parameters in your elilo.conf:
-	Macbook :
-		video=efifb:macbook
-	MacMini :
-		video=efifb:mini
-	Macbook Pro 15", iMac 17" :
-		video=efifb:i17
-	Macbook Pro 17", iMac 20" :
-		video=efifb:i20
-
-Accepted options:
-
-nowc	Don't map the framebuffer write combined. This can be used
-	to workaround side-effects and slowdowns on other CPU cores
-	when large amounts of console data are written.
-
---
-Edgar Hucek <gimli@dark-green.com>
diff --git a/Documentation/fb/ep93xx-fb.rst b/Documentation/fb/ep93xx-fb.rst
new file mode 100644
index 000000000000..6f7767926d1a
--- /dev/null
+++ b/Documentation/fb/ep93xx-fb.rst
@@ -0,0 +1,140 @@
+================================
+Driver for EP93xx LCD controller
+================================
+
+The EP93xx LCD controller can drive both standard desktop monitors and
+embedded LCD displays. If you have a standard desktop monitor then you
+can use the standard Linux video mode database. In your board file::
+
+	static struct ep93xxfb_mach_info some_board_fb_info = {
+		.num_modes	= EP93XXFB_USE_MODEDB,
+		.bpp		= 16,
+	};
+
+If you have an embedded LCD display then you need to define a video
+mode for it as follows::
+
+	static struct fb_videomode some_board_video_modes[] = {
+		{
+			.name		= "some_lcd_name",
+			/* Pixel clock, porches, etc */
+		},
+	};
+
+Note that the pixel clock value is in pico-seconds. You can use the
+KHZ2PICOS macro to convert the pixel clock value. Most other values
+are in pixel clocks. See Documentation/fb/framebuffer.rst for further
+details.
+
+The ep93xxfb_mach_info structure for your board should look like the
+following::
+
+	static struct ep93xxfb_mach_info some_board_fb_info = {
+		.num_modes	= ARRAY_SIZE(some_board_video_modes),
+		.modes		= some_board_video_modes,
+		.default_mode	= &some_board_video_modes[0],
+		.bpp		= 16,
+	};
+
+The framebuffer device can be registered by adding the following to
+your board initialisation function::
+
+	ep93xx_register_fb(&some_board_fb_info);
+
+=====================
+Video Attribute Flags
+=====================
+
+The ep93xxfb_mach_info structure has a flags field which can be used
+to configure the controller. The video attributes flags are fully
+documented in section 7 of the EP93xx users' guide. The following
+flags are available:
+
+=============================== ==========================================
+EP93XXFB_PCLK_FALLING		Clock data on the falling edge of the
+				pixel clock. The default is to clock
+				data on the rising edge.
+
+EP93XXFB_SYNC_BLANK_HIGH	Blank signal is active high. By
+				default the blank signal is active low.
+
+EP93XXFB_SYNC_HORIZ_HIGH	Horizontal sync is active high. By
+				default the horizontal sync is active low.
+
+EP93XXFB_SYNC_VERT_HIGH		Vertical sync is active high. By
+				default the vertical sync is active high.
+=============================== ==========================================
+
+The physical address of the framebuffer can be controlled using the
+following flags:
+
+=============================== ======================================
+EP93XXFB_USE_SDCSN0		Use SDCSn[0] for the framebuffer. This
+				is the default setting.
+
+EP93XXFB_USE_SDCSN1		Use SDCSn[1] for the framebuffer.
+
+EP93XXFB_USE_SDCSN2		Use SDCSn[2] for the framebuffer.
+
+EP93XXFB_USE_SDCSN3		Use SDCSn[3] for the framebuffer.
+=============================== ======================================
+
+==================
+Platform callbacks
+==================
+
+The EP93xx framebuffer driver supports three optional platform
+callbacks: setup, teardown and blank. The setup and teardown functions
+are called when the framebuffer driver is installed and removed
+respectively. The blank function is called whenever the display is
+blanked or unblanked.
+
+The setup and teardown devices pass the platform_device structure as
+an argument. The fb_info and ep93xxfb_mach_info structures can be
+obtained as follows::
+
+	static int some_board_fb_setup(struct platform_device *pdev)
+	{
+		struct ep93xxfb_mach_info *mach_info = pdev->dev.platform_data;
+		struct fb_info *fb_info = platform_get_drvdata(pdev);
+
+		/* Board specific framebuffer setup */
+	}
+
+======================
+Setting the video mode
+======================
+
+The video mode is set using the following syntax::
+
+	video=XRESxYRES[-BPP][@REFRESH]
+
+If the EP93xx video driver is built-in then the video mode is set on
+the Linux kernel command line, for example::
+
+	video=ep93xx-fb:800x600-16@60
+
+If the EP93xx video driver is built as a module then the video mode is
+set when the module is installed::
+
+	modprobe ep93xx-fb video=320x240
+
+==============
+Screenpage bug
+==============
+
+At least on the EP9315 there is a silicon bug which causes bit 27 of
+the VIDSCRNPAGE (framebuffer physical offset) to be tied low. There is
+an unofficial errata for this bug at::
+
+	http://marc.info/?l=linux-arm-kernel&m=110061245502000&w=2
+
+By default the EP93xx framebuffer driver checks if the allocated physical
+address has bit 27 set. If it does, then the memory is freed and an
+error is returned. The check can be disabled by adding the following
+option when loading the driver::
+
+      ep93xx-fb.check_screenpage_bug=0
+
+In some cases it may be possible to reconfigure your SDRAM layout to
+avoid this bug. See section 13 of the EP93xx users' guide for details.
diff --git a/Documentation/fb/ep93xx-fb.txt b/Documentation/fb/ep93xx-fb.txt
deleted file mode 100644
index 5af1bd9effae..000000000000
--- a/Documentation/fb/ep93xx-fb.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-================================
-Driver for EP93xx LCD controller
-================================
-
-The EP93xx LCD controller can drive both standard desktop monitors and
-embedded LCD displays. If you have a standard desktop monitor then you
-can use the standard Linux video mode database. In your board file:
-
-	static struct ep93xxfb_mach_info some_board_fb_info = {
-		.num_modes	= EP93XXFB_USE_MODEDB,
-		.bpp		= 16,
-	};
-
-If you have an embedded LCD display then you need to define a video
-mode for it as follows:
-
-	static struct fb_videomode some_board_video_modes[] = {
-		{
-			.name		= "some_lcd_name",
-			/* Pixel clock, porches, etc */
-		},
-	};
-
-Note that the pixel clock value is in pico-seconds. You can use the
-KHZ2PICOS macro to convert the pixel clock value. Most other values
-are in pixel clocks. See Documentation/fb/framebuffer.txt for further
-details.
-
-The ep93xxfb_mach_info structure for your board should look like the
-following:
-
-	static struct ep93xxfb_mach_info some_board_fb_info = {
-		.num_modes	= ARRAY_SIZE(some_board_video_modes),
-		.modes		= some_board_video_modes,
-		.default_mode	= &some_board_video_modes[0],
-		.bpp		= 16,
-	};
-
-The framebuffer device can be registered by adding the following to
-your board initialisation function:
-
-	ep93xx_register_fb(&some_board_fb_info);
-
-=====================
-Video Attribute Flags
-=====================
-
-The ep93xxfb_mach_info structure has a flags field which can be used
-to configure the controller. The video attributes flags are fully
-documented in section 7 of the EP93xx users' guide. The following
-flags are available:
-
-EP93XXFB_PCLK_FALLING		Clock data on the falling edge of the
-				pixel clock. The default is to clock
-				data on the rising edge.
-
-EP93XXFB_SYNC_BLANK_HIGH	Blank signal is active high. By
-				default the blank signal is active low.
-
-EP93XXFB_SYNC_HORIZ_HIGH	Horizontal sync is active high. By
-				default the horizontal sync is active low.
-
-EP93XXFB_SYNC_VERT_HIGH		Vertical sync is active high. By
-				default the vertical sync is active high.
-
-The physical address of the framebuffer can be controlled using the
-following flags:
-
-EP93XXFB_USE_SDCSN0		Use SDCSn[0] for the framebuffer. This
-				is the default setting.
-
-EP93XXFB_USE_SDCSN1		Use SDCSn[1] for the framebuffer.
-
-EP93XXFB_USE_SDCSN2		Use SDCSn[2] for the framebuffer.
-
-EP93XXFB_USE_SDCSN3		Use SDCSn[3] for the framebuffer.
-
-==================
-Platform callbacks
-==================
-
-The EP93xx framebuffer driver supports three optional platform
-callbacks: setup, teardown and blank. The setup and teardown functions
-are called when the framebuffer driver is installed and removed
-respectively. The blank function is called whenever the display is
-blanked or unblanked.
-
-The setup and teardown devices pass the platform_device structure as
-an argument. The fb_info and ep93xxfb_mach_info structures can be
-obtained as follows:
-
-	static int some_board_fb_setup(struct platform_device *pdev)
-	{
-		struct ep93xxfb_mach_info *mach_info = pdev->dev.platform_data;
-		struct fb_info *fb_info = platform_get_drvdata(pdev);
-
-		/* Board specific framebuffer setup */
-	}
-
-======================
-Setting the video mode
-======================
-
-The video mode is set using the following syntax:
-
-	video=XRESxYRES[-BPP][@REFRESH]
-
-If the EP93xx video driver is built-in then the video mode is set on
-the Linux kernel command line, for example:
-
-	video=ep93xx-fb:800x600-16@60
-
-If the EP93xx video driver is built as a module then the video mode is
-set when the module is installed:
-
-	modprobe ep93xx-fb video=320x240
-
-==============
-Screenpage bug
-==============
-
-At least on the EP9315 there is a silicon bug which causes bit 27 of
-the VIDSCRNPAGE (framebuffer physical offset) to be tied low. There is
-an unofficial errata for this bug at:
-	http://marc.info/?l=linux-arm-kernel&m=110061245502000&w=2
-
-By default the EP93xx framebuffer driver checks if the allocated physical
-address has bit 27 set. If it does, then the memory is freed and an
-error is returned. The check can be disabled by adding the following
-option when loading the driver:
-
-      ep93xx-fb.check_screenpage_bug=0
-
-In some cases it may be possible to reconfigure your SDRAM layout to
-avoid this bug. See section 13 of the EP93xx users' guide for details.
diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst
new file mode 100644
index 000000000000..ebca41785abe
--- /dev/null
+++ b/Documentation/fb/fbcon.rst
@@ -0,0 +1,350 @@
+=======================
+The Framebuffer Console
+=======================
+
+The framebuffer console (fbcon), as its name implies, is a text
+console running on top of the framebuffer device. It has the functionality of
+any standard text console driver, such as the VGA console, with the added
+features that can be attributed to the graphical nature of the framebuffer.
+
+In the x86 architecture, the framebuffer console is optional, and
+some even treat it as a toy. For other architectures, it is the only available
+display device, text or graphical.
+
+What are the features of fbcon?  The framebuffer console supports
+high resolutions, varying font types, display rotation, primitive multihead,
+etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature
+made available by the underlying graphics card are also possible.
+
+A. Configuration
+================
+
+The framebuffer console can be enabled by using your favorite kernel
+configuration tool.  It is under Device Drivers->Graphics Support->Frame
+buffer Devices->Console display driver support->Framebuffer Console Support.
+Select 'y' to compile support statically or 'm' for module support.  The
+module will be fbcon.
+
+In order for fbcon to activate, at least one framebuffer driver is
+required, so choose from any of the numerous drivers available. For x86
+systems, they almost universally have VGA cards, so vga16fb and vesafb will
+always be available. However, using a chipset-specific driver will give you
+more speed and features, such as the ability to change the video mode
+dynamically.
+
+To display the penguin logo, choose any logo available in Graphics
+support->Bootup logo.
+
+Also, you will need to select at least one compiled-in font, but if
+you don't do anything, the kernel configuration tool will select one for you,
+usually an 8x16 font.
+
+GOTCHA: A common bug report is enabling the framebuffer without enabling the
+framebuffer console.  Depending on the driver, you may get a blanked or
+garbled display, but the system still boots to completion.  If you are
+fortunate to have a driver that does not alter the graphics chip, then you
+will still get a VGA console.
+
+B. Loading
+==========
+
+Possible scenarios:
+
+1. Driver and fbcon are compiled statically
+
+	 Usually, fbcon will automatically take over your console. The notable
+	 exception is vesafb.  It needs to be explicitly activated with the
+	 vga= boot option parameter.
+
+2. Driver is compiled statically, fbcon is compiled as a module
+
+	 Depending on the driver, you either get a standard console, or a
+	 garbled display, as mentioned above.  To get a framebuffer console,
+	 do a 'modprobe fbcon'.
+
+3. Driver is compiled as a module, fbcon is compiled statically
+
+	 You get your standard console.  Once the driver is loaded with
+	 'modprobe xxxfb', fbcon automatically takes over the console with
+	 the possible exception of using the fbcon=map:n option. See below.
+
+4. Driver and fbcon are compiled as a module.
+
+	 You can load them in any order. Once both are loaded, fbcon will take
+	 over the console.
+
+C. Boot options
+
+	 The framebuffer console has several, largely unknown, boot options
+	 that can change its behavior.
+
+1. fbcon=font:<name>
+
+	Select the initial font to use. The value 'name' can be any of the
+	compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6,
+	PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, TER16x32, VGA8x16, VGA8x8.
+
+	Note, not all drivers can handle font with widths not divisible by 8,
+	such as vga16fb.
+
+2. fbcon=scrollback:<value>[k]
+
+	The scrollback buffer is memory that is used to preserve display
+	contents that has already scrolled past your view.  This is accessed
+	by using the Shift-PageUp key combination.  The value 'value' is any
+	integer. It defaults to 32KB.  The 'k' suffix is optional, and will
+	multiply the 'value' by 1024.
+
+3. fbcon=map:<0123>
+
+	This is an interesting option. It tells which driver gets mapped to
+	which console. The value '0123' is a sequence that gets repeated until
+	the total length is 64 which is the number of consoles available. In
+	the above example, it is expanded to 012301230123... and the mapping
+	will be::
+
+		tty | 1 2 3 4 5 6 7 8 9 ...
+		fb  | 0 1 2 3 0 1 2 3 0 ...
+
+		('cat /proc/fb' should tell you what the fb numbers are)
+
+	One side effect that may be useful is using a map value that exceeds
+	the number of loaded fb drivers. For example, if only one driver is
+	available, fb0, adding fbcon=map:1 tells fbcon not to take over the
+	console.
+
+	Later on, when you want to map the console the to the framebuffer
+	device, you can use the con2fbmap utility.
+
+4. fbcon=vc:<n1>-<n2>
+
+	This option tells fbcon to take over only a range of consoles as
+	specified by the values 'n1' and 'n2'. The rest of the consoles
+	outside the given range will still be controlled by the standard
+	console driver.
+
+	NOTE: For x86 machines, the standard console is the VGA console which
+	is typically located on the same video card.  Thus, the consoles that
+	are controlled by the VGA console will be garbled.
+
+4. fbcon=rotate:<n>
+
+	This option changes the orientation angle of the console display. The
+	value 'n' accepts the following:
+
+	    - 0 - normal orientation (0 degree)
+	    - 1 - clockwise orientation (90 degrees)
+	    - 2 - upside down orientation (180 degrees)
+	    - 3 - counterclockwise orientation (270 degrees)
+
+	The angle can be changed anytime afterwards by 'echoing' the same
+	numbers to any one of the 2 attributes found in
+	/sys/class/graphics/fbcon:
+
+		- rotate     - rotate the display of the active console
+		- rotate_all - rotate the display of all consoles
+
+	Console rotation will only become available if Framebuffer Console
+	Rotation support is compiled in your kernel.
+
+	NOTE: This is purely console rotation.  Any other applications that
+	use the framebuffer will remain at their 'normal' orientation.
+	Actually, the underlying fb driver is totally ignorant of console
+	rotation.
+
+5. fbcon=margin:<color>
+
+	This option specifies the color of the margins. The margins are the
+	leftover area at the right and the bottom of the screen that are not
+	used by text. By default, this area will be black. The 'color' value
+	is an integer number that depends on the framebuffer driver being used.
+
+6. fbcon=nodefer
+
+	If the kernel is compiled with deferred fbcon takeover support, normally
+	the framebuffer contents, left in place by the firmware/bootloader, will
+	be preserved until there actually is some text is output to the console.
+	This option causes fbcon to bind immediately to the fbdev device.
+
+7. fbcon=logo-pos:<location>
+
+	The only possible 'location' is 'center' (without quotes), and when
+	given, the bootup logo is moved from the default top-left corner
+	location to the center of the framebuffer. If more than one logo is
+	displayed due to multiple CPUs, the collected line of logos is moved
+	as a whole.
+
+C. Attaching, Detaching and Unloading
+
+Before going on to how to attach, detach and unload the framebuffer console, an
+illustration of the dependencies may help.
+
+The console layer, as with most subsystems, needs a driver that interfaces with
+the hardware. Thus, in a VGA console::
+
+	console ---> VGA driver ---> hardware.
+
+Assuming the VGA driver can be unloaded, one must first unbind the VGA driver
+from the console layer before unloading the driver.  The VGA driver cannot be
+unloaded if it is still bound to the console layer. (See
+Documentation/driver-api/console.rst for more information).
+
+This is more complicated in the case of the framebuffer console (fbcon),
+because fbcon is an intermediate layer between the console and the drivers::
+
+	console ---> fbcon ---> fbdev drivers ---> hardware
+
+The fbdev drivers cannot be unloaded if bound to fbcon, and fbcon cannot
+be unloaded if it's bound to the console layer.
+
+So to unload the fbdev drivers, one must first unbind fbcon from the console,
+then unbind the fbdev drivers from fbcon.  Fortunately, unbinding fbcon from
+the console layer will automatically unbind framebuffer drivers from
+fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from
+fbcon.
+
+So, how do we unbind fbcon from the console? Part of the answer is in
+Documentation/driver-api/console.rst. To summarize:
+
+Echo a value to the bind file that represents the framebuffer console
+driver. So assuming vtcon1 represents fbcon, then::
+
+  echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to
+					     console layer
+  echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from
+					     console layer
+
+If fbcon is detached from the console layer, your boot console driver (which is
+usually VGA text mode) will take over.  A few drivers (rivafb and i810fb) will
+restore VGA text mode for you.  With the rest, before detaching fbcon, you
+must take a few additional steps to make sure that your VGA text mode is
+restored properly. The following is one of the several methods that you can do:
+
+1. Download or install vbetool.  This utility is included with most
+   distributions nowadays, and is usually part of the suspend/resume tool.
+
+2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set
+   to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers.
+
+3. Boot into text mode and as root run::
+
+	vbetool vbestate save > <vga state file>
+
+   The above command saves the register contents of your graphics
+   hardware to <vga state file>.  You need to do this step only once as
+   the state file can be reused.
+
+4. If fbcon is compiled as a module, load fbcon by doing::
+
+       modprobe fbcon
+
+5. Now to detach fbcon::
+
+       vbetool vbestate restore < <vga state file> && \
+       echo 0 > /sys/class/vtconsole/vtcon1/bind
+
+6. That's it, you're back to VGA mode. And if you compiled fbcon as a module,
+   you can unload it by 'rmmod fbcon'.
+
+7. To reattach fbcon::
+
+       echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+8. Once fbcon is unbound, all drivers registered to the system will also
+become unbound.  This means that fbcon and individual framebuffer drivers
+can be unloaded or reloaded at will. Reloading the drivers or fbcon will
+automatically bind the console, fbcon and the drivers together. Unloading
+all the drivers without unloading fbcon will make it impossible for the
+console to bind fbcon.
+
+Notes for vesafb users:
+=======================
+
+Unfortunately, if your bootline includes a vga=xxx parameter that sets the
+hardware in graphics mode, such as when loading vesafb, vgacon will not load.
+Instead, vgacon will replace the default boot console with dummycon, and you
+won't get any display after detaching fbcon. Your machine is still alive, so
+you can reattach vesafb. However, to reattach vesafb, you need to do one of
+the following:
+
+Variation 1:
+
+    a. Before detaching fbcon, do::
+
+	vbetool vbemode save > <vesa state file> # do once for each vesafb mode,
+						 # the file can be reused
+
+    b. Detach fbcon as in step 5.
+
+    c. Attach fbcon::
+
+	vbetool vbestate restore < <vesa state file> && \
+	echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+Variation 2:
+
+    a. Before detaching fbcon, do::
+
+	echo <ID> > /sys/class/tty/console/bind
+
+	vbetool vbemode get
+
+    b. Take note of the mode number
+
+    b. Detach fbcon as in step 5.
+
+    c. Attach fbcon::
+
+	vbetool vbemode set <mode number> && \
+	echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+Samples:
+========
+
+Here are 2 sample bash scripts that you can use to bind or unbind the
+framebuffer console driver if you are on an X86 box::
+
+  #!/bin/bash
+  # Unbind fbcon
+
+  # Change this to where your actual vgastate file is located
+  # Or Use VGASTATE=$1 to indicate the state file at runtime
+  VGASTATE=/tmp/vgastate
+
+  # path to vbetool
+  VBETOOL=/usr/local/bin
+
+
+  for (( i = 0; i < 16; i++))
+  do
+    if test -x /sys/class/vtconsole/vtcon$i; then
+	if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
+	     = 1 ]; then
+	    if test -x $VBETOOL/vbetool; then
+	       echo Unbinding vtcon$i
+	       $VBETOOL/vbetool vbestate restore < $VGASTATE
+	       echo 0 > /sys/class/vtconsole/vtcon$i/bind
+	    fi
+	fi
+    fi
+  done
+
+---------------------------------------------------------------------------
+
+::
+
+  #!/bin/bash
+  # Bind fbcon
+
+  for (( i = 0; i < 16; i++))
+  do
+    if test -x /sys/class/vtconsole/vtcon$i; then
+	if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
+	     = 1 ]; then
+	  echo Unbinding vtcon$i
+	  echo 1 > /sys/class/vtconsole/vtcon$i/bind
+	fi
+    fi
+  done
+
+Antonino Daplas <adaplas@pol.net>
diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt
deleted file mode 100644
index 60a5ec04e8f0..000000000000
--- a/Documentation/fb/fbcon.txt
+++ /dev/null
@@ -1,347 +0,0 @@
-The Framebuffer Console
-=======================
-
-	The framebuffer console (fbcon), as its name implies, is a text
-console running on top of the framebuffer device. It has the functionality of
-any standard text console driver, such as the VGA console, with the added
-features that can be attributed to the graphical nature of the framebuffer.
-
-	 In the x86 architecture, the framebuffer console is optional, and
-some even treat it as a toy. For other architectures, it is the only available
-display device, text or graphical.
-
-	 What are the features of fbcon?  The framebuffer console supports
-high resolutions, varying font types, display rotation, primitive multihead,
-etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature
-made available by the underlying graphics card are also possible.
-
-A. Configuration
-
-	The framebuffer console can be enabled by using your favorite kernel
-configuration tool.  It is under Device Drivers->Graphics Support->Frame
-buffer Devices->Console display driver support->Framebuffer Console Support.
-Select 'y' to compile support statically or 'm' for module support.  The
-module will be fbcon.
-
-	In order for fbcon to activate, at least one framebuffer driver is
-required, so choose from any of the numerous drivers available. For x86
-systems, they almost universally have VGA cards, so vga16fb and vesafb will
-always be available. However, using a chipset-specific driver will give you
-more speed and features, such as the ability to change the video mode
-dynamically.
-
-	To display the penguin logo, choose any logo available in Graphics
-support->Bootup logo.
-
-	Also, you will need to select at least one compiled-in font, but if
-you don't do anything, the kernel configuration tool will select one for you,
-usually an 8x16 font.
-
-GOTCHA: A common bug report is enabling the framebuffer without enabling the
-framebuffer console.  Depending on the driver, you may get a blanked or
-garbled display, but the system still boots to completion.  If you are
-fortunate to have a driver that does not alter the graphics chip, then you
-will still get a VGA console.
-
-B. Loading
-
-Possible scenarios:
-
-1. Driver and fbcon are compiled statically
-
-	 Usually, fbcon will automatically take over your console. The notable
-	 exception is vesafb.  It needs to be explicitly activated with the
-	 vga= boot option parameter.
-
-2. Driver is compiled statically, fbcon is compiled as a module
-
-	 Depending on the driver, you either get a standard console, or a
-	 garbled display, as mentioned above.  To get a framebuffer console,
-	 do a 'modprobe fbcon'.
-
-3. Driver is compiled as a module, fbcon is compiled statically
-
-	 You get your standard console.  Once the driver is loaded with
-	 'modprobe xxxfb', fbcon automatically takes over the console with
-	 the possible exception of using the fbcon=map:n option. See below.
-
-4. Driver and fbcon are compiled as a module.
-
-	 You can load them in any order. Once both are loaded, fbcon will take
-	 over the console.
-
-C. Boot options
-
-         The framebuffer console has several, largely unknown, boot options
-         that can change its behavior.
-
-1. fbcon=font:<name>
-
-        Select the initial font to use. The value 'name' can be any of the
-        compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6,
-        PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, VGA8x16, VGA8x8.
-
-	Note, not all drivers can handle font with widths not divisible by 8,
-        such as vga16fb.
-
-2. fbcon=scrollback:<value>[k]
-
-        The scrollback buffer is memory that is used to preserve display
-        contents that has already scrolled past your view.  This is accessed
-        by using the Shift-PageUp key combination.  The value 'value' is any
-        integer. It defaults to 32KB.  The 'k' suffix is optional, and will
-        multiply the 'value' by 1024.
-
-3. fbcon=map:<0123>
-
-        This is an interesting option. It tells which driver gets mapped to
-        which console. The value '0123' is a sequence that gets repeated until
-        the total length is 64 which is the number of consoles available. In
-        the above example, it is expanded to 012301230123... and the mapping
-        will be:
-
-		tty | 1 2 3 4 5 6 7 8 9 ...
-		fb  | 0 1 2 3 0 1 2 3 0 ...
-
-		('cat /proc/fb' should tell you what the fb numbers are)
-
-	One side effect that may be useful is using a map value that exceeds
-	the number of loaded fb drivers. For example, if only one driver is
-	available, fb0, adding fbcon=map:1 tells fbcon not to take over the
-	console.
-
-	Later on, when you want to map the console the to the framebuffer
-	device, you can use the con2fbmap utility.
-
-4. fbcon=vc:<n1>-<n2>
-
-	This option tells fbcon to take over only a range of consoles as
-	specified by the values 'n1' and 'n2'. The rest of the consoles
-	outside the given range will still be controlled by the standard
-	console driver.
-
-	NOTE: For x86 machines, the standard console is the VGA console which
-	is typically located on the same video card.  Thus, the consoles that
-	are controlled by the VGA console will be garbled.
-
-4. fbcon=rotate:<n>
-
-        This option changes the orientation angle of the console display. The
-        value 'n' accepts the following:
-
-	      0 - normal orientation (0 degree)
-	      1 - clockwise orientation (90 degrees)
-	      2 - upside down orientation (180 degrees)
-	      3 - counterclockwise orientation (270 degrees)
-
-	The angle can be changed anytime afterwards by 'echoing' the same
-	numbers to any one of the 2 attributes found in
-	/sys/class/graphics/fbcon:
-
-		rotate     - rotate the display of the active console
-		rotate_all - rotate the display of all consoles
-
-	Console rotation will only become available if Framebuffer Console
-	Rotation support is compiled in your kernel.
-
-	NOTE: This is purely console rotation.  Any other applications that
-	use the framebuffer will remain at their 'normal' orientation.
-	Actually, the underlying fb driver is totally ignorant of console
-	rotation.
-
-5. fbcon=margin:<color>
-
-	This option specifies the color of the margins. The margins are the
-	leftover area at the right and the bottom of the screen that are not
-	used by text. By default, this area will be black. The 'color' value
-	is an integer number that depends on the framebuffer driver being used.
-
-6. fbcon=nodefer
-
-	If the kernel is compiled with deferred fbcon takeover support, normally
-	the framebuffer contents, left in place by the firmware/bootloader, will
-	be preserved until there actually is some text is output to the console.
-	This option causes fbcon to bind immediately to the fbdev device.
-
-7. fbcon=logo-pos:<location>
-
-	The only possible 'location' is 'center' (without quotes), and when
-	given, the bootup logo is moved from the default top-left corner
-	location to the center of the framebuffer. If more than one logo is
-	displayed due to multiple CPUs, the collected line of logos is moved
-	as a whole.
-
-C. Attaching, Detaching and Unloading
-
-Before going on to how to attach, detach and unload the framebuffer console, an
-illustration of the dependencies may help.
-
-The console layer, as with most subsystems, needs a driver that interfaces with
-the hardware. Thus, in a VGA console:
-
-console ---> VGA driver ---> hardware.
-
-Assuming the VGA driver can be unloaded, one must first unbind the VGA driver
-from the console layer before unloading the driver.  The VGA driver cannot be
-unloaded if it is still bound to the console layer. (See
-Documentation/console/console.txt for more information).
-
-This is more complicated in the case of the framebuffer console (fbcon),
-because fbcon is an intermediate layer between the console and the drivers:
-
-console ---> fbcon ---> fbdev drivers ---> hardware
-
-The fbdev drivers cannot be unloaded if bound to fbcon, and fbcon cannot
-be unloaded if it's bound to the console layer.
-
-So to unload the fbdev drivers, one must first unbind fbcon from the console,
-then unbind the fbdev drivers from fbcon.  Fortunately, unbinding fbcon from
-the console layer will automatically unbind framebuffer drivers from
-fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from
-fbcon.
-
-So, how do we unbind fbcon from the console? Part of the answer is in
-Documentation/console/console.txt. To summarize:
-
-Echo a value to the bind file that represents the framebuffer console
-driver. So assuming vtcon1 represents fbcon, then:
-
-echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to
-                                           console layer
-echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from
-                                           console layer
-
-If fbcon is detached from the console layer, your boot console driver (which is
-usually VGA text mode) will take over.  A few drivers (rivafb and i810fb) will
-restore VGA text mode for you.  With the rest, before detaching fbcon, you
-must take a few additional steps to make sure that your VGA text mode is
-restored properly. The following is one of the several methods that you can do:
-
-1. Download or install vbetool.  This utility is included with most
-   distributions nowadays, and is usually part of the suspend/resume tool.
-
-2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set
-   to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers.
-
-3. Boot into text mode and as root run:
-
-	vbetool vbestate save > <vga state file>
-
-	The above command saves the register contents of your graphics
-	hardware to <vga state file>.  You need to do this step only once as
-	the state file can be reused.
-
-4. If fbcon is compiled as a module, load fbcon by doing:
-
-       modprobe fbcon
-
-5. Now to detach fbcon:
-
-       vbetool vbestate restore < <vga state file> && \
-       echo 0 > /sys/class/vtconsole/vtcon1/bind
-
-6. That's it, you're back to VGA mode. And if you compiled fbcon as a module,
-   you can unload it by 'rmmod fbcon'.
-
-7. To reattach fbcon:
-
-       echo 1 > /sys/class/vtconsole/vtcon1/bind
-
-8. Once fbcon is unbound, all drivers registered to the system will also
-become unbound.  This means that fbcon and individual framebuffer drivers
-can be unloaded or reloaded at will. Reloading the drivers or fbcon will
-automatically bind the console, fbcon and the drivers together. Unloading
-all the drivers without unloading fbcon will make it impossible for the
-console to bind fbcon.
-
-Notes for vesafb users:
-=======================
-
-Unfortunately, if your bootline includes a vga=xxx parameter that sets the
-hardware in graphics mode, such as when loading vesafb, vgacon will not load.
-Instead, vgacon will replace the default boot console with dummycon, and you
-won't get any display after detaching fbcon. Your machine is still alive, so
-you can reattach vesafb. However, to reattach vesafb, you need to do one of
-the following:
-
-Variation 1:
-
-    a. Before detaching fbcon, do
-
-       vbetool vbemode save > <vesa state file> # do once for each vesafb mode,
-						# the file can be reused
-
-    b. Detach fbcon as in step 5.
-
-    c. Attach fbcon
-
-        vbetool vbestate restore < <vesa state file> && \
-	echo 1 > /sys/class/vtconsole/vtcon1/bind
-
-Variation 2:
-
-    a. Before detaching fbcon, do:
-	echo <ID> > /sys/class/tty/console/bind
-
-
-       vbetool vbemode get
-
-    b. Take note of the mode number
-
-    b. Detach fbcon as in step 5.
-
-    c. Attach fbcon:
-
-       vbetool vbemode set <mode number> && \
-       echo 1 > /sys/class/vtconsole/vtcon1/bind
-
-Samples:
-========
-
-Here are 2 sample bash scripts that you can use to bind or unbind the
-framebuffer console driver if you are on an X86 box:
-
----------------------------------------------------------------------------
-#!/bin/bash
-# Unbind fbcon
-
-# Change this to where your actual vgastate file is located
-# Or Use VGASTATE=$1 to indicate the state file at runtime
-VGASTATE=/tmp/vgastate
-
-# path to vbetool
-VBETOOL=/usr/local/bin
-
-
-for (( i = 0; i < 16; i++))
-do
-  if test -x /sys/class/vtconsole/vtcon$i; then
-      if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
-           = 1 ]; then
-	    if test -x $VBETOOL/vbetool; then
-	       echo Unbinding vtcon$i
-	       $VBETOOL/vbetool vbestate restore < $VGASTATE
-	       echo 0 > /sys/class/vtconsole/vtcon$i/bind
-	    fi
-      fi
-  fi
-done
-
----------------------------------------------------------------------------
-#!/bin/bash
-# Bind fbcon
-
-for (( i = 0; i < 16; i++))
-do
-  if test -x /sys/class/vtconsole/vtcon$i; then
-      if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
-           = 1 ]; then
-	  echo Unbinding vtcon$i
-	  echo 1 > /sys/class/vtconsole/vtcon$i/bind
-      fi
-  fi
-done
----------------------------------------------------------------------------
-
---
-Antonino Daplas <adaplas@pol.net>
diff --git a/Documentation/fb/framebuffer.rst b/Documentation/fb/framebuffer.rst
new file mode 100644
index 000000000000..7fe087310c82
--- /dev/null
+++ b/Documentation/fb/framebuffer.rst
@@ -0,0 +1,353 @@
+=======================
+The Frame Buffer Device
+=======================
+
+Last revised: May 10, 2001
+
+
+0. Introduction
+---------------
+
+The frame buffer device provides an abstraction for the graphics hardware. It
+represents the frame buffer of some video hardware and allows application
+software to access the graphics hardware through a well-defined interface, so
+the software doesn't need to know anything about the low-level (hardware
+register) stuff.
+
+The device is accessed through special device nodes, usually located in the
+/dev directory, i.e. /dev/fb*.
+
+
+1. User's View of /dev/fb*
+--------------------------
+
+From the user's point of view, the frame buffer device looks just like any
+other device in /dev. It's a character device using major 29; the minor
+specifies the frame buffer number.
+
+By convention, the following device nodes are used (numbers indicate the device
+minor numbers)::
+
+      0 = /dev/fb0	First frame buffer
+      1 = /dev/fb1	Second frame buffer
+	  ...
+     31 = /dev/fb31	32nd frame buffer
+
+For backwards compatibility, you may want to create the following symbolic
+links::
+
+    /dev/fb0current -> fb0
+    /dev/fb1current -> fb1
+
+and so on...
+
+The frame buffer devices are also `normal` memory devices, this means, you can
+read and write their contents. You can, for example, make a screen snapshot by::
+
+  cp /dev/fb0 myfile
+
+There also can be more than one frame buffer at a time, e.g. if you have a
+graphics card in addition to the built-in hardware. The corresponding frame
+buffer devices (/dev/fb0 and /dev/fb1 etc.) work independently.
+
+Application software that uses the frame buffer device (e.g. the X server) will
+use /dev/fb0 by default (older software uses /dev/fb0current). You can specify
+an alternative frame buffer device by setting the environment variable
+$FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash
+users)::
+
+    export FRAMEBUFFER=/dev/fb1
+
+or (for csh users)::
+
+    setenv FRAMEBUFFER /dev/fb1
+
+After this the X server will use the second frame buffer.
+
+
+2. Programmer's View of /dev/fb*
+--------------------------------
+
+As you already know, a frame buffer device is a memory device like /dev/mem and
+it has the same features. You can read it, write it, seek to some location in
+it and mmap() it (the main usage). The difference is just that the memory that
+appears in the special file is not the whole memory, but the frame buffer of
+some video hardware.
+
+/dev/fb* also allows several ioctls on it, by which lots of information about
+the hardware can be queried and set. The color map handling works via ioctls,
+too. Look into <linux/fb.h> for more information on what ioctls exist and on
+which data structures they work. Here's just a brief overview:
+
+  - You can request unchangeable information about the hardware, like name,
+    organization of the screen memory (planes, packed pixels, ...) and address
+    and length of the screen memory.
+
+  - You can request and change variable information about the hardware, like
+    visible and virtual geometry, depth, color map format, timing, and so on.
+    If you try to change that information, the driver maybe will round up some
+    values to meet the hardware's capabilities (or return EINVAL if that isn't
+    possible).
+
+  - You can get and set parts of the color map. Communication is done with 16
+    bits per color part (red, green, blue, transparency) to support all
+    existing hardware. The driver does all the computations needed to apply
+    it to the hardware (round it down to less bits, maybe throw away
+    transparency).
+
+All this hardware abstraction makes the implementation of application programs
+easier and more portable. E.g. the X server works completely on /dev/fb* and
+thus doesn't need to know, for example, how the color registers of the concrete
+hardware are organized. XF68_FBDev is a general X server for bitmapped,
+unaccelerated video hardware. The only thing that has to be built into
+application programs is the screen organization (bitplanes or chunky pixels
+etc.), because it works on the frame buffer image data directly.
+
+For the future it is planned that frame buffer drivers for graphics cards and
+the like can be implemented as kernel modules that are loaded at runtime. Such
+a driver just has to call register_framebuffer() and supply some functions.
+Writing and distributing such drivers independently from the kernel will save
+much trouble...
+
+
+3. Frame Buffer Resolution Maintenance
+--------------------------------------
+
+Frame buffer resolutions are maintained using the utility `fbset`. It can
+change the video mode properties of a frame buffer device. Its main usage is
+to change the current video mode, e.g. during boot up in one of your `/etc/rc.*`
+or `/etc/init.d/*` files.
+
+Fbset uses a video mode database stored in a configuration file, so you can
+easily add your own modes and refer to them with a simple identifier.
+
+
+4. The X Server
+---------------
+
+The X server (XF68_FBDev) is the most notable application program for the frame
+buffer device. Starting with XFree86 release 3.2, the X server is part of
+XFree86 and has 2 modes:
+
+  - If the `Display` subsection for the `fbdev` driver in the /etc/XF86Config
+    file contains a::
+
+	Modes "default"
+
+    line, the X server will use the scheme discussed above, i.e. it will start
+    up in the resolution determined by /dev/fb0 (or $FRAMEBUFFER, if set). You
+    still have to specify the color depth (using the Depth keyword) and virtual
+    resolution (using the Virtual keyword) though. This is the default for the
+    configuration file supplied with XFree86. It's the most simple
+    configuration, but it has some limitations.
+
+  - Therefore it's also possible to specify resolutions in the /etc/XF86Config
+    file. This allows for on-the-fly resolution switching while retaining the
+    same virtual desktop size. The frame buffer device that's used is still
+    /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are
+    defined by /etc/XF86Config now. The disadvantage is that you have to
+    specify the timings in a different format (but `fbset -x` may help).
+
+To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't
+work 100% with XF68_FBDev: the reported clock values are always incorrect.
+
+
+5. Video Mode Timings
+---------------------
+
+A monitor draws an image on the screen by using an electron beam (3 electron
+beams for color models, 1 electron beam for monochrome monitors). The front of
+the screen is covered by a pattern of colored phosphors (pixels). If a phosphor
+is hit by an electron, it emits a photon and thus becomes visible.
+
+The electron beam draws horizontal lines (scanlines) from left to right, and
+from the top to the bottom of the screen. By modifying the intensity of the
+electron beam, pixels with various colors and intensities can be shown.
+
+After each scanline the electron beam has to move back to the left side of the
+screen and to the next line: this is called the horizontal retrace. After the
+whole screen (frame) was painted, the beam moves back to the upper left corner:
+this is called the vertical retrace. During both the horizontal and vertical
+retrace, the electron beam is turned off (blanked).
+
+The speed at which the electron beam paints the pixels is determined by the
+dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions
+of cycles per second), each pixel is 35242 ps (picoseconds) long::
+
+    1/(28.37516E6 Hz) = 35.242E-9 s
+
+If the screen resolution is 640x480, it will take::
+
+    640*35.242E-9 s = 22.555E-6 s
+
+to paint the 640 (xres) pixels on one scanline. But the horizontal retrace
+also takes time (e.g. 272 `pixels`), so a full scanline takes::
+
+    (640+272)*35.242E-9 s = 32.141E-6 s
+
+We'll say that the horizontal scanrate is about 31 kHz::
+
+    1/(32.141E-6 s) = 31.113E3 Hz
+
+A full screen counts 480 (yres) lines, but we have to consider the vertical
+retrace too (e.g. 49 `lines`). So a full screen will take::
+
+    (480+49)*32.141E-6 s = 17.002E-3 s
+
+The vertical scanrate is about 59 Hz::
+
+    1/(17.002E-3 s) = 58.815 Hz
+
+This means the screen data is refreshed about 59 times per second. To have a
+stable picture without visible flicker, VESA recommends a vertical scanrate of
+at least 72 Hz. But the perceived flicker is very human dependent: some people
+can use 50 Hz without any trouble, while I'll notice if it's less than 80 Hz.
+
+Since the monitor doesn't know when a new scanline starts, the graphics board
+will supply a synchronization pulse (horizontal sync or hsync) for each
+scanline.  Similarly it supplies a synchronization pulse (vertical sync or
+vsync) for each new frame. The position of the image on the screen is
+influenced by the moments at which the synchronization pulses occur.
+
+The following picture summarizes all timings. The horizontal retrace time is
+the sum of the left margin, the right margin and the hsync length, while the
+vertical retrace time is the sum of the upper margin, the lower margin and the
+vsync length::
+
+  +----------+---------------------------------------------+----------+-------+
+  |          |                ↑                            |          |       |
+  |          |                |upper_margin                |          |       |
+  |          |                ↓                            |          |       |
+  +----------###############################################----------+-------+
+  |          #                ↑                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |   left   #                |                            #  right   | hsync |
+  |  margin  #                |       xres                 #  margin  |  len  |
+  |<-------->#<---------------+--------------------------->#<-------->|<----->|
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |yres                        #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                |                            #          |       |
+  |          #                ↓                            #          |       |
+  +----------###############################################----------+-------+
+  |          |                ↑                            |          |       |
+  |          |                |lower_margin                |          |       |
+  |          |                ↓                            |          |       |
+  +----------+---------------------------------------------+----------+-------+
+  |          |                ↑                            |          |       |
+  |          |                |vsync_len                   |          |       |
+  |          |                ↓                            |          |       |
+  +----------+---------------------------------------------+----------+-------+
+
+The frame buffer device expects all horizontal timings in number of dotclocks
+(in picoseconds, 1E-12 s), and vertical timings in number of scanlines.
+
+
+6. Converting XFree86 timing values info frame buffer device timings
+--------------------------------------------------------------------
+
+An XFree86 mode line consists of the following fields::
+
+ "800x600"     50      800  856  976 1040    600  637  643  666
+ < name >     DCF       HR  SH1  SH2  HFL     VR  SV1  SV2  VFL
+
+The frame buffer device uses the following fields:
+
+  - pixclock: pixel clock in ps (pico seconds)
+  - left_margin: time from sync to picture
+  - right_margin: time from picture to sync
+  - upper_margin: time from sync to picture
+  - lower_margin: time from picture to sync
+  - hsync_len: length of horizontal sync
+  - vsync_len: length of vertical sync
+
+1) Pixelclock:
+
+   xfree: in MHz
+
+   fb: in picoseconds (ps)
+
+   pixclock = 1000000 / DCF
+
+2) horizontal timings:
+
+   left_margin = HFL - SH2
+
+   right_margin = SH1 - HR
+
+   hsync_len = SH2 - SH1
+
+3) vertical timings:
+
+   upper_margin = VFL - SV2
+
+   lower_margin = SV1 - VR
+
+   vsync_len = SV2 - SV1
+
+Good examples for VESA timings can be found in the XFree86 source tree,
+under "xc/programs/Xserver/hw/xfree86/doc/modeDB.txt".
+
+
+7. References
+-------------
+
+For more specific information about the frame buffer device and its
+applications, please refer to the Linux-fbdev website:
+
+    http://linux-fbdev.sourceforge.net/
+
+and to the following documentation:
+
+  - The manual pages for fbset: fbset(8), fb.modes(5)
+  - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5)
+  - The mighty kernel sources:
+
+      - linux/drivers/video/
+      - linux/include/linux/fb.h
+      - linux/include/video/
+
+
+
+8. Mailing list
+---------------
+
+There is a frame buffer device related mailing list at kernel.org:
+linux-fbdev@vger.kernel.org.
+
+Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for
+subscription information and archive browsing.
+
+
+9. Downloading
+--------------
+
+All necessary files can be found at
+
+    ftp://ftp.uni-erlangen.de/pub/Linux/LOCAL/680x0/
+
+and on its mirrors.
+
+The latest version of fbset can be found at
+
+    http://www.linux-fbdev.org/
+
+
+10. Credits
+-----------
+
+This readme was written by Geert Uytterhoeven, partly based on the original
+`X-framebuffer.README` by Roman Hodek and Martin Schaller. Section 6 was
+provided by Frank Neumann.
+
+The frame buffer device abstraction was designed by Martin Schaller.
diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.txt
deleted file mode 100644
index 58c5ae2e9f59..000000000000
--- a/Documentation/fb/framebuffer.txt
+++ /dev/null
@@ -1,343 +0,0 @@
-			The Frame Buffer Device
-			-----------------------
-
-Maintained by Geert Uytterhoeven <geert@linux-m68k.org>
-Last revised: May 10, 2001
-
-
-0. Introduction
----------------
-
-The frame buffer device provides an abstraction for the graphics hardware. It
-represents the frame buffer of some video hardware and allows application
-software to access the graphics hardware through a well-defined interface, so
-the software doesn't need to know anything about the low-level (hardware
-register) stuff.
-
-The device is accessed through special device nodes, usually located in the
-/dev directory, i.e. /dev/fb*.
-
-
-1. User's View of /dev/fb*
---------------------------
-
-From the user's point of view, the frame buffer device looks just like any
-other device in /dev. It's a character device using major 29; the minor
-specifies the frame buffer number.
-
-By convention, the following device nodes are used (numbers indicate the device
-minor numbers):
-
-      0 = /dev/fb0	First frame buffer
-      1 = /dev/fb1	Second frame buffer
-	  ...
-     31 = /dev/fb31	32nd frame buffer
-
-For backwards compatibility, you may want to create the following symbolic
-links:
-
-    /dev/fb0current -> fb0
-    /dev/fb1current -> fb1
-
-and so on...
-
-The frame buffer devices are also `normal' memory devices, this means, you can
-read and write their contents. You can, for example, make a screen snapshot by
-
-  cp /dev/fb0 myfile
-
-There also can be more than one frame buffer at a time, e.g. if you have a
-graphics card in addition to the built-in hardware. The corresponding frame
-buffer devices (/dev/fb0 and /dev/fb1 etc.) work independently.
-
-Application software that uses the frame buffer device (e.g. the X server) will
-use /dev/fb0 by default (older software uses /dev/fb0current). You can specify
-an alternative frame buffer device by setting the environment variable
-$FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash
-users):
-
-    export FRAMEBUFFER=/dev/fb1
-
-or (for csh users):
-
-    setenv FRAMEBUFFER /dev/fb1
-
-After this the X server will use the second frame buffer.
-
-
-2. Programmer's View of /dev/fb*
---------------------------------
-
-As you already know, a frame buffer device is a memory device like /dev/mem and
-it has the same features. You can read it, write it, seek to some location in
-it and mmap() it (the main usage). The difference is just that the memory that
-appears in the special file is not the whole memory, but the frame buffer of
-some video hardware.
-
-/dev/fb* also allows several ioctls on it, by which lots of information about
-the hardware can be queried and set. The color map handling works via ioctls,
-too. Look into <linux/fb.h> for more information on what ioctls exist and on
-which data structures they work. Here's just a brief overview:
-
-  - You can request unchangeable information about the hardware, like name,
-    organization of the screen memory (planes, packed pixels, ...) and address
-    and length of the screen memory.
-
-  - You can request and change variable information about the hardware, like
-    visible and virtual geometry, depth, color map format, timing, and so on.
-    If you try to change that information, the driver maybe will round up some
-    values to meet the hardware's capabilities (or return EINVAL if that isn't
-    possible).
-
-  - You can get and set parts of the color map. Communication is done with 16
-    bits per color part (red, green, blue, transparency) to support all 
-    existing hardware. The driver does all the computations needed to apply 
-    it to the hardware (round it down to less bits, maybe throw away 
-    transparency).
-
-All this hardware abstraction makes the implementation of application programs
-easier and more portable. E.g. the X server works completely on /dev/fb* and
-thus doesn't need to know, for example, how the color registers of the concrete
-hardware are organized. XF68_FBDev is a general X server for bitmapped,
-unaccelerated video hardware. The only thing that has to be built into
-application programs is the screen organization (bitplanes or chunky pixels
-etc.), because it works on the frame buffer image data directly.
-
-For the future it is planned that frame buffer drivers for graphics cards and
-the like can be implemented as kernel modules that are loaded at runtime. Such
-a driver just has to call register_framebuffer() and supply some functions.
-Writing and distributing such drivers independently from the kernel will save
-much trouble...
-
-
-3. Frame Buffer Resolution Maintenance
---------------------------------------
-
-Frame buffer resolutions are maintained using the utility `fbset'. It can
-change the video mode properties of a frame buffer device. Its main usage is
-to change the current video mode, e.g. during boot up in one of your /etc/rc.*
-or /etc/init.d/* files.
-
-Fbset uses a video mode database stored in a configuration file, so you can
-easily add your own modes and refer to them with a simple identifier.
-
-
-4. The X Server
----------------
-
-The X server (XF68_FBDev) is the most notable application program for the frame
-buffer device. Starting with XFree86 release 3.2, the X server is part of
-XFree86 and has 2 modes:
-
-  - If the `Display' subsection for the `fbdev' driver in the /etc/XF86Config
-    file contains a
-
-	Modes "default"
-
-    line, the X server will use the scheme discussed above, i.e. it will start
-    up in the resolution determined by /dev/fb0 (or $FRAMEBUFFER, if set). You
-    still have to specify the color depth (using the Depth keyword) and virtual
-    resolution (using the Virtual keyword) though. This is the default for the
-    configuration file supplied with XFree86. It's the most simple
-    configuration, but it has some limitations.
-
-  - Therefore it's also possible to specify resolutions in the /etc/XF86Config
-    file. This allows for on-the-fly resolution switching while retaining the
-    same virtual desktop size. The frame buffer device that's used is still
-    /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are
-    defined by /etc/XF86Config now. The disadvantage is that you have to
-    specify the timings in a different format (but `fbset -x' may help).
-
-To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't
-work 100% with XF68_FBDev: the reported clock values are always incorrect.
-
-
-5. Video Mode Timings
----------------------
-
-A monitor draws an image on the screen by using an electron beam (3 electron
-beams for color models, 1 electron beam for monochrome monitors). The front of
-the screen is covered by a pattern of colored phosphors (pixels). If a phosphor
-is hit by an electron, it emits a photon and thus becomes visible.
-
-The electron beam draws horizontal lines (scanlines) from left to right, and
-from the top to the bottom of the screen. By modifying the intensity of the
-electron beam, pixels with various colors and intensities can be shown.
-
-After each scanline the electron beam has to move back to the left side of the
-screen and to the next line: this is called the horizontal retrace. After the
-whole screen (frame) was painted, the beam moves back to the upper left corner:
-this is called the vertical retrace. During both the horizontal and vertical
-retrace, the electron beam is turned off (blanked).
-
-The speed at which the electron beam paints the pixels is determined by the
-dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions
-of cycles per second), each pixel is 35242 ps (picoseconds) long:
-
-    1/(28.37516E6 Hz) = 35.242E-9 s
-
-If the screen resolution is 640x480, it will take
-
-    640*35.242E-9 s = 22.555E-6 s
-
-to paint the 640 (xres) pixels on one scanline. But the horizontal retrace
-also takes time (e.g. 272 `pixels'), so a full scanline takes
-
-    (640+272)*35.242E-9 s = 32.141E-6 s
-
-We'll say that the horizontal scanrate is about 31 kHz:
-
-    1/(32.141E-6 s) = 31.113E3 Hz
-
-A full screen counts 480 (yres) lines, but we have to consider the vertical
-retrace too (e.g. 49 `lines'). So a full screen will take
-
-    (480+49)*32.141E-6 s = 17.002E-3 s
-
-The vertical scanrate is about 59 Hz:
-
-    1/(17.002E-3 s) = 58.815 Hz
-
-This means the screen data is refreshed about 59 times per second. To have a
-stable picture without visible flicker, VESA recommends a vertical scanrate of
-at least 72 Hz. But the perceived flicker is very human dependent: some people
-can use 50 Hz without any trouble, while I'll notice if it's less than 80 Hz.
-
-Since the monitor doesn't know when a new scanline starts, the graphics board
-will supply a synchronization pulse (horizontal sync or hsync) for each
-scanline.  Similarly it supplies a synchronization pulse (vertical sync or
-vsync) for each new frame. The position of the image on the screen is
-influenced by the moments at which the synchronization pulses occur.
-
-The following picture summarizes all timings. The horizontal retrace time is
-the sum of the left margin, the right margin and the hsync length, while the
-vertical retrace time is the sum of the upper margin, the lower margin and the
-vsync length.
-
-  +----------+---------------------------------------------+----------+-------+
-  |          |                ↑                            |          |       |
-  |          |                |upper_margin                |          |       |
-  |          |                ↓                            |          |       |
-  +----------###############################################----------+-------+
-  |          #                ↑                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |   left   #                |                            #  right   | hsync |
-  |  margin  #                |       xres                 #  margin  |  len  |
-  |<-------->#<---------------+--------------------------->#<-------->|<----->|
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |yres                        #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                |                            #          |       |
-  |          #                ↓                            #          |       |
-  +----------###############################################----------+-------+
-  |          |                ↑                            |          |       |
-  |          |                |lower_margin                |          |       |
-  |          |                ↓                            |          |       |
-  +----------+---------------------------------------------+----------+-------+
-  |          |                ↑                            |          |       |
-  |          |                |vsync_len                   |          |       |
-  |          |                ↓                            |          |       |
-  +----------+---------------------------------------------+----------+-------+
-
-The frame buffer device expects all horizontal timings in number of dotclocks
-(in picoseconds, 1E-12 s), and vertical timings in number of scanlines.
-
-
-6. Converting XFree86 timing values info frame buffer device timings
---------------------------------------------------------------------
-
-An XFree86 mode line consists of the following fields:
- "800x600"     50      800  856  976 1040    600  637  643  666
- < name >     DCF       HR  SH1  SH2  HFL     VR  SV1  SV2  VFL
-
-The frame buffer device uses the following fields:
-
-  - pixclock: pixel clock in ps (pico seconds)
-  - left_margin: time from sync to picture
-  - right_margin: time from picture to sync
-  - upper_margin: time from sync to picture
-  - lower_margin: time from picture to sync
-  - hsync_len: length of horizontal sync
-  - vsync_len: length of vertical sync
-
-1) Pixelclock:
-   xfree: in MHz
-   fb: in picoseconds (ps)
-
-   pixclock = 1000000 / DCF
-
-2) horizontal timings:
-   left_margin = HFL - SH2
-   right_margin = SH1 - HR
-   hsync_len = SH2 - SH1
-
-3) vertical timings:
-   upper_margin = VFL - SV2
-   lower_margin = SV1 - VR
-   vsync_len = SV2 - SV1
-
-Good examples for VESA timings can be found in the XFree86 source tree,
-under "xc/programs/Xserver/hw/xfree86/doc/modeDB.txt".
-
-
-7. References
--------------
-
-For more specific information about the frame buffer device and its
-applications, please refer to the Linux-fbdev website:
-
-    http://linux-fbdev.sourceforge.net/
-
-and to the following documentation:
-
-  - The manual pages for fbset: fbset(8), fb.modes(5)
-  - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5)
-  - The mighty kernel sources:
-      o linux/drivers/video/
-      o linux/include/linux/fb.h
-      o linux/include/video/
-
-
-
-8. Mailing list
----------------
-
-There is a frame buffer device related mailing list at kernel.org:
-linux-fbdev@vger.kernel.org.
-
-Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for
-subscription information and archive browsing.
-
-
-9. Downloading
---------------
-
-All necessary files can be found at
-
-    ftp://ftp.uni-erlangen.de/pub/Linux/LOCAL/680x0/
-
-and on its mirrors.
-
-The latest version of fbset can be found at
-
-    http://www.linux-fbdev.org/ 
-
-  
-10. Credits                                                       
-----------                                                       
-                
-This readme was written by Geert Uytterhoeven, partly based on the original
-`X-framebuffer.README' by Roman Hodek and Martin Schaller. Section 6 was
-provided by Frank Neumann.
-
-The frame buffer device abstraction was designed by Martin Schaller.
diff --git a/Documentation/fb/gxfb.rst b/Documentation/fb/gxfb.rst
new file mode 100644
index 000000000000..5738709bccbb
--- /dev/null
+++ b/Documentation/fb/gxfb.rst
@@ -0,0 +1,54 @@
+=============
+What is gxfb?
+=============
+
+.. [This file is cloned from VesaFB/aty128fb]
+
+This is a graphics framebuffer driver for AMD Geode GX2 based processors.
+
+Advantages:
+
+ * No need to use AMD's VSA code (or other VESA emulation layer) in the
+   BIOS.
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using  gxfb.mode_option=<resolution>... boot
+parameter or using `fbset` program.
+
+See Documentation/fb/modedb.rst for more information on modedb
+resolutions.
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to gxfb with gxfb.<option>.
+For example, gxfb.mode_option=800x600@75.
+Accepted options:
+
+================ ==================================================
+mode_option	 specify the video mode.  Of the form
+		 <x>x<y>[-<bpp>][@<refresh>]
+vram		 size of video ram (normally auto-detected)
+vt_switch	 enable vt switching during suspend/resume.  The vt
+		 switch is slow, but harmless.
+================ ==================================================
+
+Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/gxfb.txt b/Documentation/fb/gxfb.txt
deleted file mode 100644
index 2f640903bbb2..000000000000
--- a/Documentation/fb/gxfb.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-[This file is cloned from VesaFB/aty128fb]
-
-What is gxfb?
-=================
-
-This is a graphics framebuffer driver for AMD Geode GX2 based processors.
-
-Advantages:
-
- * No need to use AMD's VSA code (or other VESA emulation layer) in the
-   BIOS.
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts.
- * You can run XF68_FBDev on top of /dev/fb0
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * graphic mode is slower than text mode...
-
-
-How to use it?
-==============
-
-Switching modes is done using  gxfb.mode_option=<resolution>... boot
-parameter or using `fbset' program.
-
-See Documentation/fb/modedb.txt for more information on modedb
-resolutions.
-
-
-X11
-===
-
-XF68_FBDev should generally work fine, but it is non-accelerated.
-
-
-Configuration
-=============
-
-You can pass kernel command line options to gxfb with gxfb.<option>.
-For example, gxfb.mode_option=800x600@75.
-Accepted options:
-
-mode_option	- specify the video mode.  Of the form
-		  <x>x<y>[-<bpp>][@<refresh>]
-vram		- size of video ram (normally auto-detected)
-vt_switch	- enable vt switching during suspend/resume.  The vt
-		  switch is slow, but harmless.
-
---
-Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/index.rst b/Documentation/fb/index.rst
new file mode 100644
index 000000000000..baf02393d8ee
--- /dev/null
+++ b/Documentation/fb/index.rst
@@ -0,0 +1,50 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Frame Buffer
+============
+
+.. toctree::
+    :maxdepth: 1
+
+    api
+    arkfb
+    aty128fb
+    cirrusfb
+    cmap_xfbdev
+    deferred_io
+    efifb
+    ep93xx-fb
+    fbcon
+    framebuffer
+    gxfb
+    intel810
+    intelfb
+    internals
+    lxfb
+    matroxfb
+    metronomefb
+    modedb
+    pvr2fb
+    pxafb
+    s3fb
+    sa1100fb
+    sh7760fb
+    sisfb
+    sm501
+    sm712fb
+    sstfb
+    tgafb
+    tridentfb
+    udlfb
+    uvesafb
+    vesafb
+    viafb
+    vt8623fb
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/fb/intel810.rst b/Documentation/fb/intel810.rst
new file mode 100644
index 000000000000..eb86098db91f
--- /dev/null
+++ b/Documentation/fb/intel810.rst
@@ -0,0 +1,287 @@
+================================
+Intel 810/815 Framebuffer driver
+================================
+
+Tony Daplas <adaplas@pol.net>
+
+http://i810fb.sourceforge.net
+
+March 17, 2002
+
+First Released: July 2001
+Last Update:    September 12, 2005
+
+A. Introduction
+===============
+
+	This is a framebuffer driver for various Intel 810/815 compatible
+	graphics devices.  These include:
+
+	- Intel 810
+	- Intel 810E
+	- Intel 810-DC100
+	- Intel 815 Internal graphics only, 100Mhz FSB
+	- Intel 815 Internal graphics only
+	- Intel 815 Internal graphics and AGP
+
+B.  Features
+============
+
+	- Choice of using Discrete Video Timings, VESA Generalized Timing
+	  Formula, or a framebuffer specific database to set the video mode
+
+	- Supports a variable range of horizontal and vertical resolution and
+	  vertical refresh rates if the VESA Generalized Timing Formula is
+	  enabled.
+
+	- Supports color depths of 8, 16, 24 and 32 bits per pixel
+
+	- Supports pseudocolor, directcolor, or truecolor visuals
+
+	- Full and optimized hardware acceleration at 8, 16 and 24 bpp
+
+	- Robust video state save and restore
+
+	- MTRR support
+
+	- Utilizes user-entered monitor specifications to automatically
+	  calculate required video mode parameters.
+
+	- Can concurrently run with xfree86 running with native i810 drivers
+
+	- Hardware Cursor Support
+
+	- Supports EDID probing either by DDC/I2C or through the BIOS
+
+C.  List of available options
+=============================
+
+   a. "video=i810fb"
+	enables the i810 driver
+
+	Recommendation: required
+
+   b. "xres:<value>"
+	select horizontal resolution in pixels. (This parameter will be
+	ignored if 'mode_option' is specified.  See 'o' below).
+
+	Recommendation: user preference
+	(default = 640)
+
+   c. "yres:<value>"
+	select vertical resolution in scanlines. If Discrete Video Timings
+	is enabled, this will be ignored and computed as 3*xres/4.  (This
+	parameter will be ignored if 'mode_option' is specified.  See 'o'
+	below)
+
+	Recommendation: user preference
+	(default = 480)
+
+   d. "vyres:<value>"
+	select virtual vertical resolution in scanlines. If (0) or none
+	is specified, this will be computed against maximum available memory.
+
+	Recommendation: do not set
+	(default = 480)
+
+   e. "vram:<value>"
+	select amount of system RAM in MB to allocate for the video memory
+
+	Recommendation: 1 - 4 MB.
+	(default = 4)
+
+   f. "bpp:<value>"
+	select desired pixel depth
+
+	Recommendation: 8
+	(default = 8)
+
+   g. "hsync1/hsync2:<value>"
+	select the minimum and maximum Horizontal Sync Frequency of the
+	monitor in kHz.  If using a fixed frequency monitor, hsync1 must
+	be equal to hsync2. If EDID probing is successful, these will be
+	ignored and values will be taken from the EDID block.
+
+	Recommendation: check monitor manual for correct values
+	(default = 29/30)
+
+   h. "vsync1/vsync2:<value>"
+	select the minimum and maximum Vertical Sync Frequency of the monitor
+	in Hz. You can also use this option to lock your monitor's refresh
+	rate. If EDID probing is successful, these will be ignored and values
+	will be taken from the EDID block.
+
+	Recommendation: check monitor manual for correct values
+	(default = 60/60)
+
+	IMPORTANT:  If you need to clamp your timings, try to give some
+	leeway for computational errors (over/underflows).  Example: if
+	using vsync1/vsync2 = 60/60, make sure hsync1/hsync2 has at least
+	a 1 unit difference, and vice versa.
+
+   i. "voffset:<value>"
+	select at what offset in MB of the logical memory to allocate the
+	framebuffer memory.  The intent is to avoid the memory blocks
+	used by standard graphics applications (XFree86).  The default
+	offset (16 MB for a 64 MB aperture, 8 MB for a 32 MB aperture) will
+	avoid XFree86's usage and allows up to 7 MB/15 MB of framebuffer
+	memory.  Depending on your usage, adjust the value up or down
+	(0 for maximum usage, 31/63 MB for the least amount).  Note, an
+	arbitrary setting may conflict with XFree86.
+
+	Recommendation: do not set
+	(default = 8 or 16 MB)
+
+   j. "accel"
+	enable text acceleration.  This can be enabled/reenabled anytime
+	by using 'fbset -accel true/false'.
+
+	Recommendation: enable
+	(default = not set)
+
+   k. "mtrr"
+	enable MTRR.  This allows data transfers to the framebuffer memory
+	to occur in bursts which can significantly increase performance.
+	Not very helpful with the i810/i815 because of 'shared memory'.
+
+	Recommendation: do not set
+	(default = not set)
+
+   l. "extvga"
+	if specified, secondary/external VGA output will always be enabled.
+	Useful if the BIOS turns off the VGA port when no monitor is attached.
+	The external VGA monitor can then be attached without rebooting.
+
+	Recommendation: do not set
+	(default = not set)
+
+   m. "sync"
+	Forces the hardware engine to do a "sync" or wait for the hardware
+	to finish before starting another instruction. This will produce a
+	more stable setup, but will be slower.
+
+	Recommendation: do not set
+	(default = not set)
+
+   n. "dcolor"
+	Use directcolor visual instead of truecolor for pixel depths greater
+	than 8 bpp.  Useful for color tuning, such as gamma control.
+
+	Recommendation: do not set
+	(default = not set)
+
+   o. <xres>x<yres>[-<bpp>][@<refresh>]
+	The driver will now accept specification of boot mode option.  If this
+	is specified, the options 'xres' and 'yres' will be ignored. See
+	Documentation/fb/modedb.rst for usage.
+
+D. Kernel booting
+=================
+
+Separate each option/option-pair by commas (,) and the option from its value
+with a colon (:) as in the following::
+
+	video=i810fb:option1,option2:value2
+
+Sample Usage
+------------
+
+In /etc/lilo.conf, add the line::
+
+  append="video=i810fb:vram:2,xres:1024,yres:768,bpp:8,hsync1:30,hsync2:55, \
+	  vsync1:50,vsync2:85,accel,mtrr"
+
+This will initialize the framebuffer to 1024x768 at 8bpp.  The framebuffer
+will use 2 MB of System RAM. MTRR support will be enabled. The refresh rate
+will be computed based on the hsync1/hsync2 and vsync1/vsync2 values.
+
+IMPORTANT:
+  You must include hsync1, hsync2, vsync1 and vsync2 to enable video modes
+  better than 640x480 at 60Hz. HOWEVER, if your chipset/display combination
+  supports I2C and has an EDID block, you can safely exclude hsync1, hsync2,
+  vsync1 and vsync2 parameters.  These parameters will be taken from the EDID
+  block.
+
+E.  Module options
+==================
+
+The module parameters are essentially similar to the kernel
+parameters. The main difference is that you need to include a Boolean value
+(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
+
+Example, to enable MTRR, include "mtrr=1".
+
+Sample Usage
+------------
+
+Using the same setup as described above, load the module like this::
+
+	modprobe i810fb vram=2 xres=1024 bpp=8 hsync1=30 hsync2=55 vsync1=50 \
+		 vsync2=85 accel=1 mtrr=1
+
+Or just add the following to a configuration file in /etc/modprobe.d/::
+
+	options i810fb vram=2 xres=1024 bpp=16 hsync1=30 hsync2=55 vsync1=50 \
+	vsync2=85 accel=1 mtrr=1
+
+and just do a::
+
+	modprobe i810fb
+
+
+F.  Setup
+=========
+
+	a. Do your usual method of configuring the kernel
+
+	   make menuconfig/xconfig/config
+
+	b. Under "Code maturity level options" enable "Prompt for development
+	   and/or incomplete code/drivers".
+
+	c. Enable agpgart support for the Intel 810/815 on-board graphics.
+	   This is required.  The option is under "Character Devices".
+
+	d. Under "Graphics Support", select "Intel 810/815" either statically
+	   or as a module.  Choose "use VESA Generalized Timing Formula" if
+	   you need to maximize the capability of your display.  To be on the
+	   safe side, you can leave this unselected.
+
+	e. If you want support for DDC/I2C probing (Plug and Play Displays),
+	   set 'Enable DDC Support' to 'y'. To make this option appear, set
+	   'use VESA Generalized Timing Formula' to 'y'.
+
+	f. If you want a framebuffer console, enable it under "Console
+	   Drivers".
+
+	g. Compile your kernel.
+
+	h. Load the driver as described in sections D and E.
+
+	i.  Try the DirectFB (http://www.directfb.org) + the i810 gfxdriver
+	    patch to see the chipset in action (or inaction :-).
+
+G.  Acknowledgment:
+===================
+
+	1.  Geert Uytterhoeven - his excellent howto and the virtual
+	    framebuffer driver code made this possible.
+
+	2.  Jeff Hartmann for his agpgart code.
+
+	3.  The X developers.  Insights were provided just by reading the
+	    XFree86 source code.
+
+	4.  Intel(c).  For this value-oriented chipset driver and for
+	    providing documentation.
+
+	5. Matt Sottek.  His inputs and ideas  helped in making some
+	   optimizations possible.
+
+H.  Home Page:
+==============
+
+	A more complete, and probably updated information is provided at
+	http://i810fb.sourceforge.net.
+
+Tony
diff --git a/Documentation/fb/intel810.txt b/Documentation/fb/intel810.txt
deleted file mode 100644
index a8e9f5bca6f3..000000000000
--- a/Documentation/fb/intel810.txt
+++ /dev/null
@@ -1,278 +0,0 @@
-Intel 810/815 Framebuffer driver
- 	Tony Daplas <adaplas@pol.net>
-	http://i810fb.sourceforge.net
-
-	March 17, 2002
-
-	First Released: July 2001
-	Last Update:    September 12, 2005
-================================================================
-
-A. Introduction
-
-	This is a framebuffer driver for various Intel 810/815 compatible
-	graphics devices.  These include:
-
-	Intel 810
-	Intel 810E
-	Intel 810-DC100
-	Intel 815 Internal graphics only, 100Mhz FSB
-	Intel 815 Internal graphics only
-	Intel 815 Internal graphics and AGP
-
-B.  Features
-
-	- Choice of using Discrete Video Timings, VESA Generalized Timing
-	  Formula, or a framebuffer specific database to set the video mode
-
-	- Supports a variable range of horizontal and vertical resolution and
-	  vertical refresh rates if the VESA Generalized Timing Formula is
-	  enabled.
-
-	- Supports color depths of 8, 16, 24 and 32 bits per pixel
-
-	- Supports pseudocolor, directcolor, or truecolor visuals
-
-	- Full and optimized hardware acceleration at 8, 16 and 24 bpp
-
-	- Robust video state save and restore
-
-	- MTRR support
-
-	- Utilizes user-entered monitor specifications to automatically
-	  calculate required video mode parameters.
-
-	- Can concurrently run with xfree86 running with native i810 drivers
-
-	- Hardware Cursor Support
- 
-	- Supports EDID probing either by DDC/I2C or through the BIOS
-
-C.  List of available options
-
-   a. "video=i810fb"
-	enables the i810 driver
-
-	Recommendation: required
-
-   b. "xres:<value>"
-	select horizontal resolution in pixels. (This parameter will be
-	ignored if 'mode_option' is specified.  See 'o' below).
-
-	Recommendation: user preference
-	(default = 640)
-
-   c. "yres:<value>"
-	select vertical resolution in scanlines. If Discrete Video Timings
-	is enabled, this will be ignored and computed as 3*xres/4.  (This
-	parameter will be ignored if 'mode_option' is specified.  See 'o'
-	below)
-
-	Recommendation: user preference
-	(default = 480)
-
-   d. "vyres:<value>"
-	select virtual vertical resolution in scanlines. If (0) or none
-	is specified, this will be computed against maximum available memory.
-
-	Recommendation: do not set
-	(default = 480)
-
-   e. "vram:<value>"
-	select amount of system RAM in MB to allocate for the video memory
-
-	Recommendation: 1 - 4 MB.
-	(default = 4)
-
-   f. "bpp:<value>"
-	select desired pixel depth
-
-	Recommendation: 8
-	(default = 8)
-
-   g. "hsync1/hsync2:<value>"
-	select the minimum and maximum Horizontal Sync Frequency of the
-	monitor in kHz.  If using a fixed frequency monitor, hsync1 must
-	be equal to hsync2. If EDID probing is successful, these will be
-	ignored and values will be taken from the EDID block.
-
-	Recommendation: check monitor manual for correct values
-	(default = 29/30)
-
-   h. "vsync1/vsync2:<value>"
-	select the minimum and maximum Vertical Sync Frequency of the monitor
-	in Hz. You can also use this option to lock your monitor's refresh
-	rate. If EDID probing is successful, these will be ignored and values
-	will be taken from the EDID block.
-
-	Recommendation: check monitor manual for correct values
-	(default = 60/60)
-
-	IMPORTANT:  If you need to clamp your timings, try to give some
-	leeway for computational errors (over/underflows).  Example: if
-	using vsync1/vsync2 = 60/60, make sure hsync1/hsync2 has at least
-	a 1 unit difference, and vice versa.
-
-   i. "voffset:<value>"
-	select at what offset in MB of the logical memory to allocate the
-	framebuffer memory.  The intent is to avoid the memory blocks
-	used by standard graphics applications (XFree86).  The default
-	offset (16 MB for a 64 MB aperture, 8 MB for a 32 MB aperture) will
-	avoid XFree86's usage and allows up to 7 MB/15 MB of framebuffer
-	memory.  Depending on your usage, adjust the value up or down
-	(0 for maximum usage, 31/63 MB for the least amount).  Note, an
-	arbitrary setting may conflict with XFree86.
-
-	Recommendation: do not set
-	(default = 8 or 16 MB)
-
-   j. "accel"
-	enable text acceleration.  This can be enabled/reenabled anytime
-	by using 'fbset -accel true/false'.
-
-	Recommendation: enable
-	(default = not set)
-
-   k. "mtrr"
-	enable MTRR.  This allows data transfers to the framebuffer memory
-	to occur in bursts which can significantly increase performance.
-	Not very helpful with the i810/i815 because of 'shared memory'.
-
-	Recommendation: do not set
-	(default = not set)
-
-   l. "extvga"
-	if specified, secondary/external VGA output will always be enabled.
-	Useful if the BIOS turns off the VGA port when no monitor is attached.
-	The external VGA monitor can then be attached without rebooting.
-
-	Recommendation: do not set
-	(default = not set)
-
-   m. "sync"
-	Forces the hardware engine to do a "sync" or wait for the hardware
-	to finish before starting another instruction. This will produce a
-	more stable setup, but will be slower.
-
-	Recommendation: do not set
-	(default = not set)
-
-   n. "dcolor"
-        Use directcolor visual instead of truecolor for pixel depths greater
-	than 8 bpp.  Useful for color tuning, such as gamma control.
-
-	Recommendation: do not set
-	(default = not set)
-
-   o. <xres>x<yres>[-<bpp>][@<refresh>]
-	The driver will now accept specification of boot mode option.  If this
-	is specified, the options 'xres' and 'yres' will be ignored. See
-	Documentation/fb/modedb.txt for usage.
-
-D. Kernel booting
-
-Separate each option/option-pair by commas (,) and the option from its value
-with a colon (:) as in the following:
-
-video=i810fb:option1,option2:value2
-
-Sample Usage
-------------
-
-In /etc/lilo.conf, add the line:
-
-append="video=i810fb:vram:2,xres:1024,yres:768,bpp:8,hsync1:30,hsync2:55, \
-        vsync1:50,vsync2:85,accel,mtrr"
-
-This will initialize the framebuffer to 1024x768 at 8bpp.  The framebuffer
-will use 2 MB of System RAM. MTRR support will be enabled. The refresh rate
-will be computed based on the hsync1/hsync2 and vsync1/vsync2 values.
-
-IMPORTANT:
-You must include hsync1, hsync2, vsync1 and vsync2 to enable video modes
-better than 640x480 at 60Hz. HOWEVER, if your chipset/display combination
-supports I2C and has an EDID block, you can safely exclude hsync1, hsync2,
-vsync1 and vsync2 parameters.  These parameters will be taken from the EDID
-block.
-
-E.  Module options
-
-The module parameters are essentially similar to the kernel
-parameters. The main difference is that you need to include a Boolean value
-(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
-
-Example, to enable MTRR, include "mtrr=1".
-
-Sample Usage
-------------
-
-Using the same setup as described above, load the module like this:
-
-	modprobe i810fb vram=2 xres=1024 bpp=8 hsync1=30 hsync2=55 vsync1=50 \
-	         vsync2=85 accel=1 mtrr=1
-
-Or just add the following to a configuration file in /etc/modprobe.d/
-
-	options i810fb vram=2 xres=1024 bpp=16 hsync1=30 hsync2=55 vsync1=50 \
-	vsync2=85 accel=1 mtrr=1
-
-and just do a
-
-	modprobe i810fb
-
-
-F.  Setup
-
-	a. Do your usual method of configuring the kernel.
-
-	make menuconfig/xconfig/config
-
-	b. Under "Code maturity level options" enable "Prompt for development
-	   and/or incomplete code/drivers".
-
- 	c. Enable agpgart support for the Intel 810/815 on-board graphics.
-	   This is required.  The option is under "Character Devices".
-
-	d. Under "Graphics Support", select "Intel 810/815" either statically
-	   or as a module.  Choose "use VESA Generalized Timing Formula" if
-	   you need to maximize the capability of your display.  To be on the
-	   safe side, you can leave this unselected.
-
-	e. If you want support for DDC/I2C probing (Plug and Play Displays),
-	   set 'Enable DDC Support' to 'y'. To make this option appear, set
-	   'use VESA Generalized Timing Formula' to 'y'.
-
-        f. If you want a framebuffer console, enable it under "Console
-	   Drivers".
-
-	g. Compile your kernel.
-
-	h. Load the driver as described in sections D and E.
-
-	i.  Try the DirectFB (http://www.directfb.org) + the i810 gfxdriver
-	    patch to see the chipset in action (or inaction :-).
-
-G.  Acknowledgment:
-
-	1.  Geert Uytterhoeven - his excellent howto and the virtual
-	    framebuffer driver code made this possible.
-
-	2.  Jeff Hartmann for his agpgart code.
-
-	3.  The X developers.  Insights were provided just by reading the
-	    XFree86 source code.
-
-	4.  Intel(c).  For this value-oriented chipset driver and for
-	    providing documentation.
-
-	5. Matt Sottek.  His inputs and ideas  helped in making some
-	   optimizations possible.
-
-H.  Home Page:
-
-	A more complete, and probably updated information is provided at
-	http://i810fb.sourceforge.net.
-
-###########################
-Tony
-
diff --git a/Documentation/fb/intelfb.rst b/Documentation/fb/intelfb.rst
new file mode 100644
index 000000000000..e2d0903f4efb
--- /dev/null
+++ b/Documentation/fb/intelfb.rst
@@ -0,0 +1,155 @@
+=============================================================
+Intel 830M/845G/852GM/855GM/865G/915G/945G Framebuffer driver
+=============================================================
+
+A. Introduction
+===============
+
+This is a framebuffer driver for various Intel 8xx/9xx compatible
+graphics devices.  These would include:
+
+	- Intel 830M
+	- Intel 845G
+	- Intel 852GM
+	- Intel 855GM
+	- Intel 865G
+	- Intel 915G
+	- Intel 915GM
+	- Intel 945G
+	- Intel 945GM
+	- Intel 945GME
+	- Intel 965G
+	- Intel 965GM
+
+B.  List of available options
+=============================
+
+   a. "video=intelfb"
+	enables the intelfb driver
+
+	Recommendation: required
+
+   b. "mode=<xres>x<yres>[-<bpp>][@<refresh>]"
+	select mode
+
+	Recommendation: user preference
+	(default = 1024x768-32@70)
+
+   c. "vram=<value>"
+	select amount of system RAM in MB to allocate for the video memory
+	if not enough RAM was already allocated by the BIOS.
+
+	Recommendation: 1 - 4 MB.
+	(default = 4 MB)
+
+   d. "voffset=<value>"
+	select at what offset in MB of the logical memory to allocate the
+	framebuffer memory.  The intent is to avoid the memory blocks
+	used by standard graphics applications (XFree86). Depending on your
+	usage, adjust the value up or down, (0 for maximum usage, 63/127 MB
+	for the least amount).  Note, an arbitrary setting may conflict
+	with XFree86.
+
+	Recommendation: do not set
+	(default = 48 MB)
+
+   e. "accel"
+	enable text acceleration.  This can be enabled/reenabled anytime
+	by using 'fbset -accel true/false'.
+
+	Recommendation: enable
+	(default = set)
+
+   f. "hwcursor"
+	enable cursor acceleration.
+
+	Recommendation: enable
+	(default = set)
+
+   g. "mtrr"
+	enable MTRR.  This allows data transfers to the framebuffer memory
+	to occur in bursts which can significantly increase performance.
+	Not very helpful with the intel chips because of 'shared memory'.
+
+	Recommendation: set
+	(default = set)
+
+   h. "fixed"
+	disable mode switching.
+
+	Recommendation: do not set
+	(default = not set)
+
+   The binary parameters can be unset with a "no" prefix, example "noaccel".
+   The default parameter (not named) is the mode.
+
+C. Kernel booting
+=================
+
+Separate each option/option-pair by commas (,) and the option from its value
+with an equals sign (=) as in the following::
+
+	video=intelfb:option1,option2=value2
+
+Sample Usage
+------------
+
+In /etc/lilo.conf, add the line::
+
+	append="video=intelfb:mode=800x600-32@75,accel,hwcursor,vram=8"
+
+This will initialize the framebuffer to 800x600 at 32bpp and 75Hz. The
+framebuffer will use 8 MB of System RAM. hw acceleration of text and cursor
+will be enabled.
+
+Remarks
+-------
+
+If setting this parameter doesn't work (you stay in a 80x25 text-mode),
+you might need to set the "vga=<mode>" parameter too - see vesafb.txt
+in this directory.
+
+
+D.  Module options
+==================
+
+The module parameters are essentially similar to the kernel
+parameters. The main difference is that you need to include a Boolean value
+(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
+
+Example, to enable MTRR, include "mtrr=1".
+
+Sample Usage
+------------
+
+Using the same setup as described above, load the module like this::
+
+	modprobe intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
+
+Or just add the following to a configuration file in /etc/modprobe.d/::
+
+	options intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
+
+and just do a::
+
+	modprobe intelfb
+
+
+E.  Acknowledgment:
+===================
+
+	1.  Geert Uytterhoeven - his excellent howto and the virtual
+	    framebuffer driver code made this possible.
+
+	2.  Jeff Hartmann for his agpgart code.
+
+	3.  David Dawes for his original kernel 2.4 code.
+
+	4.  The X developers.  Insights were provided just by reading the
+	    XFree86 source code.
+
+	5.  Antonino A. Daplas for his inspiring i810fb driver.
+
+	6.  Andrew Morton for his kernel patches maintenance.
+
+Sylvain
diff --git a/Documentation/fb/intelfb.txt b/Documentation/fb/intelfb.txt
deleted file mode 100644
index feac4e4d6968..000000000000
--- a/Documentation/fb/intelfb.txt
+++ /dev/null
@@ -1,149 +0,0 @@
-Intel 830M/845G/852GM/855GM/865G/915G/945G Framebuffer driver
-================================================================
-
-A. Introduction
-	This is a framebuffer driver for various Intel 8xx/9xx compatible
-graphics devices.  These would include:
-
-	Intel 830M
-	Intel 845G
-	Intel 852GM
-	Intel 855GM
-	Intel 865G
-	Intel 915G
-	Intel 915GM
-	Intel 945G
-	Intel 945GM
-	Intel 945GME
-	Intel 965G
-	Intel 965GM
-
-B.  List of available options
-
-   a. "video=intelfb"
-	enables the intelfb driver
-
-	Recommendation: required
-
-   b. "mode=<xres>x<yres>[-<bpp>][@<refresh>]"
-	select mode
-
-	Recommendation: user preference
-	(default = 1024x768-32@70)
-
-   c. "vram=<value>"
-	select amount of system RAM in MB to allocate for the video memory
-	if not enough RAM was already allocated by the BIOS.
-
-	Recommendation: 1 - 4 MB.
-	(default = 4 MB)
-
-   d. "voffset=<value>"
-        select at what offset in MB of the logical memory to allocate the
-	framebuffer memory.  The intent is to avoid the memory blocks
-	used by standard graphics applications (XFree86). Depending on your
-        usage, adjust the value up or down, (0 for maximum usage, 63/127 MB
-        for the least amount).  Note, an arbitrary setting may conflict
-        with XFree86.
-
-	Recommendation: do not set
-	(default = 48 MB)
-
-   e. "accel"
-	enable text acceleration.  This can be enabled/reenabled anytime
-	by using 'fbset -accel true/false'.
-
-	Recommendation: enable
-	(default = set)
-
-   f. "hwcursor"
-	enable cursor acceleration.
-
-	Recommendation: enable
-	(default = set)
-
-   g. "mtrr"
-	enable MTRR.  This allows data transfers to the framebuffer memory
-	to occur in bursts which can significantly increase performance.
-	Not very helpful with the intel chips because of 'shared memory'.
-
-	Recommendation: set
-	(default = set)
-
-   h. "fixed"
-	disable mode switching.
-
-	Recommendation: do not set
-	(default = not set)
-
-   The binary parameters can be unset with a "no" prefix, example "noaccel".
-   The default parameter (not named) is the mode.
-
-C. Kernel booting
-
-Separate each option/option-pair by commas (,) and the option from its value
-with an equals sign (=) as in the following:
-
-video=intelfb:option1,option2=value2
-
-Sample Usage
-------------
-
-In /etc/lilo.conf, add the line:
-
-append="video=intelfb:mode=800x600-32@75,accel,hwcursor,vram=8"
-
-This will initialize the framebuffer to 800x600 at 32bpp and 75Hz. The
-framebuffer will use 8 MB of System RAM. hw acceleration of text and cursor
-will be enabled.
-
-Remarks
--------
-
-If setting this parameter doesn't work (you stay in a 80x25 text-mode),
-you might need to set the "vga=<mode>" parameter too - see vesafb.txt
-in this directory.
-
-
-D.  Module options
-
-	The module parameters are essentially similar to the kernel
-parameters. The main difference is that you need to include a Boolean value
-(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
-
-Example, to enable MTRR, include "mtrr=1".
-
-Sample Usage
-------------
-
-Using the same setup as described above, load the module like this:
-
-	modprobe intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
-
-Or just add the following to a configuration file in /etc/modprobe.d/
-
-	options intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
-
-and just do a
-
-	modprobe intelfb
-
-
-E.  Acknowledgment:
-
-	1.  Geert Uytterhoeven - his excellent howto and the virtual
-                                 framebuffer driver code made this possible.
-
-	2.  Jeff Hartmann for his agpgart code.
-
-	3.  David Dawes for his original kernel 2.4 code.
-
-	4.  The X developers.  Insights were provided just by reading the
-	    XFree86 source code.
-
-	5.  Antonino A. Daplas for his inspiring i810fb driver.
-
-	6.  Andrew Morton for his kernel patches maintenance.
-
-###########################
-Sylvain
diff --git a/Documentation/fb/internals.rst b/Documentation/fb/internals.rst
new file mode 100644
index 000000000000..696b50aa7c24
--- /dev/null
+++ b/Documentation/fb/internals.rst
@@ -0,0 +1,86 @@
+=============================
+Frame Buffer device internals
+=============================
+
+This is a first start for some documentation about frame buffer device
+internals.
+
+Authors:
+
+- Geert Uytterhoeven <geert@linux-m68k.org>, 21 July 1998
+- James Simmons <jsimmons@user.sf.net>, Nov 26 2002
+
+--------------------------------------------------------------------------------
+
+Structures used by the frame buffer device API
+==============================================
+
+The following structures play a role in the game of frame buffer devices. They
+are defined in <linux/fb.h>.
+
+1. Outside the kernel (user space)
+
+  - struct fb_fix_screeninfo
+
+    Device independent unchangeable information about a frame buffer device and
+    a specific video mode. This can be obtained using the FBIOGET_FSCREENINFO
+    ioctl.
+
+  - struct fb_var_screeninfo
+
+    Device independent changeable information about a frame buffer device and a
+    specific video mode. This can be obtained using the FBIOGET_VSCREENINFO
+    ioctl, and updated with the FBIOPUT_VSCREENINFO ioctl. If you want to pan
+    the screen only, you can use the FBIOPAN_DISPLAY ioctl.
+
+  - struct fb_cmap
+
+    Device independent colormap information. You can get and set the colormap
+    using the FBIOGETCMAP and FBIOPUTCMAP ioctls.
+
+
+2. Inside the kernel
+
+  - struct fb_info
+
+    Generic information, API and low level information about a specific frame
+    buffer device instance (slot number, board address, ...).
+
+  - struct `par`
+
+    Device dependent information that uniquely defines the video mode for this
+    particular piece of hardware.
+
+
+Visuals used by the frame buffer device API
+===========================================
+
+
+Monochrome (FB_VISUAL_MONO01 and FB_VISUAL_MONO10)
+--------------------------------------------------
+Each pixel is either black or white.
+
+
+Pseudo color (FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR)
+---------------------------------------------------------------------
+The whole pixel value is fed through a programmable lookup table that has one
+color (including red, green, and blue intensities) for each possible pixel
+value, and that color is displayed.
+
+
+True color (FB_VISUAL_TRUECOLOR)
+--------------------------------
+The pixel value is broken up into red, green, and blue fields.
+
+
+Direct color (FB_VISUAL_DIRECTCOLOR)
+------------------------------------
+The pixel value is broken up into red, green, and blue fields, each of which
+are looked up in separate red, green, and blue lookup tables.
+
+
+Grayscale displays
+------------------
+Grayscale and static grayscale are special variants of pseudo color and static
+pseudo color, where the red, green and blue components are always equal to
+each other.
diff --git a/Documentation/fb/internals.txt b/Documentation/fb/internals.txt
deleted file mode 100644
index 9b2a2b2f3e57..000000000000
--- a/Documentation/fb/internals.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-
-This is a first start for some documentation about frame buffer device
-internals.
-
-Geert Uytterhoeven <geert@linux-m68k.org>, 21 July 1998
-James Simmons <jsimmons@user.sf.net>, Nov 26 2002
-
---------------------------------------------------------------------------------
-
-	    ***  STRUCTURES USED BY THE FRAME BUFFER DEVICE API  ***
-
-The following structures play a role in the game of frame buffer devices. They
-are defined in <linux/fb.h>.
-
-1. Outside the kernel (user space)
-
-  - struct fb_fix_screeninfo
-
-    Device independent unchangeable information about a frame buffer device and
-    a specific video mode. This can be obtained using the FBIOGET_FSCREENINFO
-    ioctl.
-
-  - struct fb_var_screeninfo
-
-    Device independent changeable information about a frame buffer device and a
-    specific video mode. This can be obtained using the FBIOGET_VSCREENINFO
-    ioctl, and updated with the FBIOPUT_VSCREENINFO ioctl. If you want to pan
-    the screen only, you can use the FBIOPAN_DISPLAY ioctl.
-
-  - struct fb_cmap
-
-    Device independent colormap information. You can get and set the colormap
-    using the FBIOGETCMAP and FBIOPUTCMAP ioctls.
-
-
-2. Inside the kernel
-
-  - struct fb_info
-
-    Generic information, API and low level information about a specific frame
-    buffer device instance (slot number, board address, ...).
-
-  - struct `par'
-
-    Device dependent information that uniquely defines the video mode for this
-    particular piece of hardware.
-
-
---------------------------------------------------------------------------------
-
-	    ***  VISUALS USED BY THE FRAME BUFFER DEVICE API  ***
-
-
-Monochrome (FB_VISUAL_MONO01 and FB_VISUAL_MONO10)
--------------------------------------------------
-Each pixel is either black or white.
-
-
-Pseudo color (FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR)
----------------------------------------------------------------------
-The whole pixel value is fed through a programmable lookup table that has one
-color (including red, green, and blue intensities) for each possible pixel
-value, and that color is displayed.
-
-
-True color (FB_VISUAL_TRUECOLOR)
---------------------------------
-The pixel value is broken up into red, green, and blue fields.
-
-
-Direct color (FB_VISUAL_DIRECTCOLOR)
-------------------------------------
-The pixel value is broken up into red, green, and blue fields, each of which 
-are looked up in separate red, green, and blue lookup tables.
-
-
-Grayscale displays
-------------------
-Grayscale and static grayscale are special variants of pseudo color and static
-pseudo color, where the red, green and blue components are always equal to
-each other.
-
diff --git a/Documentation/fb/lxfb.rst b/Documentation/fb/lxfb.rst
new file mode 100644
index 000000000000..863e6b98fbae
--- /dev/null
+++ b/Documentation/fb/lxfb.rst
@@ -0,0 +1,55 @@
+=============
+What is lxfb?
+=============
+
+.. [This file is cloned from VesaFB/aty128fb]
+
+
+This is a graphics framebuffer driver for AMD Geode LX based processors.
+
+Advantages:
+
+ * No need to use AMD's VSA code (or other VESA emulation layer) in the
+   BIOS.
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using  lxfb.mode_option=<resolution>... boot
+parameter or using `fbset` program.
+
+See Documentation/fb/modedb.rst for more information on modedb
+resolutions.
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to lxfb with lxfb.<option>.
+For example, lxfb.mode_option=800x600@75.
+Accepted options:
+
+================ ==================================================
+mode_option	 specify the video mode.  Of the form
+		 <x>x<y>[-<bpp>][@<refresh>]
+vram		 size of video ram (normally auto-detected)
+vt_switch	 enable vt switching during suspend/resume.  The vt
+		 switch is slow, but harmless.
+================ ==================================================
+
+Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/lxfb.txt b/Documentation/fb/lxfb.txt
deleted file mode 100644
index 38b3ca6f6ca7..000000000000
--- a/Documentation/fb/lxfb.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-[This file is cloned from VesaFB/aty128fb]
-
-What is lxfb?
-=================
-
-This is a graphics framebuffer driver for AMD Geode LX based processors.
-
-Advantages:
-
- * No need to use AMD's VSA code (or other VESA emulation layer) in the
-   BIOS.
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts.
- * You can run XF68_FBDev on top of /dev/fb0
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * graphic mode is slower than text mode...
-
-
-How to use it?
-==============
-
-Switching modes is done using  lxfb.mode_option=<resolution>... boot
-parameter or using `fbset' program.
-
-See Documentation/fb/modedb.txt for more information on modedb
-resolutions.
-
-
-X11
-===
-
-XF68_FBDev should generally work fine, but it is non-accelerated.
-
-
-Configuration
-=============
-
-You can pass kernel command line options to lxfb with lxfb.<option>.
-For example, lxfb.mode_option=800x600@75.
-Accepted options:
-
-mode_option	- specify the video mode.  Of the form
-		  <x>x<y>[-<bpp>][@<refresh>]
-vram		- size of video ram (normally auto-detected)
-vt_switch	- enable vt switching during suspend/resume.  The vt
-		  switch is slow, but harmless.
-
---
-Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/matroxfb.rst b/Documentation/fb/matroxfb.rst
new file mode 100644
index 000000000000..f1859d98606e
--- /dev/null
+++ b/Documentation/fb/matroxfb.rst
@@ -0,0 +1,443 @@
+=================
+What is matroxfb?
+=================
+
+.. [This file is cloned from VesaFB. Thanks go to Gerd Knorr]
+
+
+This is a driver for a graphic framebuffer for Matrox devices on
+Alpha, Intel and PPC boxes.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts.
+ * You can run XF{68,86}_FBDev or XFree86 fbdev driver on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode... but you should not notice
+   if you use same resolution as you used in textmode.
+
+
+How to use it?
+==============
+
+Switching modes is done using the video=matroxfb:vesa:... boot parameter
+or using `fbset` program.
+
+If you want, for example, enable a resolution of 1280x1024x24bpp you should
+pass to the kernel this command line: "video=matroxfb:vesa:0x1BB".
+
+You should compile in both vgacon (to boot if you remove you Matrox from
+box) and matroxfb (for graphics mode). You should not compile-in vesafb
+unless you have primary display on non-Matrox VBE2.0 device (see
+Documentation/fb/vesafb.rst for details).
+
+Currently supported video modes are (through vesa:... interface, PowerMac
+has [as addon] compatibility code):
+
+
+Graphic modes
+-------------
+
+===  =======  =======  =======  =======  =======
+bpp  640x400  640x480  768x576  800x600  960x720
+===  =======  =======  =======  =======  =======
+  4             0x12             0x102
+  8   0x100    0x101    0x180    0x103    0x188
+ 15            0x110    0x181    0x113    0x189
+ 16            0x111    0x182    0x114    0x18A
+ 24            0x1B2    0x184    0x1B5    0x18C
+ 32            0x112    0x183    0x115    0x18B
+===  =======  =======  =======  =======  =======
+
+
+Graphic modes (continued)
+-------------------------
+
+===  ======== ======== ========= ========= =========
+bpp  1024x768 1152x864 1280x1024 1408x1056 1600x1200
+===  ======== ======== ========= ========= =========
+  4    0x104             0x106
+  8    0x105    0x190    0x107     0x198     0x11C
+ 15    0x116    0x191    0x119     0x199     0x11D
+ 16    0x117    0x192    0x11A     0x19A     0x11E
+ 24    0x1B8    0x194    0x1BB     0x19C     0x1BF
+ 32    0x118    0x193    0x11B     0x19B
+===  ======== ======== ========= ========= =========
+
+
+Text modes
+----------
+
+==== =======  =======  ========  ========  ========
+text 640x400  640x480  1056x344  1056x400  1056x480
+==== =======  =======  ========  ========  ========
+ 8x8   0x1C0    0x108     0x10A     0x10B     0x10C
+8x16 2, 3, 7                        0x109
+==== =======  =======  ========  ========  ========
+
+You can enter these number either hexadecimal (leading `0x`) or decimal
+(0x100 = 256). You can also use value + 512 to achieve compatibility
+with your old number passed to vesafb.
+
+Non-listed number can be achieved by more complicated command-line, for
+example 1600x1200x32bpp can be specified by `video=matroxfb:vesa:0x11C,depth:32`.
+
+
+X11
+===
+
+XF{68,86}_FBDev should work just fine, but it is non-accelerated. On non-intel
+architectures there are some glitches for 24bpp videomodes. 8, 16 and 32bpp
+works fine.
+
+Running another (accelerated) X-Server like XF86_SVGA works too. But (at least)
+XFree servers have big troubles in multihead configurations (even on first
+head, not even talking about second). Running XFree86 4.x accelerated mga
+driver is possible, but you must not enable DRI - if you do, resolution and
+color depth of your X desktop must match resolution and color depths of your
+virtual consoles, otherwise X will corrupt accelerator settings.
+
+
+SVGALib
+=======
+
+Driver contains SVGALib compatibility code. It is turned on by choosing textual
+mode for console. You can do it at boot time by using videomode
+2,3,7,0x108-0x10C or 0x1C0. At runtime, `fbset -depth 0` does this work.
+Unfortunately, after SVGALib application exits, screen contents is corrupted.
+Switching to another console and back fixes it. I hope that it is SVGALib's
+problem and not mine, but I'm not sure.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to matroxfb with
+`video=matroxfb:option1,option2:value2,option3` (multiple options should be
+separated by comma, values are separated from options by `:`).
+Accepted options:
+
+============ ===================================================================
+mem:X        size of memory (X can be in megabytes, kilobytes or bytes)
+	     You can only decrease value determined by driver because of
+	     it always probe for memory. Default is to use whole detected
+	     memory usable for on-screen display (i.e. max. 8 MB).
+disabled     do not load driver; you can use also `off`, but `disabled`
+	     is here too.
+enabled      load driver, if you have `video=matroxfb:disabled` in LILO
+	     configuration, you can override it by this (you cannot override
+	     `off`). It is default.
+noaccel      do not use acceleration engine. It does not work on Alphas.
+accel        use acceleration engine. It is default.
+nopan        create initial consoles with vyres = yres, thus disabling virtual
+	     scrolling.
+pan          create initial consoles as tall as possible (vyres = memory/vxres).
+	     It is default.
+nopciretry   disable PCI retries. It is needed for some broken chipsets,
+	     it is autodetected for intel's 82437. In this case device does
+	     not comply to PCI 2.1 specs (it will not guarantee that every
+	     transaction terminate with success or retry in 32 PCLK).
+pciretry     enable PCI retries. It is default, except for intel's 82437.
+novga        disables VGA I/O ports. It is default if BIOS did not enable
+	     device. You should not use this option, some boards then do not
+	     restart without power off.
+vga          preserve state of VGA I/O ports. It is default. Driver does not
+	     enable VGA I/O if BIOS did not it (it is not safe to enable it in
+	     most cases).
+nobios       disables BIOS ROM. It is default if BIOS did not enable BIOS
+	     itself. You should not use this option, some boards then do not
+	     restart without power off.
+bios         preserve state of BIOS ROM. It is default. Driver does not enable
+	     BIOS if BIOS was not enabled before.
+noinit       tells driver, that devices were already initialized. You should use
+	     it if you have G100 and/or if driver cannot detect memory, you see
+	     strange pattern on screen and so on. Devices not enabled by BIOS
+	     are still initialized. It is default.
+init         driver initializes every device it knows about.
+memtype      specifies memory type, implies 'init'. This is valid only for G200
+	     and G400 and has following meaning:
+
+	       G200:
+		 -  0 -> 2x128Kx32 chips, 2MB onboard, probably sgram
+		 -  1 -> 2x128Kx32 chips, 4MB onboard, probably sgram
+		 -  2 -> 2x256Kx32 chips, 4MB onboard, probably sgram
+		 -  3 -> 2x256Kx32 chips, 8MB onboard, probably sgram
+		 -  4 -> 2x512Kx16 chips, 8/16MB onboard, probably sdram only
+		 -  5 -> same as above
+		 -  6 -> 4x128Kx32 chips, 4MB onboard, probably sgram
+		 -  7 -> 4x128Kx32 chips, 8MB onboard, probably sgram
+	       G400:
+		 -  0 -> 2x512Kx16 SDRAM, 16/32MB
+		 -	 2x512Kx32 SGRAM, 16/32MB
+		 -  1 -> 2x256Kx32 SGRAM, 8/16MB
+		 -  2 -> 4x128Kx32 SGRAM, 8/16MB
+		 -  3 -> 4x512Kx32 SDRAM, 32MB
+		 -  4 -> 4x256Kx32 SGRAM, 16/32MB
+		 -  5 -> 2x1Mx32 SDRAM, 32MB
+		 -  6 -> reserved
+		 -  7 -> reserved
+
+	     You should use sdram or sgram parameter in addition to memtype
+	     parameter.
+nomtrr       disables write combining on frame buffer. This slows down driver
+	     but there is reported minor incompatibility between GUS DMA and
+	     XFree under high loads if write combining is enabled (sound
+	     dropouts).
+mtrr         enables write combining on frame buffer. It speeds up video
+	     accesses much. It is default. You must have MTRR support enabled
+	     in kernel and your CPU must have MTRR (f.e. Pentium II have them).
+sgram        tells to driver that you have Gxx0 with SGRAM memory. It has no
+	     effect without `init`.
+sdram        tells to driver that you have Gxx0 with SDRAM memory.
+	     It is a default.
+inv24        change timings parameters for 24bpp modes on Millennium and
+	     Millennium II. Specify this if you see strange color shadows
+	     around  characters.
+noinv24      use standard timings. It is the default.
+inverse      invert colors on screen (for LCD displays)
+noinverse    show true colors on screen. It is default.
+dev:X        bind driver to device X. Driver numbers device from 0 up to N,
+	     where device 0 is first `known` device found, 1 second and so on.
+	     lspci lists devices in this order.
+	     Default is `every` known device.
+nohwcursor   disables hardware cursor (use software cursor instead).
+hwcursor     enables hardware cursor. It is default. If you are using
+	     non-accelerated mode (`noaccel` or `fbset -accel false`), software
+	     cursor is used (except for text mode).
+noblink      disables cursor blinking. Cursor in text mode always blinks (hw
+	     limitation).
+blink        enables cursor blinking. It is default.
+nofastfont   disables fastfont feature. It is default.
+fastfont:X   enables fastfont feature. X specifies size of memory reserved for
+	     font data, it must be >= (fontwidth*fontheight*chars_in_font)/8.
+	     It is faster on Gx00 series, but slower on older cards.
+grayscale    enable grayscale summing. It works in PSEUDOCOLOR modes (text,
+	     4bpp, 8bpp). In DIRECTCOLOR modes it is limited to characters
+	     displayed through putc/putcs. Direct accesses to framebuffer
+	     can paint colors.
+nograyscale  disable grayscale summing. It is default.
+cross4MB     enables that pixel line can cross 4MB boundary. It is default for
+	     non-Millennium.
+nocross4MB   pixel line must not cross 4MB boundary. It is default for
+	     Millennium I or II, because of these devices have hardware
+	     limitations which do not allow this. But this option is
+	     incompatible with some (if not all yet released) versions of
+	     XF86_FBDev.
+dfp          enables digital flat panel interface. This option is incompatible
+	     with secondary (TV) output - if DFP is active, TV output must be
+	     inactive and vice versa. DFP always uses same timing as primary
+	     (monitor) output.
+dfp:X        use settings X for digital flat panel interface. X is number from
+	     0 to 0xFF, and meaning of each individual bit is described in
+	     G400 manual, in description of DAC register 0x1F. For normal
+	     operation you should set all bits to zero, except lowest bit. This
+	     lowest bit selects who is source of display clocks, whether G400,
+	     or panel. Default value is now read back from hardware - so you
+	     should specify this value only if you are also using `init`
+	     parameter.
+outputs:XYZ  set mapping between CRTC and outputs. Each letter can have value
+	     of 0 (for no CRTC), 1 (CRTC1) or 2 (CRTC2), and first letter
+	     corresponds to primary analog output, second letter to the
+	     secondary analog output and third letter to the DVI output.
+	     Default setting is 100 for cards below G400 or G400 without DFP,
+	     101 for G400 with DFP, and 111 for G450 and G550. You can set
+	     mapping only on first card, use matroxset for setting up other
+	     devices.
+vesa:X       selects startup videomode. X is number from 0 to 0x1FF, see table
+	     above for detailed explanation. Default is 640x480x8bpp if driver
+	     has 8bpp support. Otherwise first available of 640x350x4bpp,
+	     640x480x15bpp, 640x480x24bpp, 640x480x32bpp or 80x25 text
+	     (80x25 text is always available).
+============ ===================================================================
+
+If you are not satisfied with videomode selected by `vesa` option, you
+can modify it with these options:
+
+============ ===================================================================
+xres:X       horizontal resolution, in pixels. Default is derived from `vesa`
+	     option.
+yres:X       vertical resolution, in pixel lines. Default is derived from `vesa`
+	     option.
+upper:X      top boundary: lines between end of VSYNC pulse and start of first
+	     pixel line of picture. Default is derived from `vesa` option.
+lower:X      bottom boundary: lines between end of picture and start of VSYNC
+	     pulse. Default is derived from `vesa` option.
+vslen:X      length of VSYNC pulse, in lines. Default is derived from `vesa`
+	     option.
+left:X       left boundary: pixels between end of HSYNC pulse and first pixel.
+	     Default is derived from `vesa` option.
+right:X      right boundary: pixels between end of picture and start of HSYNC
+	     pulse. Default is derived from `vesa` option.
+hslen:X      length of HSYNC pulse, in pixels. Default is derived from `vesa`
+	     option.
+pixclock:X   dotclocks, in ps (picoseconds). Default is derived from `vesa`
+	     option and from `fh` and `fv` options.
+sync:X       sync. pulse - bit 0 inverts HSYNC polarity, bit 1 VSYNC polarity.
+	     If bit 3 (value 0x08) is set, composite sync instead of HSYNC is
+	     generated. If bit 5 (value 0x20) is set, sync on green is turned
+	     on. Do not forget that if you want sync on green, you also probably
+	     want composite sync.
+	     Default depends on `vesa`.
+depth:X      Bits per pixel: 0=text, 4,8,15,16,24 or 32. Default depends on
+	     `vesa`.
+============ ===================================================================
+
+If you know capabilities of your monitor, you can specify some (or all) of
+`maxclk`, `fh` and `fv`. In this case, `pixclock` is computed so that
+pixclock <= maxclk, real_fh <= fh and real_fv <= fv.
+
+============ ==================================================================
+maxclk:X     maximum dotclock. X can be specified in MHz, kHz or Hz. Default is
+	     `don`t care`.
+fh:X         maximum horizontal synchronization frequency. X can be specified
+	     in kHz or Hz. Default is `don't care`.
+fv:X         maximum vertical frequency. X must be specified in Hz. Default is
+	     70 for modes derived from `vesa` with yres <= 400, 60Hz for
+	     yres > 400.
+============ ==================================================================
+
+
+Limitations
+===========
+
+There are known and unknown bugs, features and misfeatures.
+Currently there are following known bugs:
+
+ - SVGALib does not restore screen on exit
+ - generic fbcon-cfbX procedures do not work on Alphas. Due to this,
+   `noaccel` (and cfb4 accel) driver does not work on Alpha. So everyone
+   with access to `/dev/fb*` on Alpha can hang machine (you should restrict
+   access to `/dev/fb*` - everyone with access to this device can destroy
+   your monitor, believe me...).
+ - 24bpp does not support correctly XF-FBDev on big-endian architectures.
+ - interlaced text mode is not supported; it looks like hardware limitation,
+   but I'm not sure.
+ - Gxx0 SGRAM/SDRAM is not autodetected.
+ - If you are using more than one framebuffer device, you must boot kernel
+   with 'video=scrollback:0'.
+ - maybe more...
+
+And following misfeatures:
+
+ - SVGALib does not restore screen on exit.
+ - pixclock for text modes is limited by hardware to
+
+    - 83 MHz on G200
+    - 66 MHz on Millennium I
+    - 60 MHz on Millennium II
+
+   Because I have no access to other devices, I do not know specific
+   frequencies for them. So driver does not check this and allows you to
+   set frequency higher that this. It causes sparks, black holes and other
+   pretty effects on screen. Device was not destroyed during tests. :-)
+ - my Millennium G200 oscillator has frequency range from 35 MHz to 380 MHz
+   (and it works with 8bpp on about 320 MHz dotclocks (and changed mclk)).
+   But Matrox says on product sheet that VCO limit is 50-250 MHz, so I believe
+   them (maybe that chip overheats, but it has a very big cooler (G100 has
+   none), so it should work).
+ - special mixed video/graphics videomodes of Mystique and Gx00 - 2G8V16 and
+   G16V16 are not supported
+ - color keying is not supported
+ - feature connector of Mystique and Gx00 is set to VGA mode (it is disabled
+   by BIOS)
+ - DDC (monitor detection) is supported through dualhead driver
+ - some check for input values are not so strict how it should be (you can
+   specify vslen=4000 and so on).
+ - maybe more...
+
+And following features:
+
+ - 4bpp is available only on Millennium I and Millennium II. It is hardware
+   limitation.
+ - selection between 1:5:5:5 and 5:6:5 16bpp videomode is done by -rgba
+   option of fbset: "fbset -depth 16 -rgba 5,5,5" selects 1:5:5:5, anything
+   else selects 5:6:5 mode.
+ - text mode uses 6 bit VGA palette instead of 8 bit (one of 262144 colors
+   instead of one of 16M colors). It is due to hardware limitation of
+   Millennium I/II and SVGALib compatibility.
+
+
+Benchmarks
+==========
+It is time to redraw whole screen 1000 times in 1024x768, 60Hz. It is
+time for draw 6144000 characters on screen through /dev/vcsa
+(for 32bpp it is about 3GB of data (exactly 3000 MB); for 8x16 font in
+16 seconds, i.e. 187 MBps).
+Times were obtained from one older version of driver, now they are about 3%
+faster, it is kernel-space only time on P-II/350 MHz, Millennium I in 33 MHz
+PCI slot, G200 in AGP 2x slot. I did not test vgacon::
+
+  NOACCEL
+	8x16                 12x22
+	Millennium I  G200   Millennium I  G200
+  8bpp    16.42         9.54   12.33         9.13
+  16bpp   21.00        15.70   19.11        15.02
+  24bpp   36.66        36.66   35.00        35.00
+  32bpp   35.00        30.00   33.85        28.66
+
+  ACCEL, nofastfont
+	8x16                 12x22                6x11
+	Millennium I  G200   Millennium I  G200   Millennium I  G200
+  8bpp     7.79         7.24   13.55         7.78   30.00        21.01
+  16bpp    9.13         7.78   16.16         7.78   30.00        21.01
+  24bpp   14.17        10.72   18.69        10.24   34.99        21.01
+  32bpp   16.15	     16.16   18.73        13.09   34.99        21.01
+
+  ACCEL, fastfont
+	8x16                 12x22                6x11
+	Millennium I  G200   Millennium I  G200   Millennium I  G200
+  8bpp     8.41         6.01    6.54         4.37   16.00        10.51
+  16bpp    9.54         9.12    8.76         6.17   17.52        14.01
+  24bpp   15.00        12.36   11.67        10.00   22.01        18.32
+  32bpp   16.18        18.29*  12.71        12.74   24.44        21.00
+
+  TEXT
+	8x16
+	Millennium I  G200
+  TEXT     3.29         1.50
+
+  * Yes, it is slower than Millennium I.
+
+
+Dualhead G400
+=============
+Driver supports dualhead G400 with some limitations:
+ + secondary head shares videomemory with primary head. It is not problem
+   if you have 32MB of videoram, but if you have only 16MB, you may have
+   to think twice before choosing videomode (for example twice 1880x1440x32bpp
+   is not possible).
+ + due to hardware limitation, secondary head can use only 16 and 32bpp
+   videomodes.
+ + secondary head is not accelerated. There were bad problems with accelerated
+   XFree when secondary head used to use acceleration.
+ + secondary head always powerups in 640x480@60-32 videomode. You have to use
+   fbset to change this mode.
+ + secondary head always powerups in monitor mode. You have to use fbmatroxset
+   to change it to TV mode. Also, you must select at least 525 lines for
+   NTSC output and 625 lines for PAL output.
+ + kernel is not fully multihead ready. So some things are impossible to do.
+ + if you compiled it as module, you must insert i2c-matroxfb, matroxfb_maven
+   and matroxfb_crtc2 into kernel.
+
+
+Dualhead G450
+=============
+Driver supports dualhead G450 with some limitations:
+ + secondary head shares videomemory with primary head. It is not problem
+   if you have 32MB of videoram, but if you have only 16MB, you may have
+   to think twice before choosing videomode.
+ + due to hardware limitation, secondary head can use only 16 and 32bpp
+   videomodes.
+ + secondary head is not accelerated.
+ + secondary head always powerups in 640x480@60-32 videomode. You have to use
+   fbset to change this mode.
+ + TV output is not supported
+ + kernel is not fully multihead ready, so some things are impossible to do.
+ + if you compiled it as module, you must insert matroxfb_g450 and matroxfb_crtc2
+   into kernel.
+
+Petr Vandrovec <vandrove@vc.cvut.cz>
diff --git a/Documentation/fb/matroxfb.txt b/Documentation/fb/matroxfb.txt
deleted file mode 100644
index b95f5bb522f2..000000000000
--- a/Documentation/fb/matroxfb.txt
+++ /dev/null
@@ -1,413 +0,0 @@
-[This file is cloned from VesaFB. Thanks go to Gerd Knorr]
-
-What is matroxfb?
-=================
-
-This is a driver for a graphic framebuffer for Matrox devices on
-Alpha, Intel and PPC boxes.
-
-Advantages:
-
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts.
- * You can run XF{68,86}_FBDev or XFree86 fbdev driver on top of /dev/fb0
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * graphic mode is slower than text mode... but you should not notice
-   if you use same resolution as you used in textmode.
-
-
-How to use it?
-==============
-
-Switching modes is done using the video=matroxfb:vesa:... boot parameter
-or using `fbset' program.
-
-If you want, for example, enable a resolution of 1280x1024x24bpp you should
-pass to the kernel this command line: "video=matroxfb:vesa:0x1BB".
-
-You should compile in both vgacon (to boot if you remove you Matrox from
-box) and matroxfb (for graphics mode). You should not compile-in vesafb
-unless you have primary display on non-Matrox VBE2.0 device (see 
-Documentation/fb/vesafb.txt for details).
-
-Currently supported video modes are (through vesa:... interface, PowerMac
-has [as addon] compatibility code):
-
-
-[Graphic modes]
-
-bpp | 640x400  640x480  768x576  800x600  960x720
-----+--------------------------------------------
-  4 |            0x12             0x102            
-  8 |  0x100    0x101    0x180    0x103    0x188   
- 15 |           0x110    0x181    0x113    0x189   
- 16 |           0x111    0x182    0x114    0x18A   
- 24 |           0x1B2    0x184    0x1B5    0x18C   
- 32 |           0x112    0x183    0x115    0x18B   
-
-
-[Graphic modes (continued)]
-
-bpp | 1024x768 1152x864 1280x1024 1408x1056 1600x1200
-----+------------------------------------------------
-  4 |   0x104             0x106
-  8 |   0x105    0x190    0x107     0x198     0x11C
- 15 |   0x116    0x191    0x119     0x199     0x11D
- 16 |   0x117    0x192    0x11A     0x19A     0x11E
- 24 |   0x1B8    0x194    0x1BB     0x19C     0x1BF
- 32 |   0x118    0x193    0x11B     0x19B
-
-
-[Text modes]
-
-text | 640x400  640x480  1056x344  1056x400  1056x480
------+------------------------------------------------
- 8x8 |  0x1C0    0x108     0x10A     0x10B     0x10C
-8x16 | 2, 3, 7                       0x109
-
-You can enter these number either hexadecimal (leading `0x') or decimal
-(0x100 = 256). You can also use value + 512 to achieve compatibility
-with your old number passed to vesafb.
-
-Non-listed number can be achieved by more complicated command-line, for
-example 1600x1200x32bpp can be specified by `video=matroxfb:vesa:0x11C,depth:32'.
-
-
-X11
-===
-
-XF{68,86}_FBDev should work just fine, but it is non-accelerated. On non-intel
-architectures there are some glitches for 24bpp videomodes. 8, 16 and 32bpp
-works fine.
-
-Running another (accelerated) X-Server like XF86_SVGA works too. But (at least)
-XFree servers have big troubles in multihead configurations (even on first
-head, not even talking about second). Running XFree86 4.x accelerated mga 
-driver is possible, but you must not enable DRI - if you do, resolution and
-color depth of your X desktop must match resolution and color depths of your
-virtual consoles, otherwise X will corrupt accelerator settings.
-
-
-SVGALib
-=======
-
-Driver contains SVGALib compatibility code. It is turned on by choosing textual
-mode for console. You can do it at boot time by using videomode
-2,3,7,0x108-0x10C or 0x1C0. At runtime, `fbset -depth 0' does this work.
-Unfortunately, after SVGALib application exits, screen contents is corrupted.
-Switching to another console and back fixes it. I hope that it is SVGALib's
-problem and not mine, but I'm not sure.
-
-
-Configuration
-=============
-
-You can pass kernel command line options to matroxfb with
-`video=matroxfb:option1,option2:value2,option3' (multiple options should be 
-separated by comma, values are separated from options by `:'). 
-Accepted options:
-
-mem:X    - size of memory (X can be in megabytes, kilobytes or bytes)
-           You can only decrease value determined by driver because of
-	   it always probe for memory. Default is to use whole detected
-	   memory usable for on-screen display (i.e. max. 8 MB).
-disabled - do not load driver; you can use also `off', but `disabled'
-           is here too.
-enabled  - load driver, if you have `video=matroxfb:disabled' in LILO
-           configuration, you can override it by this (you cannot override
-	   `off'). It is default.
-noaccel  - do not use acceleration engine. It does not work on Alphas.
-accel    - use acceleration engine. It is default.
-nopan    - create initial consoles with vyres = yres, thus disabling virtual
-           scrolling.
-pan      - create initial consoles as tall as possible (vyres = memory/vxres).
-           It is default.
-nopciretry - disable PCI retries. It is needed for some broken chipsets,
-           it is autodetected for intel's 82437. In this case device does
-	   not comply to PCI 2.1 specs (it will not guarantee that every
-	   transaction terminate with success or retry in 32 PCLK).
-pciretry - enable PCI retries. It is default, except for intel's 82437.
-novga    - disables VGA I/O ports. It is default if BIOS did not enable device.
-           You should not use this option, some boards then do not restart
-	   without power off.
-vga      - preserve state of VGA I/O ports. It is default. Driver does not
-           enable VGA I/O if BIOS did not it (it is not safe to enable it in
-	   most cases).
-nobios   - disables BIOS ROM. It is default if BIOS did not enable BIOS itself.
-           You should not use this option, some boards then do not restart
-	   without power off.
-bios     - preserve state of BIOS ROM. It is default. Driver does not enable
-           BIOS if BIOS was not enabled before.
-noinit   - tells driver, that devices were already initialized. You should use
-           it if you have G100 and/or if driver cannot detect memory, you see
-	   strange pattern on screen and so on. Devices not enabled by BIOS
-	   are still initialized. It is default.
-init     - driver initializes every device it knows about.
-memtype  - specifies memory type, implies 'init'. This is valid only for G200 
-           and G400 and has following meaning:
-             G200: 0 -> 2x128Kx32 chips, 2MB onboard, probably sgram
-                   1 -> 2x128Kx32 chips, 4MB onboard, probably sgram
-                   2 -> 2x256Kx32 chips, 4MB onboard, probably sgram
-                   3 -> 2x256Kx32 chips, 8MB onboard, probably sgram
-                   4 -> 2x512Kx16 chips, 8/16MB onboard, probably sdram only
-                   5 -> same as above
-                   6 -> 4x128Kx32 chips, 4MB onboard, probably sgram
-                   7 -> 4x128Kx32 chips, 8MB onboard, probably sgram
-             G400: 0 -> 2x512Kx16 SDRAM, 16/32MB
-                        2x512Kx32 SGRAM, 16/32MB
-                   1 -> 2x256Kx32 SGRAM, 8/16MB
-                   2 -> 4x128Kx32 SGRAM, 8/16MB
-                   3 -> 4x512Kx32 SDRAM, 32MB
-                   4 -> 4x256Kx32 SGRAM, 16/32MB
-                   5 -> 2x1Mx32 SDRAM, 32MB
-                   6 -> reserved
-                   7 -> reserved
-           You should use sdram or sgram parameter in addition to memtype 
-           parameter.
-nomtrr   - disables write combining on frame buffer. This slows down driver but
-           there is reported minor incompatibility between GUS DMA and XFree
-	   under high loads if write combining is enabled (sound dropouts).
-mtrr     - enables write combining on frame buffer. It speeds up video accesses
-           much. It is default. You must have MTRR support enabled in kernel
-	   and your CPU must have MTRR (f.e. Pentium II have them).
-sgram    - tells to driver that you have Gxx0 with SGRAM memory. It has no
-           effect without `init'.
-sdram    - tells to driver that you have Gxx0 with SDRAM memory.
-           It is a default.
-inv24    - change timings parameters for 24bpp modes on Millennium and
-           Millennium II. Specify this if you see strange color shadows around
-	   characters.
-noinv24  - use standard timings. It is the default.
-inverse  - invert colors on screen (for LCD displays)
-noinverse - show true colors on screen. It is default.
-dev:X    - bind driver to device X. Driver numbers device from 0 up to N,
-           where device 0 is first `known' device found, 1 second and so on.
-	   lspci lists devices in this order.
-	   Default is `every' known device.
-nohwcursor - disables hardware cursor (use software cursor instead).
-hwcursor - enables hardware cursor. It is default. If you are using
-           non-accelerated mode (`noaccel' or `fbset -accel false'), software
-	   cursor is used (except for text mode).
-noblink  - disables cursor blinking. Cursor in text mode always blinks (hw
-           limitation).
-blink    - enables cursor blinking. It is default.
-nofastfont - disables fastfont feature. It is default.
-fastfont:X - enables fastfont feature. X specifies size of memory reserved for
-             font data, it must be >= (fontwidth*fontheight*chars_in_font)/8.
-	     It is faster on Gx00 series, but slower on older cards.
-grayscale - enable grayscale summing. It works in PSEUDOCOLOR modes (text,
-            4bpp, 8bpp). In DIRECTCOLOR modes it is limited to characters
-	    displayed through putc/putcs. Direct accesses to framebuffer
-	    can paint colors.
-nograyscale - disable grayscale summing. It is default.
-cross4MB - enables that pixel line can cross 4MB boundary. It is default for
-           non-Millennium.
-nocross4MB - pixel line must not cross 4MB boundary. It is default for
-             Millennium I or II, because of these devices have hardware
-	     limitations which do not allow this. But this option is
-	     incompatible with some (if not all yet released) versions of
-	     XF86_FBDev.
-dfp      - enables digital flat panel interface. This option is incompatible with
-           secondary (TV) output - if DFP is active, TV output must be
-	   inactive and vice versa. DFP always uses same timing as primary
-	   (monitor) output.
-dfp:X    - use settings X for digital flat panel interface. X is number from
-           0 to 0xFF, and meaning of each individual bit is described in
-	   G400 manual, in description of DAC register 0x1F. For normal operation
-	   you should set all bits to zero, except lowest bit. This lowest bit
-	   selects who is source of display clocks, whether G400, or panel.
-	   Default value is now read back from hardware - so you should specify
-	   this value only if you are also using `init' parameter.
-outputs:XYZ - set mapping between CRTC and outputs. Each letter can have value
-           of 0 (for no CRTC), 1 (CRTC1) or 2 (CRTC2), and first letter corresponds
-	   to primary analog output, second letter to the secondary analog output
-	   and third letter to the DVI output. Default setting is 100 for
-	   cards below G400 or G400 without DFP, 101 for G400 with DFP, and
-	   111 for G450 and G550. You can set mapping only on first card,
-	   use matroxset for setting up other devices.
-vesa:X   - selects startup videomode. X is number from 0 to 0x1FF, see table
-           above for detailed explanation. Default is 640x480x8bpp if driver
-	   has 8bpp support. Otherwise first available of 640x350x4bpp,
-	   640x480x15bpp, 640x480x24bpp, 640x480x32bpp or 80x25 text
-	   (80x25 text is always available).
-
-If you are not satisfied with videomode selected by `vesa' option, you
-can modify it with these options:
-
-xres:X   - horizontal resolution, in pixels. Default is derived from `vesa'
-           option.
-yres:X   - vertical resolution, in pixel lines. Default is derived from `vesa'
-           option.
-upper:X  - top boundary: lines between end of VSYNC pulse and start of first
-           pixel line of picture. Default is derived from `vesa' option.
-lower:X  - bottom boundary: lines between end of picture and start of VSYNC
-           pulse. Default is derived from `vesa' option.
-vslen:X  - length of VSYNC pulse, in lines. Default is derived from `vesa'
-           option.
-left:X   - left boundary: pixels between end of HSYNC pulse and first pixel.
-           Default is derived from `vesa' option.
-right:X  - right boundary: pixels between end of picture and start of HSYNC
-           pulse. Default is derived from `vesa' option.
-hslen:X  - length of HSYNC pulse, in pixels. Default is derived from `vesa'
-           option.
-pixclock:X - dotclocks, in ps (picoseconds). Default is derived from `vesa'
-             option and from `fh' and `fv' options.
-sync:X   - sync. pulse - bit 0 inverts HSYNC polarity, bit 1 VSYNC polarity.
-           If bit 3 (value 0x08) is set, composite sync instead of HSYNC is
-	   generated. If bit 5 (value 0x20) is set, sync on green is turned on.
-	   Do not forget that if you want sync on green, you also probably
-	   want composite sync.
-	   Default depends on `vesa'.
-depth:X  - Bits per pixel: 0=text, 4,8,15,16,24 or 32. Default depends on
-           `vesa'.
-
-If you know capabilities of your monitor, you can specify some (or all) of
-`maxclk', `fh' and `fv'. In this case, `pixclock' is computed so that
-pixclock <= maxclk, real_fh <= fh and real_fv <= fv.
-
-maxclk:X - maximum dotclock. X can be specified in MHz, kHz or Hz. Default is
-           `don't care'.
-fh:X     - maximum horizontal synchronization frequency. X can be specified
-           in kHz or Hz. Default is `don't care'.
-fv:X     - maximum vertical frequency. X must be specified in Hz. Default is
-           70 for modes derived from `vesa' with yres <= 400, 60Hz for
-	   yres > 400.
-
-
-Limitations
-===========
-
-There are known and unknown bugs, features and misfeatures.
-Currently there are following known bugs:
- + SVGALib does not restore screen on exit
- + generic fbcon-cfbX procedures do not work on Alphas. Due to this,
-   `noaccel' (and cfb4 accel) driver does not work on Alpha. So everyone
-   with access to /dev/fb* on Alpha can hang machine (you should restrict
-   access to /dev/fb* - everyone with access to this device can destroy
-   your monitor, believe me...).
- + 24bpp does not support correctly XF-FBDev on big-endian architectures.
- + interlaced text mode is not supported; it looks like hardware limitation,
-   but I'm not sure.
- + Gxx0 SGRAM/SDRAM is not autodetected.
- + If you are using more than one framebuffer device, you must boot kernel
-   with 'video=scrollback:0'.
- + maybe more...
-And following misfeatures:
- + SVGALib does not restore screen on exit.
- + pixclock for text modes is limited by hardware to
-    83 MHz on G200
-    66 MHz on Millennium I
-    60 MHz on Millennium II
-   Because I have no access to other devices, I do not know specific
-   frequencies for them. So driver does not check this and allows you to
-   set frequency higher that this. It causes sparks, black holes and other
-   pretty effects on screen. Device was not destroyed during tests. :-)
- + my Millennium G200 oscillator has frequency range from 35 MHz to 380 MHz
-   (and it works with 8bpp on about 320 MHz dotclocks (and changed mclk)).
-   But Matrox says on product sheet that VCO limit is 50-250 MHz, so I believe
-   them (maybe that chip overheats, but it has a very big cooler (G100 has
-   none), so it should work).
- + special mixed video/graphics videomodes of Mystique and Gx00 - 2G8V16 and
-   G16V16 are not supported
- + color keying is not supported
- + feature connector of Mystique and Gx00 is set to VGA mode (it is disabled
-   by BIOS)
- + DDC (monitor detection) is supported through dualhead driver
- + some check for input values are not so strict how it should be (you can
-   specify vslen=4000 and so on).
- + maybe more...
-And following features:
- + 4bpp is available only on Millennium I and Millennium II. It is hardware
-   limitation.
- + selection between 1:5:5:5 and 5:6:5 16bpp videomode is done by -rgba 
-   option of fbset: "fbset -depth 16 -rgba 5,5,5" selects 1:5:5:5, anything
-   else selects 5:6:5 mode.
- + text mode uses 6 bit VGA palette instead of 8 bit (one of 262144 colors
-   instead of one of 16M colors). It is due to hardware limitation of 
-   Millennium I/II and SVGALib compatibility.
-
-
-Benchmarks
-==========
-It is time to redraw whole screen 1000 times in 1024x768, 60Hz. It is
-time for draw 6144000 characters on screen through /dev/vcsa
-(for 32bpp it is about 3GB of data (exactly 3000 MB); for 8x16 font in 
-16 seconds, i.e. 187 MBps).
-Times were obtained from one older version of driver, now they are about 3%
-faster, it is kernel-space only time on P-II/350 MHz, Millennium I in 33 MHz
-PCI slot, G200 in AGP 2x slot. I did not test vgacon.
-
-NOACCEL
-        8x16                 12x22
-        Millennium I  G200   Millennium I  G200
-8bpp    16.42         9.54   12.33         9.13
-16bpp   21.00        15.70   19.11        15.02
-24bpp   36.66        36.66   35.00        35.00
-32bpp   35.00        30.00   33.85        28.66
-
-ACCEL, nofastfont
-        8x16                 12x22                6x11
-	Millennium I  G200   Millennium I  G200   Millennium I  G200
-8bpp     7.79         7.24   13.55         7.78   30.00        21.01
-16bpp    9.13         7.78   16.16         7.78   30.00        21.01
-24bpp   14.17        10.72   18.69        10.24   34.99        21.01
-32bpp   16.15	     16.16   18.73        13.09   34.99        21.01
-
-ACCEL, fastfont
-        8x16                 12x22                6x11
-	Millennium I  G200   Millennium I  G200   Millennium I  G200
-8bpp     8.41         6.01    6.54         4.37   16.00        10.51
-16bpp    9.54         9.12    8.76         6.17   17.52        14.01
-24bpp   15.00        12.36   11.67        10.00   22.01        18.32
-32bpp   16.18        18.29*  12.71        12.74   24.44        21.00
-
-TEXT
-        8x16
-	Millennium I  G200
-TEXT     3.29         1.50
-
-* Yes, it is slower than Millennium I.
-
-
-Dualhead G400
-=============
-Driver supports dualhead G400 with some limitations:
- + secondary head shares videomemory with primary head. It is not problem
-   if you have 32MB of videoram, but if you have only 16MB, you may have
-   to think twice before choosing videomode (for example twice 1880x1440x32bpp
-   is not possible).
- + due to hardware limitation, secondary head can use only 16 and 32bpp
-   videomodes.
- + secondary head is not accelerated. There were bad problems with accelerated
-   XFree when secondary head used to use acceleration.
- + secondary head always powerups in 640x480@60-32 videomode. You have to use
-   fbset to change this mode.
- + secondary head always powerups in monitor mode. You have to use fbmatroxset
-   to change it to TV mode. Also, you must select at least 525 lines for
-   NTSC output and 625 lines for PAL output.
- + kernel is not fully multihead ready. So some things are impossible to do.
- + if you compiled it as module, you must insert i2c-matroxfb, matroxfb_maven
-   and matroxfb_crtc2 into kernel.
-
-
-Dualhead G450
-=============
-Driver supports dualhead G450 with some limitations:
- + secondary head shares videomemory with primary head. It is not problem
-   if you have 32MB of videoram, but if you have only 16MB, you may have
-   to think twice before choosing videomode.
- + due to hardware limitation, secondary head can use only 16 and 32bpp
-   videomodes.
- + secondary head is not accelerated.
- + secondary head always powerups in 640x480@60-32 videomode. You have to use
-   fbset to change this mode.
- + TV output is not supported
- + kernel is not fully multihead ready, so some things are impossible to do.
- + if you compiled it as module, you must insert matroxfb_g450 and matroxfb_crtc2
-   into kernel.
-	
---
-Petr Vandrovec <vandrove@vc.cvut.cz>
diff --git a/Documentation/fb/metronomefb.rst b/Documentation/fb/metronomefb.rst
new file mode 100644
index 000000000000..63e1d31a7e54
--- /dev/null
+++ b/Documentation/fb/metronomefb.rst
@@ -0,0 +1,38 @@
+===========
+Metronomefb
+===========
+
+Maintained by Jaya Kumar <jayakumar.lkml.gmail.com>
+
+Last revised: Mar 10, 2008
+
+Metronomefb is a driver for the Metronome display controller. The controller
+is from E-Ink Corporation. It is intended to be used to drive the E-Ink
+Vizplex display media. E-Ink hosts some details of this controller and the
+display media here http://www.e-ink.com/products/matrix/metronome.html .
+
+Metronome is interfaced to the host CPU through the AMLCD interface. The
+host CPU generates the control information and the image in a framebuffer
+which is then delivered to the AMLCD interface by a host specific method.
+The display and error status are each pulled through individual GPIOs.
+
+Metronomefb is platform independent and depends on a board specific driver
+to do all physical IO work. Currently, an example is implemented for the
+PXA board used in the AM-200 EPD devkit. This example is am200epd.c
+
+Metronomefb requires waveform information which is delivered via the AMLCD
+interface to the metronome controller. The waveform information is expected to
+be delivered from userspace via the firmware class interface. The waveform file
+can be compressed as long as your udev or hotplug script is aware of the need
+to uncompress it before delivering it. metronomefb will ask for metronome.wbf
+which would typically go into /lib/firmware/metronome.wbf depending on your
+udev/hotplug setup. I have only tested with a single waveform file which was
+originally labeled 23P01201_60_WT0107_MTC. I do not know what it stands for.
+Caution should be exercised when manipulating the waveform as there may be
+a possibility that it could have some permanent effects on the display media.
+I neither have access to nor know exactly what the waveform does in terms of
+the physical media.
+
+Metronomefb uses the deferred IO interface so that it can provide a memory
+mappable frame buffer. It has been tested with tinyx (Xfbdev). It is known
+to work at this time with xeyes, xclock, xloadimage, xpdf.
diff --git a/Documentation/fb/metronomefb.txt b/Documentation/fb/metronomefb.txt
deleted file mode 100644
index 237ca412582d..000000000000
--- a/Documentation/fb/metronomefb.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-			Metronomefb
-			-----------
-Maintained by Jaya Kumar <jayakumar.lkml.gmail.com>
-Last revised: Mar 10, 2008
-
-Metronomefb is a driver for the Metronome display controller. The controller
-is from E-Ink Corporation. It is intended to be used to drive the E-Ink
-Vizplex display media. E-Ink hosts some details of this controller and the
-display media here http://www.e-ink.com/products/matrix/metronome.html .
-
-Metronome is interfaced to the host CPU through the AMLCD interface. The
-host CPU generates the control information and the image in a framebuffer
-which is then delivered to the AMLCD interface by a host specific method.
-The display and error status are each pulled through individual GPIOs.
-
-Metronomefb is platform independent and depends on a board specific driver
-to do all physical IO work. Currently, an example is implemented for the
-PXA board used in the AM-200 EPD devkit. This example is am200epd.c
-
-Metronomefb requires waveform information which is delivered via the AMLCD
-interface to the metronome controller. The waveform information is expected to
-be delivered from userspace via the firmware class interface. The waveform file
-can be compressed as long as your udev or hotplug script is aware of the need
-to uncompress it before delivering it. metronomefb will ask for metronome.wbf
-which would typically go into /lib/firmware/metronome.wbf depending on your
-udev/hotplug setup. I have only tested with a single waveform file which was
-originally labeled 23P01201_60_WT0107_MTC. I do not know what it stands for.
-Caution should be exercised when manipulating the waveform as there may be
-a possibility that it could have some permanent effects on the display media.
-I neither have access to nor know exactly what the waveform does in terms of
-the physical media.
-
-Metronomefb uses the deferred IO interface so that it can provide a memory
-mappable frame buffer. It has been tested with tinyx (Xfbdev). It is known
-to work at this time with xeyes, xclock, xloadimage, xpdf.
-
diff --git a/Documentation/fb/modedb.rst b/Documentation/fb/modedb.rst
new file mode 100644
index 000000000000..9c4e3fd39e6d
--- /dev/null
+++ b/Documentation/fb/modedb.rst
@@ -0,0 +1,169 @@
+=================================
+modedb default video mode support
+=================================
+
+
+Currently all frame buffer device drivers have their own video mode databases,
+which is a mess and a waste of resources. The main idea of modedb is to have
+
+  - one routine to probe for video modes, which can be used by all frame buffer
+    devices
+  - one generic video mode database with a fair amount of standard videomodes
+    (taken from XFree86)
+  - the possibility to supply your own mode database for graphics hardware that
+    needs non-standard modes, like amifb and Mac frame buffer drivers (which
+    use macmodes.c)
+
+When a frame buffer device receives a video= option it doesn't know, it should
+consider that to be a video mode option. If no frame buffer device is specified
+in a video= option, fbmem considers that to be a global video mode option.
+
+Valid mode specifiers (mode_option argument)::
+
+    <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][m][eDd]
+    <name>[-<bpp>][@<refresh>]
+
+with <xres>, <yres>, <bpp> and <refresh> decimal numbers and <name> a string.
+Things between square brackets are optional.
+
+If 'M' is specified in the mode_option argument (after <yres> and before
+<bpp> and <refresh>, if specified) the timings will be calculated using
+VESA(TM) Coordinated Video Timings instead of looking up the mode from a table.
+If 'R' is specified, do a 'reduced blanking' calculation for digital displays.
+If 'i' is specified, calculate for an interlaced mode.  And if 'm' is
+specified, add margins to the calculation (1.8% of xres rounded down to 8
+pixels and 1.8% of yres).
+
+       Sample usage: 1024x768M@60m - CVT timing with margins
+
+DRM drivers also add options to enable or disable outputs:
+
+'e' will force the display to be enabled, i.e. it will override the detection
+if a display is connected. 'D' will force the display to be enabled and use
+digital output. This is useful for outputs that have both analog and digital
+signals (e.g. HDMI and DVI-I). For other outputs it behaves like 'e'. If 'd'
+is specified the output is disabled.
+
+You can additionally specify which output the options matches to.
+To force the VGA output to be enabled and drive a specific mode say::
+
+    video=VGA-1:1280x1024@60me
+
+Specifying the option multiple times for different ports is possible, e.g.::
+
+    video=LVDS-1:d video=HDMI-1:D
+
+Options can also be passed after the mode, using commas as separator.
+
+       Sample usage: 720x480,rotate=180 - 720x480 mode, rotated by 180 degrees
+
+Valid options are::
+
+  - margin_top, margin_bottom, margin_left, margin_right (integer):
+    Number of pixels in the margins, typically to deal with overscan on TVs
+  - reflect_x (boolean): Perform an axial symmetry on the X axis
+  - reflect_y (boolean): Perform an axial symmetry on the Y axis
+  - rotate (integer): Rotate the initial framebuffer by x
+    degrees. Valid values are 0, 90, 180 and 270.
+
+
+-----------------------------------------------------------------------------
+
+What is the VESA(TM) Coordinated Video Timings (CVT)?
+=====================================================
+
+From the VESA(TM) Website:
+
+     "The purpose of CVT is to provide a method for generating a consistent
+      and coordinated set of standard formats, display refresh rates, and
+      timing specifications for computer display products, both those
+      employing CRTs, and those using other display technologies. The
+      intention of CVT is to give both source and display manufacturers a
+      common set of tools to enable new timings to be developed in a
+      consistent manner that ensures greater compatibility."
+
+This is the third standard approved by VESA(TM) concerning video timings.  The
+first was the Discrete Video Timings (DVT) which is  a collection of
+pre-defined modes approved by VESA(TM).  The second is the Generalized Timing
+Formula (GTF) which is an algorithm to calculate the timings, given the
+pixelclock, the horizontal sync frequency, or the vertical refresh rate.
+
+The GTF is limited by the fact that it is designed mainly for CRT displays.
+It artificially increases the pixelclock because of its high blanking
+requirement. This is inappropriate for digital display interface with its high
+data rate which requires that it conserves the pixelclock as much as possible.
+Also, GTF does not take into account the aspect ratio of the display.
+
+The CVT addresses these limitations.  If used with CRT's, the formula used
+is a derivation of GTF with a few modifications.  If used with digital
+displays, the "reduced blanking" calculation can be used.
+
+From the framebuffer subsystem perspective, new formats need not be added
+to the global mode database whenever a new mode is released by display
+manufacturers. Specifying for CVT will work for most, if not all, relatively
+new CRT displays and probably with most flatpanels, if 'reduced blanking'
+calculation is specified.  (The CVT compatibility of the display can be
+determined from its EDID. The version 1.3 of the EDID has extra 128-byte
+blocks where additional timing information is placed.  As of this time, there
+is no support yet in the layer to parse this additional blocks.)
+
+CVT also introduced a new naming convention (should be seen from dmesg output)::
+
+    <pix>M<a>[-R]
+
+    where: pix = total amount of pixels in MB (xres x yres)
+	   M   = always present
+	   a   = aspect ratio (3 - 4:3; 4 - 5:4; 9 - 15:9, 16:9; A - 16:10)
+	  -R   = reduced blanking
+
+	  example:  .48M3-R - 800x600 with reduced blanking
+
+Note: VESA(TM) has restrictions on what is a standard CVT timing:
+
+      - aspect ratio can only be one of the above values
+      - acceptable refresh rates are 50, 60, 70 or 85 Hz only
+      - if reduced blanking, the refresh rate must be at 60Hz
+
+If one of the above are not satisfied, the kernel will print a warning but the
+timings will still be calculated.
+
+-----------------------------------------------------------------------------
+
+To find a suitable video mode, you just call::
+
+  int __init fb_find_mode(struct fb_var_screeninfo *var,
+			  struct fb_info *info, const char *mode_option,
+			  const struct fb_videomode *db, unsigned int dbsize,
+			  const struct fb_videomode *default_mode,
+			  unsigned int default_bpp)
+
+with db/dbsize your non-standard video mode database, or NULL to use the
+standard video mode database.
+
+fb_find_mode() first tries the specified video mode (or any mode that matches,
+e.g. there can be multiple 640x480 modes, each of them is tried). If that
+fails, the default mode is tried. If that fails, it walks over all modes.
+
+To specify a video mode at bootup, use the following boot options::
+
+    video=<driver>:<xres>x<yres>[-<bpp>][@refresh]
+
+where <driver> is a name from the table below.  Valid default modes can be
+found in linux/drivers/video/modedb.c.  Check your driver's documentation.
+There may be more modes::
+
+    Drivers that support modedb boot options
+    Boot Name	  Cards Supported
+
+    amifb	- Amiga chipset frame buffer
+    aty128fb	- ATI Rage128 / Pro frame buffer
+    atyfb	- ATI Mach64 frame buffer
+    pm2fb	- Permedia 2/2V frame buffer
+    pm3fb	- Permedia 3 frame buffer
+    sstfb	- Voodoo 1/2 (SST1) chipset frame buffer
+    tdfxfb	- 3D Fx frame buffer
+    tridentfb	- Trident (Cyber)blade chipset frame buffer
+    vt8623fb	- VIA 8623 frame buffer
+
+BTW, only a few fb drivers use this at the moment. Others are to follow
+(feel free to send patches). The DRM drivers also support this.
diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.txt
deleted file mode 100644
index 16aa08453911..000000000000
--- a/Documentation/fb/modedb.txt
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-			modedb default video mode support
-
-
-Currently all frame buffer device drivers have their own video mode databases,
-which is a mess and a waste of resources. The main idea of modedb is to have
-
-  - one routine to probe for video modes, which can be used by all frame buffer
-    devices
-  - one generic video mode database with a fair amount of standard videomodes
-    (taken from XFree86)
-  - the possibility to supply your own mode database for graphics hardware that
-    needs non-standard modes, like amifb and Mac frame buffer drivers (which
-    use macmodes.c)
-
-When a frame buffer device receives a video= option it doesn't know, it should
-consider that to be a video mode option. If no frame buffer device is specified
-in a video= option, fbmem considers that to be a global video mode option.
-
-Valid mode specifiers (mode_option argument):
-
-    <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][m][eDd]
-    <name>[-<bpp>][@<refresh>]
-
-with <xres>, <yres>, <bpp> and <refresh> decimal numbers and <name> a string.
-Things between square brackets are optional.
-
-If 'M' is specified in the mode_option argument (after <yres> and before
-<bpp> and <refresh>, if specified) the timings will be calculated using
-VESA(TM) Coordinated Video Timings instead of looking up the mode from a table.
-If 'R' is specified, do a 'reduced blanking' calculation for digital displays.
-If 'i' is specified, calculate for an interlaced mode.  And if 'm' is
-specified, add margins to the calculation (1.8% of xres rounded down to 8
-pixels and 1.8% of yres).
-
-       Sample usage: 1024x768M@60m - CVT timing with margins
-
-DRM drivers also add options to enable or disable outputs:
-
-'e' will force the display to be enabled, i.e. it will override the detection
-if a display is connected. 'D' will force the display to be enabled and use
-digital output. This is useful for outputs that have both analog and digital
-signals (e.g. HDMI and DVI-I). For other outputs it behaves like 'e'. If 'd'
-is specified the output is disabled.
-
-You can additionally specify which output the options matches to.
-To force the VGA output to be enabled and drive a specific mode say:
-    video=VGA-1:1280x1024@60me
-
-Specifying the option multiple times for different ports is possible, e.g.:
-    video=LVDS-1:d video=HDMI-1:D
-
-***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo *****
-
-What is the VESA(TM) Coordinated Video Timings (CVT)?
-
-From the VESA(TM) Website:
-
-     "The purpose of CVT is to provide a method for generating a consistent
-      and coordinated set of standard formats, display refresh rates, and
-      timing specifications for computer display products, both those
-      employing CRTs, and those using other display technologies. The
-      intention of CVT is to give both source and display manufacturers a
-      common set of tools to enable new timings to be developed in a
-      consistent manner that ensures greater compatibility."
-
-This is the third standard approved by VESA(TM) concerning video timings.  The
-first was the Discrete Video Timings (DVT) which is  a collection of
-pre-defined modes approved by VESA(TM).  The second is the Generalized Timing
-Formula (GTF) which is an algorithm to calculate the timings, given the
-pixelclock, the horizontal sync frequency, or the vertical refresh rate.
-
-The GTF is limited by the fact that it is designed mainly for CRT displays.
-It artificially increases the pixelclock because of its high blanking
-requirement. This is inappropriate for digital display interface with its high
-data rate which requires that it conserves the pixelclock as much as possible.
-Also, GTF does not take into account the aspect ratio of the display.
-
-The CVT addresses these limitations.  If used with CRT's, the formula used
-is a derivation of GTF with a few modifications.  If used with digital
-displays, the "reduced blanking" calculation can be used.
-
-From the framebuffer subsystem perspective, new formats need not be added
-to the global mode database whenever a new mode is released by display
-manufacturers. Specifying for CVT will work for most, if not all, relatively
-new CRT displays and probably with most flatpanels, if 'reduced blanking'
-calculation is specified.  (The CVT compatibility of the display can be
-determined from its EDID. The version 1.3 of the EDID has extra 128-byte
-blocks where additional timing information is placed.  As of this time, there
-is no support yet in the layer to parse this additional blocks.)
-
-CVT also introduced a new naming convention (should be seen from dmesg output):
-
-    <pix>M<a>[-R]
-
-    where: pix = total amount of pixels in MB (xres x yres)
-           M   = always present
-           a   = aspect ratio (3 - 4:3; 4 - 5:4; 9 - 15:9, 16:9; A - 16:10)
-          -R   = reduced blanking
-
-	  example:  .48M3-R - 800x600 with reduced blanking
-
-Note: VESA(TM) has restrictions on what is a standard CVT timing:
-
-      - aspect ratio can only be one of the above values
-      - acceptable refresh rates are 50, 60, 70 or 85 Hz only
-      - if reduced blanking, the refresh rate must be at 60Hz
-
-If one of the above are not satisfied, the kernel will print a warning but the
-timings will still be calculated.
-
-***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo *****
-
-To find a suitable video mode, you just call
-
-int __init fb_find_mode(struct fb_var_screeninfo *var,
-                        struct fb_info *info, const char *mode_option,
-                        const struct fb_videomode *db, unsigned int dbsize,
-                        const struct fb_videomode *default_mode,
-                        unsigned int default_bpp)
-
-with db/dbsize your non-standard video mode database, or NULL to use the
-standard video mode database.
-
-fb_find_mode() first tries the specified video mode (or any mode that matches,
-e.g. there can be multiple 640x480 modes, each of them is tried). If that
-fails, the default mode is tried. If that fails, it walks over all modes.
-
-To specify a video mode at bootup, use the following boot options:
-    video=<driver>:<xres>x<yres>[-<bpp>][@refresh]
-
-where <driver> is a name from the table below.  Valid default modes can be
-found in linux/drivers/video/modedb.c.  Check your driver's documentation.
-There may be more modes.
-
-    Drivers that support modedb boot options
-    Boot Name	  Cards Supported
-
-    amifb	- Amiga chipset frame buffer
-    aty128fb	- ATI Rage128 / Pro frame buffer
-    atyfb	- ATI Mach64 frame buffer
-    pm2fb	- Permedia 2/2V frame buffer
-    pm3fb	- Permedia 3 frame buffer
-    sstfb	- Voodoo 1/2 (SST1) chipset frame buffer
-    tdfxfb	- 3D Fx frame buffer
-    tridentfb	- Trident (Cyber)blade chipset frame buffer
-    vt8623fb	- VIA 8623 frame buffer
-
-BTW, only a few fb drivers use this at the moment. Others are to follow
-(feel free to send patches). The DRM drivers also support this.
diff --git a/Documentation/fb/pvr2fb.rst b/Documentation/fb/pvr2fb.rst
new file mode 100644
index 000000000000..fcf2c21c8fcf
--- /dev/null
+++ b/Documentation/fb/pvr2fb.rst
@@ -0,0 +1,66 @@
+===============
+What is pvr2fb?
+===============
+
+This is a driver for PowerVR 2 based graphics frame buffers, such as the
+one found in the Dreamcast.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts (NOT on the Dreamcast)
+ * You can run XF86_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * Driver is largely untested on non-Dreamcast systems.
+
+Configuration
+=============
+
+You can pass kernel command line options to pvr2fb with
+`video=pvr2fb:option1,option2:value2,option3` (multiple options should be
+separated by comma, values are separated from options by `:`).
+
+Accepted options:
+
+==========  ==================================================================
+font:X      default font to use. All fonts are supported, including the
+	    SUN12x22 font which is very nice at high resolutions.
+
+
+mode:X      default video mode with format [xres]x[yres]-<bpp>@<refresh rate>
+	    The following video modes are supported:
+	    640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast
+	    defaults to 640x480-16@60. At the time of writing the
+	    24bpp and 32bpp modes function poorly. Work to fix that is
+	    ongoing
+
+	    Note: the 640x240 mode is currently broken, and should not be
+	    used for any reason. It is only mentioned here as a reference.
+
+inverse     invert colors on screen (for LCD displays)
+
+nomtrr      disables write combining on frame buffer. This slows down driver
+	    but there is reported minor incompatibility between GUS DMA and
+	    XFree under high loads if write combining is enabled (sound
+	    dropouts). MTRR is enabled by default on systems that have it
+	    configured and that support it.
+
+cable:X     cable type. This can be any of the following: vga, rgb, and
+	    composite. If none is specified, we guess.
+
+output:X    output type. This can be any of the following: pal, ntsc, and
+	    vga. If none is specified, we guess.
+==========  ==================================================================
+
+X11
+===
+
+XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet
+on any 2.6 series kernel.
+
+Paul Mundt <lethal@linuxdc.org>
+
+Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk>
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt
deleted file mode 100644
index 36bdeff585e2..000000000000
--- a/Documentation/fb/pvr2fb.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-$Id: pvr2fb.txt,v 1.1 2001/05/24 05:09:16 mrbrown Exp $
-
-What is pvr2fb?
-===============
-
-This is a driver for PowerVR 2 based graphics frame buffers, such as the
-one found in the Dreamcast.
-
-Advantages:
-
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts (NOT on the Dreamcast)
- * You can run XF86_FBDev on top of /dev/fb0
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * Driver is largely untested on non-Dreamcast systems.
-
-Configuration
-=============
-
-You can pass kernel command line options to pvr2fb with
-`video=pvr2fb:option1,option2:value2,option3' (multiple options should be
-separated by comma, values are separated from options by `:').
-Accepted options:
-
-font:X    - default font to use. All fonts are supported, including the
-            SUN12x22 font which is very nice at high resolutions.
-
-	    
-mode:X    - default video mode with format [xres]x[yres]-<bpp>@<refresh rate>
-            The following video modes are supported:
-            640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast
-            defaults to 640x480-16@60. At the time of writing the
-            24bpp and 32bpp modes function poorly. Work to fix that is
-            ongoing
-
-            Note: the 640x240 mode is currently broken, and should not be
-            used for any reason. It is only mentioned here as a reference.
-
-inverse   - invert colors on screen (for LCD displays)
-
-nomtrr    - disables write combining on frame buffer. This slows down driver
-            but there is reported minor incompatibility between GUS DMA and
-            XFree under high loads if write combining is enabled (sound
-            dropouts). MTRR is enabled by default on systems that have it
-            configured and that support it.
-
-cable:X   - cable type. This can be any of the following: vga, rgb, and
-            composite. If none is specified, we guess.
-
-output:X  - output type. This can be any of the following: pal, ntsc, and
-            vga. If none is specified, we guess.
-
-X11
-===
-
-XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet
-on any 2.6 series kernel.
-
---
-Paul Mundt <lethal@linuxdc.org>
-Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk>
-
diff --git a/Documentation/fb/pxafb.rst b/Documentation/fb/pxafb.rst
new file mode 100644
index 000000000000..90177f5e7e76
--- /dev/null
+++ b/Documentation/fb/pxafb.rst
@@ -0,0 +1,173 @@
+================================
+Driver for PXA25x LCD controller
+================================
+
+The driver supports the following options, either via
+options=<OPTIONS> when modular or video=pxafb:<OPTIONS> when built in.
+
+For example::
+
+	modprobe pxafb options=vmem:2M,mode:640x480-8,passive
+
+or on the kernel command line::
+
+	video=pxafb:vmem:2M,mode:640x480-8,passive
+
+vmem: VIDEO_MEM_SIZE
+
+	Amount of video memory to allocate (can be suffixed with K or M
+	for kilobytes or megabytes)
+
+mode:XRESxYRES[-BPP]
+
+	XRES == LCCR1_PPL + 1
+
+	YRES == LLCR2_LPP + 1
+
+		The resolution of the display in pixels
+
+	BPP == The bit depth. Valid values are 1, 2, 4, 8 and 16.
+
+pixclock:PIXCLOCK
+
+	Pixel clock in picoseconds
+
+left:LEFT == LCCR1_BLW + 1
+
+right:RIGHT == LCCR1_ELW + 1
+
+hsynclen:HSYNC == LCCR1_HSW + 1
+
+upper:UPPER == LCCR2_BFW
+
+lower:LOWER == LCCR2_EFR
+
+vsynclen:VSYNC == LCCR2_VSW + 1
+
+	Display margins and sync times
+
+color | mono => LCCR0_CMS
+
+	umm...
+
+active | passive => LCCR0_PAS
+
+	Active (TFT) or Passive (STN) display
+
+single | dual => LCCR0_SDS
+
+	Single or dual panel passive display
+
+4pix | 8pix => LCCR0_DPD
+
+	4 or 8 pixel monochrome single panel data
+
+hsync:HSYNC, vsync:VSYNC
+
+	Horizontal and vertical sync. 0 => active low, 1 => active
+	high.
+
+dpc:DPC
+
+	Double pixel clock. 1=>true, 0=>false
+
+outputen:POLARITY
+
+	Output Enable Polarity. 0 => active low, 1 => active high
+
+pixclockpol:POLARITY
+
+	pixel clock polarity
+	0 => falling edge, 1 => rising edge
+
+
+Overlay Support for PXA27x and later LCD controllers
+====================================================
+
+  PXA27x and later processors support overlay1 and overlay2 on-top of the
+  base framebuffer (although under-neath the base is also possible). They
+  support palette and no-palette RGB formats, as well as YUV formats (only
+  available on overlay2). These overlays have dedicated DMA channels and
+  behave in a similar way as a framebuffer.
+
+  However, there are some differences between these overlay framebuffers
+  and normal framebuffers, as listed below:
+
+  1. overlay can start at a 32-bit word aligned position within the base
+     framebuffer, which means they have a start (x, y). This information
+     is encoded into var->nonstd (no, var->xoffset and var->yoffset are
+     not for such purpose).
+
+  2. overlay framebuffer is allocated dynamically according to specified
+     'struct fb_var_screeninfo', the amount is decided by::
+
+	var->xres_virtual * var->yres_virtual * bpp
+
+     bpp = 16 -- for RGB565 or RGBT555
+
+     bpp = 24 -- for YUV444 packed
+
+     bpp = 24 -- for YUV444 planar
+
+     bpp = 16 -- for YUV422 planar (1 pixel = 1 Y + 1/2 Cb + 1/2 Cr)
+
+     bpp = 12 -- for YUV420 planar (1 pixel = 1 Y + 1/4 Cb + 1/4 Cr)
+
+     NOTE:
+
+     a. overlay does not support panning in x-direction, thus
+	var->xres_virtual will always be equal to var->xres
+
+     b. line length of overlay(s) must be on a 32-bit word boundary,
+	for YUV planar modes, it is a requirement for the component
+	with minimum bits per pixel,  e.g. for YUV420, Cr component
+	for one pixel is actually 2-bits, it means the line length
+	should be a multiple of 16-pixels
+
+     c. starting horizontal position (XPOS) should start on a 32-bit
+	word boundary, otherwise the fb_check_var() will just fail.
+
+     d. the rectangle of the overlay should be within the base plane,
+	otherwise fail
+
+     Applications should follow the sequence below to operate an overlay
+     framebuffer:
+
+	 a. open("/dev/fb[1-2]", ...)
+	 b. ioctl(fd, FBIOGET_VSCREENINFO, ...)
+	 c. modify 'var' with desired parameters:
+
+	    1) var->xres and var->yres
+	    2) larger var->yres_virtual if more memory is required,
+	       usually for double-buffering
+	    3) var->nonstd for starting (x, y) and color format
+	    4) var->{red, green, blue, transp} if RGB mode is to be used
+
+	 d. ioctl(fd, FBIOPUT_VSCREENINFO, ...)
+	 e. ioctl(fd, FBIOGET_FSCREENINFO, ...)
+	 f. mmap
+	 g. ...
+
+  3. for YUV planar formats, these are actually not supported within the
+     framebuffer framework, application has to take care of the offsets
+     and lengths of each component within the framebuffer.
+
+  4. var->nonstd is used to pass starting (x, y) position and color format,
+     the detailed bit fields are shown below::
+
+      31                23  20         10          0
+       +-----------------+---+----------+----------+
+       |  ... unused ... |FOR|   XPOS   |   YPOS   |
+       +-----------------+---+----------+----------+
+
+     FOR  - color format, as defined by OVERLAY_FORMAT_* in pxafb.h
+
+	  - 0 - RGB
+	  - 1 - YUV444 PACKED
+	  - 2 - YUV444 PLANAR
+	  - 3 - YUV422 PLANAR
+	  - 4 - YUR420 PLANAR
+
+     XPOS - starting horizontal position
+
+     YPOS - starting vertical position
diff --git a/Documentation/fb/pxafb.txt b/Documentation/fb/pxafb.txt
deleted file mode 100644
index d143a0a749f9..000000000000
--- a/Documentation/fb/pxafb.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Driver for PXA25x LCD controller
-================================
-
-The driver supports the following options, either via
-options=<OPTIONS> when modular or video=pxafb:<OPTIONS> when built in.
-
-For example:
-	modprobe pxafb options=vmem:2M,mode:640x480-8,passive
-or on the kernel command line
-	video=pxafb:vmem:2M,mode:640x480-8,passive
-
-vmem: VIDEO_MEM_SIZE
-	Amount of video memory to allocate (can be suffixed with K or M
-	for kilobytes or megabytes)
-
-mode:XRESxYRES[-BPP]
-	XRES == LCCR1_PPL + 1
-	YRES == LLCR2_LPP + 1
-		The resolution of the display in pixels
-	BPP == The bit depth. Valid values are 1, 2, 4, 8 and 16.
-
-pixclock:PIXCLOCK
-	Pixel clock in picoseconds
-
-left:LEFT == LCCR1_BLW + 1
-right:RIGHT == LCCR1_ELW + 1
-hsynclen:HSYNC == LCCR1_HSW + 1
-upper:UPPER == LCCR2_BFW
-lower:LOWER == LCCR2_EFR
-vsynclen:VSYNC == LCCR2_VSW + 1
-	Display margins and sync times
-
-color | mono => LCCR0_CMS
-	umm...
-
-active | passive => LCCR0_PAS
-	Active (TFT) or Passive (STN) display
-
-single | dual => LCCR0_SDS
-	Single or dual panel passive display
-
-4pix | 8pix => LCCR0_DPD
-	4 or 8 pixel monochrome single panel data
-
-hsync:HSYNC
-vsync:VSYNC
-	Horizontal and vertical sync. 0 => active low, 1 => active
-	high.
-
-dpc:DPC
-	Double pixel clock. 1=>true, 0=>false
-
-outputen:POLARITY
-	Output Enable Polarity. 0 => active low, 1 => active high
-
-pixclockpol:POLARITY
-	pixel clock polarity
-	0 => falling edge, 1 => rising edge
-
-
-Overlay Support for PXA27x and later LCD controllers
-====================================================
-
-  PXA27x and later processors support overlay1 and overlay2 on-top of the
-  base framebuffer (although under-neath the base is also possible). They
-  support palette and no-palette RGB formats, as well as YUV formats (only
-  available on overlay2). These overlays have dedicated DMA channels and
-  behave in a similar way as a framebuffer.
-
-  However, there are some differences between these overlay framebuffers
-  and normal framebuffers, as listed below:
-
-  1. overlay can start at a 32-bit word aligned position within the base
-     framebuffer, which means they have a start (x, y). This information
-     is encoded into var->nonstd (no, var->xoffset and var->yoffset are
-     not for such purpose).
-
-  2. overlay framebuffer is allocated dynamically according to specified
-     'struct fb_var_screeninfo', the amount is decided by:
-
-        var->xres_virtual * var->yres_virtual * bpp
-
-     bpp = 16 -- for RGB565 or RGBT555
-         = 24 -- for YUV444 packed
-         = 24 -- for YUV444 planar
-	 = 16 -- for YUV422 planar (1 pixel = 1 Y + 1/2 Cb + 1/2 Cr)
-	 = 12 -- for YUV420 planar (1 pixel = 1 Y + 1/4 Cb + 1/4 Cr)
-
-     NOTE:
-
-     a. overlay does not support panning in x-direction, thus
-        var->xres_virtual will always be equal to var->xres
-
-     b. line length of overlay(s) must be on a 32-bit word boundary,
-        for YUV planar modes, it is a requirement for the component
-	with minimum bits per pixel,  e.g. for YUV420, Cr component
-	for one pixel is actually 2-bits, it means the line length
-	should be a multiple of 16-pixels
-
-     c. starting horizontal position (XPOS) should start on a 32-bit
-        word boundary, otherwise the fb_check_var() will just fail.
-
-     d. the rectangle of the overlay should be within the base plane,
-        otherwise fail
-
-     Applications should follow the sequence below to operate an overlay
-     framebuffer:
-
-         a. open("/dev/fb[1-2]", ...)
-	 b. ioctl(fd, FBIOGET_VSCREENINFO, ...)
-	 c. modify 'var' with desired parameters:
-	    1) var->xres and var->yres
-	    2) larger var->yres_virtual if more memory is required,
-	       usually for double-buffering
-	    3) var->nonstd for starting (x, y) and color format
-	    4) var->{red, green, blue, transp} if RGB mode is to be used
-	 d. ioctl(fd, FBIOPUT_VSCREENINFO, ...)
-	 e. ioctl(fd, FBIOGET_FSCREENINFO, ...)
-	 f. mmap
-	 g. ...
-
-  3. for YUV planar formats, these are actually not supported within the
-     framebuffer framework, application has to take care of the offsets
-     and lengths of each component within the framebuffer.
-
-  4. var->nonstd is used to pass starting (x, y) position and color format,
-     the detailed bit fields are shown below:
-
-    31                23  20         10          0
-     +-----------------+---+----------+----------+
-     |  ... unused ... |FOR|   XPOS   |   YPOS   |
-     +-----------------+---+----------+----------+
-
-     FOR  - color format, as defined by OVERLAY_FORMAT_* in pxafb.h
-            0 - RGB
-	    1 - YUV444 PACKED
-	    2 - YUV444 PLANAR
-	    3 - YUV422 PLANAR
-	    4 - YUR420 PLANAR
-
-     XPOS - starting horizontal position
-     YPOS - starting vertical position
diff --git a/Documentation/fb/s3fb.rst b/Documentation/fb/s3fb.rst
new file mode 100644
index 000000000000..e809d69c21a7
--- /dev/null
+++ b/Documentation/fb/s3fb.rst
@@ -0,0 +1,82 @@
+===========================================
+s3fb - fbdev driver for S3 Trio/Virge chips
+===========================================
+
+
+Supported Hardware
+==================
+
+	S3 Trio32
+	S3 Trio64 (and variants V+, UV+, V2/DX, V2/GX)
+	S3 Virge  (and variants VX, DX, GX and GX2+)
+	S3 Plato/PX		(completely untested)
+	S3 Aurora64V+		(completely untested)
+
+	- only PCI bus supported
+	- only BIOS initialized VGA devices supported
+	- probably not working on big endian
+
+I tested s3fb on Trio64 (plain, V+ and V2/DX) and Virge (plain, VX, DX),
+all on i386.
+
+
+Supported Features
+==================
+
+	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
+	*  8 bpp pseudocolor mode (with 18bit palette)
+	* 16 bpp truecolor modes (RGB 555 and RGB 565)
+	* 24 bpp truecolor mode (RGB 888) on (only on Virge VX)
+	* 32 bpp truecolor mode (RGB 888) on (not on Virge VX)
+	* text mode (activated by bpp = 0)
+	* interlaced mode variant (not available in text mode)
+	* doublescan mode variant (not available in text mode)
+	* panning in both directions
+	* suspend/resume support
+	* DPMS support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum usually between 50-60 MHz, depending on specific
+hardware, i get best results from plain S3 Trio32 card - about 75 MHz). This
+limitation is not enforced by driver. Text mode supports 8bit wide fonts only
+(hardware limitation) and 16bit tall fonts (driver limitation). Text mode
+support is broken on S3 Trio64 V2/DX.
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+	* secondary (not initialized by BIOS) device support
+	* big endian support
+	* Zorro bus support
+	* MMIO support
+	* 24 bpp mode support on more cards
+	* support for fontwidths != 8 in 4 bpp modes
+	* support for fontheight != 16 in text mode
+	* composite and external sync (is anyone able to test this?)
+	* hardware cursor
+	* video overlay support
+	* vsync synchronization
+	* feature connector support
+	* acceleration support (8514-like 2D, Virge 3D, busmaster transfers)
+	* better values for some magic registers (performance issues)
+
+
+Known bugs
+==========
+
+	* cursor disable in text mode doesn't work
+	* text mode broken on S3 Trio64 V2/DX
+
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/s3fb.txt b/Documentation/fb/s3fb.txt
deleted file mode 100644
index 2c97770bdbaa..000000000000
--- a/Documentation/fb/s3fb.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-
-	s3fb - fbdev driver for S3 Trio/Virge chips
-	===========================================
-
-
-Supported Hardware
-==================
-
-	S3 Trio32
-	S3 Trio64 (and variants V+, UV+, V2/DX, V2/GX)
-	S3 Virge  (and variants VX, DX, GX and GX2+)
-	S3 Plato/PX		(completely untested)
-	S3 Aurora64V+		(completely untested)
-
-	- only PCI bus supported
-	- only BIOS initialized VGA devices supported
-	- probably not working on big endian
-
-I tested s3fb on Trio64 (plain, V+ and V2/DX) and Virge (plain, VX, DX),
-all on i386.
-
-
-Supported Features
-==================
-
-	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
-	*  8 bpp pseudocolor mode (with 18bit palette)
-	* 16 bpp truecolor modes (RGB 555 and RGB 565)
-	* 24 bpp truecolor mode (RGB 888) on (only on Virge VX)
-	* 32 bpp truecolor mode (RGB 888) on (not on Virge VX)
-	* text mode (activated by bpp = 0)
-	* interlaced mode variant (not available in text mode)
-	* doublescan mode variant (not available in text mode)
-	* panning in both directions
-	* suspend/resume support
-	* DPMS support
-
-Text mode is supported even in higher resolutions, but there is limitation to
-lower pixclocks (maximum usually between 50-60 MHz, depending on specific
-hardware, i get best results from plain S3 Trio32 card - about 75 MHz). This
-limitation is not enforced by driver. Text mode supports 8bit wide fonts only
-(hardware limitation) and 16bit tall fonts (driver limitation). Text mode
-support is broken on S3 Trio64 V2/DX.
-
-There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
-packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
-with interleaved planes (1 byte interleave), MSB first. Both modes support
-8bit wide fonts only (driver limitation).
-
-Suspend/resume works on systems that initialize video card during resume and
-if device is active (for example used by fbcon).
-
-
-Missing Features
-================
-(alias TODO list)
-
-	* secondary (not initialized by BIOS) device support
-   	* big endian support
-	* Zorro bus support
-	* MMIO support
-	* 24 bpp mode support on more cards
-	* support for fontwidths != 8 in 4 bpp modes
-	* support for fontheight != 16 in text mode
-	* composite and external sync (is anyone able to test this?)
-	* hardware cursor
-	* video overlay support
-	* vsync synchronization
-	* feature connector support
-	* acceleration support (8514-like 2D, Virge 3D, busmaster transfers)
-	* better values for some magic registers (performance issues)
-
-
-Known bugs
-==========
-
-	* cursor disable in text mode doesn't work
-	* text mode broken on S3 Trio64 V2/DX
-
-
---
-Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/sa1100fb.rst b/Documentation/fb/sa1100fb.rst
new file mode 100644
index 000000000000..67e2650e017d
--- /dev/null
+++ b/Documentation/fb/sa1100fb.rst
@@ -0,0 +1,40 @@
+=================
+What is sa1100fb?
+=================
+
+.. [This file is cloned from VesaFB/matroxfb]
+
+
+This is a driver for a graphic framebuffer for the SA-1100 LCD
+controller.
+
+Configuration
+==============
+
+For most common passive displays, giving the option::
+
+  video=sa1100fb:bpp:<value>,lccr0:<value>,lccr1:<value>,lccr2:<value>,lccr3:<value>
+
+on the kernel command line should be enough to configure the
+controller. The bits per pixel (bpp) value should be 4, 8, 12, or
+16. LCCR values are display-specific and should be computed as
+documented in the SA-1100 Developer's Manual, Section 11.7. Dual-panel
+displays are supported as long as the SDS bit is set in LCCR0; GPIO<9:2>
+are used for the lower panel.
+
+For active displays or displays requiring additional configuration
+(controlling backlights, powering on the LCD, etc.), the command line
+options may not be enough to configure the display. Adding sections to
+sa1100fb_init_fbinfo(), sa1100fb_activate_var(),
+sa1100fb_disable_lcd_controller(), and sa1100fb_enable_lcd_controller()
+will probably be necessary.
+
+Accepted options::
+
+	bpp:<value>	Configure for <value> bits per pixel
+	lccr0:<value>	Configure LCD control register 0 (11.7.3)
+	lccr1:<value>	Configure LCD control register 1 (11.7.4)
+	lccr2:<value>	Configure LCD control register 2 (11.7.5)
+	lccr3:<value>	Configure LCD control register 3 (11.7.6)
+
+Mark Huang <mhuang@livetoy.com>
diff --git a/Documentation/fb/sa1100fb.txt b/Documentation/fb/sa1100fb.txt
deleted file mode 100644
index f1b4220464df..000000000000
--- a/Documentation/fb/sa1100fb.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-[This file is cloned from VesaFB/matroxfb]
-
-What is sa1100fb?
-=================
-
-This is a driver for a graphic framebuffer for the SA-1100 LCD
-controller.
-
-Configuration
-==============
-
-For most common passive displays, giving the option
-
-video=sa1100fb:bpp:<value>,lccr0:<value>,lccr1:<value>,lccr2:<value>,lccr3:<value>
-
-on the kernel command line should be enough to configure the
-controller. The bits per pixel (bpp) value should be 4, 8, 12, or
-16. LCCR values are display-specific and should be computed as
-documented in the SA-1100 Developer's Manual, Section 11.7. Dual-panel
-displays are supported as long as the SDS bit is set in LCCR0; GPIO<9:2>
-are used for the lower panel.
-
-For active displays or displays requiring additional configuration
-(controlling backlights, powering on the LCD, etc.), the command line
-options may not be enough to configure the display. Adding sections to
-sa1100fb_init_fbinfo(), sa1100fb_activate_var(),
-sa1100fb_disable_lcd_controller(), and sa1100fb_enable_lcd_controller()
-will probably be necessary.
-
-Accepted options:
-
-bpp:<value>	Configure for <value> bits per pixel
-lccr0:<value>	Configure LCD control register 0 (11.7.3)
-lccr1:<value>	Configure LCD control register 1 (11.7.4)
-lccr2:<value>	Configure LCD control register 2 (11.7.5)
-lccr3:<value>	Configure LCD control register 3 (11.7.6)
-
---
-Mark Huang <mhuang@livetoy.com>
diff --git a/Documentation/fb/sh7760fb.rst b/Documentation/fb/sh7760fb.rst
new file mode 100644
index 000000000000..c3266485f810
--- /dev/null
+++ b/Documentation/fb/sh7760fb.rst
@@ -0,0 +1,130 @@
+================================================
+SH7760/SH7763 integrated LCDC Framebuffer driver
+================================================
+
+0. Overview
+-----------
+The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which
+supports (in theory) resolutions ranging from 1x1 to 1024x1024,
+with color depths ranging from 1 to 16 bits, on STN, DSTN and TFT Panels.
+
+Caveats:
+
+* Framebuffer memory must be a large chunk allocated at the top
+  of Area3 (HW requirement). Because of this requirement you should NOT
+  make the driver a module since at runtime it may become impossible to
+  get a large enough contiguous chunk of memory.
+
+* The driver does not support changing resolution while loaded
+  (displays aren't hotpluggable anyway)
+
+* Heavy flickering may be observed
+  a) if you're using 15/16bit color modes at >= 640x480 px resolutions,
+  b) during PCMCIA (or any other slow bus) activity.
+
+* Rotation works only 90degress clockwise, and only if horizontal
+  resolution is <= 320 pixels.
+
+Files:
+	- drivers/video/sh7760fb.c
+	- include/asm-sh/sh7760fb.h
+	- Documentation/fb/sh7760fb.rst
+
+1. Platform setup
+-----------------
+SH7760:
+ Video data is fetched via the DMABRG DMA engine, so you have to
+ configure the SH DMAC for DMABRG mode (write 0x94808080 to the
+ DMARSRA register somewhere at boot).
+
+ PFC registers PCCR and PCDR must be set to peripheral mode.
+ (write zeros to both).
+
+The driver does NOT do the above for you since board setup is, well, job
+of the board setup code.
+
+2. Panel definitions
+--------------------
+The LCDC must explicitly be told about the type of LCD panel
+attached.  Data must be wrapped in a "struct sh7760fb_platdata" and
+passed to the driver as platform_data.
+
+Suggest you take a closer look at the SH7760 Manual, Section 30.
+(http://documentation.renesas.com/eng/products/mpumcu/e602291_sh7760.pdf)
+
+The following code illustrates what needs to be done to
+get the framebuffer working on a 640x480 TFT::
+
+  #include <linux/fb.h>
+  #include <asm/sh7760fb.h>
+
+  /*
+   * NEC NL6440bc26-01 640x480 TFT
+   * dotclock 25175 kHz
+   * Xres                640     Yres            480
+   * Htotal      800     Vtotal          525
+   * HsynStart   656     VsynStart       490
+   * HsynLenn    30      VsynLenn        2
+   *
+   * The linux framebuffer layer does not use the syncstart/synclen
+   * values but right/left/upper/lower margin values. The comments
+   * for the x_margin explain how to calculate those from given
+   * panel sync timings.
+   */
+  static struct fb_videomode nl6448bc26 = {
+         .name           = "NL6448BC26",
+         .refresh        = 60,
+         .xres           = 640,
+         .yres           = 480,
+         .pixclock       = 39683,        /* in picoseconds! */
+         .hsync_len      = 30,
+         .vsync_len      = 2,
+         .left_margin    = 114,  /* HTOT - (HSYNSLEN + HSYNSTART) */
+         .right_margin   = 16,   /* HSYNSTART - XRES */
+         .upper_margin   = 33,   /* VTOT - (VSYNLEN + VSYNSTART) */
+         .lower_margin   = 10,   /* VSYNSTART - YRES */
+         .sync           = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT,
+         .vmode          = FB_VMODE_NONINTERLACED,
+         .flag           = 0,
+  };
+
+  static struct sh7760fb_platdata sh7760fb_nl6448 = {
+         .def_mode       = &nl6448bc26,
+         .ldmtr          = LDMTR_TFT_COLOR_16,   /* 16bit TFT panel */
+         .lddfr          = LDDFR_8BPP,           /* we want 8bit output */
+         .ldpmmr         = 0x0070,
+         .ldpspr         = 0x0500,
+         .ldaclnr        = 0,
+         .ldickr         = LDICKR_CLKSRC(LCDC_CLKSRC_EXTERNAL) |
+			 LDICKR_CLKDIV(1),
+         .rotate         = 0,
+         .novsync        = 1,
+         .blank          = NULL,
+  };
+
+  /* SH7760:
+   * 0xFE300800: 256 * 4byte xRGB palette ram
+   * 0xFE300C00: 42 bytes ctrl registers
+   */
+  static struct resource sh7760_lcdc_res[] = {
+         [0] = {
+	       .start  = 0xFE300800,
+	       .end    = 0xFE300CFF,
+	       .flags  = IORESOURCE_MEM,
+         },
+         [1] = {
+	       .start  = 65,
+	       .end    = 65,
+	       .flags  = IORESOURCE_IRQ,
+         },
+  };
+
+  static struct platform_device sh7760_lcdc_dev = {
+         .dev    = {
+	       .platform_data = &sh7760fb_nl6448,
+         },
+         .name           = "sh7760-lcdc",
+         .id             = -1,
+         .resource       = sh7760_lcdc_res,
+         .num_resources  = ARRAY_SIZE(sh7760_lcdc_res),
+  };
diff --git a/Documentation/fb/sh7760fb.txt b/Documentation/fb/sh7760fb.txt
deleted file mode 100644
index b994c3b10549..000000000000
--- a/Documentation/fb/sh7760fb.txt
+++ /dev/null
@@ -1,131 +0,0 @@
-SH7760/SH7763 integrated LCDC Framebuffer driver
-================================================
-
-0. Overview
------------
-The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which
-supports (in theory) resolutions ranging from 1x1 to 1024x1024,
-with color depths ranging from 1 to 16 bits, on STN, DSTN and TFT Panels.
-
-Caveats:
-* Framebuffer memory must be a large chunk allocated at the top
-  of Area3 (HW requirement). Because of this requirement you should NOT
-  make the driver a module since at runtime it may become impossible to
-  get a large enough contiguous chunk of memory.
-
-* The driver does not support changing resolution while loaded
-  (displays aren't hotpluggable anyway)
-
-* Heavy flickering may be observed
-  a) if you're using 15/16bit color modes at >= 640x480 px resolutions,
-  b) during PCMCIA (or any other slow bus) activity.
-
-* Rotation works only 90degress clockwise, and only if horizontal
-  resolution is <= 320 pixels.
-
-files:   drivers/video/sh7760fb.c
-        include/asm-sh/sh7760fb.h
-        Documentation/fb/sh7760fb.txt
-
-1. Platform setup
------------------
-SH7760:
- Video data is fetched via the DMABRG DMA engine, so you have to
- configure the SH DMAC for DMABRG mode (write 0x94808080 to the
- DMARSRA register somewhere at boot).
-
- PFC registers PCCR and PCDR must be set to peripheral mode.
- (write zeros to both).
-
-The driver does NOT do the above for you since board setup is, well, job
-of the board setup code.
-
-2. Panel definitions
---------------------
-The LCDC must explicitly be told about the type of LCD panel
-attached.  Data must be wrapped in a "struct sh7760fb_platdata" and
-passed to the driver as platform_data.
-
-Suggest you take a closer look at the SH7760 Manual, Section 30.
-(http://documentation.renesas.com/eng/products/mpumcu/e602291_sh7760.pdf)
-
-The following code illustrates what needs to be done to
-get the framebuffer working on a 640x480 TFT:
-
-====================== cut here ======================================
-
-#include <linux/fb.h>
-#include <asm/sh7760fb.h>
-
-/*
- * NEC NL6440bc26-01 640x480 TFT
- * dotclock 25175 kHz
- * Xres                640     Yres            480
- * Htotal      800     Vtotal          525
- * HsynStart   656     VsynStart       490
- * HsynLenn    30      VsynLenn        2
- *
- * The linux framebuffer layer does not use the syncstart/synclen
- * values but right/left/upper/lower margin values. The comments
- * for the x_margin explain how to calculate those from given
- * panel sync timings.
- */
-static struct fb_videomode nl6448bc26 = {
-       .name           = "NL6448BC26",
-       .refresh        = 60,
-       .xres           = 640,
-       .yres           = 480,
-       .pixclock       = 39683,        /* in picoseconds! */
-       .hsync_len      = 30,
-       .vsync_len      = 2,
-       .left_margin    = 114,  /* HTOT - (HSYNSLEN + HSYNSTART) */
-       .right_margin   = 16,   /* HSYNSTART - XRES */
-       .upper_margin   = 33,   /* VTOT - (VSYNLEN + VSYNSTART) */
-       .lower_margin   = 10,   /* VSYNSTART - YRES */
-       .sync           = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT,
-       .vmode          = FB_VMODE_NONINTERLACED,
-       .flag           = 0,
-};
-
-static struct sh7760fb_platdata sh7760fb_nl6448 = {
-       .def_mode       = &nl6448bc26,
-       .ldmtr          = LDMTR_TFT_COLOR_16,   /* 16bit TFT panel */
-       .lddfr          = LDDFR_8BPP,           /* we want 8bit output */
-       .ldpmmr         = 0x0070,
-       .ldpspr         = 0x0500,
-       .ldaclnr        = 0,
-       .ldickr         = LDICKR_CLKSRC(LCDC_CLKSRC_EXTERNAL) |
-                         LDICKR_CLKDIV(1),
-       .rotate         = 0,
-       .novsync        = 1,
-       .blank          = NULL,
-};
-
-/* SH7760:
- * 0xFE300800: 256 * 4byte xRGB palette ram
- * 0xFE300C00: 42 bytes ctrl registers
- */
-static struct resource sh7760_lcdc_res[] = {
-       [0] = {
-               .start  = 0xFE300800,
-               .end    = 0xFE300CFF,
-               .flags  = IORESOURCE_MEM,
-       },
-       [1] = {
-               .start  = 65,
-               .end    = 65,
-               .flags  = IORESOURCE_IRQ,
-       },
-};
-
-static struct platform_device sh7760_lcdc_dev = {
-       .dev    = {
-               .platform_data = &sh7760fb_nl6448,
-       },
-       .name           = "sh7760-lcdc",
-       .id             = -1,
-       .resource       = sh7760_lcdc_res,
-       .num_resources  = ARRAY_SIZE(sh7760_lcdc_res),
-};
-
-====================== cut here ======================================
diff --git a/Documentation/fb/sisfb.rst b/Documentation/fb/sisfb.rst
new file mode 100644
index 000000000000..8f4e502ea12e
--- /dev/null
+++ b/Documentation/fb/sisfb.rst
@@ -0,0 +1,160 @@
+==============
+What is sisfb?
+==============
+
+sisfb is a framebuffer device driver for SiS (Silicon Integrated Systems)
+graphics chips. Supported are:
+
+- SiS 300 series: SiS 300/305, 540, 630(S), 730(S)
+- SiS 315 series: SiS 315/H/PRO, 55x, (M)65x, 740, (M)661(F/M)X, (M)741(GX)
+- SiS 330 series: SiS 330 ("Xabre"), (M)760
+
+
+Why do I need a framebuffer driver?
+===================================
+
+sisfb is eg. useful if you want a high-resolution text console. Besides that,
+sisfb is required to run DirectFB (which comes with an additional, dedicated
+driver for the 315 series).
+
+On the 300 series, sisfb on kernels older than 2.6.3 furthermore plays an
+important role in connection with DRM/DRI: Sisfb manages the memory heap
+used by DRM/DRI for 3D texture and other data. This memory management is
+required for using DRI/DRM.
+
+Kernels >= around 2.6.3 do not need sisfb any longer for DRI/DRM memory
+management. The SiS DRM driver has been updated and features a memory manager
+of its own (which will be used if sisfb is not compiled). So unless you want
+a graphical console, you don't need sisfb on kernels >=2.6.3.
+
+Sidenote: Since this seems to be a commonly made mistake: sisfb and vesafb
+cannot be active at the same time! Do only select one of them in your kernel
+configuration.
+
+
+How are parameters passed to sisfb?
+===================================
+
+Well, it depends: If compiled statically into the kernel, use lilo's append
+statement to add the parameters to the kernel command line. Please see lilo's
+(or GRUB's) documentation for more information. If sisfb is a kernel module,
+parameters are given with the modprobe (or insmod) command.
+
+Example for sisfb as part of the static kernel: Add the following line to your
+lilo.conf::
+
+     append="video=sisfb:mode:1024x768x16,mem:12288,rate:75"
+
+Example for sisfb as a module: Start sisfb by typing::
+
+     modprobe sisfb mode=1024x768x16 rate=75 mem=12288
+
+A common mistake is that folks use a wrong parameter format when using the
+driver compiled into the kernel. Please note: If compiled into the kernel,
+the parameter format is video=sisfb:mode:none or video=sisfb:mode:1024x768x16
+(or whatever mode you want to use, alternatively using any other format
+described above or the vesa keyword instead of mode). If compiled as a module,
+the parameter format reads mode=none or mode=1024x768x16 (or whatever mode you
+want to use). Using a "=" for a ":" (and vice versa) is a huge difference!
+Additionally: If you give more than one argument to the in-kernel sisfb, the
+arguments are separated with ",". For example::
+
+   video=sisfb:mode:1024x768x16,rate:75,mem:12288
+
+
+How do I use it?
+================
+
+Preface statement: This file only covers very little of the driver's
+capabilities and features. Please refer to the author's and maintainer's
+website at http://www.winischhofer.net/linuxsisvga.shtml for more
+information. Additionally, "modinfo sisfb" gives an overview over all
+supported options including some explanation.
+
+The desired display mode can be specified using the keyword "mode" with
+a parameter in one of the following formats:
+
+  - XxYxDepth or
+  - XxY-Depth or
+  - XxY-Depth@Rate or
+  - XxY
+  - or simply use the VESA mode number in hexadecimal or decimal.
+
+For example: 1024x768x16, 1024x768-16@75, 1280x1024-16. If no depth is
+specified, it defaults to 8. If no rate is given, it defaults to 60Hz. Depth 32
+means 24bit color depth (but 32 bit framebuffer depth, which is not relevant
+to the user).
+
+Additionally, sisfb understands the keyword "vesa" followed by a VESA mode
+number in decimal or hexadecimal. For example: vesa=791 or vesa=0x117. Please
+use either "mode" or "vesa" but not both.
+
+Linux 2.4 only: If no mode is given, sisfb defaults to "no mode" (mode=none) if
+compiled as a module; if sisfb is statically compiled into the kernel, it
+defaults to 800x600x8 unless CRT2 type is LCD, in which case the LCD's native
+resolution is used. If you want to switch to a different mode, use the fbset
+shell command.
+
+Linux 2.6 only: If no mode is given, sisfb defaults to 800x600x8 unless CRT2
+type is LCD, in which case it defaults to the LCD's native resolution. If
+you want to switch to another mode, use the stty shell command.
+
+You should compile in both vgacon (to boot if you remove you SiS card from
+your system) and sisfb (for graphics mode). Under Linux 2.6, also "Framebuffer
+console support" (fbcon) is needed for a graphical console.
+
+You should *not* compile-in vesafb. And please do not use the "vga=" keyword
+in lilo's or grub's configuration file; mode selection is done using the
+"mode" or "vesa" keywords as a parameter. See above and below.
+
+
+X11
+===
+
+If using XFree86 or X.org, it is recommended that you don't use the "fbdev"
+driver but the dedicated "sis" X driver. The "sis" X driver and sisfb are
+developed by the same person (Thomas Winischhofer) and cooperate well with
+each other.
+
+
+SVGALib
+=======
+
+SVGALib, if directly accessing the hardware, never restores the screen
+correctly, especially on laptops or if the output devices are LCD or TV.
+Therefore, use the chipset "FBDEV" in SVGALib configuration. This will make
+SVGALib use the framebuffer device for mode switches and restoration.
+
+
+Configuration
+=============
+
+(Some) accepted options:
+
+=========  ==================================================================
+off        Disable sisfb. This option is only understood if sisfb is
+	   in-kernel, not a module.
+mem:X      size of memory for the console, rest will be used for DRI/DRM. X
+	   is in kilobytes. On 300 series, the default is 4096, 8192 or
+	   16384 (each in kilobyte) depending on how much video ram the card
+	   has. On 315/330 series, the default is the maximum available ram
+	   (since DRI/DRM is not supported for these chipsets).
+noaccel    do not use 2D acceleration engine. (Default: use acceleration)
+noypan     disable y-panning and scroll by redrawing the entire screen.
+	   This is much slower than y-panning. (Default: use y-panning)
+vesa:X     selects startup videomode. X is number from 0 to 0x1FF and
+	   represents the VESA mode number (can be given in decimal or
+	   hexadecimal form, the latter prefixed with "0x").
+mode:X     selects startup videomode. Please see above for the format of
+	   "X".
+=========  ==================================================================
+
+Boolean options such as "noaccel" or "noypan" are to be given without a
+parameter if sisfb is in-kernel (for example "video=sisfb:noypan). If
+sisfb is a module, these are to be set to 1 (for example "modprobe sisfb
+noypan=1").
+
+
+Thomas Winischhofer <thomas@winischhofer.net>
+
+May 27, 2004
diff --git a/Documentation/fb/sisfb.txt b/Documentation/fb/sisfb.txt
deleted file mode 100644
index 2e68e503e72f..000000000000
--- a/Documentation/fb/sisfb.txt
+++ /dev/null
@@ -1,158 +0,0 @@
-
-What is sisfb?
-==============
-
-sisfb is a framebuffer device driver for SiS (Silicon Integrated Systems)
-graphics chips. Supported are:
-
-- SiS 300 series: SiS 300/305, 540, 630(S), 730(S)
-- SiS 315 series: SiS 315/H/PRO, 55x, (M)65x, 740, (M)661(F/M)X, (M)741(GX)
-- SiS 330 series: SiS 330 ("Xabre"), (M)760
-
-
-Why do I need a framebuffer driver?
-===================================
-
-sisfb is eg. useful if you want a high-resolution text console. Besides that,
-sisfb is required to run DirectFB (which comes with an additional, dedicated
-driver for the 315 series).
-
-On the 300 series, sisfb on kernels older than 2.6.3 furthermore plays an
-important role in connection with DRM/DRI: Sisfb manages the memory heap
-used by DRM/DRI for 3D texture and other data. This memory management is
-required for using DRI/DRM.
-
-Kernels >= around 2.6.3 do not need sisfb any longer for DRI/DRM memory
-management. The SiS DRM driver has been updated and features a memory manager
-of its own (which will be used if sisfb is not compiled). So unless you want
-a graphical console, you don't need sisfb on kernels >=2.6.3.
-
-Sidenote: Since this seems to be a commonly made mistake: sisfb and vesafb
-cannot be active at the same time! Do only select one of them in your kernel
-configuration.
-
-
-How are parameters passed to sisfb?
-===================================
-
-Well, it depends: If compiled statically into the kernel, use lilo's append
-statement to add the parameters to the kernel command line. Please see lilo's
-(or GRUB's) documentation for more information. If sisfb is a kernel module,
-parameters are given with the modprobe (or insmod) command.
-
-Example for sisfb as part of the static kernel: Add the following line to your
-lilo.conf:
-
-     append="video=sisfb:mode:1024x768x16,mem:12288,rate:75"
-
-Example for sisfb as a module: Start sisfb by typing
-
-     modprobe sisfb mode=1024x768x16 rate=75 mem=12288
-
-A common mistake is that folks use a wrong parameter format when using the
-driver compiled into the kernel. Please note: If compiled into the kernel,
-the parameter format is video=sisfb:mode:none or video=sisfb:mode:1024x768x16
-(or whatever mode you want to use, alternatively using any other format
-described above or the vesa keyword instead of mode). If compiled as a module,
-the parameter format reads mode=none or mode=1024x768x16 (or whatever mode you
-want to use). Using a "=" for a ":" (and vice versa) is a huge difference!
-Additionally: If you give more than one argument to the in-kernel sisfb, the
-arguments are separated with ",". For example:
-
-   video=sisfb:mode:1024x768x16,rate:75,mem:12288
-
-
-How do I use it?
-================
-
-Preface statement: This file only covers very little of the driver's
-capabilities and features. Please refer to the author's and maintainer's
-website at http://www.winischhofer.net/linuxsisvga.shtml for more
-information. Additionally, "modinfo sisfb" gives an overview over all
-supported options including some explanation.
-
-The desired display mode can be specified using the keyword "mode" with
-a parameter in one of the following formats:
-  - XxYxDepth or
-  - XxY-Depth or
-  - XxY-Depth@Rate or
-  - XxY
-  - or simply use the VESA mode number in hexadecimal or decimal.
-
-For example: 1024x768x16, 1024x768-16@75, 1280x1024-16. If no depth is
-specified, it defaults to 8. If no rate is given, it defaults to 60Hz. Depth 32
-means 24bit color depth (but 32 bit framebuffer depth, which is not relevant
-to the user).
-
-Additionally, sisfb understands the keyword "vesa" followed by a VESA mode
-number in decimal or hexadecimal. For example: vesa=791 or vesa=0x117. Please
-use either "mode" or "vesa" but not both.
-
-Linux 2.4 only: If no mode is given, sisfb defaults to "no mode" (mode=none) if
-compiled as a module; if sisfb is statically compiled into the kernel, it
-defaults to 800x600x8 unless CRT2 type is LCD, in which case the LCD's native
-resolution is used. If you want to switch to a different mode, use the fbset
-shell command.
-
-Linux 2.6 only: If no mode is given, sisfb defaults to 800x600x8 unless CRT2
-type is LCD, in which case it defaults to the LCD's native resolution. If
-you want to switch to another mode, use the stty shell command.
-
-You should compile in both vgacon (to boot if you remove you SiS card from
-your system) and sisfb (for graphics mode). Under Linux 2.6, also "Framebuffer
-console support" (fbcon) is needed for a graphical console.
-
-You should *not* compile-in vesafb. And please do not use the "vga=" keyword
-in lilo's or grub's configuration file; mode selection is done using the
-"mode" or "vesa" keywords as a parameter. See above and below.
-
-
-X11
-===
-
-If using XFree86 or X.org, it is recommended that you don't use the "fbdev"
-driver but the dedicated "sis" X driver. The "sis" X driver and sisfb are
-developed by the same person (Thomas Winischhofer) and cooperate well with
-each other.
-
-
-SVGALib
-=======
-
-SVGALib, if directly accessing the hardware, never restores the screen
-correctly, especially on laptops or if the output devices are LCD or TV.
-Therefore, use the chipset "FBDEV" in SVGALib configuration. This will make
-SVGALib use the framebuffer device for mode switches and restoration.
-
-
-Configuration
-=============
-
-(Some) accepted options:
-
-off      - Disable sisfb. This option is only understood if sisfb is
-           in-kernel, not a module.
-mem:X    - size of memory for the console, rest will be used for DRI/DRM. X
-           is in kilobytes. On 300 series, the default is 4096, 8192 or
-	   16384 (each in kilobyte) depending on how much video ram the card
-           has. On 315/330 series, the default is the maximum available ram
-	   (since DRI/DRM is not supported for these chipsets).
-noaccel  - do not use 2D acceleration engine. (Default: use acceleration)
-noypan   - disable y-panning and scroll by redrawing the entire screen.
-           This is much slower than y-panning. (Default: use y-panning)
-vesa:X   - selects startup videomode. X is number from 0 to 0x1FF and
-           represents the VESA mode number (can be given in decimal or
-	   hexadecimal form, the latter prefixed with "0x").
-mode:X   - selects startup videomode. Please see above for the format of
-           "X".
-
-Boolean options such as "noaccel" or "noypan" are to be given without a
-parameter if sisfb is in-kernel (for example "video=sisfb:noypan). If
-sisfb is a module, these are to be set to 1 (for example "modprobe sisfb
-noypan=1").
-
---
-Thomas Winischhofer <thomas@winischhofer.net>
-May 27, 2004
-
-
diff --git a/Documentation/fb/sm501.rst b/Documentation/fb/sm501.rst
new file mode 100644
index 000000000000..03e02c8042a7
--- /dev/null
+++ b/Documentation/fb/sm501.rst
@@ -0,0 +1,15 @@
+=======
+sm501fb
+=======
+
+Configuration:
+
+You can pass the following kernel command line options to sm501
+videoframebuffer::
+
+	sm501fb.bpp=	SM501 Display driver:
+			Specify bits-per-pixel if not specified by 'mode'
+
+	sm501fb.mode=	SM501 Display driver:
+			Specify resolution as
+			"<xres>x<yres>[-<bpp>][@<refresh>]"
diff --git a/Documentation/fb/sm501.txt b/Documentation/fb/sm501.txt
deleted file mode 100644
index 187f3b3ccb6c..000000000000
--- a/Documentation/fb/sm501.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Configuration:
-
-You can pass the following kernel command line options to sm501 videoframebuffer:
-
-	sm501fb.bpp=	SM501 Display driver:
-			Specify bits-per-pixel if not specified by 'mode'
-
-	sm501fb.mode=	SM501 Display driver:
-			Specify resolution as
-			"<xres>x<yres>[-<bpp>][@<refresh>]"
diff --git a/Documentation/fb/sm712fb.rst b/Documentation/fb/sm712fb.rst
new file mode 100644
index 000000000000..994dad3b0238
--- /dev/null
+++ b/Documentation/fb/sm712fb.rst
@@ -0,0 +1,35 @@
+================
+What is sm712fb?
+================
+
+This is a graphics framebuffer driver for Silicon Motion SM712 based processors.
+
+How to use it?
+==============
+
+Switching modes is done using the video=sm712fb:... boot parameter.
+
+If you want, for example, enable a resolution of 1280x1024x24bpp you should
+pass to the kernel this command line: "video=sm712fb:0x31B".
+
+You should not compile-in vesafb.
+
+Currently supported video modes are:
+
+Graphic modes
+-------------
+
+===  =======  =======  ========  =========
+bpp  640x480  800x600  1024x768  1280x1024
+===  =======  =======  ========  =========
+  8  0x301    0x303    0x305     0x307
+ 16  0x311    0x314    0x317     0x31A
+ 24  0x312    0x315    0x318     0x31B
+===  =======  =======  ========  =========
+
+Missing Features
+================
+(alias TODO list)
+
+	* 2D acceleratrion
+	* dual-head support
diff --git a/Documentation/fb/sm712fb.txt b/Documentation/fb/sm712fb.txt
deleted file mode 100644
index c388442edf51..000000000000
--- a/Documentation/fb/sm712fb.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-What is sm712fb?
-=================
-
-This is a graphics framebuffer driver for Silicon Motion SM712 based processors.
-
-How to use it?
-==============
-
-Switching modes is done using the video=sm712fb:... boot parameter.
-
-If you want, for example, enable a resolution of 1280x1024x24bpp you should
-pass to the kernel this command line: "video=sm712fb:0x31B".
-
-You should not compile-in vesafb.
-
-Currently supported video modes are:
-
-[Graphic modes]
-
-bpp | 640x480  800x600  1024x768  1280x1024
-----+--------------------------------------------
-  8 | 0x301    0x303    0x305    0x307
- 16 | 0x311    0x314    0x317    0x31A
- 24 | 0x312    0x315    0x318    0x31B
-
-Missing Features
-================
-(alias TODO list)
-
-	* 2D acceleratrion
-	* dual-head support
diff --git a/Documentation/fb/sstfb.rst b/Documentation/fb/sstfb.rst
new file mode 100644
index 000000000000..8e8c1b940359
--- /dev/null
+++ b/Documentation/fb/sstfb.rst
@@ -0,0 +1,207 @@
+=====
+sstfb
+=====
+
+Introduction
+============
+
+This is a frame buffer device driver for 3dfx' Voodoo Graphics
+(aka voodoo 1, aka sst1) and Voodoo² (aka Voodoo 2, aka CVG) based
+video boards. It's highly experimental code, but is guaranteed to work
+on my computer, with my "Maxi Gamer 3D" and "Maxi Gamer 3d²" boards,
+and with me "between chair and keyboard". Some people tested other
+combinations and it seems that it works.
+The main page is located at <http://sstfb.sourceforge.net>, and if
+you want the latest version, check out the CVS, as the driver is a work
+in progress, I feel uncomfortable with releasing tarballs of something
+not completely working...Don't worry, it's still more than usable
+(I eat my own dog food)
+
+Please read the Bug section, and report any success or failure to me
+(Ghozlane Toumi <gtoumi@laposte.net>).
+BTW, If you have only one monitor , and you don't feel like playing
+with the vga passthrou cable, I can only suggest borrowing a screen
+somewhere...
+
+
+Installation
+============
+
+This driver (should) work on ix86, with "late" 2.2.x kernel (tested
+with x = 19) and "recent" 2.4.x kernel, as a module or compiled in.
+It has been included in mainstream kernel since the infamous 2.4.10.
+You can apply the patches found in `sstfb/kernel/*-2.{2|4}.x.patch`,
+and copy sstfb.c to linux/drivers/video/, or apply a single patch,
+`sstfb/patch-2.{2|4}.x-sstfb-yymmdd` to your linux source tree.
+
+Then configure your kernel as usual: choose "m" or "y" to 3Dfx Voodoo
+Graphics in section "console". Compile, install, have fun... and please
+drop me a report :)
+
+
+Module Usage
+============
+
+.. warning::
+
+       #. You should read completely this section before issuing any command.
+
+       #. If you have only one monitor to play with, once you insmod the
+	  module, the 3dfx takes control of the output, so you'll have to
+	  plug the monitor to the "normal" video board in order to issue
+	  the commands, or you can blindly use sst_dbg_vgapass
+	  in the tools directory (See Tools). The latest solution is pass the
+	  parameter vgapass=1 when insmodding the driver. (See Kernel/Modules
+	  Options)
+
+Module insertion
+----------------
+
+       #. insmod sstfb.o
+
+	  you should see some strange output from the board:
+	  a big blue square, a green and a red small squares and a vertical
+	  white rectangle. why? the function's name is self-explanatory:
+	  "sstfb_test()"...
+	  (if you don't have a second monitor, you'll have to plug your monitor
+	  directly to the 2D videocard to see what you're typing)
+
+       #. con2fb /dev/fbx /dev/ttyx
+
+	  bind a tty to the new frame buffer. if you already have a frame
+	  buffer driver, the voodoo fb will likely be /dev/fb1. if not,
+	  the device will be /dev/fb0. You can check this by doing a
+	  cat /proc/fb. You can find a copy of con2fb in tools/ directory.
+	  if you don't have another fb device, this step is superfluous,
+	  as the console subsystem automagicaly binds ttys to the fb.
+       #. switch to the virtual console you just mapped. "tadaaa" ...
+
+Module removal
+--------------
+
+       #. con2fb /dev/fbx /dev/ttyx
+
+	  bind the tty to the old frame buffer so the module can be removed.
+	  (how does it work with vgacon ? short answer : it doesn't work)
+
+       #. rmmod sstfb
+
+
+Kernel/Modules Options
+----------------------
+
+You can pass some options to the sstfb module, and via the kernel
+command line when the driver is compiled in:
+for module : insmod sstfb.o option1=value1 option2=value2 ...
+in kernel :  video=sstfb:option1,option2:value2,option3 ...
+
+sstfb supports the following options:
+
+=============== =============== ===============================================
+Module		Kernel		Description
+=============== =============== ===============================================
+vgapass=0	vganopass	Enable or disable VGA passthrou cable.
+vgapass=1	vgapass		When enabled, the monitor will get the signal
+				from the VGA board and not from the voodoo.
+
+				Default: nopass
+
+mem=x		mem:x		Force frame buffer memory in MiB
+				allowed values: 0, 1, 2, 4.
+
+				Default: 0 (= autodetect)
+
+inverse=1	inverse		Supposed to enable inverse console.
+				doesn't work yet...
+
+clipping=1	clipping	Enable or disable clipping.
+clipping=0	noclipping	With clipping enabled, all offscreen
+				reads and writes are discarded.
+
+				Default: enable clipping.
+
+gfxclk=x	gfxclk:x	Force graphic clock frequency (in MHz).
+				Be careful with this option, it may be
+				DANGEROUS.
+
+				Default: auto
+
+					- 50Mhz for Voodoo 1,
+					- 75MHz for Voodoo 2.
+
+slowpci=1	fastpci		Enable or disable fast PCI read/writes.
+slowpci=1	slowpci		Default : fastpci
+
+dev=x		dev:x		Attach the driver to device number x.
+				0 is the first compatible board (in
+				lspci order)
+=============== =============== ===============================================
+
+Tools
+=====
+
+These tools are mostly for debugging purposes, but you can
+find some of these interesting:
+
+- `con2fb`, maps a tty to a fbramebuffer::
+
+	con2fb /dev/fb1 /dev/tty5
+
+- `sst_dbg_vgapass`, changes vga passthrou. You have to recompile the
+  driver with SST_DEBUG and SST_DEBUG_IOCTL set to 1::
+
+	sst_dbg_vgapass /dev/fb1 1 (enables vga cable)
+	sst_dbg_vgapass /dev/fb1 0 (disables vga cable)
+
+- `glide_reset`, resets the voodoo using glide
+  use this after rmmoding sstfb, if the module refuses to
+  reinsert.
+
+Bugs
+====
+
+- DO NOT use glide while the sstfb module is in, you'll most likely
+  hang your computer.
+- If you see some artefacts (pixels not cleaning and stuff like that),
+  try turning off clipping (clipping=0), and/or using slowpci
+- the driver don't detect the 4Mb frame buffer voodoos, it seems that
+  the 2 last Mbs wrap around. looking into that .
+- The driver is 16 bpp only, 24/32 won't work.
+- The driver is not your_favorite_toy-safe. this includes SMP...
+
+	[Actually from inspection it seems to be safe - Alan]
+
+- When using XFree86 FBdev (X over fbdev) you may see strange color
+  patterns at the border of your windows (the pixels lose the lowest
+  byte -> basically the blue component and some of the green). I'm unable
+  to reproduce this with XFree86-3.3, but one of the testers has this
+  problem with XFree86-4. Apparently recent Xfree86-4.x solve this
+  problem.
+- I didn't really test changing the palette, so you may find some weird
+  things when playing with that.
+- Sometimes the driver will not recognise the DAC, and the
+  initialisation will fail. This is specifically true for
+  voodoo 2 boards, but it should be solved in recent versions. Please
+  contact me.
+- The 24/32 is not likely to work anytime soon, knowing that the
+  hardware does ... unusual things in 24/32 bpp.
+- When used with another video board, current limitations of the linux
+  console subsystem can cause some troubles, specifically, you should
+  disable software scrollback, as it can oops badly ...
+
+Todo
+====
+
+- Get rid of the previous paragraph.
+- Buy more coffee.
+- test/port to other arch.
+- try to add panning using tweeks with front and back buffer .
+- try to implement accel on voodoo2, this board can actually do a
+  lot in 2D even if it was sold as a 3D only board ...
+
+Ghozlane Toumi <gtoumi@laposte.net>
+
+
+Date: 2002/05/09 20:11:45
+
+http://sstfb.sourceforge.net/README
diff --git a/Documentation/fb/sstfb.txt b/Documentation/fb/sstfb.txt
deleted file mode 100644
index 13db1075e4a5..000000000000
--- a/Documentation/fb/sstfb.txt
+++ /dev/null
@@ -1,174 +0,0 @@
-
-Introduction
-
-	  This is a frame buffer device driver for 3dfx' Voodoo Graphics 
-	(aka voodoo 1, aka sst1) and Voodoo² (aka Voodoo 2, aka CVG) based 
-	video boards. It's highly experimental code, but is guaranteed to work
-	on my computer, with my "Maxi Gamer 3D" and "Maxi Gamer 3d²" boards,
-	and with me "between chair and keyboard". Some people tested other
-	combinations and it seems that it works.
-	  The main page is located at <http://sstfb.sourceforge.net>, and if
-	you want the latest version, check out the CVS, as the driver is a work
-	in progress, I feel uncomfortable with releasing tarballs of something
-	not completely working...Don't worry, it's still more than usable
-	(I eat my own dog food)
-
-	  Please read the Bug section, and report any success or failure to me
-	(Ghozlane Toumi <gtoumi@laposte.net>).
-	  BTW, If you have only one monitor , and you don't feel like playing
-	with the vga passthrou cable, I can only suggest borrowing a screen
-	somewhere... 
-
-
-Installation 
-
-	  This driver (should) work on ix86, with "late" 2.2.x kernel (tested
-	with x = 19) and "recent" 2.4.x kernel, as a module or compiled in.
-	  It has been included in mainstream kernel since the infamous 2.4.10.
-	  You can apply the patches found in sstfb/kernel/*-2.{2|4}.x.patch,
-	and copy sstfb.c to linux/drivers/video/, or apply a single patch, 
-	sstfb/patch-2.{2|4}.x-sstfb-yymmdd to your linux source tree.
-
-	  Then configure your kernel as usual: choose "m" or "y" to 3Dfx Voodoo
-	Graphics in section "console". Compile, install, have fun... and please
-	drop me a report :)
-
-
-Module Usage
-	
-	Warnings.
-	# You should read completely this section before issuing any command.
-	# If you have only one monitor to play with, once you insmod the
-	  module, the 3dfx takes control of the output, so you'll have to
-	  plug the monitor to the "normal" video board in order to issue
-	  the commands, or you can blindly use sst_dbg_vgapass
-          in the tools directory (See Tools). The latest solution is pass the
-	  parameter vgapass=1 when insmodding the driver. (See Kernel/Modules
-	  Options)
-
-	Module insertion:
-	# insmod sstfb.o
-	  you should see some strange output from the board: 
-	  a big blue square, a green and a red small squares and a vertical
-	  white rectangle. why? the function's name is self-explanatory:
-	  "sstfb_test()"...
-	  (if you don't have a second monitor, you'll have to plug your monitor
-	  directly to the 2D videocard to see what you're typing)
-	# con2fb /dev/fbx /dev/ttyx
-	  bind a tty to the new frame buffer. if you already have a frame
-	  buffer driver, the voodoo fb will likely be /dev/fb1. if not, 
-	  the device will be /dev/fb0. You can check this by doing a 
-	  cat /proc/fb. You can find a copy of con2fb in tools/ directory.
-	  if you don't have another fb device, this step is superfluous,
-	  as the console subsystem automagicaly binds ttys to the fb.
-	# switch to the virtual console you just mapped. "tadaaa" ...
-
-	Module removal:
-	# con2fb /dev/fbx /dev/ttyx
-	  bind the tty to the old frame buffer so the module can be removed.
-	  (how does it work with vgacon ? short answer : it doesn't work)
-	# rmmod sstfb
-
-
-Kernel/Modules Options
-
-	You can pass some options to the sstfb module, and via the kernel 
-	command line when the driver is compiled in:
-	for module : insmod sstfb.o option1=value1 option2=value2 ...
-	in kernel :  video=sstfb:option1,option2:value2,option3 ...
-	
-	sstfb supports the following options :
-
-Module		Kernel		Description
-
-vgapass=0	vganopass	Enable or disable VGA passthrou cable.
-vgapass=1	vgapass		When enabled, the monitor will get the signal
-				from the VGA board and not from the voodoo.
-				Default: nopass
-
-mem=x		mem:x		Force frame buffer memory in MiB
-				allowed values: 0, 1, 2, 4.
-				Default: 0 (= autodetect)
-
-inverse=1	inverse		Supposed to enable inverse console.
-				doesn't work yet...
-
-clipping=1	clipping	Enable or disable clipping.
-clipping=0	noclipping	With clipping enabled, all offscreen
-				reads and writes are discarded.
-				Default: enable clipping.
-
-gfxclk=x	gfxclk:x	Force graphic clock frequency (in MHz).
-				Be careful with this option, it may be
-				DANGEROUS.
-				Default: auto 
-					50Mhz for Voodoo 1,
-					75MHz for Voodoo 2. 
-
-slowpci=1	fastpci		Enable or disable fast PCI read/writes.
-slowpci=1	slowpci		Default : fastpci
-
-dev=x		dev:x		Attach the driver to device number x.
-				0 is the first compatible board (in 
-				lspci order)
-
-Tools
-
-	These tools are mostly for debugging purposes, but you can 
-	find some of these interesting :
-	 - con2fb , maps a tty to a fbramebuffer .
-		con2fb /dev/fb1 /dev/tty5
-	 - sst_dbg_vgapass , changes vga passthrou. You have to recompile the
-	driver with SST_DEBUG and SST_DEBUG_IOCTL set to 1
-		sst_dbg_vgapass /dev/fb1 1 (enables vga cable)
-		sst_dbg_vgapass /dev/fb1 0 (disables vga cable)
-	 - glide_reset , resets the voodoo using glide
-		use this after rmmoding sstfb, if the module refuses to
-		reinsert .
-
-Bugs
-
-	- DO NOT use glide while the sstfb module is in, you'll most likely
-	hang your computer.
-	- If you see some artefacts (pixels not cleaning and stuff like that), 
-	try turning off clipping (clipping=0), and/or using slowpci
-	- the driver don't detect the 4Mb frame buffer voodoos, it seems that
-	the 2 last Mbs wrap around. looking into that .
-	- The driver is 16 bpp only, 24/32 won't work.
-	- The driver is not your_favorite_toy-safe. this includes SMP...
-          [Actually from inspection it seems to be safe - Alan]
-	- When using XFree86 FBdev (X over fbdev) you may see strange color
-	patterns at the border of your windows (the pixels lose the lowest
-	byte -> basically the blue component and some of the green). I'm unable
-	to reproduce this with XFree86-3.3, but one of the testers has this
-	problem with XFree86-4. Apparently recent Xfree86-4.x solve this
-	problem.
-	- I didn't really test changing the palette, so you may find some weird
-	things when playing with that.
-	- Sometimes the driver will not recognise the DAC, and the
-        initialisation will fail. This is specifically true for
-	voodoo 2 boards, but it should be solved in recent versions. Please
-	contact me.
-	- The 24/32 is not likely to work anytime soon, knowing that the
-	hardware does ... unusual things in 24/32 bpp.
-	- When used with another video board, current limitations of the linux
-	console subsystem can cause some troubles, specifically, you should
-	disable software scrollback, as it can oops badly ...
-
-Todo
-
-	- Get rid of the previous paragraph.
-	- Buy more coffee.
-	- test/port to other arch.
-	- try to add panning using tweeks with front and back buffer .
-	- try to implement accel on voodoo2, this board can actually do a 
-	  lot in 2D even if it was sold as a 3D only board ...
-
-ghoz.
-
--- 
-Ghozlane Toumi <gtoumi@laposte.net>
-
-
-$Date: 2002/05/09 20:11:45 $
-http://sstfb.sourceforge.net/README
diff --git a/Documentation/fb/tgafb.rst b/Documentation/fb/tgafb.rst
new file mode 100644
index 000000000000..0c50d2134aa4
--- /dev/null
+++ b/Documentation/fb/tgafb.rst
@@ -0,0 +1,71 @@
+==============
+What is tgafb?
+==============
+
+This is a driver for DECChip 21030 based graphics framebuffers, a.k.a. TGA
+cards, which are usually found in older Digital Alpha systems. The
+following models are supported:
+
+- ZLxP-E1 (8bpp, 2 MB VRAM)
+- ZLxP-E2 (32bpp, 8 MB VRAM)
+- ZLxP-E3 (32bpp, 16 MB VRAM, Zbuffer)
+
+This version is an almost complete rewrite of the code written by Geert
+Uytterhoeven, which was based on the original TGA console code written by
+Jay Estabrook.
+
+Major new features since Linux 2.0.x:
+
+ * Support for multiple resolutions
+ * Support for fixed-frequency and other oddball monitors
+   (by allowing the video mode to be set at boot time)
+
+User-visible changes since Linux 2.2.x:
+
+ * Sync-on-green is now handled properly
+ * More useful information is printed on bootup
+   (this helps if people run into problems)
+
+This driver does not (yet) support the TGA2 family of framebuffers, so the
+PowerStorm 3D30/4D20 (also known as PBXGB) cards are not supported. These
+can however be used with the standard VGA Text Console driver.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to tgafb with
+`video=tgafb:option1,option2:value2,option3` (multiple options should be
+separated by comma, values are separated from options by `:`).
+
+Accepted options:
+
+==========  ============================================================
+font:X      default font to use. All fonts are supported, including the
+	    SUN12x22 font which is very nice at high resolutions.
+
+mode:X      default video mode. The following video modes are supported:
+	    640x480-60, 800x600-56, 640x480-72, 800x600-60, 800x600-72,
+	    1024x768-60, 1152x864-60, 1024x768-70, 1024x768-76,
+	    1152x864-70, 1280x1024-61, 1024x768-85, 1280x1024-70,
+	    1152x864-84, 1280x1024-76, 1280x1024-85
+==========  ============================================================
+
+
+Known Issues
+============
+
+The XFree86 FBDev server has been reported not to work, since tgafb doesn't do
+mmap(). Running the standard XF86_TGA server from XFree86 3.3.x works fine for
+me, however this server does not do acceleration, which make certain operations
+quite slow. Support for acceleration is being progressively integrated in
+XFree86 4.x.
+
+When running tgafb in resolutions higher than 640x480, on switching VCs from
+tgafb to XF86_TGA 3.3.x, the entire screen is not re-drawn and must be manually
+refreshed. This is an X server problem, not a tgafb problem, and is fixed in
+XFree86 4.0.
+
+Enjoy!
+
+Martin Lucina <mato@kotelna.sk>
diff --git a/Documentation/fb/tgafb.txt b/Documentation/fb/tgafb.txt
deleted file mode 100644
index 250083ada8fb..000000000000
--- a/Documentation/fb/tgafb.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-$Id: tgafb.txt,v 1.1.2.2 2000/04/04 06:50:18 mato Exp $
-
-What is tgafb?
-===============
-
-This is a driver for DECChip 21030 based graphics framebuffers, a.k.a. TGA
-cards, which are usually found in older Digital Alpha systems. The
-following models are supported:
-
-ZLxP-E1 (8bpp, 2 MB VRAM)
-ZLxP-E2 (32bpp, 8 MB VRAM)
-ZLxP-E3 (32bpp, 16 MB VRAM, Zbuffer)
-
-This version is an almost complete rewrite of the code written by Geert
-Uytterhoeven, which was based on the original TGA console code written by
-Jay Estabrook.
-
-Major new features since Linux 2.0.x:
-
- * Support for multiple resolutions
- * Support for fixed-frequency and other oddball monitors 
-   (by allowing the video mode to be set at boot time)
-
-User-visible changes since Linux 2.2.x:
-
- * Sync-on-green is now handled properly
- * More useful information is printed on bootup
-   (this helps if people run into problems)
-
-This driver does not (yet) support the TGA2 family of framebuffers, so the
-PowerStorm 3D30/4D20 (also known as PBXGB) cards are not supported. These
-can however be used with the standard VGA Text Console driver.
-
-
-Configuration
-=============
-
-You can pass kernel command line options to tgafb with
-`video=tgafb:option1,option2:value2,option3' (multiple options should be
-separated by comma, values are separated from options by `:').
-Accepted options:
-
-font:X    - default font to use. All fonts are supported, including the
-            SUN12x22 font which is very nice at high resolutions.
-
-mode:X    - default video mode. The following video modes are supported:
-            640x480-60, 800x600-56, 640x480-72, 800x600-60, 800x600-72, 
-	    1024x768-60, 1152x864-60, 1024x768-70, 1024x768-76,
-	    1152x864-70, 1280x1024-61, 1024x768-85, 1280x1024-70,
-	    1152x864-84, 1280x1024-76, 1280x1024-85
- 
-
-Known Issues
-============
-
-The XFree86 FBDev server has been reported not to work, since tgafb doesn't do
-mmap(). Running the standard XF86_TGA server from XFree86 3.3.x works fine for
-me, however this server does not do acceleration, which make certain operations
-quite slow. Support for acceleration is being progressively integrated in
-XFree86 4.x.
-
-When running tgafb in resolutions higher than 640x480, on switching VCs from
-tgafb to XF86_TGA 3.3.x, the entire screen is not re-drawn and must be manually
-refreshed. This is an X server problem, not a tgafb problem, and is fixed in
-XFree86 4.0.
-
-Enjoy!
-
-Martin Lucina <mato@kotelna.sk>
diff --git a/Documentation/fb/tridentfb.rst b/Documentation/fb/tridentfb.rst
new file mode 100644
index 000000000000..7921c9dee78c
--- /dev/null
+++ b/Documentation/fb/tridentfb.rst
@@ -0,0 +1,78 @@
+=========
+Tridentfb
+=========
+
+Tridentfb is a framebuffer driver for some Trident chip based cards.
+
+The following list of chips is thought to be supported although not all are
+tested:
+
+those from the TGUI series 9440/96XX and with Cyber in their names
+those from the Image series and with Cyber in their names
+those with Blade in their names (Blade3D,CyberBlade...)
+the newer CyberBladeXP family
+
+All families are accelerated. Only PCI/AGP based cards are supported,
+none of the older Tridents.
+The driver supports 8, 16 and 32 bits per pixel depths.
+The TGUI family requires a line length to be power of 2 if acceleration
+is enabled. This means that range of possible resolutions and bpp is
+limited comparing to the range if acceleration is disabled (see list
+of parameters below).
+
+Known bugs:
+
+1. The driver randomly locks up on 3DImage975 chip with acceleration
+   enabled. The same happens in X11 (Xorg).
+2. The ramdac speeds require some more fine tuning. It is possible to
+   switch resolution which the chip does not support at some depths for
+   older chips.
+
+How to use it?
+==============
+
+When booting you can pass the video parameter::
+
+	video=tridentfb
+
+The parameters for tridentfb are concatenated with a ':' as in this example::
+
+	video=tridentfb:800x600-16@75,noaccel
+
+The second level parameters that tridentfb understands are:
+
+========  =====================================================================
+noaccel   turns off acceleration (when it doesn't work for your card)
+
+fp	  use flat panel related stuff
+crt 	  assume monitor is present instead of fp
+
+center 	  for flat panels and resolutions smaller than native size center the
+	  image, otherwise use
+stretch
+
+memsize   integer value in KB, use if your card's memory size is misdetected.
+	  look at the driver output to see what it says when initializing.
+
+memdiff   integer value in KB, should be nonzero if your card reports
+	  more memory than it actually has. For instance mine is 192K less than
+	  detection says in all three BIOS selectable situations 2M, 4M, 8M.
+	  Only use if your video memory is taken from main memory hence of
+	  configurable size. Otherwise use memsize.
+	  If in some modes which barely fit the memory you see garbage
+	  at the bottom this might help by not letting change to that mode
+	  anymore.
+
+nativex   the width in pixels of the flat panel.If you know it (usually 1024
+	  800 or 1280) and it is not what the driver seems to detect use it.
+
+bpp	  bits per pixel (8,16 or 32)
+mode	  a mode name like 800x600-8@75 as described in
+	  Documentation/fb/modedb.rst
+========  =====================================================================
+
+Using insane values for the above parameters will probably result in driver
+misbehaviour so take care(for instance memsize=12345678 or memdiff=23784 or
+nativex=93)
+
+Contact: jani@astechnix.ro
diff --git a/Documentation/fb/tridentfb.txt b/Documentation/fb/tridentfb.txt
deleted file mode 100644
index 45d9de5b13a3..000000000000
--- a/Documentation/fb/tridentfb.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-Tridentfb is a framebuffer driver for some Trident chip based cards.
-
-The following list of chips is thought to be supported although not all are
-tested:
-
-those from the TGUI series 9440/96XX and with Cyber in their names
-those from the Image series and with Cyber in their names
-those with Blade in their names (Blade3D,CyberBlade...)
-the newer CyberBladeXP family
-
-All families are accelerated. Only PCI/AGP based cards are supported,
-none of the older Tridents.
-The driver supports 8, 16 and 32 bits per pixel depths.
-The TGUI family requires a line length to be power of 2 if acceleration
-is enabled. This means that range of possible resolutions and bpp is
-limited comparing to the range if acceleration is disabled (see list
-of parameters below).
-
-Known bugs:
-1. The driver randomly locks up on 3DImage975 chip with acceleration
-   enabled. The same happens in X11 (Xorg).
-2. The ramdac speeds require some more fine tuning. It is possible to
-   switch resolution which the chip does not support at some depths for
-   older chips.
-
-How to use it?
-==============
-
-When booting you can pass the video parameter.
-video=tridentfb
-
-The parameters for tridentfb are concatenated with a ':' as in this example.
-
-video=tridentfb:800x600-16@75,noaccel
-
-The second level parameters that tridentfb understands are:
-
-noaccel - turns off acceleration (when it doesn't work for your card)
-
-fp	- use flat panel related stuff
-crt 	- assume monitor is present instead of fp
-
-center 	- for flat panels and resolutions smaller than native size center the
-	  image, otherwise use
-stretch
-
-memsize - integer value in KB, use if your card's memory size is misdetected.
-	  look at the driver output to see what it says when initializing.
-
-memdiff - integer value in KB, should be nonzero if your card reports
-	  more memory than it actually has. For instance mine is 192K less than
-	  detection says in all three BIOS selectable situations 2M, 4M, 8M.
-	  Only use if your video memory is taken from main memory hence of
-	  configurable size. Otherwise use memsize.
-	  If in some modes which barely fit the memory you see garbage
-	  at the bottom this might help by not letting change to that mode
-	  anymore.
-
-nativex - the width in pixels of the flat panel.If you know it (usually 1024
-	  800 or 1280) and it is not what the driver seems to detect use it.
-
-bpp	- bits per pixel (8,16 or 32)
-mode	- a mode name like 800x600-8@75 as described in
-	  Documentation/fb/modedb.txt
-
-Using insane values for the above parameters will probably result in driver
-misbehaviour so take care(for instance memsize=12345678 or memdiff=23784 or
-nativex=93)
-
-Contact: jani@astechnix.ro
diff --git a/Documentation/fb/udlfb.rst b/Documentation/fb/udlfb.rst
new file mode 100644
index 000000000000..732b37db3504
--- /dev/null
+++ b/Documentation/fb/udlfb.rst
@@ -0,0 +1,162 @@
+==============
+What is udlfb?
+==============
+
+This is a driver for DisplayLink USB 2.0 era graphics chips.
+
+DisplayLink chips provide simple hline/blit operations with some compression,
+pairing that with a hardware framebuffer (16MB) on the other end of the
+USB wire.  That hardware framebuffer is able to drive the VGA, DVI, or HDMI
+monitor with no CPU involvement until a pixel has to change.
+
+The CPU or other local resource does all the rendering; optionally compares the
+result with a local shadow of the remote hardware framebuffer to identify
+the minimal set of pixels that have changed; and compresses and sends those
+pixels line-by-line via USB bulk transfers.
+
+Because of the efficiency of bulk transfers and a protocol on top that
+does not require any acks - the effect is very low latency that
+can support surprisingly high resolutions with good performance for
+non-gaming and non-video applications.
+
+Mode setting, EDID read, etc are other bulk or control transfers. Mode
+setting is very flexible - able to set nearly arbitrary modes from any timing.
+
+Advantages of USB graphics in general:
+
+ * Ability to add a nearly arbitrary number of displays to any USB 2.0
+   capable system. On Linux, number of displays is limited by fbdev interface
+   (FB_MAX is currently 32). Of course, all USB devices on the same
+   host controller share the same 480Mbs USB 2.0 interface.
+
+Advantages of supporting DisplayLink chips with kernel framebuffer interface:
+
+ * The actual hardware functionality of DisplayLink chips matches nearly
+   one-to-one with the fbdev interface, making the driver quite small and
+   tight relative to the functionality it provides.
+ * X servers and other applications can use the standard fbdev interface
+   from user mode to talk to the device, without needing to know anything
+   about USB or DisplayLink's protocol at all. A "displaylink" X driver
+   and a slightly modified "fbdev" X driver are among those that already do.
+
+Disadvantages:
+
+ * Fbdev's mmap interface assumes a real hardware framebuffer is mapped.
+   In the case of USB graphics, it is just an allocated (virtual) buffer.
+   Writes need to be detected and encoded into USB bulk transfers by the CPU.
+   Accurate damage/changed area notifications work around this problem.
+   In the future, hopefully fbdev will be enhanced with an small standard
+   interface to allow mmap clients to report damage, for the benefit
+   of virtual or remote framebuffers.
+ * Fbdev does not arbitrate client ownership of the framebuffer well.
+ * Fbcon assumes the first framebuffer it finds should be consumed for console.
+ * It's not clear what the future of fbdev is, given the rise of KMS/DRM.
+
+How to use it?
+==============
+
+Udlfb, when loaded as a module, will match against all USB 2.0 generation
+DisplayLink chips (Alex and Ollie family). It will then attempt to read the EDID
+of the monitor, and set the best common mode between the DisplayLink device
+and the monitor's capabilities.
+
+If the DisplayLink device is successful, it will paint a "green screen" which
+means that from a hardware and fbdev software perspective, everything is good.
+
+At that point, a /dev/fb? interface will be present for user-mode applications
+to open and begin writing to the framebuffer of the DisplayLink device using
+standard fbdev calls.  Note that if mmap() is used, by default the user mode
+application must send down damage notifications to trigger repaints of the
+changed regions.  Alternatively, udlfb can be recompiled with experimental
+defio support enabled, to support a page-fault based detection mechanism
+that can work without explicit notification.
+
+The most common client of udlfb is xf86-video-displaylink or a modified
+xf86-video-fbdev X server. These servers have no real DisplayLink specific
+code. They write to the standard framebuffer interface and rely on udlfb
+to do its thing.  The one extra feature they have is the ability to report
+rectangles from the X DAMAGE protocol extension down to udlfb via udlfb's
+damage interface (which will hopefully be standardized for all virtual
+framebuffers that need damage info). These damage notifications allow
+udlfb to efficiently process the changed pixels.
+
+Module Options
+==============
+
+Special configuration for udlfb is usually unnecessary. There are a few
+options, however.
+
+From the command line, pass options to modprobe
+modprobe udlfb fb_defio=0 console=1 shadow=1
+
+Or modify options on the fly at /sys/module/udlfb/parameters directory via
+sudo nano fb_defio
+change the parameter in place, and save the file.
+
+Unplug/replug USB device to apply with new settings
+
+Or for permanent option, create file like /etc/modprobe.d/udlfb.conf with text
+options udlfb fb_defio=0 console=1 shadow=1
+
+Accepted boolean options:
+
+=============== ================================================================
+fb_defio	Make use of the fb_defio (CONFIG_FB_DEFERRED_IO) kernel
+		module to track changed areas of the framebuffer by page faults.
+		Standard fbdev applications that use mmap but that do not
+		report damage, should be able to work with this enabled.
+		Disable when running with X server that supports reporting
+		changed regions via ioctl, as this method is simpler,
+		more stable, and higher performance.
+		default: fb_defio=1
+
+console		Allow fbcon to attach to udlfb provided framebuffers.
+		Can be disabled if fbcon and other clients
+		(e.g. X with --shared-vt) are in conflict.
+		default: console=1
+
+shadow		Allocate a 2nd framebuffer to shadow what's currently across
+		the USB bus in device memory. If any pixels are unchanged,
+		do not transmit. Spends host memory to save USB transfers.
+		Enabled by default. Only disable on very low memory systems.
+		default: shadow=1
+=============== ================================================================
+
+Sysfs Attributes
+================
+
+Udlfb creates several files in /sys/class/graphics/fb?
+Where ? is the sequential framebuffer id of the particular DisplayLink device
+
+======================== ========================================================
+edid			 If a valid EDID blob is written to this file (typically
+			 by a udev rule), then udlfb will use this EDID as a
+			 backup in case reading the actual EDID of the monitor
+			 attached to the DisplayLink device fails. This is
+			 especially useful for fixed panels, etc. that cannot
+			 communicate their capabilities via EDID. Reading
+			 this file returns the current EDID of the attached
+			 monitor (or last backup value written). This is
+			 useful to get the EDID of the attached monitor,
+			 which can be passed to utilities like parse-edid.
+
+metrics_bytes_rendered	 32-bit count of pixel bytes rendered
+
+metrics_bytes_identical  32-bit count of how many of those bytes were found to be
+			 unchanged, based on a shadow framebuffer check
+
+metrics_bytes_sent	 32-bit count of how many bytes were transferred over
+			 USB to communicate the resulting changed pixels to the
+			 hardware. Includes compression and protocol overhead
+
+metrics_cpu_kcycles_used 32-bit count of CPU cycles used in processing the
+			 above pixels (in thousands of cycles).
+
+metrics_reset		 Write-only. Any write to this file resets all metrics
+			 above to zero.  Note that the 32-bit counters above
+			 roll over very quickly. To get reliable results, design
+			 performance tests to start and finish in a very short
+			 period of time (one minute or less is safe).
+======================== ========================================================
+
+Bernie Thompson <bernie@plugable.com>
diff --git a/Documentation/fb/udlfb.txt b/Documentation/fb/udlfb.txt
deleted file mode 100644
index c985cb65dd06..000000000000
--- a/Documentation/fb/udlfb.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-
-What is udlfb?
-===============
-
-This is a driver for DisplayLink USB 2.0 era graphics chips.
-
-DisplayLink chips provide simple hline/blit operations with some compression,
-pairing that with a hardware framebuffer (16MB) on the other end of the
-USB wire.  That hardware framebuffer is able to drive the VGA, DVI, or HDMI
-monitor with no CPU involvement until a pixel has to change.
-
-The CPU or other local resource does all the rendering; optionally compares the
-result with a local shadow of the remote hardware framebuffer to identify
-the minimal set of pixels that have changed; and compresses and sends those
-pixels line-by-line via USB bulk transfers.
-
-Because of the efficiency of bulk transfers and a protocol on top that
-does not require any acks - the effect is very low latency that
-can support surprisingly high resolutions with good performance for
-non-gaming and non-video applications.
-
-Mode setting, EDID read, etc are other bulk or control transfers. Mode
-setting is very flexible - able to set nearly arbitrary modes from any timing.
-
-Advantages of USB graphics in general:
-
- * Ability to add a nearly arbitrary number of displays to any USB 2.0
-   capable system. On Linux, number of displays is limited by fbdev interface
-   (FB_MAX is currently 32). Of course, all USB devices on the same
-   host controller share the same 480Mbs USB 2.0 interface.
-
-Advantages of supporting DisplayLink chips with kernel framebuffer interface:
-
- * The actual hardware functionality of DisplayLink chips matches nearly
-   one-to-one with the fbdev interface, making the driver quite small and
-   tight relative to the functionality it provides.
- * X servers and other applications can use the standard fbdev interface
-   from user mode to talk to the device, without needing to know anything
-   about USB or DisplayLink's protocol at all. A "displaylink" X driver
-   and a slightly modified "fbdev" X driver are among those that already do.
-
-Disadvantages:
-
- * Fbdev's mmap interface assumes a real hardware framebuffer is mapped.
-   In the case of USB graphics, it is just an allocated (virtual) buffer.
-   Writes need to be detected and encoded into USB bulk transfers by the CPU.
-   Accurate damage/changed area notifications work around this problem.
-   In the future, hopefully fbdev will be enhanced with an small standard
-   interface to allow mmap clients to report damage, for the benefit
-   of virtual or remote framebuffers.
- * Fbdev does not arbitrate client ownership of the framebuffer well.
- * Fbcon assumes the first framebuffer it finds should be consumed for console.
- * It's not clear what the future of fbdev is, given the rise of KMS/DRM.
-
-How to use it?
-==============
-
-Udlfb, when loaded as a module, will match against all USB 2.0 generation
-DisplayLink chips (Alex and Ollie family). It will then attempt to read the EDID
-of the monitor, and set the best common mode between the DisplayLink device
-and the monitor's capabilities.
-
-If the DisplayLink device is successful, it will paint a "green screen" which
-means that from a hardware and fbdev software perspective, everything is good.
-
-At that point, a /dev/fb? interface will be present for user-mode applications
-to open and begin writing to the framebuffer of the DisplayLink device using
-standard fbdev calls.  Note that if mmap() is used, by default the user mode
-application must send down damage notifications to trigger repaints of the
-changed regions.  Alternatively, udlfb can be recompiled with experimental
-defio support enabled, to support a page-fault based detection mechanism
-that can work without explicit notification.
-
-The most common client of udlfb is xf86-video-displaylink or a modified
-xf86-video-fbdev X server. These servers have no real DisplayLink specific
-code. They write to the standard framebuffer interface and rely on udlfb
-to do its thing.  The one extra feature they have is the ability to report
-rectangles from the X DAMAGE protocol extension down to udlfb via udlfb's
-damage interface (which will hopefully be standardized for all virtual
-framebuffers that need damage info). These damage notifications allow
-udlfb to efficiently process the changed pixels.
-
-Module Options
-==============
-
-Special configuration for udlfb is usually unnecessary. There are a few
-options, however.
-
-From the command line, pass options to modprobe
-modprobe udlfb fb_defio=0 console=1 shadow=1
-
-Or modify options on the fly at /sys/module/udlfb/parameters directory via
-sudo nano fb_defio
-change the parameter in place, and save the file.
-
-Unplug/replug USB device to apply with new settings
-
-Or for permanent option, create file like /etc/modprobe.d/udlfb.conf with text
-options udlfb fb_defio=0 console=1 shadow=1
-
-Accepted boolean options:
-
-fb_defio	Make use of the fb_defio (CONFIG_FB_DEFERRED_IO) kernel
-		module to track changed areas of the framebuffer by page faults.
-		Standard fbdev applications that use mmap but that do not
-		report damage, should be able to work with this enabled.
-		Disable when running with X server that supports reporting
-		changed regions via ioctl, as this method is simpler,
-		more stable, and higher performance.
-		default: fb_defio=1
-
-console	Allow fbcon to attach to udlfb provided framebuffers.
-		Can be disabled if fbcon and other clients
-		(e.g. X with --shared-vt) are in conflict.
-		default: console=1
-
-shadow		Allocate a 2nd framebuffer to shadow what's currently across
-		the USB bus in device memory. If any pixels are unchanged,
-		do not transmit. Spends host memory to save USB transfers.
-		Enabled by default. Only disable on very low memory systems.
-		default: shadow=1
-
-Sysfs Attributes
-================
-
-Udlfb creates several files in /sys/class/graphics/fb?
-Where ? is the sequential framebuffer id of the particular DisplayLink device
-
-edid	       		If a valid EDID blob is written to this file (typically
-			by a udev rule), then udlfb will use this EDID as a
-			backup in case reading the actual EDID of the monitor
-			attached to the DisplayLink device fails. This is
-			especially useful for fixed panels, etc. that cannot
-			communicate their capabilities via EDID. Reading
-			this file returns the current EDID of the attached
-			monitor (or last backup value written). This is
-			useful to get the EDID of the attached monitor,
-			which can be passed to utilities like parse-edid.
-
-metrics_bytes_rendered	32-bit count of pixel bytes rendered
-
-metrics_bytes_identical 32-bit count of how many of those bytes were found to be
-			unchanged, based on a shadow framebuffer check
-
-metrics_bytes_sent	32-bit count of how many bytes were transferred over
-			USB to communicate the resulting changed pixels to the
-			hardware. Includes compression and protocol overhead
-
-metrics_cpu_kcycles_used 32-bit count of CPU cycles used in processing the
-			above pixels (in thousands of cycles).
-
-metrics_reset		Write-only. Any write to this file resets all metrics
-			above to zero.  Note that the 32-bit counters above
-			roll over very quickly. To get reliable results, design
-			performance tests to start and finish in a very short
-			period of time (one minute or less is safe).
-
---
-Bernie Thompson <bernie@plugable.com>
diff --git a/Documentation/fb/uvesafb.rst b/Documentation/fb/uvesafb.rst
new file mode 100644
index 000000000000..d1c2523fbb33
--- /dev/null
+++ b/Documentation/fb/uvesafb.rst
@@ -0,0 +1,188 @@
+==========================================================
+uvesafb - A Generic Driver for VBE2+ compliant video cards
+==========================================================
+
+1. Requirements
+---------------
+
+uvesafb should work with any video card that has a Video BIOS compliant
+with the VBE 2.0 standard.
+
+Unlike other drivers, uvesafb makes use of a userspace helper called
+v86d.  v86d is used to run the x86 Video BIOS code in a simulated and
+controlled environment.  This allows uvesafb to function on arches other
+than x86.  Check the v86d documentation for a list of currently supported
+arches.
+
+v86d source code can be downloaded from the following website:
+
+  https://github.com/mjanusz/v86d
+
+Please refer to the v86d documentation for detailed configuration and
+installation instructions.
+
+Note that the v86d userspace helper has to be available at all times in
+order for uvesafb to work properly.  If you want to use uvesafb during
+early boot, you will have to include v86d into an initramfs image, and
+either compile it into the kernel or use it as an initrd.
+
+2. Caveats and limitations
+--------------------------
+
+uvesafb is a _generic_ driver which supports a wide variety of video
+cards, but which is ultimately limited by the Video BIOS interface.
+The most important limitations are:
+
+- Lack of any type of acceleration.
+- A strict and limited set of supported video modes.  Often the native
+  or most optimal resolution/refresh rate for your setup will not work
+  with uvesafb, simply because the Video BIOS doesn't support the
+  video mode you want to use.  This can be especially painful with
+  widescreen panels, where native video modes don't have the 4:3 aspect
+  ratio, which is what most BIOS-es are limited to.
+- Adjusting the refresh rate is only possible with a VBE 3.0 compliant
+  Video BIOS.  Note that many nVidia Video BIOS-es claim to be VBE 3.0
+  compliant, while they simply ignore any refresh rate settings.
+
+3. Configuration
+----------------
+
+uvesafb can be compiled either as a module, or directly into the kernel.
+In both cases it supports the same set of configuration options, which
+are either given on the kernel command line or as module parameters, e.g.::
+
+ video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel)
+
+ # modprobe uvesafb mode_option=1024x768-32 mtrr=3 scroll=ywrap  (module)
+
+Accepted options:
+
+======= =========================================================
+ypan    Enable display panning using the VESA protected mode
+	interface.  The visible screen is just a window of the
+	video memory, console scrolling is done by changing the
+	start of the window.  This option is available on x86
+	only and is the default option on that architecture.
+
+ywrap   Same as ypan, but assumes your gfx board can wrap-around
+	the video memory (i.e. starts reading from top if it
+	reaches the end of video memory).  Faster than ypan.
+	Available on x86 only.
+
+redraw  Scroll by redrawing the affected part of the screen, this
+	is the default on non-x86.
+======= =========================================================
+
+(If you're using uvesafb as a module, the above three options are
+used a parameter of the scroll option, e.g. scroll=ypan.)
+
+=========== ====================================================================
+vgapal      Use the standard VGA registers for palette changes.
+
+pmipal      Use the protected mode interface for palette changes.
+            This is the default if the protected mode interface is
+            available.  Available on x86 only.
+
+mtrr:n      Setup memory type range registers for the framebuffer
+            where n:
+
+                - 0 - disabled (equivalent to nomtrr)
+                - 3 - write-combining (default)
+
+            Values other than 0 and 3 will result in a warning and will be
+            treated just like 3.
+
+nomtrr      Do not use memory type range registers.
+
+vremap:n
+            Remap 'n' MiB of video RAM.  If 0 or not specified, remap memory
+            according to video mode.
+
+vtotal:n    If the video BIOS of your card incorrectly determines the total
+            amount of video RAM, use this option to override the BIOS (in MiB).
+
+<mode>      The mode you want to set, in the standard modedb format.  Refer to
+            modedb.txt for a detailed description.  When uvesafb is compiled as
+            a module, the mode string should be provided as a value of the
+            'mode_option' option.
+
+vbemode:x   Force the use of VBE mode x.  The mode will only be set if it's
+            found in the VBE-provided list of supported modes.
+            NOTE: The mode number 'x' should be specified in VESA mode number
+            notation, not the Linux kernel one (eg. 257 instead of 769).
+            HINT: If you use this option because normal <mode> parameter does
+            not work for you and you use a X server, you'll probably want to
+            set the 'nocrtc' option to ensure that the video mode is properly
+            restored after console <-> X switches.
+
+nocrtc      Do not use CRTC timings while setting the video mode.  This option
+            has any effect only if the Video BIOS is VBE 3.0 compliant.  Use it
+            if you have problems with modes set the standard way.  Note that
+            using this option implies that any refresh rate adjustments will
+            be ignored and the refresh rate will stay at your BIOS default
+            (60 Hz).
+
+noedid      Do not try to fetch and use EDID-provided modes.
+
+noblank     Disable hardware blanking.
+
+v86d:path   Set path to the v86d executable. This option is only available as
+            a module parameter, and not as a part of the video= string.  If you
+            need to use it and have uvesafb built into the kernel, use
+            uvesafb.v86d="path".
+=========== ====================================================================
+
+Additionally, the following parameters may be provided.  They all override the
+EDID-provided values and BIOS defaults.  Refer to your monitor's specs to get
+the correct values for maxhf, maxvf and maxclk for your hardware.
+
+=========== ======================================
+maxhf:n     Maximum horizontal frequency (in kHz).
+maxvf:n     Maximum vertical frequency (in Hz).
+maxclk:n    Maximum pixel clock (in MHz).
+=========== ======================================
+
+4. The sysfs interface
+----------------------
+
+uvesafb provides several sysfs nodes for configurable parameters and
+additional information.
+
+Driver attributes:
+
+/sys/bus/platform/drivers/uvesafb
+  v86d
+    (default: /sbin/v86d)
+
+    Path to the v86d executable. v86d is started by uvesafb
+    if an instance of the daemon isn't already running.
+
+Device attributes:
+
+/sys/bus/platform/drivers/uvesafb/uvesafb.0
+  nocrtc
+    Use the default refresh rate (60 Hz) if set to 1.
+
+  oem_product_name, oem_product_rev, oem_string, oem_vendor
+    Information about the card and its maker.
+
+  vbe_modes
+    A list of video modes supported by the Video BIOS along with their
+    VBE mode numbers in hex.
+
+  vbe_version
+    A BCD value indicating the implemented VBE standard.
+
+5. Miscellaneous
+----------------
+
+Uvesafb will set a video mode with the default refresh rate and timings
+from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
+
+
+
+ Michal Januszewski <spock@gentoo.org>
+
+ Last updated: 2017-10-10
+
+ Documentation of the uvesafb options is loosely based on vesafb.txt.
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
deleted file mode 100644
index aa924196c366..000000000000
--- a/Documentation/fb/uvesafb.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-
-uvesafb - A Generic Driver for VBE2+ compliant video cards
-==========================================================
-
-1. Requirements
----------------
-
-uvesafb should work with any video card that has a Video BIOS compliant
-with the VBE 2.0 standard.
-
-Unlike other drivers, uvesafb makes use of a userspace helper called
-v86d.  v86d is used to run the x86 Video BIOS code in a simulated and
-controlled environment.  This allows uvesafb to function on arches other
-than x86.  Check the v86d documentation for a list of currently supported
-arches.
-
-v86d source code can be downloaded from the following website:
-
-  https://github.com/mjanusz/v86d
-
-Please refer to the v86d documentation for detailed configuration and
-installation instructions.
-
-Note that the v86d userspace helper has to be available at all times in
-order for uvesafb to work properly.  If you want to use uvesafb during
-early boot, you will have to include v86d into an initramfs image, and
-either compile it into the kernel or use it as an initrd.
-
-2. Caveats and limitations
---------------------------
-
-uvesafb is a _generic_ driver which supports a wide variety of video
-cards, but which is ultimately limited by the Video BIOS interface.
-The most important limitations are:
-
-- Lack of any type of acceleration.
-- A strict and limited set of supported video modes.  Often the native
-  or most optimal resolution/refresh rate for your setup will not work
-  with uvesafb, simply because the Video BIOS doesn't support the
-  video mode you want to use.  This can be especially painful with
-  widescreen panels, where native video modes don't have the 4:3 aspect
-  ratio, which is what most BIOS-es are limited to.
-- Adjusting the refresh rate is only possible with a VBE 3.0 compliant
-  Video BIOS.  Note that many nVidia Video BIOS-es claim to be VBE 3.0
-  compliant, while they simply ignore any refresh rate settings.
-
-3. Configuration
-----------------
-
-uvesafb can be compiled either as a module, or directly into the kernel.
-In both cases it supports the same set of configuration options, which
-are either given on the kernel command line or as module parameters, e.g.:
-
- video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel)
-
- # modprobe uvesafb mode_option=1024x768-32 mtrr=3 scroll=ywrap  (module)
-
-Accepted options:
-
-ypan    Enable display panning using the VESA protected mode
-        interface.  The visible screen is just a window of the
-        video memory, console scrolling is done by changing the
-        start of the window.  This option is available on x86
-        only and is the default option on that architecture.
-
-ywrap   Same as ypan, but assumes your gfx board can wrap-around
-        the video memory (i.e. starts reading from top if it
-        reaches the end of video memory).  Faster than ypan.
-        Available on x86 only.
-
-redraw  Scroll by redrawing the affected part of the screen, this
-        is the default on non-x86.
-
-(If you're using uvesafb as a module, the above three options are
- used a parameter of the scroll option, e.g. scroll=ypan.)
-
-vgapal  Use the standard VGA registers for palette changes.
-
-pmipal  Use the protected mode interface for palette changes.
-        This is the default if the protected mode interface is
-        available.  Available on x86 only.
-
-mtrr:n  Setup memory type range registers for the framebuffer
-        where n:
-              0 - disabled (equivalent to nomtrr)
-              3 - write-combining (default)
-
-	Values other than 0 and 3 will result in a warning and will be
-	treated just like 3.
-
-nomtrr  Do not use memory type range registers.
-
-vremap:n
-        Remap 'n' MiB of video RAM.  If 0 or not specified, remap memory
-        according to video mode.
-
-vtotal:n
-        If the video BIOS of your card incorrectly determines the total
-        amount of video RAM, use this option to override the BIOS (in MiB).
-
-<mode>  The mode you want to set, in the standard modedb format.  Refer to
-        modedb.txt for a detailed description.  When uvesafb is compiled as
-        a module, the mode string should be provided as a value of the
-        'mode_option' option.
-
-vbemode:x
-        Force the use of VBE mode x.  The mode will only be set if it's
-        found in the VBE-provided list of supported modes.
-        NOTE: The mode number 'x' should be specified in VESA mode number
-        notation, not the Linux kernel one (eg. 257 instead of 769).
-        HINT: If you use this option because normal <mode> parameter does
-        not work for you and you use a X server, you'll probably want to
-        set the 'nocrtc' option to ensure that the video mode is properly
-        restored after console <-> X switches.
-
-nocrtc  Do not use CRTC timings while setting the video mode.  This option
-        has any effect only if the Video BIOS is VBE 3.0 compliant.  Use it
-        if you have problems with modes set the standard way.  Note that
-        using this option implies that any refresh rate adjustments will
-        be ignored and the refresh rate will stay at your BIOS default (60 Hz).
-
-noedid  Do not try to fetch and use EDID-provided modes.
-
-noblank Disable hardware blanking.
-
-v86d:path
-        Set path to the v86d executable. This option is only available as
-        a module parameter, and not as a part of the video= string.  If you
-        need to use it and have uvesafb built into the kernel, use
-        uvesafb.v86d="path".
-
-Additionally, the following parameters may be provided.  They all override the
-EDID-provided values and BIOS defaults.  Refer to your monitor's specs to get
-the correct values for maxhf, maxvf and maxclk for your hardware.
-
-maxhf:n     Maximum horizontal frequency (in kHz).
-maxvf:n     Maximum vertical frequency (in Hz).
-maxclk:n    Maximum pixel clock (in MHz).
-
-4. The sysfs interface
-----------------------
-
-uvesafb provides several sysfs nodes for configurable parameters and
-additional information.
-
-Driver attributes:
-
-/sys/bus/platform/drivers/uvesafb
-  - v86d (default: /sbin/v86d)
-    Path to the v86d executable. v86d is started by uvesafb
-    if an instance of the daemon isn't already running.
-
-Device attributes:
-
-/sys/bus/platform/drivers/uvesafb/uvesafb.0
-  - nocrtc
-    Use the default refresh rate (60 Hz) if set to 1.
-
-  - oem_product_name
-  - oem_product_rev
-  - oem_string
-  - oem_vendor
-    Information about the card and its maker.
-
-  - vbe_modes
-    A list of video modes supported by the Video BIOS along with their
-    VBE mode numbers in hex.
-
-  - vbe_version
-    A BCD value indicating the implemented VBE standard.
-
-5. Miscellaneous
-----------------
-
-Uvesafb will set a video mode with the default refresh rate and timings
-from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
-
-
---
- Michal Januszewski <spock@gentoo.org>
- Last updated: 2017-10-10
-
- Documentation of the uvesafb options is loosely based on vesafb.txt.
-
diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst
new file mode 100644
index 000000000000..6821c87b7893
--- /dev/null
+++ b/Documentation/fb/vesafb.rst
@@ -0,0 +1,192 @@
+===============
+What is vesafb?
+===============
+
+This is a generic driver for a graphic framebuffer on intel boxes.
+
+The idea is simple:  Turn on graphics mode at boot time with the help
+of the BIOS, and use this as framebuffer device /dev/fb0, like the m68k
+(and other) ports do.
+
+This means we decide at boot time whenever we want to run in text or
+graphics mode.  Switching mode later on (in protected mode) is
+impossible; BIOS calls work in real mode only.  VESA BIOS Extensions
+Version 2.0 are required, because we need a linear frame buffer.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+   without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0 (=> non-accelerated X11
+   support for every VBE 2.0 compliant graphics board).
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using the vga=... boot parameter.  Read
+Documentation/admin-guide/svga.rst for details.
+
+You should compile in both vgacon (for text mode) and vesafb (for
+graphics mode). Which of them takes over the console depends on
+whenever the specified mode is text or graphics.
+
+The graphic modes are NOT in the list which you get if you boot with
+vga=ask and hit return. The mode you wish to use is derived from the
+VESA mode number. Here are those VESA mode numbers:
+
+====== =======  =======  ======== =========
+colors 640x480  800x600  1024x768 1280x1024
+====== =======  =======  ======== =========
+256    0x101    0x103    0x105    0x107
+32k    0x110    0x113    0x116    0x119
+64k    0x111    0x114    0x117    0x11A
+16M    0x112    0x115    0x118    0x11B
+====== =======  =======  ======== =========
+
+
+The video mode number of the Linux kernel is the VESA mode number plus
+0x200:
+
+ Linux_kernel_mode_number = VESA_mode_number + 0x200
+
+So the table for the Kernel mode numbers are:
+
+====== =======  =======  ======== =========
+colors 640x480  800x600  1024x768 1280x1024
+====== =======  =======  ======== =========
+256    0x301    0x303    0x305    0x307
+32k    0x310    0x313    0x316    0x319
+64k    0x311    0x314    0x317    0x31A
+16M    0x312    0x315    0x318    0x31B
+====== =======  =======  ======== =========
+
+To enable one of those modes you have to specify "vga=ask" in the
+lilo.conf file and rerun LILO. Then you can type in the desired
+mode at the "vga=ask" prompt. For example if you like to use
+1024x768x256 colors you have to say "305" at this prompt.
+
+If this does not work, this might be because your BIOS does not support
+linear framebuffers or because it does not support this mode at all.
+Even if your board does, it might be the BIOS which does not.  VESA BIOS
+Extensions v2.0 are required, 1.2 is NOT sufficient.  You will get a
+"bad mode number" message if something goes wrong.
+
+1. Note: LILO cannot handle hex, for booting directly with
+   "vga=mode-number" you have to transform the numbers to decimal.
+2. Note: Some newer versions of LILO appear to work with those hex values,
+   if you set the 0x in front of the numbers.
+
+X11
+===
+
+XF68_FBDev should work just fine, but it is non-accelerated.  Running
+another (accelerated) X-Server like XF86_SVGA might or might not work.
+It depends on X-Server and graphics board.
+
+The X-Server must restore the video mode correctly, else you end up
+with a broken console (and vesafb cannot do anything about this).
+
+
+Refresh rates
+=============
+
+There is no way to change the vesafb video mode and/or timings after
+booting linux.  If you are not happy with the 60 Hz refresh rate, you
+have these options:
+
+ * configure and load the DOS-Tools for the graphics board (if
+   available) and boot linux with loadlin.
+ * use a native driver (matroxfb/atyfb) instead if vesafb.  If none
+   is available, write a new one!
+ * VBE 3.0 might work too.  I have neither a gfx board with VBE 3.0
+   support nor the specs, so I have not checked this yet.
+
+
+Configuration
+=============
+
+The VESA BIOS provides protected mode interface for changing
+some parameters.  vesafb can use it for palette changes and
+to pan the display.  It is turned off by default because it
+seems not to work with some BIOS versions, but there are options
+to turn it on.
+
+You can pass options to vesafb using "video=vesafb:option" on
+the kernel command line.  Multiple options should be separated
+by comma, like this: "video=vesafb:ypan,inverse"
+
+Accepted options:
+
+inverse	use inverse color map
+
+========= ======================================================================
+ypan	  enable display panning using the VESA protected mode
+          interface.  The visible screen is just a window of the
+          video memory, console scrolling is done by changing the
+          start of the window.
+
+          pro:
+
+                * scrolling (fullscreen) is fast, because there is
+		  no need to copy around data.
+		* You'll get scrollback (the Shift-PgUp thing),
+		  the video memory can be used as scrollback buffer
+
+          kontra:
+
+		* scrolling only parts of the screen causes some
+		  ugly flicker effects (boot logo flickers for
+		  example).
+
+ywrap	  Same as ypan, but assumes your gfx board can wrap-around
+          the video memory (i.e. starts reading from top if it
+          reaches the end of video memory).  Faster than ypan.
+
+redraw	  Scroll by redrawing the affected part of the screen, this
+          is the safe (and slow) default.
+
+
+vgapal	  Use the standard vga registers for palette changes.
+          This is the default.
+pmipal    Use the protected mode interface for palette changes.
+
+mtrr:n	  Setup memory type range registers for the vesafb framebuffer
+          where n:
+
+              - 0 - disabled (equivalent to nomtrr) (default)
+              - 1 - uncachable
+              - 2 - write-back
+              - 3 - write-combining
+              - 4 - write-through
+
+          If you see the following in dmesg, choose the type that matches the
+          old one. In this example, use "mtrr:2".
+...
+mtrr:     type mismatch for e0000000,8000000 old: write-back new:
+	  write-combining
+...
+
+nomtrr    disable mtrr
+
+vremap:n
+          Remap 'n' MiB of video RAM. If 0 or not specified, remap memory
+          according to video mode. (2.5.66 patch/idea by Antonino Daplas
+          reversed to give override possibility (allocate more fb memory
+          than the kernel would) to 2.4 by tmb@iki.fi)
+
+vtotal:n  If the video BIOS of your card incorrectly determines the total
+          amount of video RAM, use this option to override the BIOS (in MiB).
+========= ======================================================================
+
+Have fun!
+
+Gerd Knorr <kraxel@goldbach.in-berlin.de>
+
+Minor (mostly typo) changes
+by Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
deleted file mode 100644
index 413bb73235be..000000000000
--- a/Documentation/fb/vesafb.txt
+++ /dev/null
@@ -1,181 +0,0 @@
-
-What is vesafb?
-===============
-
-This is a generic driver for a graphic framebuffer on intel boxes.
-
-The idea is simple:  Turn on graphics mode at boot time with the help
-of the BIOS, and use this as framebuffer device /dev/fb0, like the m68k
-(and other) ports do.
-
-This means we decide at boot time whenever we want to run in text or
-graphics mode.  Switching mode later on (in protected mode) is
-impossible; BIOS calls work in real mode only.  VESA BIOS Extensions
-Version 2.0 are required, because we need a linear frame buffer.
-
-Advantages:
-
- * It provides a nice large console (128 cols + 48 lines with 1024x768)
-   without using tiny, unreadable fonts.
- * You can run XF68_FBDev on top of /dev/fb0 (=> non-accelerated X11
-   support for every VBE 2.0 compliant graphics board).
- * Most important: boot logo :-)
-
-Disadvantages:
-
- * graphic mode is slower than text mode...
-
-
-How to use it?
-==============
-
-Switching modes is done using the vga=... boot parameter.  Read
-Documentation/svga.txt for details.
-
-You should compile in both vgacon (for text mode) and vesafb (for
-graphics mode). Which of them takes over the console depends on
-whenever the specified mode is text or graphics.
-
-The graphic modes are NOT in the list which you get if you boot with
-vga=ask and hit return. The mode you wish to use is derived from the
-VESA mode number. Here are those VESA mode numbers:
-
-    | 640x480  800x600  1024x768 1280x1024
-----+-------------------------------------
-256 |  0x101    0x103    0x105    0x107   
-32k |  0x110    0x113    0x116    0x119   
-64k |  0x111    0x114    0x117    0x11A   
-16M |  0x112    0x115    0x118    0x11B   
-
-The video mode number of the Linux kernel is the VESA mode number plus
-0x200.
- 
- Linux_kernel_mode_number = VESA_mode_number + 0x200
-
-So the table for the Kernel mode numbers are:
-
-    | 640x480  800x600  1024x768 1280x1024
-----+-------------------------------------
-256 |  0x301    0x303    0x305    0x307   
-32k |  0x310    0x313    0x316    0x319   
-64k |  0x311    0x314    0x317    0x31A   
-16M |  0x312    0x315    0x318    0x31B   
-
-To enable one of those modes you have to specify "vga=ask" in the
-lilo.conf file and rerun LILO. Then you can type in the desired
-mode at the "vga=ask" prompt. For example if you like to use 
-1024x768x256 colors you have to say "305" at this prompt.
-
-If this does not work, this might be because your BIOS does not support
-linear framebuffers or because it does not support this mode at all.
-Even if your board does, it might be the BIOS which does not.  VESA BIOS
-Extensions v2.0 are required, 1.2 is NOT sufficient.  You will get a
-"bad mode number" message if something goes wrong.
-
-1. Note: LILO cannot handle hex, for booting directly with 
-         "vga=mode-number" you have to transform the numbers to decimal.
-2. Note: Some newer versions of LILO appear to work with those hex values,
-         if you set the 0x in front of the numbers.
-
-X11
-===
-
-XF68_FBDev should work just fine, but it is non-accelerated.  Running
-another (accelerated) X-Server like XF86_SVGA might or might not work.
-It depends on X-Server and graphics board.
-
-The X-Server must restore the video mode correctly, else you end up
-with a broken console (and vesafb cannot do anything about this).
-
-
-Refresh rates
-=============
-
-There is no way to change the vesafb video mode and/or timings after
-booting linux.  If you are not happy with the 60 Hz refresh rate, you
-have these options:
-
- * configure and load the DOS-Tools for the graphics board (if
-   available) and boot linux with loadlin.
- * use a native driver (matroxfb/atyfb) instead if vesafb.  If none
-   is available, write a new one!
- * VBE 3.0 might work too.  I have neither a gfx board with VBE 3.0
-   support nor the specs, so I have not checked this yet.
-
-
-Configuration
-=============
-
-The VESA BIOS provides protected mode interface for changing
-some parameters.  vesafb can use it for palette changes and
-to pan the display.  It is turned off by default because it
-seems not to work with some BIOS versions, but there are options
-to turn it on.
-
-You can pass options to vesafb using "video=vesafb:option" on
-the kernel command line.  Multiple options should be separated
-by comma, like this: "video=vesafb:ypan,inverse"
-
-Accepted options:
-
-inverse	use inverse color map
-
-ypan	enable display panning using the VESA protected mode 
-	interface.  The visible screen is just a window of the
-	video memory, console scrolling is done by changing the
-	start of the window.
-	pro:	* scrolling (fullscreen) is fast, because there is
-		  no need to copy around data.
-		* You'll get scrollback (the Shift-PgUp thing),
-		  the video memory can be used as scrollback buffer
-	kontra: * scrolling only parts of the screen causes some
-		  ugly flicker effects (boot logo flickers for
-		  example).
-
-ywrap	Same as ypan, but assumes your gfx board can wrap-around 
-	the video memory (i.e. starts reading from top if it
-	reaches the end of video memory).  Faster than ypan.
-
-redraw	scroll by redrawing the affected part of the screen, this
-	is the safe (and slow) default.
-
-
-vgapal	Use the standard vga registers for palette changes.
-	This is the default.
-pmipal	Use the protected mode interface for palette changes.
-
-mtrr:n	setup memory type range registers for the vesafb framebuffer
-	where n:
-	      0 - disabled (equivalent to nomtrr) (default)
-	      1 - uncachable
-	      2 - write-back
-	      3 - write-combining
-	      4 - write-through
-
-	If you see the following in dmesg, choose the type that matches the
-	old one. In this example, use "mtrr:2".
-...
-mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining
-...
-
-nomtrr  disable mtrr
-
-vremap:n
-        remap 'n' MiB of video RAM. If 0 or not specified, remap memory
-	according to video mode. (2.5.66 patch/idea by Antonino Daplas
-	reversed to give override possibility (allocate more fb memory
-	than the kernel would) to 2.4 by tmb@iki.fi)
-
-vtotal:n
-        if the video BIOS of your card incorrectly determines the total
-        amount of video RAM, use this option to override the BIOS (in MiB).
-
-Have fun!
-
-  Gerd
-
---
-Gerd Knorr <kraxel@goldbach.in-berlin.de>
-
-Minor (mostly typo) changes 
-by Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
diff --git a/Documentation/fb/viafb.rst b/Documentation/fb/viafb.rst
new file mode 100644
index 000000000000..8eb7a3bb068c
--- /dev/null
+++ b/Documentation/fb/viafb.rst
@@ -0,0 +1,297 @@
+=======================================================
+VIA Integration Graphic Chip Console Framebuffer Driver
+=======================================================
+
+Platform
+--------
+    The console framebuffer driver is for graphics chips of
+    VIA UniChrome Family
+    (CLE266, PM800 / CN400 / CN300,
+    P4M800CE / P4M800Pro / CN700 / VN800,
+    CX700 / VX700, K8M890, P4M890,
+    CN896 / P4M900, VX800, VX855)
+
+Driver features
+---------------
+    Device: CRT, LCD, DVI
+
+    Support viafb_mode::
+
+	CRT:
+	    640x480(60, 75, 85, 100, 120 Hz), 720x480(60 Hz),
+	    720x576(60 Hz), 800x600(60, 75, 85, 100, 120 Hz),
+	    848x480(60 Hz), 856x480(60 Hz), 1024x512(60 Hz),
+	    1024x768(60, 75, 85, 100 Hz), 1152x864(75 Hz),
+	    1280x768(60 Hz), 1280x960(60 Hz), 1280x1024(60, 75, 85 Hz),
+	    1440x1050(60 Hz), 1600x1200(60, 75 Hz), 1280x720(60 Hz),
+	    1920x1080(60 Hz), 1400x1050(60 Hz), 800x480(60 Hz)
+
+    color depth: 8 bpp, 16 bpp, 32 bpp supports.
+
+    Support 2D hardware accelerator.
+
+Using the viafb module
+----------------------
+    Start viafb with default settings::
+
+	#modprobe viafb
+
+    Start viafb with user options::
+
+	#modprobe viafb viafb_mode=800x600 viafb_bpp=16 viafb_refresh=60
+		  viafb_active_dev=CRT+DVI viafb_dvi_port=DVP1
+		  viafb_mode1=1024x768 viafb_bpp=16 viafb_refresh1=60
+		  viafb_SAMM_ON=1
+
+    viafb_mode:
+	- 640x480 (default)
+	- 720x480
+	- 800x600
+	- 1024x768
+
+    viafb_bpp:
+	- 8, 16, 32 (default:32)
+
+    viafb_refresh:
+	- 60, 75, 85, 100, 120 (default:60)
+
+    viafb_lcd_dsp_method:
+	- 0 : expansion (default)
+	- 1 : centering
+
+    viafb_lcd_mode:
+	0 : LCD panel with LSB data format input (default)
+	1 : LCD panel with MSB data format input
+
+    viafb_lcd_panel_id:
+	- 0 : Resolution: 640x480, Channel: single, Dithering: Enable
+	- 1 : Resolution: 800x600, Channel: single, Dithering: Enable
+	- 2 : Resolution: 1024x768, Channel: single, Dithering: Enable (default)
+	- 3 : Resolution: 1280x768, Channel: single, Dithering: Enable
+	- 4 : Resolution: 1280x1024, Channel: dual, Dithering: Enable
+	- 5 : Resolution: 1400x1050, Channel: dual, Dithering: Enable
+	- 6 : Resolution: 1600x1200, Channel: dual, Dithering: Enable
+
+	- 8 : Resolution: 800x480, Channel: single, Dithering: Enable
+	- 9 : Resolution: 1024x768, Channel: dual, Dithering: Enable
+	- 10: Resolution: 1024x768, Channel: single, Dithering: Disable
+	- 11: Resolution: 1024x768, Channel: dual, Dithering: Disable
+	- 12: Resolution: 1280x768, Channel: single, Dithering: Disable
+	- 13: Resolution: 1280x1024, Channel: dual, Dithering: Disable
+	- 14: Resolution: 1400x1050, Channel: dual, Dithering: Disable
+	- 15: Resolution: 1600x1200, Channel: dual, Dithering: Disable
+	- 16: Resolution: 1366x768, Channel: single, Dithering: Disable
+	- 17: Resolution: 1024x600, Channel: single, Dithering: Enable
+	- 18: Resolution: 1280x768, Channel: dual, Dithering: Enable
+	- 19: Resolution: 1280x800, Channel: single, Dithering: Enable
+
+    viafb_accel:
+	- 0 : No 2D Hardware Acceleration
+	- 1 : 2D Hardware Acceleration (default)
+
+    viafb_SAMM_ON:
+	- 0 : viafb_SAMM_ON disable (default)
+	- 1 : viafb_SAMM_ON enable
+
+    viafb_mode1: (secondary display device)
+	- 640x480 (default)
+	- 720x480
+	- 800x600
+	- 1024x768
+
+    viafb_bpp1: (secondary display device)
+	- 8, 16, 32 (default:32)
+
+    viafb_refresh1: (secondary display device)
+	- 60, 75, 85, 100, 120 (default:60)
+
+    viafb_active_dev:
+	This option is used to specify active devices.(CRT, DVI, CRT+LCD...)
+	DVI stands for DVI or HDMI, E.g., If you want to enable HDMI,
+	set viafb_active_dev=DVI. In SAMM case, the previous of
+	viafb_active_dev is primary device, and the following is
+	secondary device.
+
+	For example:
+
+	To enable one device, such as DVI only, we can use::
+
+	    modprobe viafb viafb_active_dev=DVI
+
+	To enable two devices, such as CRT+DVI::
+
+	    modprobe viafb viafb_active_dev=CRT+DVI;
+
+	For DuoView case, we can use::
+
+	    modprobe viafb viafb_active_dev=CRT+DVI
+
+	OR::
+
+	    modprobe viafb viafb_active_dev=DVI+CRT...
+
+	For SAMM case:
+
+	If CRT is primary and DVI is secondary, we should use::
+
+	    modprobe viafb viafb_active_dev=CRT+DVI viafb_SAMM_ON=1...
+
+	If DVI is primary and CRT is secondary, we should use::
+
+	    modprobe viafb viafb_active_dev=DVI+CRT viafb_SAMM_ON=1...
+
+    viafb_display_hardware_layout:
+	This option is used to specify display hardware layout for CX700 chip.
+
+	- 1 : LCD only
+	- 2 : DVI only
+	- 3 : LCD+DVI (default)
+	- 4 : LCD1+LCD2 (internal + internal)
+	- 16: LCD1+ExternalLCD2 (internal + external)
+
+    viafb_second_size:
+	This option is used to set second device memory size(MB) in SAMM case.
+	The minimal size is 16.
+
+    viafb_platform_epia_dvi:
+	This option is used to enable DVI on EPIA - M
+
+	- 0 : No DVI on EPIA - M (default)
+	- 1 : DVI on EPIA - M
+
+    viafb_bus_width:
+	When using 24 - Bit Bus Width Digital Interface,
+	this option should be set.
+
+	- 12: 12-Bit LVDS or 12-Bit TMDS (default)
+	- 24: 24-Bit LVDS or 24-Bit TMDS
+
+    viafb_device_lcd_dualedge:
+	When using Dual Edge Panel, this option should be set.
+
+	- 0 : No Dual Edge Panel (default)
+	- 1 : Dual Edge Panel
+
+    viafb_lcd_port:
+	This option is used to specify LCD output port,
+	available values are "DVP0" "DVP1" "DFP_HIGHLOW" "DFP_HIGH" "DFP_LOW".
+
+	for external LCD + external DVI on CX700(External LCD is on DVP0),
+	we should use::
+
+	    modprobe viafb viafb_lcd_port=DVP0...
+
+Notes:
+    1. CRT may not display properly for DuoView CRT & DVI display at
+       the "640x480" PAL mode with DVI overscan enabled.
+    2. SAMM stands for single adapter multi monitors. It is different from
+       multi-head since SAMM support multi monitor at driver layers, thus fbcon
+       layer doesn't even know about it; SAMM's second screen doesn't have a
+       device node file, thus a user mode application can't access it directly.
+       When SAMM is enabled, viafb_mode and viafb_mode1, viafb_bpp and
+       viafb_bpp1, viafb_refresh and viafb_refresh1 can be different.
+    3. When console is depending on viafbinfo1, dynamically change resolution
+       and bpp, need to call VIAFB specified ioctl interface VIAFB_SET_DEVICE
+       instead of calling common ioctl function FBIOPUT_VSCREENINFO since
+       viafb doesn't support multi-head well, or it will cause screen crush.
+
+
+Configure viafb with "fbset" tool
+---------------------------------
+
+    "fbset" is an inbox utility of Linux.
+
+    1. Inquire current viafb information, type::
+
+	   # fbset -i
+
+    2. Set various resolutions and viafb_refresh rates::
+
+	   # fbset <resolution-vertical_sync>
+
+       example::
+
+	   # fbset "1024x768-75"
+
+       or::
+
+	   # fbset -g 1024 768 1024 768 32
+
+       Check the file "/etc/fb.modes" to find display modes available.
+
+    3. Set the color depth::
+
+	   # fbset -depth <value>
+
+       example::
+
+	   # fbset -depth 16
+
+
+Configure viafb via /proc
+-------------------------
+    The following files exist in /proc/viafb
+
+    supported_output_devices
+	This read-only file contains a full ',' separated list containing all
+	output devices that could be available on your platform. It is likely
+	that not all of those have a connector on your hardware but it should
+	provide a good starting point to figure out which of those names match
+	a real connector.
+
+	Example::
+
+		# cat /proc/viafb/supported_output_devices
+
+    iga1/output_devices, iga2/output_devices
+	These two files are readable and writable. iga1 and iga2 are the two
+	independent units that produce the screen image. Those images can be
+	forwarded to one or more output devices. Reading those files is a way
+	to query which output devices are currently used by an iga.
+
+	Example::
+
+		# cat /proc/viafb/iga1/output_devices
+
+	If there are no output devices printed the output of this iga is lost.
+	This can happen for example if only one (the other) iga is used.
+	Writing to these files allows adjusting the output devices during
+	runtime. One can add new devices, remove existing ones or switch
+	between igas. Essentially you can write a ',' separated list of device
+	names (or a single one) in the same format as the output to those
+	files. You can add a '+' or '-' as a prefix allowing simple addition
+	and removal of devices. So a prefix '+' adds the devices from your list
+	to the already existing ones, '-' removes the listed devices from the
+	existing ones and if no prefix is given it replaces all existing ones
+	with the listed ones. If you remove devices they are expected to turn
+	off. If you add devices that are already part of the other iga they are
+	removed there and added to the new one.
+
+	Examples:
+
+	Add CRT as output device to iga1::
+
+		# echo +CRT > /proc/viafb/iga1/output_devices
+
+	Remove (turn off) DVP1 and LVDS1 as output devices of iga2::
+
+		# echo -DVP1,LVDS1 > /proc/viafb/iga2/output_devices
+
+	Replace all iga1 output devices by CRT::
+
+		# echo CRT > /proc/viafb/iga1/output_devices
+
+
+Bootup with viafb
+-----------------
+
+Add the following line to your grub.conf::
+
+    append = "video=viafb:viafb_mode=1024x768,viafb_bpp=32,viafb_refresh=85"
+
+
+VIA Framebuffer modes
+=====================
+
+.. include:: viafb.modes
+   :literal:
diff --git a/Documentation/fb/viafb.txt b/Documentation/fb/viafb.txt
deleted file mode 100644
index 1cb2462a71ce..000000000000
--- a/Documentation/fb/viafb.txt
+++ /dev/null
@@ -1,252 +0,0 @@
-
-        VIA Integration Graphic Chip Console Framebuffer Driver
-
-[Platform]
------------------------
-    The console framebuffer driver is for graphics chips of
-    VIA UniChrome Family(CLE266, PM800 / CN400 / CN300,
-                        P4M800CE / P4M800Pro / CN700 / VN800,
-                        CX700 / VX700, K8M890, P4M890,
-                        CN896 / P4M900, VX800, VX855)
-
-[Driver features]
-------------------------
-    Device: CRT, LCD, DVI
-
-    Support viafb_mode:
-        CRT:
-            640x480(60, 75, 85, 100, 120 Hz), 720x480(60 Hz),
-            720x576(60 Hz), 800x600(60, 75, 85, 100, 120 Hz),
-            848x480(60 Hz), 856x480(60 Hz), 1024x512(60 Hz),
-            1024x768(60, 75, 85, 100 Hz), 1152x864(75 Hz),
-            1280x768(60 Hz), 1280x960(60 Hz), 1280x1024(60, 75, 85 Hz),
-            1440x1050(60 Hz), 1600x1200(60, 75 Hz), 1280x720(60 Hz),
-            1920x1080(60 Hz), 1400x1050(60 Hz), 800x480(60 Hz)
-
-    color depth: 8 bpp, 16 bpp, 32 bpp supports.
-
-    Support 2D hardware accelerator.
-
-[Using the viafb module]
--- -- --------------------
-    Start viafb with default settings:
-        #modprobe viafb
-
-    Start viafb with user options:
-        #modprobe viafb viafb_mode=800x600 viafb_bpp=16 viafb_refresh=60
-                  viafb_active_dev=CRT+DVI viafb_dvi_port=DVP1
-                  viafb_mode1=1024x768 viafb_bpp=16 viafb_refresh1=60
-                  viafb_SAMM_ON=1
-
-    viafb_mode:
-        640x480 (default)
-        720x480
-        800x600
-        1024x768
-        ......
-
-    viafb_bpp:
-        8, 16, 32 (default:32)
-
-    viafb_refresh:
-        60, 75, 85, 100, 120 (default:60)
-
-    viafb_lcd_dsp_method:
-        0 : expansion (default)
-        1 : centering
-
-    viafb_lcd_mode:
-        0 : LCD panel with LSB data format input (default)
-        1 : LCD panel with MSB data format input
-
-    viafb_lcd_panel_id:
-        0 : Resolution: 640x480, Channel: single, Dithering: Enable
-        1 : Resolution: 800x600, Channel: single, Dithering: Enable
-        2 : Resolution: 1024x768, Channel: single, Dithering: Enable (default)
-        3 : Resolution: 1280x768, Channel: single, Dithering: Enable
-        4 : Resolution: 1280x1024, Channel: dual, Dithering: Enable
-        5 : Resolution: 1400x1050, Channel: dual, Dithering: Enable
-        6 : Resolution: 1600x1200, Channel: dual, Dithering: Enable
-
-        8 : Resolution: 800x480, Channel: single, Dithering: Enable
-        9 : Resolution: 1024x768, Channel: dual, Dithering: Enable
-        10: Resolution: 1024x768, Channel: single, Dithering: Disable
-        11: Resolution: 1024x768, Channel: dual, Dithering: Disable
-        12: Resolution: 1280x768, Channel: single, Dithering: Disable
-        13: Resolution: 1280x1024, Channel: dual, Dithering: Disable
-        14: Resolution: 1400x1050, Channel: dual, Dithering: Disable
-        15: Resolution: 1600x1200, Channel: dual, Dithering: Disable
-        16: Resolution: 1366x768, Channel: single, Dithering: Disable
-        17: Resolution: 1024x600, Channel: single, Dithering: Enable
-        18: Resolution: 1280x768, Channel: dual, Dithering: Enable
-        19: Resolution: 1280x800, Channel: single, Dithering: Enable
-
-    viafb_accel:
-        0 : No 2D Hardware Acceleration
-        1 : 2D Hardware Acceleration (default)
-
-    viafb_SAMM_ON:
-        0 : viafb_SAMM_ON disable (default)
-        1 : viafb_SAMM_ON enable
-
-    viafb_mode1: (secondary display device)
-        640x480 (default)
-        720x480
-        800x600
-        1024x768
-        ... ...
-
-    viafb_bpp1: (secondary display device)
-        8, 16, 32 (default:32)
-
-    viafb_refresh1: (secondary display device)
-        60, 75, 85, 100, 120 (default:60)
-
-    viafb_active_dev:
-        This option is used to specify active devices.(CRT, DVI, CRT+LCD...)
-        DVI stands for DVI or HDMI, E.g., If you want to enable HDMI,
-        set viafb_active_dev=DVI. In SAMM case, the previous of
-        viafb_active_dev is primary device, and the following is
-        secondary device.
-
-        For example:
-        To enable one device, such as DVI only, we can use:
-            modprobe viafb viafb_active_dev=DVI
-        To enable two devices, such as CRT+DVI:
-            modprobe viafb viafb_active_dev=CRT+DVI;
-
-        For DuoView case, we can use:
-            modprobe viafb viafb_active_dev=CRT+DVI
-            OR
-            modprobe viafb viafb_active_dev=DVI+CRT...
-
-        For SAMM case:
-        If CRT is primary and DVI is secondary, we should use:
-            modprobe viafb viafb_active_dev=CRT+DVI viafb_SAMM_ON=1...
-        If DVI is primary and CRT is secondary, we should use:
-            modprobe viafb viafb_active_dev=DVI+CRT viafb_SAMM_ON=1...
-
-    viafb_display_hardware_layout:
-        This option is used to specify display hardware layout for CX700 chip.
-        1 : LCD only
-        2 : DVI only
-        3 : LCD+DVI (default)
-        4 : LCD1+LCD2 (internal + internal)
-        16: LCD1+ExternalLCD2 (internal + external)
-
-    viafb_second_size:
-        This option is used to set second device memory size(MB) in SAMM case.
-        The minimal size is 16.
-
-    viafb_platform_epia_dvi:
-        This option is used to enable DVI on EPIA - M
-        0 : No DVI on EPIA - M (default)
-        1 : DVI on EPIA - M
-
-    viafb_bus_width:
-        When using 24 - Bit Bus Width Digital Interface,
-        this option should be set.
-        12: 12-Bit LVDS or 12-Bit TMDS (default)
-        24: 24-Bit LVDS or 24-Bit TMDS
-
-    viafb_device_lcd_dualedge:
-        When using Dual Edge Panel, this option should be set.
-        0 : No Dual Edge Panel (default)
-        1 : Dual Edge Panel
-
-    viafb_lcd_port:
-        This option is used to specify LCD output port,
-        available values are "DVP0" "DVP1" "DFP_HIGHLOW" "DFP_HIGH" "DFP_LOW".
-        for external LCD + external DVI on CX700(External LCD is on DVP0),
-        we should use:
-            modprobe viafb viafb_lcd_port=DVP0...
-
-Notes:
-    1. CRT may not display properly for DuoView CRT & DVI display at
-       the "640x480" PAL mode with DVI overscan enabled.
-    2. SAMM stands for single adapter multi monitors. It is different from
-       multi-head since SAMM support multi monitor at driver layers, thus fbcon
-       layer doesn't even know about it; SAMM's second screen doesn't have a
-       device node file, thus a user mode application can't access it directly.
-       When SAMM is enabled, viafb_mode and viafb_mode1, viafb_bpp and
-       viafb_bpp1, viafb_refresh and viafb_refresh1 can be different.
-    3. When console is depending on viafbinfo1, dynamically change resolution
-       and bpp, need to call VIAFB specified ioctl interface VIAFB_SET_DEVICE
-       instead of calling common ioctl function FBIOPUT_VSCREENINFO since
-       viafb doesn't support multi-head well, or it will cause screen crush.
-
-
-[Configure viafb with "fbset" tool]
------------------------------------
-    "fbset" is an inbox utility of Linux.
-    1. Inquire current viafb information, type,
-           # fbset -i
-
-    2. Set various resolutions and viafb_refresh rates,
-           # fbset <resolution-vertical_sync>
-
-       example,
-           # fbset "1024x768-75"
-       or
-           # fbset -g 1024 768 1024 768 32
-       Check the file "/etc/fb.modes" to find display modes available.
-
-    3. Set the color depth,
-           # fbset -depth <value>
-
-       example,
-           # fbset -depth 16
-
-
-[Configure viafb via /proc]
----------------------------
-    The following files exist in /proc/viafb
-
-    supported_output_devices
-
-        This read-only file contains a full ',' separated list containing all
-        output devices that could be available on your platform. It is likely
-        that not all of those have a connector on your hardware but it should
-        provide a good starting point to figure out which of those names match
-        a real connector.
-        Example:
-        # cat /proc/viafb/supported_output_devices
-
-    iga1/output_devices
-    iga2/output_devices
-
-        These two files are readable and writable. iga1 and iga2 are the two
-        independent units that produce the screen image. Those images can be
-        forwarded to one or more output devices. Reading those files is a way
-        to query which output devices are currently used by an iga.
-        Example:
-        # cat /proc/viafb/iga1/output_devices
-        If there are no output devices printed the output of this iga is lost.
-        This can happen for example if only one (the other) iga is used.
-        Writing to these files allows adjusting the output devices during
-        runtime. One can add new devices, remove existing ones or switch
-        between igas. Essentially you can write a ',' separated list of device
-        names (or a single one) in the same format as the output to those
-        files. You can add a '+' or '-' as a prefix allowing simple addition
-        and removal of devices. So a prefix '+' adds the devices from your list
-        to the already existing ones, '-' removes the listed devices from the
-        existing ones and if no prefix is given it replaces all existing ones
-        with the listed ones. If you remove devices they are expected to turn
-        off. If you add devices that are already part of the other iga they are
-        removed there and added to the new one.
-        Examples:
-        Add CRT as output device to iga1
-        # echo +CRT > /proc/viafb/iga1/output_devices
-
-        Remove (turn off) DVP1 and LVDS1 as output devices of iga2
-        # echo -DVP1,LVDS1 > /proc/viafb/iga2/output_devices
-
-        Replace all iga1 output devices by CRT
-        # echo CRT > /proc/viafb/iga1/output_devices
-
-
-[Bootup with viafb]:
---------------------
-    Add the following line to your grub.conf:
-    append = "video=viafb:viafb_mode=1024x768,viafb_bpp=32,viafb_refresh=85"
-
diff --git a/Documentation/fb/vt8623fb.rst b/Documentation/fb/vt8623fb.rst
new file mode 100644
index 000000000000..ba1730937dd8
--- /dev/null
+++ b/Documentation/fb/vt8623fb.rst
@@ -0,0 +1,64 @@
+===============================================================
+vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset
+===============================================================
+
+
+Supported Hardware
+==================
+
+VIA VT8623 [CLE266] chipset and	its graphics core
+(known as CastleRock or Unichrome)
+
+I tested vt8623fb on VIA EPIA ML-6000
+
+
+Supported Features
+==================
+
+	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
+	*  8 bpp pseudocolor mode (with 18bit palette)
+	* 16 bpp truecolor mode (RGB 565)
+	* 32 bpp truecolor mode (RGB 888)
+	* text mode (activated by bpp = 0)
+	* doublescan mode variant (not available in text mode)
+	* panning in both directions
+	* suspend/resume support
+	* DPMS support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum about 100 MHz). This limitation is not enforced by
+driver. Text mode supports 8bit wide fonts only (hardware limitation) and
+16bit tall fonts (driver limitation).
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+	* secondary (not initialized by BIOS) device support
+	* MMIO support
+	* interlaced mode variant
+	* support for fontwidths != 8 in 4 bpp modes
+	* support for fontheight != 16 in text mode
+	* hardware cursor
+	* video overlay support
+	* vsync synchronization
+	* acceleration support (8514-like 2D, busmaster transfers)
+
+
+Known bugs
+==========
+
+	* cursor disable in text mode doesn't work
+
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/vt8623fb.txt b/Documentation/fb/vt8623fb.txt
deleted file mode 100644
index f654576c56b7..000000000000
--- a/Documentation/fb/vt8623fb.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-
-	vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset
-	===============================================================
-
-
-Supported Hardware
-==================
-
-	VIA VT8623 [CLE266] chipset and	its graphics core
-		(known as CastleRock or Unichrome)
-
-I tested vt8623fb on VIA EPIA ML-6000
-
-
-Supported Features
-==================
-
-	*  4 bpp pseudocolor modes (with 18bit palette, two variants)
-	*  8 bpp pseudocolor mode (with 18bit palette)
-	* 16 bpp truecolor mode (RGB 565)
-	* 32 bpp truecolor mode (RGB 888)
-	* text mode (activated by bpp = 0)
-	* doublescan mode variant (not available in text mode)
-	* panning in both directions
-	* suspend/resume support
-	* DPMS support
-
-Text mode is supported even in higher resolutions, but there is limitation to
-lower pixclocks (maximum about 100 MHz). This limitation is not enforced by
-driver. Text mode supports 8bit wide fonts only (hardware limitation) and
-16bit tall fonts (driver limitation).
-
-There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
-packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
-with interleaved planes (1 byte interleave), MSB first. Both modes support
-8bit wide fonts only (driver limitation).
-
-Suspend/resume works on systems that initialize video card during resume and
-if device is active (for example used by fbcon).
-
-
-Missing Features
-================
-(alias TODO list)
-
-	* secondary (not initialized by BIOS) device support
-	* MMIO support
-	* interlaced mode variant
-	* support for fontwidths != 8 in 4 bpp modes
-	* support for fontheight != 16 in text mode
-	* hardware cursor
-	* video overlay support
-	* vsync synchronization
-	* acceleration support (8514-like 2D, busmaster transfers)
-
-
-Known bugs
-==========
-
-	* cursor disable in text mode doesn't work
-
-
---
-Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 9999ea521f3e..32bbdfc64c32 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -22,7 +22,7 @@
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
     |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: |  ok  |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index dac435575384..204dd3ea36bb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -361,8 +361,6 @@ so fl_release_private called on a lease should not block.
 
 ----------------------- lock_manager_operations ---------------------------
 prototypes:
-	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
-	unsigned long (*lm_owner_key)(struct file_lock *);
 	void (*lm_notify)(struct file_lock *);  /* unblock callback */
 	int (*lm_grant)(struct file_lock *, struct file_lock *, int);
 	void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -371,23 +369,11 @@ prototypes:
 locking rules:
 
 			inode->i_lock	blocked_lock_lock	may block
-lm_compare_owner:	yes[1]		maybe			no
-lm_owner_key		yes[1]		yes			no
 lm_notify:		yes		yes			no
 lm_grant:		no		no			no
 lm_break:		yes		no			no
 lm_change		yes		no			no
 
-[1]:	->lm_compare_owner and ->lm_owner_key are generally called with
-*an* inode->i_lock held. It may not be the i_lock of the inode
-associated with either file_lock argument! This is the case with deadlock
-detection, since the code has to chase down the owners of locks that may
-be entirely unrelated to the one on which the lock is being acquired.
-For deadlock detection however, the blocked_lock_lock is also held. The
-fact that these locks are held ensures that the file_locks do not
-disappear out from under you while doing the comparison or generating an
-owner key.
-
 --------------------------- buffer_head -----------------------------------
 prototypes:
 	void (*b_end_io)(struct buffer_head *bh, int uptodate);
diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst
index aa51ffcfa029..bbb0c1c0e5cf 100644
--- a/Documentation/filesystems/api-summary.rst
+++ b/Documentation/filesystems/api-summary.rst
@@ -89,9 +89,6 @@ Other Functions
 .. kernel-doc:: fs/direct-io.c
    :export:
 
-.. kernel-doc:: fs/file_table.c
-   :export:
-
 .. kernel-doc:: fs/libfs.c
    :export:
 
diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
index 61311356025d..545262c167c3 100644
--- a/Documentation/filesystems/coda.txt
+++ b/Documentation/filesystems/coda.txt
@@ -481,7 +481,10 @@ kernel support.
 
 
 
-
+  struct coda_timespec {
+          int64_t         tv_sec;         /* seconds */
+          long            tv_nsec;        /* nanoseconds */
+  };
 
   struct coda_vattr {
           enum coda_vtype va_type;        /* vnode type (for create) */
@@ -493,9 +496,9 @@ kernel support.
           long            va_fileid;      /* file id */
           u_quad_t        va_size;        /* file size in bytes */
           long            va_blocksize;   /* blocksize preferred for i/o */
-          struct timespec va_atime;       /* time of last access */
-          struct timespec va_mtime;       /* time of last modification */
-          struct timespec va_ctime;       /* time file changed */
+          struct coda_timespec va_atime;  /* time of last access */
+          struct coda_timespec va_mtime;  /* time of last modification */
+          struct coda_timespec va_ctime;  /* time file changed */
           u_long          va_gen;         /* generation number of file */
           u_long          va_flags;       /* flags defined for file */
           dev_t           va_rdev;        /* device special file represents */
diff --git a/Documentation/filesystems/conf.py b/Documentation/filesystems/conf.py
deleted file mode 100644
index ea44172af5c4..000000000000
--- a/Documentation/filesystems/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Linux Filesystems API"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'filesystems.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 6d2c0d340dea..679729442fd2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -76,7 +76,7 @@ exposure of uninitialized data through mmap.
 These filesystems may be used for inspiration:
 - ext2: see Documentation/filesystems/ext2.txt
 - ext4: see Documentation/filesystems/ext4/
-- xfs:  see Documentation/filesystems/xfs.txt
+- xfs:  see Documentation/admin-guide/xfs.rst
 
 
 Handling Media Errors
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 4a0a9c3f4af6..9e27c843d00e 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -169,7 +169,7 @@ byte offsets over a base for the register block.
 
 If you want to dump an u32 array in debugfs, you can create file with:
 
-    struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+    void debugfs_create_u32_array(const char *name, umode_t mode,
 			struct dentry *parent,
 			u32 *array, u32 elements);
 
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index a19973a4dd1e..94c2cf0292f5 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -57,7 +57,13 @@ noacl				Don't support POSIX ACLs.
 
 nobh				Do not attach buffer_heads to file pagecache.
 
-grpquota,noquota,quota,usrquota	Quota options are silently ignored by ext2.
+quota, usrquota			Enable user disk quota support
+				(requires CONFIG_QUOTA).
+
+grpquota			Enable group disk quota support
+				(requires CONFIG_QUOTA).
+
+noquota option ls silently ignored by ext2.
 
 
 Specification
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
index 3be3e54d480d..705d813d558f 100644
--- a/Documentation/filesystems/ext4/index.rst
+++ b/Documentation/filesystems/ext4/index.rst
@@ -8,7 +8,7 @@ ext4 Data Structures and Algorithms
    :maxdepth: 6
    :numbered:
 
-   about.rst
-   overview.rst
-   globals.rst
-   dynamic.rst
+   about
+   overview
+   globals
+   dynamic
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index f7b5e4ff0de3..496fa28b2492 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -214,11 +214,22 @@ fsync_mode=%s          Control the policy of fsync. Currently supports "posix",
                        non-atomic files likewise "nobarrier" mount option.
 test_dummy_encryption  Enable dummy encryption, which provides a fake fscrypt
                        context. The fake fscrypt context is used by xfstests.
-checkpoint=%s          Set to "disable" to turn off checkpointing. Set to "enable"
+checkpoint=%s[:%u[%]]     Set to "disable" to turn off checkpointing. Set to "enable"
                        to reenable checkpointing. Is enabled by default. While
                        disabled, any unmounting or unexpected shutdowns will cause
                        the filesystem contents to appear as they did when the
                        filesystem was mounted with that option.
+                       While mounting with checkpoint=disabled, the filesystem must
+                       run garbage collection to ensure that all available space can
+                       be used. If this takes too much time, the mount may return
+                       EAGAIN. You may optionally add a value to indicate how much
+                       of the disk you would be willing to temporarily give up to
+                       avoid additional garbage collection. This can be given as a
+                       number of blocks, or as a percent. For instance, mounting
+                       with checkpoint=disable:100% would always succeed, but it may
+                       hide up to all remaining free space. The actual space that
+                       would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
+                       This space is reclaimed once checkpoint=enable.
 
 ================================================================================
 DEBUGFS ENTRIES
@@ -246,11 +257,14 @@ Files in /sys/fs/f2fs/<devname>
 ..............................................................................
  File                         Content
 
- gc_max_sleep_time            This tuning parameter controls the maximum sleep
+ gc_urgent_sleep_time         This parameter controls sleep time for gc_urgent.
+                              500 ms is set by default. See above gc_urgent.
+
+ gc_min_sleep_time            This tuning parameter controls the minimum sleep
                               time for the garbage collection thread. Time is
                               in milliseconds.
 
- gc_min_sleep_time            This tuning parameter controls the minimum sleep
+ gc_max_sleep_time            This tuning parameter controls the maximum sleep
                               time for the garbage collection thread. Time is
                               in milliseconds.
 
@@ -270,9 +284,6 @@ Files in /sys/fs/f2fs/<devname>
                               to 1, background thread starts to do GC by given
                               gc_urgent_sleep_time interval.
 
- gc_urgent_sleep_time         This parameter controls sleep time for gc_urgent.
-                              500 ms is set by default. See above gc_urgent.
-
  reclaim_segments             This parameter controls the number of prefree
                               segments to be reclaimed. If the number of prefree
 			      segments is larger than the number of segments
@@ -287,7 +298,16 @@ Files in /sys/fs/f2fs/<devname>
 			      checkpoint is triggered, and issued during the
 			      checkpoint. By default, it is disabled with 0.
 
- trim_sections                This parameter controls the number of sections
+ discard_granularity	      This parameter controls the granularity of discard
+			      command size. It will issue discard commands iif
+			      the size is larger than given granularity. Its
+			      unit size is 4KB, and 4 (=16KB) is set by default.
+			      The maximum value is 128 (=512KB).
+
+ reserved_blocks	      This parameter indicates the number of blocks that
+			      f2fs reserves internally for root.
+
+ batched_trim_sections	      This parameter controls the number of sections
                               to be trimmed out in batch mode when FITRIM
                               conducts. 32 sections is set by default.
 
@@ -309,11 +329,35 @@ Files in /sys/fs/f2fs/<devname>
 			      the number is less than this value, it triggers
 			      in-place-updates.
 
+ min_seq_blocks		      This parameter controls the threshold to serialize
+			      write IOs issued by multiple threads in parallel.
+
+ min_hot_blocks		      This parameter controls the threshold to allocate
+			      a hot data log for pending data blocks to write.
+
+ min_ssr_sections	      This parameter adds the threshold when deciding
+			      SSR block allocation. If this is large, SSR mode
+			      will be enabled early.
+
+ ram_thresh                   This parameter controls the memory footprint used
+			      by free nids and cached nat entries. By default,
+			      10 is set, which indicates 10 MB / 1 GB RAM.
+
+ ra_nid_pages		      When building free nids, F2FS reads NAT blocks
+			      ahead for speed up. Default is 0.
+
+ dirty_nats_ratio	      Given dirty ratio of cached nat entries, F2FS
+			      determines flushing them in background.
+
  max_victim_search	      This parameter controls the number of trials to
 			      find a victim segment when conducting SSR and
 			      cleaning operations. The default value is 4096
 			      which covers 8GB block address range.
 
+ migration_granularity	      For large-sized sections, F2FS can stop GC given
+			      this granularity instead of reclaiming entire
+			      section.
+
  dir_level                    This parameter controls the directory level to
 			      support large directory. If a directory has a
 			      number of files, it can reduce the file lookup
@@ -321,9 +365,53 @@ Files in /sys/fs/f2fs/<devname>
 			      Otherwise, it needs to decrease this value to
 			      reduce the space overhead. The default value is 0.
 
- ram_thresh                   This parameter controls the memory footprint used
-			      by free nids and cached nat entries. By default,
-			      10 is set, which indicates 10 MB / 1 GB RAM.
+ cp_interval		      F2FS tries to do checkpoint periodically, 60 secs
+			      by default.
+
+ idle_interval		      F2FS detects system is idle, if there's no F2FS
+			      operations during given interval, 5 secs by
+			      default.
+
+ discard_idle_interval	      F2FS detects the discard thread is idle, given
+			      time interval. Default is 5 secs.
+
+ gc_idle_interval	      F2FS detects the GC thread is idle, given time
+			      interval. Default is 5 secs.
+
+ umount_discard_timeout       When unmounting the disk, F2FS waits for finishing
+			      queued discard commands which can take huge time.
+			      This gives time out for it, 5 secs by default.
+
+ iostat_enable		      This controls to enable/disable iostat in F2FS.
+
+ readdir_ra		      This enables/disabled readahead of inode blocks
+			      in readdir, and default is enabled.
+
+ gc_pin_file_thresh	      This indicates how many GC can be failed for the
+			      pinned file. If it exceeds this, F2FS doesn't
+			      guarantee its pinning state. 2048 trials is set
+			      by default.
+
+ extension_list		      This enables to change extension_list for hot/cold
+			      files in runtime.
+
+ inject_rate		      This controls injection rate of arbitrary faults.
+
+ inject_type		      This controls injection type of arbitrary faults.
+
+ dirty_segments 	      This shows # of dirty segments.
+
+ lifetime_write_kbytes	      This shows # of data written to the disk.
+
+ features		      This shows current features enabled on F2FS.
+
+ current_reserved_blocks      This shows # of blocks currently reserved.
+
+ unusable                     If checkpoint=disable, this shows the number of
+                              blocks that are unusable.
+                              If checkpoint=enable it shows the number of blocks
+                              that would be unusable if checkpoint=disable were
+                              to be set.
 
 ================================================================================
 USAGE
@@ -716,3 +804,28 @@ WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
 WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
 WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
 WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
+
+Fallocate(2) Policy
+-------------------
+
+The default policy follows the below posix rule.
+
+Allocating disk space
+    The default operation (i.e., mode is zero) of fallocate() allocates
+    the disk space within the range specified by offset and len.  The
+    file size (as reported by stat(2)) will be changed if offset+len is
+    greater than the file size.  Any subregion within the range specified
+    by offset and len that did not contain data before the call will be
+    initialized to zero.  This default behavior closely resembles the
+    behavior of the posix_fallocate(3) library function, and is intended
+    as a method of optimally implementing that function.
+
+However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
+fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
+zero or random data, which is useful to the below scenario where:
+ 1. create(fd)
+ 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
+ 3. fallocate(fd, 0, 0, size)
+ 4. address = fibmap(fd, offset)
+ 5. open(blkdev)
+ 6. write(blkdev, address)
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 08c23b60e016..82efa41b0e6c 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,7 +191,9 @@ Currently, the following pairs of encryption modes are supported:
 If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
 
 AES-128-CBC was added only for low-powered embedded devices with
-crypto accelerators such as CAAM or CESA that do not support XTS.
+crypto accelerators such as CAAM or CESA that do not support XTS.  To
+use AES-128-CBC, CONFIG_CRYPTO_SHA256 (or another SHA-256
+implementation) must be enabled so that ESSIV can be used.
 
 Adiantum is a (primarily) stream cipher-based mode that is fast even
 on CPUs without dedicated crypto instructions.  It's also a true
@@ -647,3 +649,42 @@ Note that the precise way that filenames are presented to userspace
 without the key is subject to change in the future.  It is only meant
 as a way to temporarily present valid filenames so that commands like
 ``rm -r`` work as expected on encrypted directories.
+
+Tests
+=====
+
+To test fscrypt, use xfstests, which is Linux's de facto standard
+filesystem test suite.  First, run all the tests in the "encrypt"
+group on the relevant filesystem(s).  For example, to test ext4 and
+f2fs encryption using `kvm-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/kvm-quickstart.md>`_::
+
+    kvm-xfstests -c ext4,f2fs -g encrypt
+
+UBIFS encryption can also be tested this way, but it should be done in
+a separate command, and it takes some time for kvm-xfstests to set up
+emulated UBI volumes::
+
+    kvm-xfstests -c ubifs -g encrypt
+
+No tests should fail.  However, tests that use non-default encryption
+modes (e.g. generic/549 and generic/550) will be skipped if the needed
+algorithms were not built into the kernel's crypto API.  Also, tests
+that access the raw block device (e.g. generic/399, generic/548,
+generic/549, generic/550) will be skipped on UBIFS.
+
+Besides running the "encrypt" group tests, for ext4 and f2fs it's also
+possible to run most xfstests with the "test_dummy_encryption" mount
+option.  This option causes all new files to be automatically
+encrypted with a dummy key, without having to make any API calls.
+This tests the encrypted I/O paths more thoroughly.  To do this with
+kvm-xfstests, use the "encrypt" filesystem configuration::
+
+    kvm-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
+
+Because this runs many more tests than "-g encrypt" does, it takes
+much longer to run; so also consider using `gce-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/gce-xfstests.md>`_
+instead of kvm-xfstests::
+
+    gce-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 1131c34d77f6..2de2fe2ab078 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -16,7 +16,8 @@ algorithms work.
 .. toctree::
    :maxdepth: 2
 
-   path-lookup.rst
+   vfs
+   path-lookup
    api-summary
    splice
 
@@ -31,13 +32,3 @@ filesystem implementations.
 
    journalling
    fscrypt
-
-Filesystem-specific documentation
-=================================
-
-Documentation for individual filesystem types can be found here.
-
-.. toctree::
-   :maxdepth: 2
-
-   binderfs.rst
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index d2963123eb1c..ae4332464560 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -239,7 +239,7 @@ rdinit=<executable file>
   A description of the process of mounting the root file system can be
   found in:
 
-    Documentation/early-userspace/README
+    Documentation/driver-api/early-userspace/early_userspace_support.rst
 
 
 
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index eef7d9d259e8..1da2f1668f08 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -336,8 +336,20 @@ the copied layers will fail the verification of the lower root file handle.
 Non-standard behavior
 ---------------------
 
-Overlayfs can now act as a POSIX compliant filesystem with the following
-features turned on:
+Current version of overlayfs can act as a mostly POSIX compliant
+filesystem.
+
+This is the list of cases that overlayfs doesn't currently handle:
+
+a) POSIX mandates updating st_atime for reads.  This is currently not
+done in the case when the file resides on a lower layer.
+
+b) If a file residing on a lower layer is opened for read-only and then
+memory mapped with MAP_SHARED, then subsequent changes to the file are not
+reflected in the memory mapping.
+
+The following options allow overlayfs to act more like a standards
+compliant filesystem:
 
 1) "redirect_dir"
 
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3bd1148d8bb6..6b7a41cfcaed 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -330,14 +330,14 @@ unreferenced dentries, and is now only called when the dentry refcount goes to
 [mandatory]
 
 	.d_compare() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
 look at examples of other filesystems) for guidance.
 
 ---
 [mandatory]
 
 	.d_hash() calling convention and locking rules are significantly
-changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+changed. Read updated documentation in Documentation/filesystems/vfs.rst (and
 look at examples of other filesystems) for guidance.
 
 ---
@@ -377,12 +377,12 @@ where possible.
 the filesystem provides it), which requires dropping out of rcu-walk mode. This
 may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
 returned if the filesystem cannot handle rcu-walk. See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
 
 	permission is an inode permission check that is called on many or all
 directory inodes on the way down a path walk (to check for exec permission). It
 must now be rcu-walk aware (mask & MAY_NOT_BLOCK).  See
-Documentation/filesystems/vfs.txt for more details.
+Documentation/filesystems/vfs.rst for more details.
  
 --
 [mandatory]
@@ -428,8 +428,19 @@ release it yourself.
 --
 [mandatory]
 	d_alloc_root() is gone, along with a lot of bugs caused by code
-misusing it.  Replacement: d_make_root(inode).  The difference is,
-d_make_root() drops the reference to inode if dentry allocation fails.  
+misusing it.  Replacement: d_make_root(inode).  On success d_make_root(inode)
+allocates and returns a new dentry instantiated with the passed in inode.
+On failure NULL is returned and the passed in inode is dropped so the reference
+to inode is consumed in all cases and failure handling need not do any cleanup
+for the inode.  If d_make_root(inode) is passed a NULL inode it returns NULL
+and also requires no further error handling. Typical usage is:
+
+	inode = foofs_new_inode(....);
+	s->s_root = d_make_root(inode);
+	if (!s->s_root)
+		/* Nothing needed for the inode cleanup */
+		return -ENOMEM;
+	...
 
 --
 [mandatory]
@@ -625,7 +636,7 @@ in your dentry operations instead.
 --
 [mandatory]
 	->clone_file_range() and ->dedupe_file_range have been replaced with
-	->remap_file_range().  See Documentation/filesystems/vfs.txt for more
+	->remap_file_range().  See Documentation/filesystems/vfs.rst for more
 	information.
 --
 [recommended]
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 66cad5c86171..99ca040e3f90 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -45,6 +45,7 @@ Table of Contents
   3.9   /proc/<pid>/map_files - Information about memory mapped files
   3.10  /proc/<pid>/timerslack_ns - Task timerslack value
   3.11	/proc/<pid>/patch_state - Livepatch patch operation state
+  3.12	/proc/<pid>/arch_status - Task architecture specific information
 
   4	Configuring procfs
   4.1	Mount options
@@ -153,9 +154,11 @@ Table 1-1: Process specific entries in /proc
 		symbol the task is blocked in - or "0" if not blocked.
  pagemap	Page table
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
- smaps		an extension based on maps, showing the memory consumption of
+ smaps		An extension based on maps, showing the memory consumption of
 		each mapping and flags associated with it
- numa_maps	an extension based on maps, showing the memory locality and
+ smaps_rollup	Accumulated smaps stats for all mappings of the process.  This
+		can be derived from smaps, but is faster and more convenient
+ numa_maps	An extension based on maps, showing the memory locality and
 		binding policy as well as mem usage (in pages) of each mapping.
 ..............................................................................
 
@@ -365,7 +368,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   exit_code     the thread's exit_code in the form reported by the waitpid system call
 ..............................................................................
 
-The /proc/PID/maps file containing the currently mapped memory regions and
+The /proc/PID/maps file contains the currently mapped memory regions and
 their access permissions.
 
 The format is:
@@ -416,11 +419,14 @@ is not associated with a file:
  or if empty, the mapping is anonymous.
 
 The /proc/PID/smaps is an extension based on maps, showing the memory
-consumption for each of the process's mappings. For each of mappings there
-is a series of lines such as the following:
+consumption for each of the process's mappings. For each mapping (aka Virtual
+Memory Area, or VMA) there is a series of lines such as the following:
 
 08048000-080bc000 r-xp 00000000 03:02 13130      /bin/bash
+
 Size:               1084 kB
+KernelPageSize:        4 kB
+MMUPageSize:           4 kB
 Rss:                 892 kB
 Pss:                 374 kB
 Shared_Clean:        892 kB
@@ -442,11 +448,14 @@ Locked:                0 kB
 THPeligible:           0
 VmFlags: rd ex mr mw me dw
 
-the first of these lines shows the same information as is displayed for the
-mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
-(size), the amount of the mapping that is currently resident in RAM (RSS), the
-process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps.  Following lines show the size of the mapping
+(size); the size of each page allocated when backing a VMA (KernelPageSize),
+which is usually the same as the size in the page table entries; the page size
+used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
+the amount of the mapping that is currently resident in RAM (RSS); the
+process' proportional share of this mapping (PSS); and the number of clean and
+dirty shared and private pages in the mapping.
 
 The "proportional set size" (PSS) of a process is the count of pages it has
 in memory, where each page is divided by the number of processes sharing it.
@@ -477,8 +486,8 @@ replaced by copy-on-write) part of the underlying shmem object out on swap.
 "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
 does not take into account swapped out page of underlying shmem objects.
 "Locked" indicates whether the mapping is locked in memory or not.
-"THPeligible" indicates whether the mapping is eligible for THP pages - 1 if
-true, 0 otherwise.
+"THPeligible" indicates whether the mapping is eligible for allocating THP
+pages - 1 if true, 0 otherwise. It just shows the current status.
 
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
@@ -531,6 +540,19 @@ guarantees:
 2) If there is something at a given vaddr during the entirety of the
    life of the smaps/maps walk, there will be some output for it.
 
+The /proc/PID/smaps_rollup file includes the same fields as /proc/PID/smaps,
+but their values are the sums of the corresponding values for all mappings of
+the process.  Additionally, it contains these fields:
+
+Pss_Anon
+Pss_File
+Pss_Shmem
+
+They represent the proportional shares of anonymous, file, and shmem pages, as
+described for smaps above.  These fields are omitted in smaps since each
+mapping identifies the type (anon, file, or shmem) of all pages it contains.
+Thus all information in smaps_rollup can be derived from smaps, but at a
+significantly higher cost.
 
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
 bits on both physical and virtual pages associated with a process, and the
@@ -1478,7 +1500,7 @@ review the kernel documentation in the directory /usr/src/linux/Documentation.
 This chapter  is  heavily  based  on the documentation included in the pre 2.2
 kernels, and became part of it in version 2.2.1 of the Linux kernel.
 
-Please see: Documentation/sysctl/ directory for descriptions of these
+Please see: Documentation/admin-guide/sysctl/ directory for descriptions of these
 entries.
 
 ------------------------------------------------------------------------------
@@ -1948,6 +1970,45 @@ patched.  If the patch is being enabled, then the task has already been
 patched.  If the patch is being disabled, then the task hasn't been
 unpatched yet.
 
+3.12 /proc/<pid>/arch_status - task architecture specific status
+-------------------------------------------------------------------
+When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the
+architecture specific status of the task.
+
+Example
+-------
+ $ cat /proc/6753/arch_status
+ AVX512_elapsed_ms:      8
+
+Description
+-----------
+
+x86 specific entries:
+---------------------
+ AVX512_elapsed_ms:
+ ------------------
+  If AVX512 is supported on the machine, this entry shows the milliseconds
+  elapsed since the last time AVX512 usage was recorded. The recording
+  happens on a best effort basis when a task is scheduled out. This means
+  that the value depends on two factors:
+
+    1) The time which the task spent on the CPU without being scheduled
+       out. With CPU isolation and a single runnable task this can take
+       several seconds.
+
+    2) The time since the task was scheduled out last. Depending on the
+       reason for being scheduled out (time slice exhausted, syscall ...)
+       this can be arbitrary long time.
+
+  As a consequence the value cannot be considered precise and authoritative
+  information. The application which uses this information has to be aware
+  of the overall scenario on the system in order to determine whether a
+  task is a real AVX512 user or not. Precise information can be obtained
+  with performance counters.
+
+  A special value of '-1' indicates that no AVX512 usage was recorded, thus
+  the task is unlikely an AVX512 user, but depends on the workload and the
+  scheduling scenario, it also could be a false negative mentioned above.
 
 ------------------------------------------------------------------------------
 Configuring procfs
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 79637d227e85..97d42ccaa92d 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -105,7 +105,7 @@ All this differs from the old initrd in several ways:
   - The old initrd file was a gzipped filesystem image (in some file format,
     such as ext2, that needed a driver built into the kernel), while the new
     initramfs archive is a gzipped cpio archive (like tar only simpler,
-    see cpio(1) and Documentation/early-userspace/buffer-format.txt).  The
+    see cpio(1) and Documentation/driver-api/early-userspace/buffer-format.rst).  The
     kernel's cpio extraction code is not only extremely small, it's also
     __init text and data that can be discarded during the boot process.
 
@@ -159,7 +159,7 @@ One advantage of the configuration file is that root access is not required to
 set permissions or create device nodes in the new archive.  (Note that those
 two example "file" entries expect to find files named "init.sh" and "busybox" in
 a directory called "initramfs", under the linux-2.6.* directory.  See
-Documentation/early-userspace/README for more details.)
+Documentation/driver-api/early-userspace/early_userspace_support.rst for more details.)
 
 The kernel does not depend on external cpio tools.  If you specify a
 directory instead of a configuration file, the kernel's build infrastructure
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5b5311f9358d..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
 a stat(2) operation.
 
 More information can driver-model specific features can be found in
-Documentation/driver-model/. 
+Documentation/driver-api/driver-model/.
 
 
 TODO: Finish this section.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d06e9a59a9f4..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
 use at file creation time.  When a task allocates a file in the file
 system, the mount option memory policy will be applied with a NodeList,
 if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
 below.  If the resulting NodeLists is the empty set, the effective memory
 policy for the file will revert to "default" policy.
 
diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md
index 028b3e2e25f9..23e698167141 100644
--- a/Documentation/filesystems/ubifs-authentication.md
+++ b/Documentation/filesystems/ubifs-authentication.md
@@ -417,9 +417,9 @@ will then have to be provided beforehand in the normal way.
 
 [DMC-CBC-ATTACK]     http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/
 
-[DM-INTEGRITY]       https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt
+[DM-INTEGRITY]       https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.rst
 
-[DM-VERITY]          https://www.kernel.org/doc/Documentation/device-mapper/verity.txt
+[DM-VERITY]          https://www.kernel.org/doc/Documentation/device-mapper/verity.rst
 
 [FSCRYPT-POLICY2]    https://www.spinics.net/lists/linux-ext4/msg58710.html
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
new file mode 100644
index 000000000000..0f85ab21c2ca
--- /dev/null
+++ b/Documentation/filesystems/vfs.rst
@@ -0,0 +1,1428 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Overview of the Linux Virtual File System
+=========================================
+
+Original author: Richard Gooch <rgooch@atnf.csiro.au>
+
+- Copyright (C) 1999 Richard Gooch
+- Copyright (C) 2005 Pekka Enberg
+
+
+Introduction
+============
+
+The Virtual File System (also known as the Virtual Filesystem Switch) is
+the software layer in the kernel that provides the filesystem interface
+to userspace programs.  It also provides an abstraction within the
+kernel which allows different filesystem implementations to coexist.
+
+VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on
+are called from a process context.  Filesystem locking is described in
+the document Documentation/filesystems/Locking.
+
+
+Directory Entry Cache (dcache)
+------------------------------
+
+The VFS implements the open(2), stat(2), chmod(2), and similar system
+calls.  The pathname argument that is passed to them is used by the VFS
+to search through the directory entry cache (also known as the dentry
+cache or dcache).  This provides a very fast look-up mechanism to
+translate a pathname (filename) into a specific dentry.  Dentries live
+in RAM and are never saved to disc: they exist only for performance.
+
+The dentry cache is meant to be a view into your entire filespace.  As
+most computers cannot fit all dentries in the RAM at the same time, some
+bits of the cache are missing.  In order to resolve your pathname into a
+dentry, the VFS may have to resort to creating dentries along the way,
+and then loading the inode.  This is done by looking up the inode.
+
+
+The Inode Object
+----------------
+
+An individual dentry usually has a pointer to an inode.  Inodes are
+filesystem objects such as regular files, directories, FIFOs and other
+beasts.  They live either on the disc (for block device filesystems) or
+in the memory (for pseudo filesystems).  Inodes that live on the disc
+are copied into the memory when required and changes to the inode are
+written back to disc.  A single inode can be pointed to by multiple
+dentries (hard links, for example, do this).
+
+To look up an inode requires that the VFS calls the lookup() method of
+the parent directory inode.  This method is installed by the specific
+filesystem implementation that the inode lives in.  Once the VFS has the
+required dentry (and hence the inode), we can do all those boring things
+like open(2) the file, or stat(2) it to peek at the inode data.  The
+stat(2) operation is fairly simple: once the VFS has the dentry, it
+peeks at the inode data and passes some of it back to userspace.
+
+
+The File Object
+---------------
+
+Opening a file requires another operation: allocation of a file
+structure (this is the kernel-side implementation of file descriptors).
+The freshly allocated file structure is initialized with a pointer to
+the dentry and a set of file operation member functions.  These are
+taken from the inode data.  The open() file method is then called so the
+specific filesystem implementation can do its work.  You can see that
+this is another switch performed by the VFS.  The file structure is
+placed into the file descriptor table for the process.
+
+Reading, writing and closing files (and other assorted VFS operations)
+is done by using the userspace file descriptor to grab the appropriate
+file structure, and then calling the required file structure method to
+do whatever is required.  For as long as the file is open, it keeps the
+dentry in use, which in turn means that the VFS inode is still in use.
+
+
+Registering and Mounting a Filesystem
+=====================================
+
+To register and unregister a filesystem, use the following API
+functions:
+
+.. code-block:: c
+
+	#include <linux/fs.h>
+
+	extern int register_filesystem(struct file_system_type *);
+	extern int unregister_filesystem(struct file_system_type *);
+
+The passed struct file_system_type describes your filesystem.  When a
+request is made to mount a filesystem onto a directory in your
+namespace, the VFS will call the appropriate mount() method for the
+specific filesystem.  New vfsmount referring to the tree returned by
+->mount() will be attached to the mountpoint, so that when pathname
+resolution reaches the mountpoint it will jump into the root of that
+vfsmount.
+
+You can see all filesystems that are registered to the kernel in the
+file /proc/filesystems.
+
+
+struct file_system_type
+-----------------------
+
+This describes the filesystem.  As of kernel 2.6.39, the following
+members are defined:
+
+.. code-block:: c
+
+	struct file_system_operations {
+		const char *name;
+		int fs_flags;
+		struct dentry *(*mount) (struct file_system_type *, int,
+					 const char *, void *);
+		void (*kill_sb) (struct super_block *);
+		struct module *owner;
+		struct file_system_type * next;
+		struct list_head fs_supers;
+		struct lock_class_key s_lock_key;
+		struct lock_class_key s_umount_key;
+	};
+
+``name``
+	the name of the filesystem type, such as "ext2", "iso9660",
+	"msdos" and so on
+
+``fs_flags``
+	various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
+
+``mount``
+	the method to call when a new instance of this filesystem should
+	be mounted
+
+``kill_sb``
+	the method to call when an instance of this filesystem should be
+	shut down
+
+
+``owner``
+	for internal VFS use: you should initialize this to THIS_MODULE
+	in most cases.
+
+``next``
+	for internal VFS use: you should initialize this to NULL
+
+  s_lock_key, s_umount_key: lockdep-specific
+
+The mount() method has the following arguments:
+
+``struct file_system_type *fs_type``
+	describes the filesystem, partly initialized by the specific
+	filesystem code
+
+``int flags``
+	mount flags
+
+``const char *dev_name``
+	the device name we are mounting.
+
+``void *data``
+	arbitrary mount options, usually comes as an ASCII string (see
+	"Mount Options" section)
+
+The mount() method must return the root dentry of the tree requested by
+caller.  An active reference to its superblock must be grabbed and the
+superblock must be locked.  On failure it should return ERR_PTR(error).
+
+The arguments match those of mount(2) and their interpretation depends
+on filesystem type.  E.g. for block filesystems, dev_name is interpreted
+as block device name, that device is opened and if it contains a
+suitable filesystem image the method creates and initializes struct
+super_block accordingly, returning its root dentry to caller.
+
+->mount() may choose to return a subtree of existing filesystem - it
+doesn't have to create a new one.  The main result from the caller's
+point of view is a reference to dentry at the root of (sub)tree to be
+attached; creation of new superblock is a common side effect.
+
+The most interesting member of the superblock structure that the mount()
+method fills in is the "s_op" field.  This is a pointer to a "struct
+super_operations" which describes the next level of the filesystem
+implementation.
+
+Usually, a filesystem uses one of the generic mount() implementations
+and provides a fill_super() callback instead.  The generic variants are:
+
+``mount_bdev``
+	mount a filesystem residing on a block device
+
+``mount_nodev``
+	mount a filesystem that is not backed by a device
+
+``mount_single``
+	mount a filesystem which shares the instance between all mounts
+
+A fill_super() callback implementation has the following arguments:
+
+``struct super_block *sb``
+	the superblock structure.  The callback must initialize this
+	properly.
+
+``void *data``
+	arbitrary mount options, usually comes as an ASCII string (see
+	"Mount Options" section)
+
+``int silent``
+	whether or not to be silent on error
+
+
+The Superblock Object
+=====================
+
+A superblock object represents a mounted filesystem.
+
+
+struct super_operations
+-----------------------
+
+This describes how the VFS can manipulate the superblock of your
+filesystem.  As of kernel 2.6.22, the following members are defined:
+
+.. code-block:: c
+
+	struct super_operations {
+		struct inode *(*alloc_inode)(struct super_block *sb);
+		void (*destroy_inode)(struct inode *);
+
+		void (*dirty_inode) (struct inode *, int flags);
+		int (*write_inode) (struct inode *, int);
+		void (*drop_inode) (struct inode *);
+		void (*delete_inode) (struct inode *);
+		void (*put_super) (struct super_block *);
+		int (*sync_fs)(struct super_block *sb, int wait);
+		int (*freeze_fs) (struct super_block *);
+		int (*unfreeze_fs) (struct super_block *);
+		int (*statfs) (struct dentry *, struct kstatfs *);
+		int (*remount_fs) (struct super_block *, int *, char *);
+		void (*clear_inode) (struct inode *);
+		void (*umount_begin) (struct super_block *);
+
+		int (*show_options)(struct seq_file *, struct dentry *);
+
+		ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+		ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+		int (*nr_cached_objects)(struct super_block *);
+		void (*free_cached_objects)(struct super_block *, int);
+	};
+
+All methods are called without any locks being held, unless otherwise
+noted.  This means that most methods can block safely.  All methods are
+only called from a process context (i.e. not from an interrupt handler
+or bottom half).
+
+``alloc_inode``
+	this method is called by alloc_inode() to allocate memory for
+	struct inode and initialize it.  If this function is not
+	defined, a simple 'struct inode' is allocated.  Normally
+	alloc_inode will be used to allocate a larger structure which
+	contains a 'struct inode' embedded within it.
+
+``destroy_inode``
+	this method is called by destroy_inode() to release resources
+	allocated for struct inode.  It is only required if
+	->alloc_inode was defined and simply undoes anything done by
+	->alloc_inode.
+
+``dirty_inode``
+	this method is called by the VFS to mark an inode dirty.
+
+``write_inode``
+	this method is called when the VFS needs to write an inode to
+	disc.  The second parameter indicates whether the write should
+	be synchronous or not, not all filesystems check this flag.
+
+``drop_inode``
+	called when the last access to the inode is dropped, with the
+	inode->i_lock spinlock held.
+
+	This method should be either NULL (normal UNIX filesystem
+	semantics) or "generic_delete_inode" (for filesystems that do
+	not want to cache inodes - causing "delete_inode" to always be
+	called regardless of the value of i_nlink)
+
+	The "generic_delete_inode()" behavior is equivalent to the old
+	practice of using "force_delete" in the put_inode() case, but
+	does not have the races that the "force_delete()" approach had.
+
+``delete_inode``
+	called when the VFS wants to delete an inode
+
+``put_super``
+	called when the VFS wishes to free the superblock
+	(i.e. unmount).  This is called with the superblock lock held
+
+``sync_fs``
+	called when VFS is writing out all dirty data associated with a
+	superblock.  The second parameter indicates whether the method
+	should wait until the write out has been completed.  Optional.
+
+``freeze_fs``
+	called when VFS is locking a filesystem and forcing it into a
+	consistent state.  This method is currently used by the Logical
+	Volume Manager (LVM).
+
+``unfreeze_fs``
+	called when VFS is unlocking a filesystem and making it writable
+	again.
+
+``statfs``
+	called when the VFS needs to get filesystem statistics.
+
+``remount_fs``
+	called when the filesystem is remounted.  This is called with
+	the kernel lock held
+
+``clear_inode``
+	called then the VFS clears the inode.  Optional
+
+``umount_begin``
+	called when the VFS is unmounting a filesystem.
+
+``show_options``
+	called by the VFS to show mount options for /proc/<pid>/mounts.
+	(see "Mount Options" section)
+
+``quota_read``
+	called by the VFS to read from filesystem quota file.
+
+``quota_write``
+	called by the VFS to write to filesystem quota file.
+
+``nr_cached_objects``
+	called by the sb cache shrinking function for the filesystem to
+	return the number of freeable cached objects it contains.
+	Optional.
+
+``free_cache_objects``
+	called by the sb cache shrinking function for the filesystem to
+	scan the number of objects indicated to try to free them.
+	Optional, but any filesystem implementing this method needs to
+	also implement ->nr_cached_objects for it to be called
+	correctly.
+
+	We can't do anything with any errors that the filesystem might
+	encountered, hence the void return type.  This will never be
+	called if the VM is trying to reclaim under GFP_NOFS conditions,
+	hence this method does not need to handle that situation itself.
+
+	Implementations must include conditional reschedule calls inside
+	any scanning loop that is done.  This allows the VFS to
+	determine appropriate scan batch sizes without having to worry
+	about whether implementations will cause holdoff problems due to
+	large scan batch sizes.
+
+Whoever sets up the inode is responsible for filling in the "i_op"
+field.  This is a pointer to a "struct inode_operations" which describes
+the methods that can be performed on individual inodes.
+
+
+struct xattr_handlers
+---------------------
+
+On filesystems that support extended attributes (xattrs), the s_xattr
+superblock field points to a NULL-terminated array of xattr handlers.
+Extended attributes are name:value pairs.
+
+``name``
+	Indicates that the handler matches attributes with the specified
+	name (such as "system.posix_acl_access"); the prefix field must
+	be NULL.
+
+``prefix``
+	Indicates that the handler matches all attributes with the
+	specified name prefix (such as "user."); the name field must be
+	NULL.
+
+``list``
+	Determine if attributes matching this xattr handler should be
+	listed for a particular dentry.  Used by some listxattr
+	implementations like generic_listxattr.
+
+``get``
+	Called by the VFS to get the value of a particular extended
+	attribute.  This method is called by the getxattr(2) system
+	call.
+
+``set``
+	Called by the VFS to set the value of a particular extended
+	attribute.  When the new value is NULL, called to remove a
+	particular extended attribute.  This method is called by the the
+	setxattr(2) and removexattr(2) system calls.
+
+When none of the xattr handlers of a filesystem match the specified
+attribute name or when a filesystem doesn't support extended attributes,
+the various ``*xattr(2)`` system calls return -EOPNOTSUPP.
+
+
+The Inode Object
+================
+
+An inode object represents an object within the filesystem.
+
+
+struct inode_operations
+-----------------------
+
+This describes how the VFS can manipulate an inode in your filesystem.
+As of kernel 2.6.22, the following members are defined:
+
+.. code-block:: c
+
+	struct inode_operations {
+		int (*create) (struct inode *,struct dentry *, umode_t, bool);
+		struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
+		int (*link) (struct dentry *,struct inode *,struct dentry *);
+		int (*unlink) (struct inode *,struct dentry *);
+		int (*symlink) (struct inode *,struct dentry *,const char *);
+		int (*mkdir) (struct inode *,struct dentry *,umode_t);
+		int (*rmdir) (struct inode *,struct dentry *);
+		int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
+		int (*rename) (struct inode *, struct dentry *,
+			       struct inode *, struct dentry *, unsigned int);
+		int (*readlink) (struct dentry *, char __user *,int);
+		const char *(*get_link) (struct dentry *, struct inode *,
+					 struct delayed_call *);
+		int (*permission) (struct inode *, int);
+		int (*get_acl)(struct inode *, int);
+		int (*setattr) (struct dentry *, struct iattr *);
+		int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+		ssize_t (*listxattr) (struct dentry *, char *, size_t);
+		void (*update_time)(struct inode *, struct timespec *, int);
+		int (*atomic_open)(struct inode *, struct dentry *, struct file *,
+				   unsigned open_flag, umode_t create_mode);
+		int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+	};
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+``create``
+	called by the open(2) and creat(2) system calls.  Only required
+	if you want to support regular files.  The dentry you get should
+	not have an inode (i.e. it should be a negative dentry).  Here
+	you will probably call d_instantiate() with the dentry and the
+	newly created inode
+
+``lookup``
+	called when the VFS needs to look up an inode in a parent
+	directory.  The name to look for is found in the dentry.  This
+	method must call d_add() to insert the found inode into the
+	dentry.  The "i_count" field in the inode structure should be
+	incremented.  If the named inode does not exist a NULL inode
+	should be inserted into the dentry (this is called a negative
+	dentry).  Returning an error code from this routine must only be
+	done on a real error, otherwise creating inodes with system
+	calls like create(2), mknod(2), mkdir(2) and so on will fail.
+	If you wish to overload the dentry methods then you should
+	initialise the "d_dop" field in the dentry; this is a pointer to
+	a struct "dentry_operations".  This method is called with the
+	directory inode semaphore held
+
+``link``
+	called by the link(2) system call.  Only required if you want to
+	support hard links.  You will probably need to call
+	d_instantiate() just as you would in the create() method
+
+``unlink``
+	called by the unlink(2) system call.  Only required if you want
+	to support deleting inodes
+
+``symlink``
+	called by the symlink(2) system call.  Only required if you want
+	to support symlinks.  You will probably need to call
+	d_instantiate() just as you would in the create() method
+
+``mkdir``
+	called by the mkdir(2) system call.  Only required if you want
+	to support creating subdirectories.  You will probably need to
+	call d_instantiate() just as you would in the create() method
+
+``rmdir``
+	called by the rmdir(2) system call.  Only required if you want
+	to support deleting subdirectories
+
+``mknod``
+	called by the mknod(2) system call to create a device (char,
+	block) inode or a named pipe (FIFO) or socket.  Only required if
+	you want to support creating these types of inodes.  You will
+	probably need to call d_instantiate() just as you would in the
+	create() method
+
+``rename``
+	called by the rename(2) system call to rename the object to have
+	the parent and name given by the second inode and dentry.
+
+	The filesystem must return -EINVAL for any unsupported or
+	unknown flags.  Currently the following flags are implemented:
+	(1) RENAME_NOREPLACE: this flag indicates that if the target of
+	the rename exists the rename should fail with -EEXIST instead of
+	replacing the target.  The VFS already checks for existence, so
+	for local filesystems the RENAME_NOREPLACE implementation is
+	equivalent to plain rename.
+	(2) RENAME_EXCHANGE: exchange source and target.  Both must
+	exist; this is checked by the VFS.  Unlike plain rename, source
+	and target may be of different type.
+
+``get_link``
+	called by the VFS to follow a symbolic link to the inode it
+	points to.  Only required if you want to support symbolic links.
+	This method returns the symlink body to traverse (and possibly
+	resets the current position with nd_jump_link()).  If the body
+	won't go away until the inode is gone, nothing else is needed;
+	if it needs to be otherwise pinned, arrange for its release by
+	having get_link(..., ..., done) do set_delayed_call(done,
+	destructor, argument).  In that case destructor(argument) will
+	be called once VFS is done with the body you've returned.  May
+	be called in RCU mode; that is indicated by NULL dentry
+	argument.  If request can't be handled without leaving RCU mode,
+	have it return ERR_PTR(-ECHILD).
+
+	If the filesystem stores the symlink target in ->i_link, the
+	VFS may use it directly without calling ->get_link(); however,
+	->get_link() must still be provided.  ->i_link must not be
+	freed until after an RCU grace period.  Writing to ->i_link
+	post-iget() time requires a 'release' memory barrier.
+
+``readlink``
+	this is now just an override for use by readlink(2) for the
+	cases when ->get_link uses nd_jump_link() or object is not in
+	fact a symlink.  Normally filesystems should only implement
+	->get_link for symlinks and readlink(2) will automatically use
+	that.
+
+``permission``
+	called by the VFS to check for access rights on a POSIX-like
+	filesystem.
+
+	May be called in rcu-walk mode (mask & MAY_NOT_BLOCK).  If in
+	rcu-walk mode, the filesystem must check the permission without
+	blocking or storing to the inode.
+
+	If a situation is encountered that rcu-walk cannot handle,
+	return
+	-ECHILD and it will be called again in ref-walk mode.
+
+``setattr``
+	called by the VFS to set attributes for a file.  This method is
+	called by chmod(2) and related system calls.
+
+``getattr``
+	called by the VFS to get attributes of a file.  This method is
+	called by stat(2) and related system calls.
+
+``listxattr``
+	called by the VFS to list all extended attributes for a given
+	file.  This method is called by the listxattr(2) system call.
+
+``update_time``
+	called by the VFS to update a specific time or the i_version of
+	an inode.  If this is not defined the VFS will update the inode
+	itself and call mark_inode_dirty_sync.
+
+``atomic_open``
+	called on the last component of an open.  Using this optional
+	method the filesystem can look up, possibly create and open the
+	file in one atomic operation.  If it wants to leave actual
+	opening to the caller (e.g. if the file turned out to be a
+	symlink, device, or just something filesystem won't do atomic
+	open for), it may signal this by returning finish_no_open(file,
+	dentry).  This method is only called if the last component is
+	negative or needs lookup.  Cached positive dentries are still
+	handled by f_op->open().  If the file was created, FMODE_CREATED
+	flag should be set in file->f_mode.  In case of O_EXCL the
+	method must only succeed if the file didn't exist and hence
+	FMODE_CREATED shall always be set on success.
+
+``tmpfile``
+	called in the end of O_TMPFILE open().  Optional, equivalent to
+	atomically creating, opening and unlinking a file in given
+	directory.
+
+
+The Address Space Object
+========================
+
+The address space object is used to group and manage pages in the page
+cache.  It can be used to keep track of the pages in a file (or anything
+else) and also track the mapping of sections of the file into process
+address spaces.
+
+There are a number of distinct yet related services that an
+address-space can provide.  These include communicating memory pressure,
+page lookup by address, and keeping track of pages tagged as Dirty or
+Writeback.
+
+The first can be used independently to the others.  The VM can try to
+either write dirty pages in order to clean them, or release clean pages
+in order to reuse them.  To do this it can call the ->writepage method
+on dirty pages, and ->releasepage on clean pages with PagePrivate set.
+Clean pages without PagePrivate and with no external references will be
+released without notice being given to the address_space.
+
+To achieve this functionality, pages need to be placed on an LRU with
+lru_cache_add and mark_page_active needs to be called whenever the page
+is used.
+
+Pages are normally kept in a radix tree index by ->index.  This tree
+maintains information about the PG_Dirty and PG_Writeback status of each
+page, so that pages with either of these flags can be found quickly.
+
+The Dirty tag is primarily used by mpage_writepages - the default
+->writepages method.  It uses the tag to find dirty pages to call
+->writepage on.  If mpage_writepages is not used (i.e. the address
+provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost
+unused.  write_inode_now and sync_inode do use it (through
+__sync_single_inode) to check if ->writepages has been successful in
+writing out the whole address_space.
+
+The Writeback tag is used by filemap*wait* and sync_page* functions, via
+filemap_fdatawait_range, to wait for all writeback to complete.
+
+An address_space handler may attach extra information to a page,
+typically using the 'private' field in the 'struct page'.  If such
+information is attached, the PG_Private flag should be set.  This will
+cause various VM routines to make extra calls into the address_space
+handler to deal with that data.
+
+An address space acts as an intermediate between storage and
+application.  Data is read into the address space a whole page at a
+time, and provided to the application either by copying of the page, or
+by memory-mapping the page.  Data is written into the address space by
+the application, and then written-back to storage typically in whole
+pages, however the address_space has finer control of write sizes.
+
+The read process essentially only requires 'readpage'.  The write
+process is more complicated and uses write_begin/write_end or
+set_page_dirty to write data into the address_space, and writepage and
+writepages to writeback data to storage.
+
+Adding and removing pages to/from an address_space is protected by the
+inode's i_mutex.
+
+When data is written to a page, the PG_Dirty flag should be set.  It
+typically remains set until writepage asks for it to be written.  This
+should clear PG_Dirty and set PG_Writeback.  It can be actually written
+at any point after PG_Dirty is clear.  Once it is known to be safe,
+PG_Writeback is cleared.
+
+Writeback makes use of a writeback_control structure to direct the
+operations.  This gives the the writepage and writepages operations some
+information about the nature of and reason for the writeback request,
+and the constraints under which it is being done.  It is also used to
+return information back to the caller about the result of a writepage or
+writepages request.
+
+
+Handling errors during writeback
+--------------------------------
+
+Most applications that do buffered I/O will periodically call a file
+synchronization call (fsync, fdatasync, msync or sync_file_range) to
+ensure that data written has made it to the backing store.  When there
+is an error during writeback, they expect that error to be reported when
+a file sync request is made.  After an error has been reported on one
+request, subsequent requests on the same file descriptor should return
+0, unless further writeback errors have occurred since the previous file
+syncronization.
+
+Ideally, the kernel would report errors only on file descriptions on
+which writes were done that subsequently failed to be written back.  The
+generic pagecache infrastructure does not track the file descriptions
+that have dirtied each individual page however, so determining which
+file descriptors should get back an error is not possible.
+
+Instead, the generic writeback error tracking infrastructure in the
+kernel settles for reporting errors to fsync on all file descriptions
+that were open at the time that the error occurred.  In a situation with
+multiple writers, all of them will get back an error on a subsequent
+fsync, even if all of the writes done through that particular file
+descriptor succeeded (or even if there were no writes on that file
+descriptor at all).
+
+Filesystems that wish to use this infrastructure should call
+mapping_set_error to record the error in the address_space when it
+occurs.  Then, after writing back data from the pagecache in their
+file->fsync operation, they should call file_check_and_advance_wb_err to
+ensure that the struct file's error cursor has advanced to the correct
+point in the stream of errors emitted by the backing device(s).
+
+
+struct address_space_operations
+-------------------------------
+
+This describes how the VFS can manipulate mapping of a file to page
+cache in your filesystem.  The following members are defined:
+
+.. code-block:: c
+
+	struct address_space_operations {
+		int (*writepage)(struct page *page, struct writeback_control *wbc);
+		int (*readpage)(struct file *, struct page *);
+		int (*writepages)(struct address_space *, struct writeback_control *);
+		int (*set_page_dirty)(struct page *page);
+		int (*readpages)(struct file *filp, struct address_space *mapping,
+				 struct list_head *pages, unsigned nr_pages);
+		int (*write_begin)(struct file *, struct address_space *mapping,
+				   loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata);
+		int (*write_end)(struct file *, struct address_space *mapping,
+				 loff_t pos, unsigned len, unsigned copied,
+				 struct page *page, void *fsdata);
+		sector_t (*bmap)(struct address_space *, sector_t);
+		void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+		int (*releasepage) (struct page *, int);
+		void (*freepage)(struct page *);
+		ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+		/* isolate a page for migration */
+		bool (*isolate_page) (struct page *, isolate_mode_t);
+		/* migrate the contents of a page to the specified target */
+		int (*migratepage) (struct page *, struct page *);
+		/* put migration-failed page back to right list */
+		void (*putback_page) (struct page *);
+		int (*launder_page) (struct page *);
+
+		int (*is_partially_uptodate) (struct page *, unsigned long,
+					      unsigned long);
+		void (*is_dirty_writeback) (struct page *, bool *, bool *);
+		int (*error_remove_page) (struct mapping *mapping, struct page *page);
+		int (*swap_activate)(struct file *);
+		int (*swap_deactivate)(struct file *);
+	};
+
+``writepage``
+	called by the VM to write a dirty page to backing store.  This
+	may happen for data integrity reasons (i.e. 'sync'), or to free
+	up memory (flush).  The difference can be seen in
+	wbc->sync_mode.  The PG_Dirty flag has been cleared and
+	PageLocked is true.  writepage should start writeout, should set
+	PG_Writeback, and should make sure the page is unlocked, either
+	synchronously or asynchronously when the write operation
+	completes.
+
+	If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
+	try too hard if there are problems, and may choose to write out
+	other pages from the mapping if that is easier (e.g. due to
+	internal dependencies).  If it chooses not to start writeout, it
+	should return AOP_WRITEPAGE_ACTIVATE so that the VM will not
+	keep calling ->writepage on that page.
+
+	See the file "Locking" for more details.
+
+``readpage``
+	called by the VM to read a page from backing store.  The page
+	will be Locked when readpage is called, and should be unlocked
+	and marked uptodate once the read completes.  If ->readpage
+	discovers that it needs to unlock the page for some reason, it
+	can do so, and then return AOP_TRUNCATED_PAGE.  In this case,
+	the page will be relocated, relocked and if that all succeeds,
+	->readpage will be called again.
+
+``writepages``
+	called by the VM to write out pages associated with the
+	address_space object.  If wbc->sync_mode is WBC_SYNC_ALL, then
+	the writeback_control will specify a range of pages that must be
+	written out.  If it is WBC_SYNC_NONE, then a nr_to_write is
+	given and that many pages should be written if possible.  If no
+	->writepages is given, then mpage_writepages is used instead.
+	This will choose pages from the address space that are tagged as
+	DIRTY and will pass them to ->writepage.
+
+``set_page_dirty``
+	called by the VM to set a page dirty.  This is particularly
+	needed if an address space attaches private data to a page, and
+	that data needs to be updated when a page is dirtied.  This is
+	called, for example, when a memory mapped page gets modified.
+	If defined, it should set the PageDirty flag, and the
+	PAGECACHE_TAG_DIRTY tag in the radix tree.
+
+``readpages``
+	called by the VM to read pages associated with the address_space
+	object.  This is essentially just a vector version of readpage.
+	Instead of just one page, several pages are requested.
+	readpages is only used for read-ahead, so read errors are
+	ignored.  If anything goes wrong, feel free to give up.
+
+``write_begin``
+	Called by the generic buffered write code to ask the filesystem
+	to prepare to write len bytes at the given offset in the file.
+	The address_space should check that the write will be able to
+	complete, by allocating space if necessary and doing any other
+	internal housekeeping.  If the write will update parts of any
+	basic-blocks on storage, then those blocks should be pre-read
+	(if they haven't been read already) so that the updated blocks
+	can be written out properly.
+
+	The filesystem must return the locked pagecache page for the
+	specified offset, in ``*pagep``, for the caller to write into.
+
+	It must be able to cope with short writes (where the length
+	passed to write_begin is greater than the number of bytes copied
+	into the page).
+
+	flags is a field for AOP_FLAG_xxx flags, described in
+	include/linux/fs.h.
+
+	A void * may be returned in fsdata, which then gets passed into
+	write_end.
+
+	Returns 0 on success; < 0 on failure (which is the error code),
+	in which case write_end is not called.
+
+``write_end``
+	After a successful write_begin, and data copy, write_end must be
+	called.  len is the original len passed to write_begin, and
+	copied is the amount that was able to be copied.
+
+	The filesystem must take care of unlocking the page and
+	releasing it refcount, and updating i_size.
+
+	Returns < 0 on failure, otherwise the number of bytes (<=
+	'copied') that were able to be copied into pagecache.
+
+``bmap``
+	called by the VFS to map a logical block offset within object to
+	physical block number.  This method is used by the FIBMAP ioctl
+	and for working with swap-files.  To be able to swap to a file,
+	the file must have a stable mapping to a block device.  The swap
+	system does not go through the filesystem but instead uses bmap
+	to find out where the blocks in the file are and uses those
+	addresses directly.
+
+``invalidatepage``
+	If a page has PagePrivate set, then invalidatepage will be
+	called when part or all of the page is to be removed from the
+	address space.  This generally corresponds to either a
+	truncation, punch hole or a complete invalidation of the address
+	space (in the latter case 'offset' will always be 0 and 'length'
+	will be PAGE_SIZE).  Any private data associated with the page
+	should be updated to reflect this truncation.  If offset is 0
+	and length is PAGE_SIZE, then the private data should be
+	released, because the page must be able to be completely
+	discarded.  This may be done by calling the ->releasepage
+	function, but in this case the release MUST succeed.
+
+``releasepage``
+	releasepage is called on PagePrivate pages to indicate that the
+	page should be freed if possible.  ->releasepage should remove
+	any private data from the page and clear the PagePrivate flag.
+	If releasepage() fails for some reason, it must indicate failure
+	with a 0 return value.  releasepage() is used in two distinct
+	though related cases.  The first is when the VM finds a clean
+	page with no active users and wants to make it a free page.  If
+	->releasepage succeeds, the page will be removed from the
+	address_space and become free.
+
+	The second case is when a request has been made to invalidate
+	some or all pages in an address_space.  This can happen through
+	the fadvise(POSIX_FADV_DONTNEED) system call or by the
+	filesystem explicitly requesting it as nfs and 9fs do (when they
+	believe the cache may be out of date with storage) by calling
+	invalidate_inode_pages2().  If the filesystem makes such a call,
+	and needs to be certain that all pages are invalidated, then its
+	releasepage will need to ensure this.  Possibly it can clear the
+	PageUptodate bit if it cannot free private data yet.
+
+``freepage``
+	freepage is called once the page is no longer visible in the
+	page cache in order to allow the cleanup of any private data.
+	Since it may be called by the memory reclaimer, it should not
+	assume that the original address_space mapping still exists, and
+	it should not block.
+
+``direct_IO``
+	called by the generic read/write routines to perform direct_IO -
+	that is IO requests which bypass the page cache and transfer
+	data directly between the storage and the application's address
+	space.
+
+``isolate_page``
+	Called by the VM when isolating a movable non-lru page.  If page
+	is successfully isolated, VM marks the page as PG_isolated via
+	__SetPageIsolated.
+
+``migrate_page``
+	This is used to compact the physical memory usage.  If the VM
+	wants to relocate a page (maybe off a memory card that is
+	signalling imminent failure) it will pass a new page and an old
+	page to this function.  migrate_page should transfer any private
+	data across and update any references that it has to the page.
+
+``putback_page``
+	Called by the VM when isolated page's migration fails.
+
+``launder_page``
+	Called before freeing a page - it writes back the dirty page.
+	To prevent redirtying the page, it is kept locked during the
+	whole operation.
+
+``is_partially_uptodate``
+	Called by the VM when reading a file through the pagecache when
+	the underlying blocksize != pagesize.  If the required block is
+	up to date then the read can complete without needing the IO to
+	bring the whole page up to date.
+
+``is_dirty_writeback``
+	Called by the VM when attempting to reclaim a page.  The VM uses
+	dirty and writeback information to determine if it needs to
+	stall to allow flushers a chance to complete some IO.
+	Ordinarily it can use PageDirty and PageWriteback but some
+	filesystems have more complex state (unstable pages in NFS
+	prevent reclaim) or do not set those flags due to locking
+	problems.  This callback allows a filesystem to indicate to the
+	VM if a page should be treated as dirty or writeback for the
+	purposes of stalling.
+
+``error_remove_page``
+	normally set to generic_error_remove_page if truncation is ok
+	for this address space.  Used for memory failure handling.
+	Setting this implies you deal with pages going away under you,
+	unless you have them locked or reference counts increased.
+
+``swap_activate``
+	Called when swapon is used on a file to allocate space if
+	necessary and pin the block lookup information in memory.  A
+	return value of zero indicates success, in which case this file
+	can be used to back swapspace.
+
+``swap_deactivate``
+	Called during swapoff on files where swap_activate was
+	successful.
+
+
+The File Object
+===============
+
+A file object represents a file opened by a process.  This is also known
+as an "open file description" in POSIX parlance.
+
+
+struct file_operations
+----------------------
+
+This describes how the VFS can manipulate an open file.  As of kernel
+4.18, the following members are defined:
+
+.. code-block:: c
+
+	struct file_operations {
+		struct module *owner;
+		loff_t (*llseek) (struct file *, loff_t, int);
+		ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+		ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+		ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+		ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+		int (*iopoll)(struct kiocb *kiocb, bool spin);
+		int (*iterate) (struct file *, struct dir_context *);
+		int (*iterate_shared) (struct file *, struct dir_context *);
+		__poll_t (*poll) (struct file *, struct poll_table_struct *);
+		long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+		long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+		int (*mmap) (struct file *, struct vm_area_struct *);
+		int (*open) (struct inode *, struct file *);
+		int (*flush) (struct file *, fl_owner_t id);
+		int (*release) (struct inode *, struct file *);
+		int (*fsync) (struct file *, loff_t, loff_t, int datasync);
+		int (*fasync) (int, struct file *, int);
+		int (*lock) (struct file *, int, struct file_lock *);
+		ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+		unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+		int (*check_flags)(int);
+		int (*flock) (struct file *, int, struct file_lock *);
+		ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
+		ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+		int (*setlease)(struct file *, long, struct file_lock **, void **);
+		long (*fallocate)(struct file *file, int mode, loff_t offset,
+				  loff_t len);
+		void (*show_fdinfo)(struct seq_file *m, struct file *f);
+	#ifndef CONFIG_MMU
+		unsigned (*mmap_capabilities)(struct file *);
+	#endif
+		ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
+		loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+					   struct file *file_out, loff_t pos_out,
+					   loff_t len, unsigned int remap_flags);
+		int (*fadvise)(struct file *, loff_t, loff_t, int);
+	};
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+``llseek``
+	called when the VFS needs to move the file position index
+
+``read``
+	called by read(2) and related system calls
+
+``read_iter``
+	possibly asynchronous read with iov_iter as destination
+
+``write``
+	called by write(2) and related system calls
+
+``write_iter``
+	possibly asynchronous write with iov_iter as source
+
+``iopoll``
+	called when aio wants to poll for completions on HIPRI iocbs
+
+``iterate``
+	called when the VFS needs to read the directory contents
+
+``iterate_shared``
+	called when the VFS needs to read the directory contents when
+	filesystem supports concurrent dir iterators
+
+``poll``
+	called by the VFS when a process wants to check if there is
+	activity on this file and (optionally) go to sleep until there
+	is activity.  Called by the select(2) and poll(2) system calls
+
+``unlocked_ioctl``
+	called by the ioctl(2) system call.
+
+``compat_ioctl``
+	called by the ioctl(2) system call when 32 bit system calls are
+	 used on 64 bit kernels.
+
+``mmap``
+	called by the mmap(2) system call
+
+``open``
+	called by the VFS when an inode should be opened.  When the VFS
+	opens a file, it creates a new "struct file".  It then calls the
+	open method for the newly allocated file structure.  You might
+	think that the open method really belongs in "struct
+	inode_operations", and you may be right.  I think it's done the
+	way it is because it makes filesystems simpler to implement.
+	The open() method is a good place to initialize the
+	"private_data" member in the file structure if you want to point
+	to a device structure
+
+``flush``
+	called by the close(2) system call to flush a file
+
+``release``
+	called when the last reference to an open file is closed
+
+``fsync``
+	called by the fsync(2) system call.  Also see the section above
+	entitled "Handling errors during writeback".
+
+``fasync``
+	called by the fcntl(2) system call when asynchronous
+	(non-blocking) mode is enabled for a file
+
+``lock``
+	called by the fcntl(2) system call for F_GETLK, F_SETLK, and
+	F_SETLKW commands
+
+``get_unmapped_area``
+	called by the mmap(2) system call
+
+``check_flags``
+	called by the fcntl(2) system call for F_SETFL command
+
+``flock``
+	called by the flock(2) system call
+
+``splice_write``
+	called by the VFS to splice data from a pipe to a file.  This
+	method is used by the splice(2) system call
+
+``splice_read``
+	called by the VFS to splice data from file to a pipe.  This
+	method is used by the splice(2) system call
+
+``setlease``
+	called by the VFS to set or release a file lock lease.  setlease
+	implementations should call generic_setlease to record or remove
+	the lease in the inode after setting it.
+
+``fallocate``
+	called by the VFS to preallocate blocks or punch a hole.
+
+``copy_file_range``
+	called by the copy_file_range(2) system call.
+
+``remap_file_range``
+	called by the ioctl(2) system call for FICLONERANGE and FICLONE
+	and FIDEDUPERANGE commands to remap file ranges.  An
+	implementation should remap len bytes at pos_in of the source
+	file into the dest file at pos_out.  Implementations must handle
+	callers passing in len == 0; this means "remap to the end of the
+	source file".  The return value should the number of bytes
+	remapped, or the usual negative error code if errors occurred
+	before any bytes were remapped.  The remap_flags parameter
+	accepts REMAP_FILE_* flags.  If REMAP_FILE_DEDUP is set then the
+	implementation must only remap if the requested file ranges have
+	identical contents.  If REMAP_CAN_SHORTEN is set, the caller is
+	ok with the implementation shortening the request length to
+	satisfy alignment or EOF requirements (or any other reason).
+
+``fadvise``
+	possibly called by the fadvise64() system call.
+
+Note that the file operations are implemented by the specific
+filesystem in which the inode resides.  When opening a device node
+(character or block special) most filesystems will call special
+support routines in the VFS which will locate the required device
+driver information.  These support routines replace the filesystem file
+operations with those for the device driver, and then proceed to call
+the new open() method for the file.  This is how opening a device file
+in the filesystem eventually ends up calling the device driver open()
+method.
+
+
+Directory Entry Cache (dcache)
+==============================
+
+
+struct dentry_operations
+------------------------
+
+This describes how a filesystem can overload the standard dentry
+operations.  Dentries and the dcache are the domain of the VFS and the
+individual filesystem implementations.  Device drivers have no business
+here.  These methods may be set to NULL, as they are either optional or
+the VFS uses a default.  As of kernel 2.6.22, the following members are
+defined:
+
+.. code-block:: c
+
+	struct dentry_operations {
+		int (*d_revalidate)(struct dentry *, unsigned int);
+		int (*d_weak_revalidate)(struct dentry *, unsigned int);
+		int (*d_hash)(const struct dentry *, struct qstr *);
+		int (*d_compare)(const struct dentry *,
+				 unsigned int, const char *, const struct qstr *);
+		int (*d_delete)(const struct dentry *);
+		int (*d_init)(struct dentry *);
+		void (*d_release)(struct dentry *);
+		void (*d_iput)(struct dentry *, struct inode *);
+		char *(*d_dname)(struct dentry *, char *, int);
+		struct vfsmount *(*d_automount)(struct path *);
+		int (*d_manage)(const struct path *, bool);
+		struct dentry *(*d_real)(struct dentry *, const struct inode *);
+	};
+
+``d_revalidate``
+	called when the VFS needs to revalidate a dentry.  This is
+	called whenever a name look-up finds a dentry in the dcache.
+	Most local filesystems leave this as NULL, because all their
+	dentries in the dcache are valid.  Network filesystems are
+	different since things can change on the server without the
+	client necessarily being aware of it.
+
+	This function should return a positive value if the dentry is
+	still valid, and zero or a negative error code if it isn't.
+
+	d_revalidate may be called in rcu-walk mode (flags &
+	LOOKUP_RCU).  If in rcu-walk mode, the filesystem must
+	revalidate the dentry without blocking or storing to the dentry,
+	d_parent and d_inode should not be used without care (because
+	they can change and, in d_inode case, even become NULL under
+	us).
+
+	If a situation is encountered that rcu-walk cannot handle,
+	return
+	-ECHILD and it will be called again in ref-walk mode.
+
+``_weak_revalidate``
+	called when the VFS needs to revalidate a "jumped" dentry.  This
+	is called when a path-walk ends at dentry that was not acquired
+	by doing a lookup in the parent directory.  This includes "/",
+	"." and "..", as well as procfs-style symlinks and mountpoint
+	traversal.
+
+	In this case, we are less concerned with whether the dentry is
+	still fully correct, but rather that the inode is still valid.
+	As with d_revalidate, most local filesystems will set this to
+	NULL since their dcache entries are always valid.
+
+	This function has the same return code semantics as
+	d_revalidate.
+
+	d_weak_revalidate is only called after leaving rcu-walk mode.
+
+``d_hash``
+	called when the VFS adds a dentry to the hash table.  The first
+	dentry passed to d_hash is the parent directory that the name is
+	to be hashed into.
+
+	Same locking and synchronisation rules as d_compare regarding
+	what is safe to dereference etc.
+
+``d_compare``
+	called to compare a dentry name with a given name.  The first
+	dentry is the parent of the dentry to be compared, the second is
+	the child dentry.  len and name string are properties of the
+	dentry to be compared.  qstr is the name to compare it with.
+
+	Must be constant and idempotent, and should not take locks if
+	possible, and should not or store into the dentry.  Should not
+	dereference pointers outside the dentry without lots of care
+	(eg.  d_parent, d_inode, d_name should not be used).
+
+	However, our vfsmount is pinned, and RCU held, so the dentries
+	and inodes won't disappear, neither will our sb or filesystem
+	module.  ->d_sb may be used.
+
+	It is a tricky calling convention because it needs to be called
+	under "rcu-walk", ie. without any locks or references on things.
+
+``d_delete``
+	called when the last reference to a dentry is dropped and the
+	dcache is deciding whether or not to cache it.  Return 1 to
+	delete immediately, or 0 to cache the dentry.  Default is NULL
+	which means to always cache a reachable dentry.  d_delete must
+	be constant and idempotent.
+
+``d_init``
+	called when a dentry is allocated
+
+``d_release``
+	called when a dentry is really deallocated
+
+``d_iput``
+	called when a dentry loses its inode (just prior to its being
+	deallocated).  The default when this is NULL is that the VFS
+	calls iput().  If you define this method, you must call iput()
+	yourself
+
+``d_dname``
+	called when the pathname of a dentry should be generated.
+	Useful for some pseudo filesystems (sockfs, pipefs, ...) to
+	delay pathname generation.  (Instead of doing it when dentry is
+	created, it's done only when the path is needed.).  Real
+	filesystems probably dont want to use it, because their dentries
+	are present in global dcache hash, so their hash should be an
+	invariant.  As no lock is held, d_dname() should not try to
+	modify the dentry itself, unless appropriate SMP safety is used.
+	CAUTION : d_path() logic is quite tricky.  The correct way to
+	return for example "Hello" is to put it at the end of the
+	buffer, and returns a pointer to the first char.
+	dynamic_dname() helper function is provided to take care of
+	this.
+
+	Example :
+
+.. code-block:: c
+
+	static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+	{
+		return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+				dentry->d_inode->i_ino);
+	}
+
+``d_automount``
+	called when an automount dentry is to be traversed (optional).
+	This should create a new VFS mount record and return the record
+	to the caller.  The caller is supplied with a path parameter
+	giving the automount directory to describe the automount target
+	and the parent VFS mount record to provide inheritable mount
+	parameters.  NULL should be returned if someone else managed to
+	make the automount first.  If the vfsmount creation failed, then
+	an error code should be returned.  If -EISDIR is returned, then
+	the directory will be treated as an ordinary directory and
+	returned to pathwalk to continue walking.
+
+	If a vfsmount is returned, the caller will attempt to mount it
+	on the mountpoint and will remove the vfsmount from its
+	expiration list in the case of failure.  The vfsmount should be
+	returned with 2 refs on it to prevent automatic expiration - the
+	caller will clean up the additional ref.
+
+	This function is only used if DCACHE_NEED_AUTOMOUNT is set on
+	the dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is
+	set on the inode being added.
+
+``d_manage``
+	called to allow the filesystem to manage the transition from a
+	dentry (optional).  This allows autofs, for example, to hold up
+	clients waiting to explore behind a 'mountpoint' while letting
+	the daemon go past and construct the subtree there.  0 should be
+	returned to let the calling process continue.  -EISDIR can be
+	returned to tell pathwalk to use this directory as an ordinary
+	directory and to ignore anything mounted on it and not to check
+	the automount flag.  Any other error code will abort pathwalk
+	completely.
+
+	If the 'rcu_walk' parameter is true, then the caller is doing a
+	pathwalk in RCU-walk mode.  Sleeping is not permitted in this
+	mode, and the caller can be asked to leave it and call again by
+	returning -ECHILD.  -EISDIR may also be returned to tell
+	pathwalk to ignore d_automount or any mounts.
+
+	This function is only used if DCACHE_MANAGE_TRANSIT is set on
+	the dentry being transited from.
+
+``d_real``
+	overlay/union type filesystems implement this method to return
+	one of the underlying dentries hidden by the overlay.  It is
+	used in two different modes:
+
+	Called from file_dentry() it returns the real dentry matching
+	the inode argument.  The real dentry may be from a lower layer
+	already copied up, but still referenced from the file.  This
+	mode is selected with a non-NULL inode argument.
+
+	With NULL inode the topmost real underlying dentry is returned.
+
+Each dentry has a pointer to its parent dentry, as well as a hash list
+of child dentries.  Child dentries are basically like files in a
+directory.
+
+
+Directory Entry Cache API
+--------------------------
+
+There are a number of functions defined which permit a filesystem to
+manipulate dentries:
+
+``dget``
+	open a new handle for an existing dentry (this just increments
+	the usage count)
+
+``dput``
+	close a handle for a dentry (decrements the usage count).  If
+	the usage count drops to 0, and the dentry is still in its
+	parent's hash, the "d_delete" method is called to check whether
+	it should be cached.  If it should not be cached, or if the
+	dentry is not hashed, it is deleted.  Otherwise cached dentries
+	are put into an LRU list to be reclaimed on memory shortage.
+
+``d_drop``
+	this unhashes a dentry from its parents hash list.  A subsequent
+	call to dput() will deallocate the dentry if its usage count
+	drops to 0
+
+``d_delete``
+	delete a dentry.  If there are no other open references to the
+	dentry then the dentry is turned into a negative dentry (the
+	d_iput() method is called).  If there are other references, then
+	d_drop() is called instead
+
+``d_add``
+	add a dentry to its parents hash list and then calls
+	d_instantiate()
+
+``d_instantiate``
+	add a dentry to the alias hash list for the inode and updates
+	the "d_inode" member.  The "i_count" member in the inode
+	structure should be set/incremented.  If the inode pointer is
+	NULL, the dentry is called a "negative dentry".  This function
+	is commonly called when an inode is created for an existing
+	negative dentry
+
+``d_lookup``
+	look up a dentry given its parent and path name component It
+	looks up the child of that given name from the dcache hash
+	table.  If it is found, the reference count is incremented and
+	the dentry is returned.  The caller must use dput() to free the
+	dentry when it finishes using it.
+
+
+Mount Options
+=============
+
+
+Parsing options
+---------------
+
+On mount and remount the filesystem is passed a string containing a
+comma separated list of mount options.  The options can have either of
+these forms:
+
+  option
+  option=value
+
+The <linux/parser.h> header defines an API that helps parse these
+options.  There are plenty of examples on how to use it in existing
+filesystems.
+
+
+Showing options
+---------------
+
+If a filesystem accepts mount options, it must define show_options() to
+show all the currently active options.  The rules are:
+
+  - options MUST be shown which are not default or their values differ
+    from the default
+
+  - options MAY be shown which are enabled by default or have their
+    default value
+
+Options used only internally between a mount helper and the kernel (such
+as file descriptors), or which only have an effect during the mounting
+(such as ones controlling the creation of a journal) are exempt from the
+above rules.
+
+The underlying reason for the above rules is to make sure, that a mount
+can be accurately replicated (e.g. umounting and mounting again) based
+on the information found in /proc/mounts.
+
+
+Resources
+=========
+
+(Note some of these resources are not up-to-date with the latest kernel
+ version.)
+
+Creating Linux virtual filesystems. 2002
+    <http://lwn.net/Articles/13325/>
+
+The Linux Virtual File-system Layer by Neil Brown. 1999
+    <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
+
+A tour of the Linux VFS by Michael K. Johnson. 1996
+    <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
+
+A small trail through the Linux kernel by Andries Brouwer. 2001
+    <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
deleted file mode 100644
index 57fc576b1f3e..000000000000
--- a/Documentation/filesystems/vfs.txt
+++ /dev/null
@@ -1,1268 +0,0 @@
-
-	      Overview of the Linux Virtual File System
-
-	Original author: Richard Gooch <rgooch@atnf.csiro.au>
-
-  Copyright (C) 1999 Richard Gooch
-  Copyright (C) 2005 Pekka Enberg
-
-  This file is released under the GPLv2.
-
-
-Introduction
-============
-
-The Virtual File System (also known as the Virtual Filesystem Switch)
-is the software layer in the kernel that provides the filesystem
-interface to userspace programs. It also provides an abstraction
-within the kernel which allows different filesystem implementations to
-coexist.
-
-VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so
-on are called from a process context. Filesystem locking is described
-in the document Documentation/filesystems/Locking.
-
-
-Directory Entry Cache (dcache)
-------------------------------
-
-The VFS implements the open(2), stat(2), chmod(2), and similar system
-calls. The pathname argument that is passed to them is used by the VFS
-to search through the directory entry cache (also known as the dentry
-cache or dcache). This provides a very fast look-up mechanism to
-translate a pathname (filename) into a specific dentry. Dentries live
-in RAM and are never saved to disc: they exist only for performance.
-
-The dentry cache is meant to be a view into your entire filespace. As
-most computers cannot fit all dentries in the RAM at the same time,
-some bits of the cache are missing. In order to resolve your pathname
-into a dentry, the VFS may have to resort to creating dentries along
-the way, and then loading the inode. This is done by looking up the
-inode.
-
-
-The Inode Object
-----------------
-
-An individual dentry usually has a pointer to an inode. Inodes are
-filesystem objects such as regular files, directories, FIFOs and other
-beasts.  They live either on the disc (for block device filesystems)
-or in the memory (for pseudo filesystems). Inodes that live on the
-disc are copied into the memory when required and changes to the inode
-are written back to disc. A single inode can be pointed to by multiple
-dentries (hard links, for example, do this).
-
-To look up an inode requires that the VFS calls the lookup() method of
-the parent directory inode. This method is installed by the specific
-filesystem implementation that the inode lives in. Once the VFS has
-the required dentry (and hence the inode), we can do all those boring
-things like open(2) the file, or stat(2) it to peek at the inode
-data. The stat(2) operation is fairly simple: once the VFS has the
-dentry, it peeks at the inode data and passes some of it back to
-userspace.
-
-
-The File Object
----------------
-
-Opening a file requires another operation: allocation of a file
-structure (this is the kernel-side implementation of file
-descriptors). The freshly allocated file structure is initialized with
-a pointer to the dentry and a set of file operation member functions.
-These are taken from the inode data. The open() file method is then
-called so the specific filesystem implementation can do its work. You
-can see that this is another switch performed by the VFS. The file
-structure is placed into the file descriptor table for the process.
-
-Reading, writing and closing files (and other assorted VFS operations)
-is done by using the userspace file descriptor to grab the appropriate
-file structure, and then calling the required file structure method to
-do whatever is required. For as long as the file is open, it keeps the
-dentry in use, which in turn means that the VFS inode is still in use.
-
-
-Registering and Mounting a Filesystem
-=====================================
-
-To register and unregister a filesystem, use the following API
-functions:
-
-   #include <linux/fs.h>
-
-   extern int register_filesystem(struct file_system_type *);
-   extern int unregister_filesystem(struct file_system_type *);
-
-The passed struct file_system_type describes your filesystem. When a
-request is made to mount a filesystem onto a directory in your namespace,
-the VFS will call the appropriate mount() method for the specific
-filesystem.  New vfsmount referring to the tree returned by ->mount()
-will be attached to the mountpoint, so that when pathname resolution
-reaches the mountpoint it will jump into the root of that vfsmount.
-
-You can see all filesystems that are registered to the kernel in the
-file /proc/filesystems.
-
-
-struct file_system_type
------------------------
-
-This describes the filesystem. As of kernel 2.6.39, the following
-members are defined:
-
-struct file_system_type {
-	const char *name;
-	int fs_flags;
-        struct dentry *(*mount) (struct file_system_type *, int,
-                       const char *, void *);
-        void (*kill_sb) (struct super_block *);
-        struct module *owner;
-        struct file_system_type * next;
-        struct list_head fs_supers;
-	struct lock_class_key s_lock_key;
-	struct lock_class_key s_umount_key;
-};
-
-  name: the name of the filesystem type, such as "ext2", "iso9660",
-	"msdos" and so on
-
-  fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
-
-  mount: the method to call when a new instance of this
-	filesystem should be mounted
-
-  kill_sb: the method to call when an instance of this filesystem
-	should be shut down
-
-  owner: for internal VFS use: you should initialize this to THIS_MODULE in
-  	most cases.
-
-  next: for internal VFS use: you should initialize this to NULL
-
-  s_lock_key, s_umount_key: lockdep-specific
-
-The mount() method has the following arguments:
-
-  struct file_system_type *fs_type: describes the filesystem, partly initialized
-  	by the specific filesystem code
-
-  int flags: mount flags
-
-  const char *dev_name: the device name we are mounting.
-
-  void *data: arbitrary mount options, usually comes as an ASCII
-	string (see "Mount Options" section)
-
-The mount() method must return the root dentry of the tree requested by
-caller.  An active reference to its superblock must be grabbed and the
-superblock must be locked.  On failure it should return ERR_PTR(error).
-
-The arguments match those of mount(2) and their interpretation
-depends on filesystem type.  E.g. for block filesystems, dev_name is
-interpreted as block device name, that device is opened and if it
-contains a suitable filesystem image the method creates and initializes
-struct super_block accordingly, returning its root dentry to caller.
-
-->mount() may choose to return a subtree of existing filesystem - it
-doesn't have to create a new one.  The main result from the caller's
-point of view is a reference to dentry at the root of (sub)tree to
-be attached; creation of new superblock is a common side effect.
-
-The most interesting member of the superblock structure that the
-mount() method fills in is the "s_op" field. This is a pointer to
-a "struct super_operations" which describes the next level of the
-filesystem implementation.
-
-Usually, a filesystem uses one of the generic mount() implementations
-and provides a fill_super() callback instead. The generic variants are:
-
-  mount_bdev: mount a filesystem residing on a block device
-
-  mount_nodev: mount a filesystem that is not backed by a device
-
-  mount_single: mount a filesystem which shares the instance between
-  	all mounts
-
-A fill_super() callback implementation has the following arguments:
-
-  struct super_block *sb: the superblock structure. The callback
-  	must initialize this properly.
-
-  void *data: arbitrary mount options, usually comes as an ASCII
-	string (see "Mount Options" section)
-
-  int silent: whether or not to be silent on error
-
-
-The Superblock Object
-=====================
-
-A superblock object represents a mounted filesystem.
-
-
-struct super_operations
------------------------
-
-This describes how the VFS can manipulate the superblock of your
-filesystem. As of kernel 2.6.22, the following members are defined:
-
-struct super_operations {
-        struct inode *(*alloc_inode)(struct super_block *sb);
-        void (*destroy_inode)(struct inode *);
-
-        void (*dirty_inode) (struct inode *, int flags);
-        int (*write_inode) (struct inode *, int);
-        void (*drop_inode) (struct inode *);
-        void (*delete_inode) (struct inode *);
-        void (*put_super) (struct super_block *);
-        int (*sync_fs)(struct super_block *sb, int wait);
-        int (*freeze_fs) (struct super_block *);
-        int (*unfreeze_fs) (struct super_block *);
-        int (*statfs) (struct dentry *, struct kstatfs *);
-        int (*remount_fs) (struct super_block *, int *, char *);
-        void (*clear_inode) (struct inode *);
-        void (*umount_begin) (struct super_block *);
-
-        int (*show_options)(struct seq_file *, struct dentry *);
-
-        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
-        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
-	int (*nr_cached_objects)(struct super_block *);
-	void (*free_cached_objects)(struct super_block *, int);
-};
-
-All methods are called without any locks being held, unless otherwise
-noted. This means that most methods can block safely. All methods are
-only called from a process context (i.e. not from an interrupt handler
-or bottom half).
-
-  alloc_inode: this method is called by alloc_inode() to allocate memory
- 	for struct inode and initialize it.  If this function is not
- 	defined, a simple 'struct inode' is allocated.  Normally
- 	alloc_inode will be used to allocate a larger structure which
- 	contains a 'struct inode' embedded within it.
-
-  destroy_inode: this method is called by destroy_inode() to release
-  	resources allocated for struct inode.  It is only required if
-  	->alloc_inode was defined and simply undoes anything done by
-	->alloc_inode.
-
-  dirty_inode: this method is called by the VFS to mark an inode dirty.
-
-  write_inode: this method is called when the VFS needs to write an
-	inode to disc.  The second parameter indicates whether the write
-	should be synchronous or not, not all filesystems check this flag.
-
-  drop_inode: called when the last access to the inode is dropped,
-	with the inode->i_lock spinlock held.
-
-	This method should be either NULL (normal UNIX filesystem
-	semantics) or "generic_delete_inode" (for filesystems that do not
-	want to cache inodes - causing "delete_inode" to always be
-	called regardless of the value of i_nlink)
-
-	The "generic_delete_inode()" behavior is equivalent to the
-	old practice of using "force_delete" in the put_inode() case,
-	but does not have the races that the "force_delete()" approach
-	had. 
-
-  delete_inode: called when the VFS wants to delete an inode
-
-  put_super: called when the VFS wishes to free the superblock
-	(i.e. unmount). This is called with the superblock lock held
-
-  sync_fs: called when VFS is writing out all dirty data associated with
-  	a superblock. The second parameter indicates whether the method
-	should wait until the write out has been completed. Optional.
-
-  freeze_fs: called when VFS is locking a filesystem and
-  	forcing it into a consistent state.  This method is currently
-  	used by the Logical Volume Manager (LVM).
-
-  unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
-  	again.
-
-  statfs: called when the VFS needs to get filesystem statistics.
-
-  remount_fs: called when the filesystem is remounted. This is called
-	with the kernel lock held
-
-  clear_inode: called then the VFS clears the inode. Optional
-
-  umount_begin: called when the VFS is unmounting a filesystem.
-
-  show_options: called by the VFS to show mount options for
-	/proc/<pid>/mounts.  (see "Mount Options" section)
-
-  quota_read: called by the VFS to read from filesystem quota file.
-
-  quota_write: called by the VFS to write to filesystem quota file.
-
-  nr_cached_objects: called by the sb cache shrinking function for the
-	filesystem to return the number of freeable cached objects it contains.
-	Optional.
-
-  free_cache_objects: called by the sb cache shrinking function for the
-	filesystem to scan the number of objects indicated to try to free them.
-	Optional, but any filesystem implementing this method needs to also
-	implement ->nr_cached_objects for it to be called correctly.
-
-	We can't do anything with any errors that the filesystem might
-	encountered, hence the void return type. This will never be called if
-	the VM is trying to reclaim under GFP_NOFS conditions, hence this
-	method does not need to handle that situation itself.
-
-	Implementations must include conditional reschedule calls inside any
-	scanning loop that is done. This allows the VFS to determine
-	appropriate scan batch sizes without having to worry about whether
-	implementations will cause holdoff problems due to large scan batch
-	sizes.
-
-Whoever sets up the inode is responsible for filling in the "i_op" field. This
-is a pointer to a "struct inode_operations" which describes the methods that
-can be performed on individual inodes.
-
-struct xattr_handlers
----------------------
-
-On filesystems that support extended attributes (xattrs), the s_xattr
-superblock field points to a NULL-terminated array of xattr handlers.  Extended
-attributes are name:value pairs.
-
-  name: Indicates that the handler matches attributes with the specified name
-	(such as "system.posix_acl_access"); the prefix field must be NULL.
-
-  prefix: Indicates that the handler matches all attributes with the specified
-	name prefix (such as "user."); the name field must be NULL.
-
-  list: Determine if attributes matching this xattr handler should be listed
-	for a particular dentry.  Used by some listxattr implementations like
-	generic_listxattr.
-
-  get: Called by the VFS to get the value of a particular extended attribute.
-	This method is called by the getxattr(2) system call.
-
-  set: Called by the VFS to set the value of a particular extended attribute.
-	When the new value is NULL, called to remove a particular extended
-	attribute.  This method is called by the the setxattr(2) and
-	removexattr(2) system calls.
-
-When none of the xattr handlers of a filesystem match the specified attribute
-name or when a filesystem doesn't support extended attributes, the various
-*xattr(2) system calls return -EOPNOTSUPP.
-
-
-The Inode Object
-================
-
-An inode object represents an object within the filesystem.
-
-
-struct inode_operations
------------------------
-
-This describes how the VFS can manipulate an inode in your
-filesystem. As of kernel 2.6.22, the following members are defined:
-
-struct inode_operations {
-	int (*create) (struct inode *,struct dentry *, umode_t, bool);
-	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-	int (*link) (struct dentry *,struct inode *,struct dentry *);
-	int (*unlink) (struct inode *,struct dentry *);
-	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,umode_t);
-	int (*rmdir) (struct inode *,struct dentry *);
-	int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-	int (*rename) (struct inode *, struct dentry *,
-			struct inode *, struct dentry *, unsigned int);
-	int (*readlink) (struct dentry *, char __user *,int);
-	const char *(*get_link) (struct dentry *, struct inode *,
-				 struct delayed_call *);
-	int (*permission) (struct inode *, int);
-	int (*get_acl)(struct inode *, int);
-	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
-	ssize_t (*listxattr) (struct dentry *, char *, size_t);
-	void (*update_time)(struct inode *, struct timespec *, int);
-	int (*atomic_open)(struct inode *, struct dentry *, struct file *,
-			unsigned open_flag, umode_t create_mode);
-	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
-};
-
-Again, all methods are called without any locks being held, unless
-otherwise noted.
-
-  create: called by the open(2) and creat(2) system calls. Only
-	required if you want to support regular files. The dentry you
-	get should not have an inode (i.e. it should be a negative
-	dentry). Here you will probably call d_instantiate() with the
-	dentry and the newly created inode
-
-  lookup: called when the VFS needs to look up an inode in a parent
-	directory. The name to look for is found in the dentry. This
-	method must call d_add() to insert the found inode into the
-	dentry. The "i_count" field in the inode structure should be
-	incremented. If the named inode does not exist a NULL inode
-	should be inserted into the dentry (this is called a negative
-	dentry). Returning an error code from this routine must only
-	be done on a real error, otherwise creating inodes with system
-	calls like create(2), mknod(2), mkdir(2) and so on will fail.
-	If you wish to overload the dentry methods then you should
-	initialise the "d_dop" field in the dentry; this is a pointer
-	to a struct "dentry_operations".
-	This method is called with the directory inode semaphore held
-
-  link: called by the link(2) system call. Only required if you want
-	to support hard links. You will probably need to call
-	d_instantiate() just as you would in the create() method
-
-  unlink: called by the unlink(2) system call. Only required if you
-	want to support deleting inodes
-
-  symlink: called by the symlink(2) system call. Only required if you
-	want to support symlinks. You will probably need to call
-	d_instantiate() just as you would in the create() method
-
-  mkdir: called by the mkdir(2) system call. Only required if you want
-	to support creating subdirectories. You will probably need to
-	call d_instantiate() just as you would in the create() method
-
-  rmdir: called by the rmdir(2) system call. Only required if you want
-	to support deleting subdirectories
-
-  mknod: called by the mknod(2) system call to create a device (char,
-	block) inode or a named pipe (FIFO) or socket. Only required
-	if you want to support creating these types of inodes. You
-	will probably need to call d_instantiate() just as you would
-	in the create() method
-
-  rename: called by the rename(2) system call to rename the object to
-	have the parent and name given by the second inode and dentry.
-
-	The filesystem must return -EINVAL for any unsupported or
-	unknown	flags.  Currently the following flags are implemented:
-	(1) RENAME_NOREPLACE: this flag indicates that if the target
-	of the rename exists the rename should fail with -EEXIST
-	instead of replacing the target.  The VFS already checks for
-	existence, so for local filesystems the RENAME_NOREPLACE
-	implementation is equivalent to plain rename.
-	(2) RENAME_EXCHANGE: exchange source and target.  Both must
-	exist; this is checked by the VFS.  Unlike plain rename,
-	source and target may be of different type.
-
-  get_link: called by the VFS to follow a symbolic link to the
-	inode it points to.  Only required if you want to support
-	symbolic links.  This method returns the symlink body
-	to traverse (and possibly resets the current position with
-	nd_jump_link()).  If the body won't go away until the inode
-	is gone, nothing else is needed; if it needs to be otherwise
-	pinned, arrange for its release by having get_link(..., ..., done)
-	do set_delayed_call(done, destructor, argument).
-	In that case destructor(argument) will be called once VFS is
-	done with the body you've returned.
-	May be called in RCU mode; that is indicated by NULL dentry
-	argument.  If request can't be handled without leaving RCU mode,
-	have it return ERR_PTR(-ECHILD).
-
-	If the filesystem stores the symlink target in ->i_link, the
-	VFS may use it directly without calling ->get_link(); however,
-	->get_link() must still be provided.  ->i_link must not be
-	freed until after an RCU grace period.  Writing to ->i_link
-	post-iget() time requires a 'release' memory barrier.
-
-  readlink: this is now just an override for use by readlink(2) for the
-	cases when ->get_link uses nd_jump_link() or object is not in
-	fact a symlink.  Normally filesystems should only implement
-	->get_link for symlinks and readlink(2) will automatically use
-	that.
-
-  permission: called by the VFS to check for access rights on a POSIX-like
-  	filesystem.
-
-	May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk
-        mode, the filesystem must check the permission without blocking or
-	storing to the inode.
-
-	If a situation is encountered that rcu-walk cannot handle, return
-	-ECHILD and it will be called again in ref-walk mode.
-
-  setattr: called by the VFS to set attributes for a file. This method
-  	is called by chmod(2) and related system calls.
-
-  getattr: called by the VFS to get attributes of a file. This method
-  	is called by stat(2) and related system calls.
-
-  listxattr: called by the VFS to list all extended attributes for a
-	given file. This method is called by the listxattr(2) system call.
-
-  update_time: called by the VFS to update a specific time or the i_version of
-  	an inode.  If this is not defined the VFS will update the inode itself
-  	and call mark_inode_dirty_sync.
-
-  atomic_open: called on the last component of an open.  Using this optional
-  	method the filesystem can look up, possibly create and open the file in
-	one atomic operation.  If it wants to leave actual opening to the
-	caller (e.g. if the file turned out to be a symlink, device, or just
-	something filesystem won't do atomic open for), it may signal this by
-	returning finish_no_open(file, dentry).  This method is only called if
-	the last component is negative or needs lookup.  Cached positive dentries
-	are still handled by f_op->open().  If the file was created,
-	FMODE_CREATED flag should be set in file->f_mode.  In case of O_EXCL
-	the method must only succeed if the file didn't exist and hence FMODE_CREATED
-	shall always be set on success.
-
-  tmpfile: called in the end of O_TMPFILE open().  Optional, equivalent to
-	atomically creating, opening and unlinking a file in given directory.
-
-The Address Space Object
-========================
-
-The address space object is used to group and manage pages in the page
-cache.  It can be used to keep track of the pages in a file (or
-anything else) and also track the mapping of sections of the file into
-process address spaces.
-
-There are a number of distinct yet related services that an
-address-space can provide.  These include communicating memory
-pressure, page lookup by address, and keeping track of pages tagged as
-Dirty or Writeback.
-
-The first can be used independently to the others.  The VM can try to
-either write dirty pages in order to clean them, or release clean
-pages in order to reuse them.  To do this it can call the ->writepage
-method on dirty pages, and ->releasepage on clean pages with
-PagePrivate set. Clean pages without PagePrivate and with no external
-references will be released without notice being given to the
-address_space.
-
-To achieve this functionality, pages need to be placed on an LRU with
-lru_cache_add and mark_page_active needs to be called whenever the
-page is used.
-
-Pages are normally kept in a radix tree index by ->index. This tree
-maintains information about the PG_Dirty and PG_Writeback status of
-each page, so that pages with either of these flags can be found
-quickly.
-
-The Dirty tag is primarily used by mpage_writepages - the default
-->writepages method.  It uses the tag to find dirty pages to call
-->writepage on.  If mpage_writepages is not used (i.e. the address
-provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is
-almost unused.  write_inode_now and sync_inode do use it (through
-__sync_single_inode) to check if ->writepages has been successful in
-writing out the whole address_space.
-
-The Writeback tag is used by filemap*wait* and sync_page* functions,
-via filemap_fdatawait_range, to wait for all writeback to complete.
-
-An address_space handler may attach extra information to a page,
-typically using the 'private' field in the 'struct page'.  If such
-information is attached, the PG_Private flag should be set.  This will
-cause various VM routines to make extra calls into the address_space
-handler to deal with that data.
-
-An address space acts as an intermediate between storage and
-application.  Data is read into the address space a whole page at a
-time, and provided to the application either by copying of the page,
-or by memory-mapping the page.
-Data is written into the address space by the application, and then
-written-back to storage typically in whole pages, however the
-address_space has finer control of write sizes.
-
-The read process essentially only requires 'readpage'.  The write
-process is more complicated and uses write_begin/write_end or
-set_page_dirty to write data into the address_space, and writepage
-and writepages to writeback data to storage.
-
-Adding and removing pages to/from an address_space is protected by the
-inode's i_mutex.
-
-When data is written to a page, the PG_Dirty flag should be set.  It
-typically remains set until writepage asks for it to be written.  This
-should clear PG_Dirty and set PG_Writeback.  It can be actually
-written at any point after PG_Dirty is clear.  Once it is known to be
-safe, PG_Writeback is cleared.
-
-Writeback makes use of a writeback_control structure to direct the
-operations.  This gives the the writepage and writepages operations some
-information about the nature of and reason for the writeback request,
-and the constraints under which it is being done.  It is also used to
-return information back to the caller about the result of a writepage or
-writepages request.
-
-Handling errors during writeback
---------------------------------
-Most applications that do buffered I/O will periodically call a file
-synchronization call (fsync, fdatasync, msync or sync_file_range) to
-ensure that data written has made it to the backing store.  When there
-is an error during writeback, they expect that error to be reported when
-a file sync request is made.  After an error has been reported on one
-request, subsequent requests on the same file descriptor should return
-0, unless further writeback errors have occurred since the previous file
-syncronization.
-
-Ideally, the kernel would report errors only on file descriptions on
-which writes were done that subsequently failed to be written back.  The
-generic pagecache infrastructure does not track the file descriptions
-that have dirtied each individual page however, so determining which
-file descriptors should get back an error is not possible.
-
-Instead, the generic writeback error tracking infrastructure in the
-kernel settles for reporting errors to fsync on all file descriptions
-that were open at the time that the error occurred.  In a situation with
-multiple writers, all of them will get back an error on a subsequent fsync,
-even if all of the writes done through that particular file descriptor
-succeeded (or even if there were no writes on that file descriptor at all).
-
-Filesystems that wish to use this infrastructure should call
-mapping_set_error to record the error in the address_space when it
-occurs.  Then, after writing back data from the pagecache in their
-file->fsync operation, they should call file_check_and_advance_wb_err to
-ensure that the struct file's error cursor has advanced to the correct
-point in the stream of errors emitted by the backing device(s).
-
-struct address_space_operations
--------------------------------
-
-This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. The following members are defined:
-
-struct address_space_operations {
-	int (*writepage)(struct page *page, struct writeback_control *wbc);
-	int (*readpage)(struct file *, struct page *);
-	int (*writepages)(struct address_space *, struct writeback_control *);
-	int (*set_page_dirty)(struct page *page);
-	int (*readpages)(struct file *filp, struct address_space *mapping,
-			struct list_head *pages, unsigned nr_pages);
-	int (*write_begin)(struct file *, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata);
-	int (*write_end)(struct file *, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata);
-	sector_t (*bmap)(struct address_space *, sector_t);
-	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
-	int (*releasepage) (struct page *, int);
-	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-	/* isolate a page for migration */
-	bool (*isolate_page) (struct page *, isolate_mode_t);
-	/* migrate the contents of a page to the specified target */
-	int (*migratepage) (struct page *, struct page *);
-	/* put migration-failed page back to right list */
-	void (*putback_page) (struct page *);
-	int (*launder_page) (struct page *);
-
-	int (*is_partially_uptodate) (struct page *, unsigned long,
-					unsigned long);
-	void (*is_dirty_writeback) (struct page *, bool *, bool *);
-	int (*error_remove_page) (struct mapping *mapping, struct page *page);
-	int (*swap_activate)(struct file *);
-	int (*swap_deactivate)(struct file *);
-};
-
-  writepage: called by the VM to write a dirty page to backing store.
-      This may happen for data integrity reasons (i.e. 'sync'), or
-      to free up memory (flush).  The difference can be seen in
-      wbc->sync_mode.
-      The PG_Dirty flag has been cleared and PageLocked is true.
-      writepage should start writeout, should set PG_Writeback,
-      and should make sure the page is unlocked, either synchronously
-      or asynchronously when the write operation completes.
-
-      If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
-      try too hard if there are problems, and may choose to write out
-      other pages from the mapping if that is easier (e.g. due to
-      internal dependencies).  If it chooses not to start writeout, it
-      should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep
-      calling ->writepage on that page.
-
-      See the file "Locking" for more details.
-
-  readpage: called by the VM to read a page from backing store.
-       The page will be Locked when readpage is called, and should be
-       unlocked and marked uptodate once the read completes.
-       If ->readpage discovers that it needs to unlock the page for
-       some reason, it can do so, and then return AOP_TRUNCATED_PAGE.
-       In this case, the page will be relocated, relocked and if
-       that all succeeds, ->readpage will be called again.
-
-  writepages: called by the VM to write out pages associated with the
-  	address_space object.  If wbc->sync_mode is WBC_SYNC_ALL, then
-  	the writeback_control will specify a range of pages that must be
-  	written out.  If it is WBC_SYNC_NONE, then a nr_to_write is given
-	and that many pages should be written if possible.
-	If no ->writepages is given, then mpage_writepages is used
-  	instead.  This will choose pages from the address space that are
-  	tagged as DIRTY and will pass them to ->writepage.
-
-  set_page_dirty: called by the VM to set a page dirty.
-        This is particularly needed if an address space attaches
-        private data to a page, and that data needs to be updated when
-        a page is dirtied.  This is called, for example, when a memory
-	mapped page gets modified.
-	If defined, it should set the PageDirty flag, and the
-        PAGECACHE_TAG_DIRTY tag in the radix tree.
-
-  readpages: called by the VM to read pages associated with the address_space
-  	object. This is essentially just a vector version of
-  	readpage.  Instead of just one page, several pages are
-  	requested.
-	readpages is only used for read-ahead, so read errors are
-  	ignored.  If anything goes wrong, feel free to give up.
-
-  write_begin:
-	Called by the generic buffered write code to ask the filesystem to
-	prepare to write len bytes at the given offset in the file. The
-	address_space should check that the write will be able to complete,
-	by allocating space if necessary and doing any other internal
-	housekeeping.  If the write will update parts of any basic-blocks on
-	storage, then those blocks should be pre-read (if they haven't been
-	read already) so that the updated blocks can be written out properly.
-
-        The filesystem must return the locked pagecache page for the specified
-	offset, in *pagep, for the caller to write into.
-
-	It must be able to cope with short writes (where the length passed to
-	write_begin is greater than the number of bytes copied into the page).
-
-	flags is a field for AOP_FLAG_xxx flags, described in
-	include/linux/fs.h.
-
-        A void * may be returned in fsdata, which then gets passed into
-        write_end.
-
-        Returns 0 on success; < 0 on failure (which is the error code), in
-	which case write_end is not called.
-
-  write_end: After a successful write_begin, and data copy, write_end must
-        be called. len is the original len passed to write_begin, and copied
-        is the amount that was able to be copied.
-
-        The filesystem must take care of unlocking the page and releasing it
-        refcount, and updating i_size.
-
-        Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
-        that were able to be copied into pagecache.
-
-  bmap: called by the VFS to map a logical block offset within object to
-  	physical block number. This method is used by the FIBMAP
-  	ioctl and for working with swap-files.  To be able to swap to
-  	a file, the file must have a stable mapping to a block
-  	device.  The swap system does not go through the filesystem
-  	but instead uses bmap to find out where the blocks in the file
-  	are and uses those addresses directly.
-
-  invalidatepage: If a page has PagePrivate set, then invalidatepage
-        will be called when part or all of the page is to be removed
-	from the address space.  This generally corresponds to either a
-	truncation, punch hole  or a complete invalidation of the address
-	space (in the latter case 'offset' will always be 0 and 'length'
-	will be PAGE_SIZE). Any private data associated with the page
-	should be updated to reflect this truncation.  If offset is 0 and
-	length is PAGE_SIZE, then the private data should be released,
-	because the page must be able to be completely discarded.  This may
-	be done by calling the ->releasepage function, but in this case the
-	release MUST succeed.
-
-  releasepage: releasepage is called on PagePrivate pages to indicate
-        that the page should be freed if possible.  ->releasepage
-        should remove any private data from the page and clear the
-        PagePrivate flag. If releasepage() fails for some reason, it must
-	indicate failure with a 0 return value.
-	releasepage() is used in two distinct though related cases.  The
-	first is when the VM finds a clean page with no active users and
-        wants to make it a free page.  If ->releasepage succeeds, the
-        page will be removed from the address_space and become free.
-
-	The second case is when a request has been made to invalidate
-        some or all pages in an address_space.  This can happen
-        through the fadvise(POSIX_FADV_DONTNEED) system call or by the
-        filesystem explicitly requesting it as nfs and 9fs do (when
-        they believe the cache may be out of date with storage) by
-        calling invalidate_inode_pages2().
-	If the filesystem makes such a call, and needs to be certain
-        that all pages are invalidated, then its releasepage will
-        need to ensure this.  Possibly it can clear the PageUptodate
-        bit if it cannot free private data yet.
-
-  freepage: freepage is called once the page is no longer visible in
-        the page cache in order to allow the cleanup of any private
-	data. Since it may be called by the memory reclaimer, it
-	should not assume that the original address_space mapping still
-	exists, and it should not block.
-
-  direct_IO: called by the generic read/write routines to perform
-        direct_IO - that is IO requests which bypass the page cache
-        and transfer data directly between the storage and the
-        application's address space.
-
-  isolate_page: Called by the VM when isolating a movable non-lru page.
-	If page is successfully isolated, VM marks the page as PG_isolated
-	via __SetPageIsolated.
-
-  migrate_page:  This is used to compact the physical memory usage.
-        If the VM wants to relocate a page (maybe off a memory card
-        that is signalling imminent failure) it will pass a new page
-	and an old page to this function.  migrate_page should
-	transfer any private data across and update any references
-        that it has to the page.
-
-  putback_page: Called by the VM when isolated page's migration fails.
-
-  launder_page: Called before freeing a page - it writes back the dirty page. To
-  	prevent redirtying the page, it is kept locked during the whole
-	operation.
-
-  is_partially_uptodate: Called by the VM when reading a file through the
-	pagecache when the underlying blocksize != pagesize. If the required
-	block is up to date then the read can complete without needing the IO
-	to bring the whole page up to date.
-
-  is_dirty_writeback: Called by the VM when attempting to reclaim a page.
-	The VM uses dirty and writeback information to determine if it needs
-	to stall to allow flushers a chance to complete some IO. Ordinarily
-	it can use PageDirty and PageWriteback but some filesystems have
-	more complex state (unstable pages in NFS prevent reclaim) or
-	do not set those flags due to locking problems. This callback
-	allows a filesystem to indicate to the VM if a page should be
-	treated as dirty or writeback for the purposes of stalling.
-
-  error_remove_page: normally set to generic_error_remove_page if truncation
-	is ok for this address space. Used for memory failure handling.
-	Setting this implies you deal with pages going away under you,
-	unless you have them locked or reference counts increased.
-
-  swap_activate: Called when swapon is used on a file to allocate
-	space if necessary and pin the block lookup information in
-	memory. A return value of zero indicates success,
-	in which case this file can be used to back swapspace.
-
-  swap_deactivate: Called during swapoff on files where swap_activate
-	was successful.
-
-
-The File Object
-===============
-
-A file object represents a file opened by a process. This is also known
-as an "open file description" in POSIX parlance.
-
-
-struct file_operations
-----------------------
-
-This describes how the VFS can manipulate an open file. As of kernel
-4.18, the following members are defined:
-
-struct file_operations {
-	struct module *owner;
-	loff_t (*llseek) (struct file *, loff_t, int);
-	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
-	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-	int (*iopoll)(struct kiocb *kiocb, bool spin);
-	int (*iterate) (struct file *, struct dir_context *);
-	int (*iterate_shared) (struct file *, struct dir_context *);
-	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
-	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
-	int (*mmap) (struct file *, struct vm_area_struct *);
-	int (*open) (struct inode *, struct file *);
-	int (*flush) (struct file *, fl_owner_t id);
-	int (*release) (struct inode *, struct file *);
-	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
-	int (*fasync) (int, struct file *, int);
-	int (*lock) (struct file *, int, struct file_lock *);
-	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
-	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-	int (*check_flags)(int);
-	int (*flock) (struct file *, int, struct file_lock *);
-	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
-	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
-	int (*setlease)(struct file *, long, struct file_lock **, void **);
-	long (*fallocate)(struct file *file, int mode, loff_t offset,
-			  loff_t len);
-	void (*show_fdinfo)(struct seq_file *m, struct file *f);
-#ifndef CONFIG_MMU
-	unsigned (*mmap_capabilities)(struct file *);
-#endif
-	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
-	loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
-				   struct file *file_out, loff_t pos_out,
-				   loff_t len, unsigned int remap_flags);
-	int (*fadvise)(struct file *, loff_t, loff_t, int);
-};
-
-Again, all methods are called without any locks being held, unless
-otherwise noted.
-
-  llseek: called when the VFS needs to move the file position index
-
-  read: called by read(2) and related system calls
-
-  read_iter: possibly asynchronous read with iov_iter as destination
-
-  write: called by write(2) and related system calls
-
-  write_iter: possibly asynchronous write with iov_iter as source
-
-  iopoll: called when aio wants to poll for completions on HIPRI iocbs
-
-  iterate: called when the VFS needs to read the directory contents
-
-  iterate_shared: called when the VFS needs to read the directory contents
-	when filesystem supports concurrent dir iterators
-
-  poll: called by the VFS when a process wants to check if there is
-	activity on this file and (optionally) go to sleep until there
-	is activity. Called by the select(2) and poll(2) system calls
-
-  unlocked_ioctl: called by the ioctl(2) system call.
-
-  compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
- 	 are used on 64 bit kernels.
-
-  mmap: called by the mmap(2) system call
-
-  open: called by the VFS when an inode should be opened. When the VFS
-	opens a file, it creates a new "struct file". It then calls the
-	open method for the newly allocated file structure. You might
-	think that the open method really belongs in
-	"struct inode_operations", and you may be right. I think it's
-	done the way it is because it makes filesystems simpler to
-	implement. The open() method is a good place to initialize the
-	"private_data" member in the file structure if you want to point
-	to a device structure
-
-  flush: called by the close(2) system call to flush a file
-
-  release: called when the last reference to an open file is closed
-
-  fsync: called by the fsync(2) system call. Also see the section above
-	 entitled "Handling errors during writeback".
-
-  fasync: called by the fcntl(2) system call when asynchronous
-	(non-blocking) mode is enabled for a file
-
-  lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
-  	commands
-
-  get_unmapped_area: called by the mmap(2) system call
-
-  check_flags: called by the fcntl(2) system call for F_SETFL command
-
-  flock: called by the flock(2) system call
-
-  splice_write: called by the VFS to splice data from a pipe to a file. This
-		method is used by the splice(2) system call
-
-  splice_read: called by the VFS to splice data from file to a pipe. This
-	       method is used by the splice(2) system call
-
-  setlease: called by the VFS to set or release a file lock lease. setlease
-	    implementations should call generic_setlease to record or remove
-	    the lease in the inode after setting it.
-
-  fallocate: called by the VFS to preallocate blocks or punch a hole.
-
-  copy_file_range: called by the copy_file_range(2) system call.
-
-  remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
-	FICLONE and FIDEDUPERANGE commands to remap file ranges.  An
-	implementation should remap len bytes at pos_in of the source file into
-	the dest file at pos_out.  Implementations must handle callers passing
-	in len == 0; this means "remap to the end of the source file".  The
-	return value should the number of bytes remapped, or the usual
-	negative error code if errors occurred before any bytes were remapped.
-	The remap_flags parameter accepts REMAP_FILE_* flags.  If
-	REMAP_FILE_DEDUP is set then the implementation must only remap if the
-	requested file ranges have identical contents.  If REMAP_CAN_SHORTEN is
-	set, the caller is ok with the implementation shortening the request
-	length to satisfy alignment or EOF requirements (or any other reason).
-
-  fadvise: possibly called by the fadvise64() system call.
-
-Note that the file operations are implemented by the specific
-filesystem in which the inode resides. When opening a device node
-(character or block special) most filesystems will call special
-support routines in the VFS which will locate the required device
-driver information. These support routines replace the filesystem file
-operations with those for the device driver, and then proceed to call
-the new open() method for the file. This is how opening a device file
-in the filesystem eventually ends up calling the device driver open()
-method.
-
-
-Directory Entry Cache (dcache)
-==============================
-
-
-struct dentry_operations
-------------------------
-
-This describes how a filesystem can overload the standard dentry
-operations. Dentries and the dcache are the domain of the VFS and the
-individual filesystem implementations. Device drivers have no business
-here. These methods may be set to NULL, as they are either optional or
-the VFS uses a default. As of kernel 2.6.22, the following members are
-defined:
-
-struct dentry_operations {
-	int (*d_revalidate)(struct dentry *, unsigned int);
-	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, struct qstr *);
-	int (*d_compare)(const struct dentry *,
-			unsigned int, const char *, const struct qstr *);
-	int (*d_delete)(const struct dentry *);
-	int (*d_init)(struct dentry *);
-	void (*d_release)(struct dentry *);
-	void (*d_iput)(struct dentry *, struct inode *);
-	char *(*d_dname)(struct dentry *, char *, int);
-	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(const struct path *, bool);
-	struct dentry *(*d_real)(struct dentry *, const struct inode *);
-};
-
-  d_revalidate: called when the VFS needs to revalidate a dentry. This
-	is called whenever a name look-up finds a dentry in the
-	dcache. Most local filesystems leave this as NULL, because all their
-	dentries in the dcache are valid. Network filesystems are different
-	since things can change on the server without the client necessarily
-	being aware of it.
-
-	This function should return a positive value if the dentry is still
-	valid, and zero or a negative error code if it isn't.
-
-	d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU).
-	If in rcu-walk mode, the filesystem must revalidate the dentry without
-	blocking or storing to the dentry, d_parent and d_inode should not be
-	used without care (because they can change and, in d_inode case, even
-	become NULL under us).
-
-	If a situation is encountered that rcu-walk cannot handle, return
-	-ECHILD and it will be called again in ref-walk mode.
-
- d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry.
-	This is called when a path-walk ends at dentry that was not acquired by
-	doing a lookup in the parent directory. This includes "/", "." and "..",
-	as well as procfs-style symlinks and mountpoint traversal.
-
-	In this case, we are less concerned with whether the dentry is still
-	fully correct, but rather that the inode is still valid. As with
-	d_revalidate, most local filesystems will set this to NULL since their
-	dcache entries are always valid.
-
-	This function has the same return code semantics as d_revalidate.
-
-	d_weak_revalidate is only called after leaving rcu-walk mode.
-
-  d_hash: called when the VFS adds a dentry to the hash table. The first
-	dentry passed to d_hash is the parent directory that the name is
-	to be hashed into.
-
-	Same locking and synchronisation rules as d_compare regarding
-	what is safe to dereference etc.
-
-  d_compare: called to compare a dentry name with a given name. The first
-	dentry is the parent of the dentry to be compared, the second is
-	the child dentry. len and name string are properties of the dentry
-	to be compared. qstr is the name to compare it with.
-
-	Must be constant and idempotent, and should not take locks if
-	possible, and should not or store into the dentry.
-	Should not dereference pointers outside the dentry without
-	lots of care (eg.  d_parent, d_inode, d_name should not be used).
-
-	However, our vfsmount is pinned, and RCU held, so the dentries and
-	inodes won't disappear, neither will our sb or filesystem module.
-	->d_sb may be used.
-
-	It is a tricky calling convention because it needs to be called under
-	"rcu-walk", ie. without any locks or references on things.
-
-  d_delete: called when the last reference to a dentry is dropped and the
-	dcache is deciding whether or not to cache it. Return 1 to delete
-	immediately, or 0 to cache the dentry. Default is NULL which means to
-	always cache a reachable dentry. d_delete must be constant and
-	idempotent.
-
-  d_init: called when a dentry is allocated
-
-  d_release: called when a dentry is really deallocated
-
-  d_iput: called when a dentry loses its inode (just prior to its
-	being deallocated). The default when this is NULL is that the
-	VFS calls iput(). If you define this method, you must call
-	iput() yourself
-
-  d_dname: called when the pathname of a dentry should be generated.
-	Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay
-	pathname generation. (Instead of doing it when dentry is created,
-	it's done only when the path is needed.). Real filesystems probably
-	dont want to use it, because their dentries are present in global
-	dcache hash, so their hash should be an invariant. As no lock is
-	held, d_dname() should not try to modify the dentry itself, unless
-	appropriate SMP safety is used. CAUTION : d_path() logic is quite
-	tricky. The correct way to return for example "Hello" is to put it
-	at the end of the buffer, and returns a pointer to the first char.
-	dynamic_dname() helper function is provided to take care of this.
-
-	Example :
-
-	static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
-	{
-		return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
-				dentry->d_inode->i_ino);
-	}
-
-  d_automount: called when an automount dentry is to be traversed (optional).
-	This should create a new VFS mount record and return the record to the
-	caller.  The caller is supplied with a path parameter giving the
-	automount directory to describe the automount target and the parent
-	VFS mount record to provide inheritable mount parameters.  NULL should
-	be returned if someone else managed to make the automount first.  If
-	the vfsmount creation failed, then an error code should be returned.
-	If -EISDIR is returned, then the directory will be treated as an
-	ordinary directory and returned to pathwalk to continue walking.
-
-	If a vfsmount is returned, the caller will attempt to mount it on the
-	mountpoint and will remove the vfsmount from its expiration list in
-	the case of failure.  The vfsmount should be returned with 2 refs on
-	it to prevent automatic expiration - the caller will clean up the
-	additional ref.
-
-	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
-	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
-	inode being added.
-
-  d_manage: called to allow the filesystem to manage the transition from a
-	dentry (optional).  This allows autofs, for example, to hold up clients
-	waiting to explore behind a 'mountpoint' while letting the daemon go
-	past and construct the subtree there.  0 should be returned to let the
-	calling process continue.  -EISDIR can be returned to tell pathwalk to
-	use this directory as an ordinary directory and to ignore anything
-	mounted on it and not to check the automount flag.  Any other error
-	code will abort pathwalk completely.
-
-	If the 'rcu_walk' parameter is true, then the caller is doing a
-	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
-	and the caller can be asked to leave it and call again by returning
-	-ECHILD.  -EISDIR may also be returned to tell pathwalk to
-	ignore d_automount or any mounts.
-
-	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
-	dentry being transited from.
-
-  d_real: overlay/union type filesystems implement this method to return one of
-	the underlying dentries hidden by the overlay.  It is used in two
-	different modes:
-
-	Called from file_dentry() it returns the real dentry matching the inode
-	argument.  The real dentry may be from a lower layer already copied up,
-	but still referenced from the file.  This mode is selected with a
-	non-NULL inode argument.
-
-	With NULL inode the topmost real underlying dentry is returned.
-
-Each dentry has a pointer to its parent dentry, as well as a hash list
-of child dentries. Child dentries are basically like files in a
-directory.
-
-
-Directory Entry Cache API
---------------------------
-
-There are a number of functions defined which permit a filesystem to
-manipulate dentries:
-
-  dget: open a new handle for an existing dentry (this just increments
-	the usage count)
-
-  dput: close a handle for a dentry (decrements the usage count). If
-	the usage count drops to 0, and the dentry is still in its
-	parent's hash, the "d_delete" method is called to check whether
-	it should be cached. If it should not be cached, or if the dentry
-	is not hashed, it is deleted. Otherwise cached dentries are put
-	into an LRU list to be reclaimed on memory shortage.
-
-  d_drop: this unhashes a dentry from its parents hash list. A
-	subsequent call to dput() will deallocate the dentry if its
-	usage count drops to 0
-
-  d_delete: delete a dentry. If there are no other open references to
-	the dentry then the dentry is turned into a negative dentry
-	(the d_iput() method is called). If there are other
-	references, then d_drop() is called instead
-
-  d_add: add a dentry to its parents hash list and then calls
-	d_instantiate()
-
-  d_instantiate: add a dentry to the alias hash list for the inode and
-	updates the "d_inode" member. The "i_count" member in the
-	inode structure should be set/incremented. If the inode
-	pointer is NULL, the dentry is called a "negative
-	dentry". This function is commonly called when an inode is
-	created for an existing negative dentry
-
-  d_lookup: look up a dentry given its parent and path name component
-	It looks up the child of that given name from the dcache
-	hash table. If it is found, the reference count is incremented
-	and the dentry is returned. The caller must use dput()
-	to free the dentry when it finishes using it.
-
-Mount Options
-=============
-
-Parsing options
----------------
-
-On mount and remount the filesystem is passed a string containing a
-comma separated list of mount options.  The options can have either of
-these forms:
-
-  option
-  option=value
-
-The <linux/parser.h> header defines an API that helps parse these
-options.  There are plenty of examples on how to use it in existing
-filesystems.
-
-Showing options
----------------
-
-If a filesystem accepts mount options, it must define show_options()
-to show all the currently active options.  The rules are:
-
-  - options MUST be shown which are not default or their values differ
-    from the default
-
-  - options MAY be shown which are enabled by default or have their
-    default value
-
-Options used only internally between a mount helper and the kernel
-(such as file descriptors), or which only have an effect during the
-mounting (such as ones controlling the creation of a journal) are exempt
-from the above rules.
-
-The underlying reason for the above rules is to make sure, that a
-mount can be accurately replicated (e.g. umounting and mounting again)
-based on the information found in /proc/mounts.
-
-Resources
-=========
-
-(Note some of these resources are not up-to-date with the latest kernel
- version.)
-
-Creating Linux virtual filesystems. 2002
-    <http://lwn.net/Articles/13325/>
-
-The Linux Virtual File-system Layer by Neil Brown. 1999
-    <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
-
-A tour of the Linux VFS by Michael K. Johnson. 1996
-    <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
-
-A small trail through the Linux kernel by Andries Brouwer. 2001
-    <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index 2ce36439c09f..9a6dd289b17b 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -34,7 +34,7 @@ transaction:
 	   D			A+B+C+D		X+n+m+o
 	    <object written to disk>
 	   E			   E		   Y (> X+n+m+o)
-	   F			  E+F		  Yٍ+p
+	   F			  E+F		  Y+p
 
 In other words, each time an object is relogged, the new transaction contains
 the aggregation of all the previous changes currently held only in the log.
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt
index 68604e67a495..8db0121d0980 100644
--- a/Documentation/filesystems/xfs-self-describing-metadata.txt
+++ b/Documentation/filesystems/xfs-self-describing-metadata.txt
@@ -222,7 +222,7 @@ static void
 xfs_foo_read_verify(
 	struct xfs_buf	*bp)
 {
-       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_mount *mp = bp->b_mount;
 
         if ((xfs_sb_version_hascrc(&mp->m_sb) &&
              !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
@@ -245,7 +245,7 @@ static bool
 xfs_foo_verify(
 	struct xfs_buf		*bp)
 {
-        struct xfs_mount	*mp = bp->b_target->bt_mount;
+        struct xfs_mount	*mp = bp->b_mount;
         struct xfs_ondisk_hdr	*hdr = bp->b_addr;
 
         if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
@@ -272,7 +272,7 @@ static bool
 xfs_foo_verify(
 	struct xfs_buf		*bp)
 {
-        struct xfs_mount	*mp = bp->b_target->bt_mount;
+        struct xfs_mount	*mp = bp->b_mount;
         struct xfs_ondisk_hdr	*hdr = bp->b_addr;
 
         if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
@@ -297,7 +297,7 @@ static void
 xfs_foo_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
 	if (!xfs_foo_verify(bp)) {
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
deleted file mode 100644
index a5cbb5e0e3db..000000000000
--- a/Documentation/filesystems/xfs.txt
+++ /dev/null
@@ -1,470 +0,0 @@
-
-The SGI XFS Filesystem
-======================
-
-XFS is a high performance journaling filesystem which originated
-on the SGI IRIX platform.  It is completely multi-threaded, can
-support large files and large filesystems, extended attributes,
-variable block sizes, is extent based, and makes extensive use of
-Btrees (directories, extents, free space) to aid both performance
-and scalability.
-
-Refer to the documentation at https://xfs.wiki.kernel.org/
-for further details.  This implementation is on-disk compatible
-with the IRIX version of XFS.
-
-
-Mount Options
-=============
-
-When mounting an XFS filesystem, the following options are accepted.
-For boolean mount options, the names with the (*) suffix is the
-default behaviour.
-
-  allocsize=size
-	Sets the buffered I/O end-of-file preallocation size when
-	doing delayed allocation writeout (default size is 64KiB).
-	Valid values for this option are page size (typically 4KiB)
-	through to 1GiB, inclusive, in power-of-2 increments.
-
-	The default behaviour is for dynamic end-of-file
-	preallocation size, which uses a set of heuristics to
-	optimise the preallocation size based on the current
-	allocation patterns within the file and the access patterns
-	to the file. Specifying a fixed allocsize value turns off
-	the dynamic behaviour.
-
-  attr2
-  noattr2
-	The options enable/disable an "opportunistic" improvement to
-	be made in the way inline extended attributes are stored
-	on-disk.  When the new form is used for the first time when
-	attr2 is selected (either when setting or removing extended
-	attributes) the on-disk superblock feature bit field will be
-	updated to reflect this format being in use.
-
-	The default behaviour is determined by the on-disk feature
-	bit indicating that attr2 behaviour is active. If either
-	mount option it set, then that becomes the new default used
-	by the filesystem.
-
-	CRC enabled filesystems always use the attr2 format, and so
-	will reject the noattr2 mount option if it is set.
-
-  discard
-  nodiscard (*)
-	Enable/disable the issuing of commands to let the block
-	device reclaim space freed by the filesystem.  This is
-	useful for SSD devices, thinly provisioned LUNs and virtual
-	machine images, but may have a performance impact.
-
-	Note: It is currently recommended that you use the fstrim
-	application to discard unused blocks rather than the discard
-	mount option because the performance impact of this option
-	is quite severe.
-
-  grpid/bsdgroups
-  nogrpid/sysvgroups (*)
-	These options define what group ID a newly created file
-	gets.  When grpid is set, it takes the group ID of the
-	directory in which it is created; otherwise it takes the
-	fsgid of the current process, unless the directory has the
-	setgid bit set, in which case it takes the gid from the
-	parent directory, and also gets the setgid bit set if it is
-	a directory itself.
-
-  filestreams
-	Make the data allocator use the filestreams allocation mode
-	across the entire filesystem rather than just on directories
-	configured to use it.
-
-  ikeep
-  noikeep (*)
-	When ikeep is specified, XFS does not delete empty inode
-	clusters and keeps them around on disk.  When noikeep is
-	specified, empty inode clusters are returned to the free
-	space pool.
-
-  inode32
-  inode64 (*)
-	When inode32 is specified, it indicates that XFS limits
-	inode creation to locations which will not result in inode
-	numbers with more than 32 bits of significance.
-
-	When inode64 is specified, it indicates that XFS is allowed
-	to create inodes at any location in the filesystem,
-	including those which will result in inode numbers occupying
-	more than 32 bits of significance. 
-
-	inode32 is provided for backwards compatibility with older
-	systems and applications, since 64 bits inode numbers might
-	cause problems for some applications that cannot handle
-	large inode numbers.  If applications are in use which do
-	not handle inode numbers bigger than 32 bits, the inode32
-	option should be specified.
-
-
-  largeio
-  nolargeio (*)
-	If "nolargeio" is specified, the optimal I/O reported in
-	st_blksize by stat(2) will be as small as possible to allow
-	user applications to avoid inefficient read/modify/write
-	I/O.  This is typically the page size of the machine, as
-	this is the granularity of the page cache.
-
-	If "largeio" specified, a filesystem that was created with a
-	"swidth" specified will return the "swidth" value (in bytes)
-	in st_blksize. If the filesystem does not have a "swidth"
-	specified but does specify an "allocsize" then "allocsize"
-	(in bytes) will be returned instead. Otherwise the behaviour
-	is the same as if "nolargeio" was specified.
-
-  logbufs=value
-	Set the number of in-memory log buffers.  Valid numbers
-	range from 2-8 inclusive.
-
-	The default value is 8 buffers.
-
-	If the memory cost of 8 log buffers is too high on small
-	systems, then it may be reduced at some cost to performance
-	on metadata intensive workloads. The logbsize option below
-	controls the size of each buffer and so is also relevant to
-	this case.
-
-  logbsize=value
-	Set the size of each in-memory log buffer.  The size may be
-	specified in bytes, or in kilobytes with a "k" suffix.
-	Valid sizes for version 1 and version 2 logs are 16384 (16k)
-	and 32768 (32k).  Valid sizes for version 2 logs also
-	include 65536 (64k), 131072 (128k) and 262144 (256k). The
-	logbsize must be an integer multiple of the log
-	stripe unit configured at mkfs time.
-
-	The default value for for version 1 logs is 32768, while the
-	default value for version 2 logs is MAX(32768, log_sunit).
-
-  logdev=device and rtdev=device
-	Use an external log (metadata journal) and/or real-time device.
-	An XFS filesystem has up to three parts: a data section, a log
-	section, and a real-time section.  The real-time section is
-	optional, and the log section can be separate from the data
-	section or contained within it.
-
-  noalign
-	Data allocations will not be aligned at stripe unit
-	boundaries. This is only relevant to filesystems created
-	with non-zero data alignment parameters (sunit, swidth) by
-	mkfs.
-
-  norecovery
-	The filesystem will be mounted without running log recovery.
-	If the filesystem was not cleanly unmounted, it is likely to
-	be inconsistent when mounted in "norecovery" mode.
-	Some files or directories may not be accessible because of this.
-	Filesystems mounted "norecovery" must be mounted read-only or
-	the mount will fail.
-
-  nouuid
-	Don't check for double mounted file systems using the file
-	system uuid.  This is useful to mount LVM snapshot volumes,
-	and often used in combination with "norecovery" for mounting
-	read-only snapshots.
-
-  noquota
-	Forcibly turns off all quota accounting and enforcement
-	within the filesystem.
-
-  uquota/usrquota/uqnoenforce/quota
-	User disk quota accounting enabled, and limits (optionally)
-	enforced.  Refer to xfs_quota(8) for further details.
-
-  gquota/grpquota/gqnoenforce
-	Group disk quota accounting enabled and limits (optionally)
-	enforced.  Refer to xfs_quota(8) for further details.
-
-  pquota/prjquota/pqnoenforce
-	Project disk quota accounting enabled and limits (optionally)
-	enforced.  Refer to xfs_quota(8) for further details.
-
-  sunit=value and swidth=value
-	Used to specify the stripe unit and width for a RAID device
-	or a stripe volume.  "value" must be specified in 512-byte
-	block units. These options are only relevant to filesystems
-	that were created with non-zero data alignment parameters.
-
-	The sunit and swidth parameters specified must be compatible
-	with the existing filesystem alignment characteristics.  In
-	general, that means the only valid changes to sunit are
-	increasing it by a power-of-2 multiple. Valid swidth values
-	are any integer multiple of a valid sunit value.
-
-	Typically the only time these mount options are necessary if
-	after an underlying RAID device has had it's geometry
-	modified, such as adding a new disk to a RAID5 lun and
-	reshaping it.
-
-  swalloc
-	Data allocations will be rounded up to stripe width boundaries
-	when the current end of file is being extended and the file
-	size is larger than the stripe width size.
-
-  wsync
-	When specified, all filesystem namespace operations are
-	executed synchronously. This ensures that when the namespace
-	operation (create, unlink, etc) completes, the change to the
-	namespace is on stable storage. This is useful in HA setups
-	where failover must not result in clients seeing
-	inconsistent namespace presentation during or after a
-	failover event.
-
-
-Deprecated Mount Options
-========================
-
-  Name				Removal Schedule
-  ----				----------------
-
-
-Removed Mount Options
-=====================
-
-  Name				Removed
-  ----				-------
-  delaylog/nodelaylog		v4.0
-  ihashsize			v4.0
-  irixsgid			v4.0
-  osyncisdsync/osyncisosync	v4.0
-  barrier			v4.19
-  nobarrier			v4.19
-
-
-sysctls
-=======
-
-The following sysctls are available for the XFS filesystem:
-
-  fs.xfs.stats_clear		(Min: 0  Default: 0  Max: 1)
-	Setting this to "1" clears accumulated XFS statistics
-	in /proc/fs/xfs/stat.  It then immediately resets to "0".
-
-  fs.xfs.xfssyncd_centisecs	(Min: 100  Default: 3000  Max: 720000)
-	The interval at which the filesystem flushes metadata
-	out to disk and runs internal cache cleanup routines.
-
-  fs.xfs.filestream_centisecs	(Min: 1  Default: 3000  Max: 360000)
-	The interval at which the filesystem ages filestreams cache
-	references and returns timed-out AGs back to the free stream
-	pool.
-
-  fs.xfs.speculative_prealloc_lifetime
-		(Units: seconds   Min: 1  Default: 300  Max: 86400)
-	The interval at which the background scanning for inodes
-	with unused speculative preallocation runs. The scan
-	removes unused preallocation from clean inodes and releases
-	the unused space back to the free pool.
-
-  fs.xfs.error_level		(Min: 0  Default: 3  Max: 11)
-	A volume knob for error reporting when internal errors occur.
-	This will generate detailed messages & backtraces for filesystem
-	shutdowns, for example.  Current threshold values are:
-
-		XFS_ERRLEVEL_OFF:       0
-		XFS_ERRLEVEL_LOW:       1
-		XFS_ERRLEVEL_HIGH:      5
-
-  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 256)
-	Causes certain error conditions to call BUG(). Value is a bitmask;
-	OR together the tags which represent errors which should cause panics:
-
-		XFS_NO_PTAG                     0
-		XFS_PTAG_IFLUSH                 0x00000001
-		XFS_PTAG_LOGRES                 0x00000002
-		XFS_PTAG_AILDELETE              0x00000004
-		XFS_PTAG_ERROR_REPORT           0x00000008
-		XFS_PTAG_SHUTDOWN_CORRUPT       0x00000010
-		XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
-		XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
-		XFS_PTAG_FSBLOCK_ZERO           0x00000080
-		XFS_PTAG_VERIFIER_ERROR         0x00000100
-
-	This option is intended for debugging only.
-
-  fs.xfs.irix_symlink_mode	(Min: 0  Default: 0  Max: 1)
-	Controls whether symlinks are created with mode 0777 (default)
-	or whether their mode is affected by the umask (irix mode).
-
-  fs.xfs.irix_sgid_inherit	(Min: 0  Default: 0  Max: 1)
-	Controls files created in SGID directories.
-	If the group ID of the new file does not match the effective group
-	ID or one of the supplementary group IDs of the parent dir, the
-	ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
-	is set.
-
-  fs.xfs.inherit_sync		(Min: 0  Default: 1  Max: 1)
-	Setting this to "1" will cause the "sync" flag set
-	by the xfs_io(8) chattr command on a directory to be
-	inherited by files in that directory.
-
-  fs.xfs.inherit_nodump		(Min: 0  Default: 1  Max: 1)
-	Setting this to "1" will cause the "nodump" flag set
-	by the xfs_io(8) chattr command on a directory to be
-	inherited by files in that directory.
-
-  fs.xfs.inherit_noatime	(Min: 0  Default: 1  Max: 1)
-	Setting this to "1" will cause the "noatime" flag set
-	by the xfs_io(8) chattr command on a directory to be
-	inherited by files in that directory.
-
-  fs.xfs.inherit_nosymlinks	(Min: 0  Default: 1  Max: 1)
-	Setting this to "1" will cause the "nosymlinks" flag set
-	by the xfs_io(8) chattr command on a directory to be
-	inherited by files in that directory.
-
-  fs.xfs.inherit_nodefrag	(Min: 0  Default: 1  Max: 1)
-	Setting this to "1" will cause the "nodefrag" flag set
-	by the xfs_io(8) chattr command on a directory to be
-	inherited by files in that directory.
-
-  fs.xfs.rotorstep		(Min: 1  Default: 1  Max: 256)
-	In "inode32" allocation mode, this option determines how many
-	files the allocator attempts to allocate in the same allocation
-	group before moving to the next allocation group.  The intent
-	is to control the rate at which the allocator moves between
-	allocation groups when allocating extents for new files.
-
-Deprecated Sysctls
-==================
-
-None at present.
-
-
-Removed Sysctls
-===============
-
-  Name				Removed
-  ----				-------
-  fs.xfs.xfsbufd_centisec	v4.0
-  fs.xfs.age_buffer_centisecs	v4.0
-
-
-Error handling
-==============
-
-XFS can act differently according to the type of error found during its
-operation. The implementation introduces the following concepts to the error
-handler:
-
- -failure speed:
-	Defines how fast XFS should propagate an error upwards when a specific
-	error is found during the filesystem operation. It can propagate
-	immediately, after a defined number of retries, after a set time period,
-	or simply retry forever.
-
- -error classes:
-	Specifies the subsystem the error configuration will apply to, such as
-	metadata IO or memory allocation. Different subsystems will have
-	different error handlers for which behaviour can be configured.
-
- -error handlers:
-	Defines the behavior for a specific error.
-
-The filesystem behavior during an error can be set via sysfs files. Each
-error handler works independently - the first condition met by an error handler
-for a specific class will cause the error to be propagated rather than reset and
-retried.
-
-The action taken by the filesystem when the error is propagated is context
-dependent - it may cause a shut down in the case of an unrecoverable error,
-it may be reported back to userspace, or it may even be ignored because
-there's nothing useful we can with the error or anyone we can report it to (e.g.
-during unmount).
-
-The configuration files are organized into the following hierarchy for each
-mounted filesystem:
-
-  /sys/fs/xfs/<dev>/error/<class>/<error>/
-
-Where:
-  <dev>
-	The short device name of the mounted filesystem. This is the same device
-	name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
-
-  <class>
-	The subsystem the error configuration belongs to. As of 4.9, the defined
-	classes are:
-
-		- "metadata": applies metadata buffer write IO
-
-  <error>
-	The individual error handler configurations.
-
-
-Each filesystem has "global" error configuration options defined in their top
-level directory:
-
-  /sys/fs/xfs/<dev>/error/
-
-  fail_at_unmount		(Min:  0  Default:  1  Max: 1)
-	Defines the filesystem error behavior at unmount time.
-
-	If set to a value of 1, XFS will override all other error configurations
-	during unmount and replace them with "immediate fail" characteristics.
-	i.e. no retries, no retry timeout. This will always allow unmount to
-	succeed when there are persistent errors present.
-
-	If set to 0, the configured retry behaviour will continue until all
-	retries and/or timeouts have been exhausted. This will delay unmount
-	completion when there are persistent errors, and it may prevent the
-	filesystem from ever unmounting fully in the case of "retry forever"
-	handler configurations.
-
-	Note: there is no guarantee that fail_at_unmount can be set while an
-	unmount is in progress. It is possible that the sysfs entries are
-	removed by the unmounting filesystem before a "retry forever" error
-	handler configuration causes unmount to hang, and hence the filesystem
-	must be configured appropriately before unmount begins to prevent
-	unmount hangs.
-
-Each filesystem has specific error class handlers that define the error
-propagation behaviour for specific errors. There is also a "default" error
-handler defined, which defines the behaviour for all errors that don't have
-specific handlers defined. Where multiple retry constraints are configuredi for
-a single error, the first retry configuration that expires will cause the error
-to be propagated. The handler configurations are found in the directory:
-
-  /sys/fs/xfs/<dev>/error/<class>/<error>/
-
-  max_retries			(Min: -1  Default: Varies  Max: INTMAX)
-	Defines the allowed number of retries of a specific error before
-	the filesystem will propagate the error. The retry count for a given
-	error context (e.g. a specific metadata buffer) is reset every time
-	there is a successful completion of the operation.
-
-	Setting the value to "-1" will cause XFS to retry forever for this
-	specific error.
-
-	Setting the value to "0" will cause XFS to fail immediately when the
-	specific error is reported.
-
-	Setting the value to "N" (where 0 < N < Max) will make XFS retry the
-	operation "N" times before propagating the error.
-
-  retry_timeout_seconds		(Min:  -1  Default:  Varies  Max: 1 day)
-	Define the amount of time (in seconds) that the filesystem is
-	allowed to retry its operations when the specific error is
-	found.
-
-	Setting the value to "-1" will allow XFS to retry forever for this
-	specific error.
-
-	Setting the value to "0" will cause XFS to fail immediately when the
-	specific error is reported.
-
-	Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
-	operation for up to "N" seconds before propagating the error.
-
-Note: The default behaviour for a specific error handler is dependent on both
-the class and error context. For example, the default values for
-"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
-to "fail immediately" behaviour. This is done because ENODEV is a fatal,
-unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst
index 6b32b7be8c85..0a72b6321f5f 100644
--- a/Documentation/firmware-guide/acpi/enumeration.rst
+++ b/Documentation/firmware-guide/acpi/enumeration.rst
@@ -316,7 +316,7 @@ specifies the path to the controller. In order to use these GPIOs in Linux
 we need to translate them to the corresponding Linux GPIO descriptors.
 
 There is a standard GPIO API for that and is documented in
-Documentation/gpio/.
+Documentation/admin-guide/gpio/.
 
 In the above example we can get the corresponding two GPIO descriptors with
 a code like this::
@@ -339,7 +339,7 @@ a code like this::
 There are also devm_* versions of these functions which release the
 descriptors once the device is released.
 
-See Documentation/acpi/gpio-properties.txt for more information about the
+See Documentation/firmware-guide/acpi/gpio-properties.rst for more information about the
 _DSD binding related to GPIOs.
 
 MFD devices
@@ -423,7 +423,7 @@ will be enumerated to depends on the device ID returned by _HID.
 
 For example, the following ACPI sample might be used to enumerate an lm75-type
 I2C temperature sensor and match it to the driver using the Device Tree
-namespace link:
+namespace link::
 
 	Device (TMP0)
 	{
diff --git a/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst b/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst
new file mode 100644
index 000000000000..5137ca834b54
--- /dev/null
+++ b/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst
@@ -0,0 +1,33 @@
+=====================================================
+Intel INT3496 ACPI device extcon driver documentation
+=====================================================
+
+The Intel INT3496 ACPI device extcon driver is a driver for ACPI
+devices with an acpi-id of INT3496, such as found for example on
+Intel Baytrail and Cherrytrail tablets.
+
+This ACPI device describes how the OS can read the id-pin of the devices'
+USB-otg port, as well as how it optionally can enable Vbus output on the
+otg port and how it can optionally control the muxing of the data pins
+between an USB host and an USB peripheral controller.
+
+The ACPI devices exposes this functionality by returning an array with up
+to 3 gpio descriptors from its ACPI _CRS (Current Resource Settings) call:
+
+=======  =====================================================================
+Index 0  The input gpio for the id-pin, this is always present and valid
+Index 1  The output gpio for enabling Vbus output from the device to the otg
+         port, write 1 to enable the Vbus output (this gpio descriptor may
+         be absent or invalid)
+Index 2  The output gpio for muxing of the data pins between the USB host and
+         the USB peripheral controller, write 1 to mux to the peripheral
+         controller
+=======  =====================================================================
+
+There is a mapping between indices and GPIO connection IDs as follows
+
+	======= =======
+	id	index 0
+	vbus	index 1
+	mux	index 2
+	======= =======
diff --git a/Documentation/firmware-guide/acpi/index.rst b/Documentation/firmware-guide/acpi/index.rst
index ae609eec4679..90c90d42d9ad 100644
--- a/Documentation/firmware-guide/acpi/index.rst
+++ b/Documentation/firmware-guide/acpi/index.rst
@@ -24,3 +24,4 @@ ACPI Support
    acpi-lid
    lpit
    video_extension
+   extcon-intel-int3496
diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst
index d0b077b73f5f..0aa7e2c5d32a 100644
--- a/Documentation/firmware-guide/acpi/method-tracing.rst
+++ b/Documentation/firmware-guide/acpi/method-tracing.rst
@@ -68,7 +68,7 @@ c. Filter out the debug layer/level matched logs when the specified
 
 Where:
    0xXXXXXXXX/0xYYYYYYYY
-     Refer to Documentation/acpi/debug.txt for possible debug layer/level
+     Refer to Documentation/firmware-guide/acpi/debug.rst for possible debug layer/level
      masking values.
    \PPPP.AAAA.TTTT.HHHH
      Full path of a control method that can be found in the ACPI namespace.
diff --git a/Documentation/fmc/API.txt b/Documentation/fmc/API.txt
deleted file mode 100644
index 06b06b92c794..000000000000
--- a/Documentation/fmc/API.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Functions Exported by fmc.ko
-****************************
-
-The FMC core exports the usual 4 functions that are needed for a bus to
-work, and a few more:
-
-        int fmc_driver_register(struct fmc_driver *drv);
-        void fmc_driver_unregister(struct fmc_driver *drv);
-        int fmc_device_register(struct fmc_device *fmc);
-        void fmc_device_unregister(struct fmc_device *fmc);
-
-        int fmc_device_register_n(struct fmc_device **fmc, int n);
-        void fmc_device_unregister_n(struct fmc_device **fmc, int n);
-
-        uint32_t fmc_readl(struct fmc_device *fmc, int offset);
-        void fmc_writel(struct fmc_device *fmc, uint32_t val, int off);
-        void *fmc_get_drvdata(struct fmc_device *fmc);
-        void fmc_set_drvdata(struct fmc_device *fmc, void *data);
-
-        int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
-                          int sdb_entry);
-
-The data structure that describe a device is detailed in *note FMC
-Device::, the one that describes a driver is detailed in *note FMC
-Driver::.  Please note that structures of type fmc_device must be
-allocated by the caller, but must not be released after unregistering.
-The fmc-bus itself takes care of releasing the structure when their use
-count reaches zero - actually, the device model does that in lieu of us.
-
-The functions to register and unregister n devices are meant to be used
-by carriers that host more than one mezzanine. The devices must all be
-registered at the same time because if the FPGA is reprogrammed, all
-devices in the array are affected. Usually, the driver matching the
-first device will reprogram the FPGA, so other devices must know they
-are already driven by a reprogrammed FPGA.
-
-If a carrier hosts slots that are driven by different FPGA devices, it
-should register as a group only mezzanines that are driven by the same
-FPGA, for the reason outlined above.
-
-Finally, the fmc_reprogram function calls the reprogram method (see
-*note The API Offered by Carriers:: and also scans the memory area for
-an SDB tree. You can pass -1 as sdb_entry to disable such scan.
-Otherwise, the function fails if no tree is found at the specified
-entry point.  The function is meant to factorize common code, and by
-the time you read this it is already used by the spec-sw and fine-delay
-modules.
diff --git a/Documentation/fmc/FMC-and-SDB.txt b/Documentation/fmc/FMC-and-SDB.txt
deleted file mode 100644
index fa14e0b24521..000000000000
--- a/Documentation/fmc/FMC-and-SDB.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-
-FMC (FPGA Mezzanine Card) is the standard we use for our I/O devices,
-in the context of White Rabbit and related hardware.
-
-In our I/O environments we need to write drivers for each mezzanine
-card, and such drivers must work regardless of the carrier being used.
-To achieve this, we abstract the FMC interface.
-
-We have a carrier for PCI-E called SPEC and one for VME called SVEC,
-but more are planned.  Also, we support stand-alone devices (usually
-plugged on a SPEC card), controlled through Etherbone, developed by GSI.
-
-Code and documentation for the FMC bus was born as part of the spec-sw
-project, but now it lives in its own project. Other projects, i.e.
-software support for the various carriers, should include this as a
-submodule.
-
-The most up to date version of code and documentation is always
-available from the repository you can clone from:
-
-        git://ohwr.org/fmc-projects/fmc-bus.git (read-only)
-        git@ohwr.org:fmc-projects/fmc-bus.git (read-write for developers)
-
-Selected versions of the documentation, as well as complete tar
-archives for selected revisions are placed to the Files section of the
-project: `http://www.ohwr.org/projects/fmc-bus/files'
-
-
-What is FMC
-***********
-
-FMC, as said, stands for "FPGA Mezzanine Card". It is a standard
-developed by the VME consortium called VITA (VMEbus International Trade
-Association and ratified by ANSI, the American National Standard
-Institute.  The official documentation is called "ANSI-VITA 57.1".
-
-The FMC card is an almost square PCB, around 70x75 millimeters, that is
-called mezzanine in this document.  It usually lives plugged into
-another PCB for power supply and control; such bigger circuit board is
-called carrier from now on, and a single carrier may host more than one
-mezzanine.
-
-In the typical application the mezzanine is mostly analog while the
-carrier is mostly digital, and hosts an FPGA that must be configured to
-match the specific mezzanine and the desired application. Thus, you may
-need to load different FPGA images to drive different instances of the
-same mezzanine.
-
-FMC, as such, is not a bus in the usual meaning of the term, because
-most carriers have only one connector, and carriers with several
-connectors have completely separate electrical connections to them.
-This package, however, implements a bus as a software abstraction.
-
-
-What is SDB
-***********
-
-SDB (Self Describing Bus) is a set of data structures that we use for
-enumerating the internal structure of an FPGA image. We also use it as
-a filesystem inside the FMC EEPROM.
-
-SDB is not mandatory for use of this FMC kernel bus, but if you have SDB
-this package can make good use of it.  SDB itself is developed in the
-fpga-config-space OHWR project. The link to the repository is
-`git://ohwr.org/hdl-core-lib/fpga-config-space.git' and what is used in
-this project lives in the sdbfs subdirectory in there.
-
-SDB support for FMC is described in *note FMC Identification:: and
-*note SDB Support::
-
-
-SDB Support
-***********
-
-The fmc.ko bus driver exports a few functions to help drivers taking
-advantage of the SDB information that may be present in your own FPGA
-memory image.
-
-The module exports the following functions, in the special header
-<linux/fmc-sdb.h>. The linux/ prefix in the name is there because we
-plan to submit it upstream in the future, and don't want to force
-changes on our drivers if that happens.
-
-         int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
-         void fmc_show_sdb_tree(struct fmc_device *fmc);
-         signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
-                                         uint32_t device, unsigned long *sz);
-         int fmc_free_sdb_tree(struct fmc_device *fmc);
diff --git a/Documentation/fmc/carrier.txt b/Documentation/fmc/carrier.txt
deleted file mode 100644
index 5e4f1dd3e98b..000000000000
--- a/Documentation/fmc/carrier.txt
+++ /dev/null
@@ -1,311 +0,0 @@
-FMC Device
-**********
-
-Within the Linux bus framework, the FMC device is created and
-registered by the carrier driver. For example, the PCI driver for the
-SPEC card fills a data structure for each SPEC that it drives, and
-registers an associated FMC device for each card.  The SVEC driver can
-do exactly the same for the VME carrier (actually, it should do it
-twice, because the SVEC carries two FMC mezzanines).  Similarly, an
-Etherbone driver will be able to register its own FMC devices, offering
-communication primitives through frame exchange.
-
-The contents of the EEPROM within the FMC are used for identification
-purposes, i.e. for matching the device with its own driver. For this
-reason the device structure includes a complete copy of the EEPROM
-(actually, the carrier driver may choose whether or not to return it -
-for example we most likely won't have the whole EEPROM available for
-Etherbone devices.
-
-The following listing shows the current structure defining a device.
-Please note that all the machinery is in place but some details may
-still change in the future.  For this reason, there is a version field
-at the beginning of the structure.  As usual, the minor number will
-change for compatible changes (like a new flag) and the major number
-will increase when an incompatible change happens (for example, a
-change in layout of some fmc data structures).  Device writers should
-just set it to the value FMC_VERSION, and be ready to get back -EINVAL
-at registration time.
-
-     struct fmc_device {
-             unsigned long version;
-             unsigned long flags;
-             struct module *owner;           /* char device must pin it */
-             struct fmc_fru_id id;           /* for EEPROM-based match */
-             struct fmc_operations *op;      /* carrier-provided */
-             int irq;                        /* according to host bus. 0 == none */
-             int eeprom_len;                 /* Usually 8kB, may be less */
-             int eeprom_addr;                /* 0x50, 0x52 etc */
-             uint8_t *eeprom;                /* Full contents or leading part */
-             char *carrier_name;             /* "SPEC" or similar, for special use */
-             void *carrier_data;             /* "struct spec *" or equivalent */
-             __iomem void *fpga_base;        /* May be NULL (Etherbone) */
-             __iomem void *slot_base;        /* Set by the driver */
-             struct fmc_device **devarray;   /* Allocated by the bus */
-             int slot_id;                    /* Index in the slot array */
-             int nr_slots;                   /* Number of slots in this carrier */
-             unsigned long memlen;           /* Used for the char device */
-             struct device dev;              /* For Linux use */
-             struct device *hwdev;           /* The underlying hardware device */
-             unsigned long sdbfs_entry;
-             struct sdb_array *sdb;
-             uint32_t device_id;             /* Filled by the device */
-             char *mezzanine_name;           /* Defaults to ``fmc'' */
-             void *mezzanine_data;
-     };
-
-The meaning of most fields is summarized in the code comment above.
-
-The following fields must be filled by the carrier driver before
-registration:
-
-   * version: must be set to FMC_VERSION.
-
-   * owner: set to MODULE_OWNER.
-
-   * op: the operations to act on the device.
-
-   * irq: number for the mezzanine; may be zero.
-
-   * eeprom_len: length of the following array.
-
-   * eeprom_addr: 0x50 for first mezzanine and so on.
-
-   * eeprom: the full content of the I2C EEPROM.
-
-   * carrier_name.
-
-   * carrier_data: a unique pointer for the carrier.
-
-   * fpga_base: the I/O memory address (may be NULL).
-
-   * slot_id: the index of this slot (starting from zero).
-
-   * memlen: if fpga_base is valid, the length of I/O memory.
-
-   * hwdev: to be used in some dev_err() calls.
-
-   * device_id: a slot-specific unique integer number.
-
-
-Please note that the carrier should read its own EEPROM memory before
-registering the device, as well as fill all other fields listed above.
-
-The following fields should not be assigned, because they are filled
-later by either the bus or the device driver:
-
-   * flags.
-
-   * fru_id: filled by the bus, parsing the eeprom.
-
-   * slot_base: filled and used by the driver, if useful to it.
-
-   * devarray: an array og all mezzanines driven by a singe FPGA.
-
-   * nr_slots: set by the core at registration time.
-
-   * dev: used by Linux.
-
-   * sdb: FPGA contents, scanned according to driver's directions.
-
-   * sdbfs_entry: SDB entry point in EEPROM: autodetected.
-
-   * mezzanine_data: available for the driver.
-
-   * mezzanine_name: filled by fmc-bus during identification.
-
-
-Note: mezzanine_data may be redundant, because Linux offers the drvdata
-approach, so the field may be removed in later versions of this bus
-implementation.
-
-As I write this, she SPEC carrier is already completely functional in
-the fmc-bus environment, and is a good reference to look at.
-
-
-The API Offered by Carriers
-===========================
-
-The carrier provides a number of methods by means of the
-`fmc_operations' structure, which currently is defined like this
-(again, it is a moving target, please refer to the header rather than
-this document):
-
-     struct fmc_operations {
-             uint32_t (*readl)(struct fmc_device *fmc, int offset);
-             void (*writel)(struct fmc_device *fmc, uint32_t value, int offset);
-             int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
-             int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
-             int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
-                                char *name, int flags);
-             void (*irq_ack)(struct fmc_device *fmc);
-             int (*irq_free)(struct fmc_device *fmc);
-             int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
-                                int ngpio);
-             int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
-             int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
-     };
-
-The individual methods perform the following tasks:
-
-`readl'
-`writel'
-     These functions access FPGA registers by whatever means the
-     carrier offers. They are not expected to fail, and most of the time
-     they will just make a memory access to the host bus. If the
-     carrier provides a fpga_base pointer, the driver may use direct
-     access through that pointer. For this reason the header offers the
-     inline functions fmc_readl and fmc_writel that access fpga_base if
-     the respective method is NULL. A driver that wants to be portable
-     and efficient should use fmc_readl and fmc_writel.  For Etherbone,
-     or other non-local carriers, error-management is still to be
-     defined.
-
-`validate'
-     Module parameters are used to manage different applications for
-     two or more boards of the same kind. Validation is based on the
-     busid module parameter, if provided, and returns the matching
-     index in the associated array. See *note Module Parameters:: in in
-     doubt. If no match is found, `-ENOENT' is returned; if the user
-     didn't pass `busid=', all devices will pass validation.  The value
-     returned by the validate method can be used as index into other
-     parameters (for example, some drivers use the `lm32=' parameter in
-     this way). Such "generic parameters" are documented in *note
-     Module Parameters::, below. The validate method is used by
-     `fmc-trivial.ko', described in *note fmc-trivial::.
-
-`reprogram'
-     The carrier enumerates FMC devices by loading a standard (or
-     golden) FPGA binary that allows EEPROM access. Each driver, then,
-     will need to reprogram the FPGA by calling this function.  If the
-     name argument is NULL, the carrier should reprogram the golden
-     binary. If the gateware name has been overridden through module
-     parameters (in a carrier-specific way) the file loaded will match
-     the parameters. Per-device gateware names can be specified using
-     the `gateware=' parameter, see *note Module Parameters::.  Note:
-     Clients should call rhe new helper, fmc_reprogram, which both
-     calls this method and parse the SDB tree of the FPGA.
-
-`irq_request'
-`irq_ack'
-`irq_free'
-     Interrupt management is carrier-specific, so it is abstracted as
-     operations. The interrupt number is listed in the device
-     structure, and for the mezzanine driver the number is only
-     informative.  The handler will receive the fmc pointer as dev_id;
-     the flags argument is passed to the Linux request_irq function,
-     but fmc-specific flags may be added in the future. You'll most
-     likely want to pass the `IRQF_SHARED' flag.
-
-`gpio_config'
-     The method allows to configure a GPIO pin in the carrier, and read
-     its current value if it is configured as input. See *note The GPIO
-     Abstraction:: for details.
-
-`read_ee'
-`write_ee'
-     Read or write the EEPROM. The functions are expected to be only
-     called before reprogramming and the carrier should refuse them
-     with `ENODEV' after reprogramming.  The offset is expected to be
-     within 8kB (the current size), but addresses up to 1MB are
-     reserved to fit bigger I2C devices in the future. Carriers may
-     offer access to other internal flash memories using these same
-     methods: for example the SPEC driver may define that its carrier
-     I2C memory is seen at offset 1M and the internal SPI flash is seen
-     at offset 16M.  This multiplexing of several flash memories in the
-     same address space is carrier-specific and should only be used
-     by a driver that has verified the `carrier_name' field.
-
-
-
-The GPIO Abstraction
-====================
-
-Support for GPIO pins in the fmc-bus environment is not very
-straightforward and deserves special discussion.
-
-While the general idea of a carrier-independent driver seems to fly,
-configuration of specific signals within the carrier needs at least
-some knowledge of the carrier itself.  For this reason, the specific
-driver can request to configure carrier-specific GPIO pins, numbered
-from 0 to at most 4095.  Configuration is performed by passing a
-pointer to an array of struct fmc_gpio items, as well as the length of
-the array. This is the data structure:
-
-        struct fmc_gpio {
-                char *carrier_name;
-                int gpio;
-                int _gpio;      /* internal use by the carrier */
-                int mode;       /* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
-                int irqmode;    /* IRQF_TRIGGER_LOW and so on */
-        };
-
-By specifying a carrier_name for each pin, the driver may access
-different pins in different carriers.  The gpio_config method is
-expected to return the number of pins successfully configured, ignoring
-requests for other carriers. However, if no pin is configured (because
-no structure at all refers to the current carrier_name), the operation
-returns an error so the caller will know that it is running under a
-yet-unsupported carrier.
-
-So, for example, a driver that has been developed and tested on both
-the SPEC and the SVEC may request configuration of two different GPIO
-pins, and expect one such configuration to succeed - if none succeeds
-it most likely means that the current carrier is a still-unknown one.
-
-If, however, your GPIO pin has a specific known role, you can pass a
-special number in the gpio field, using one of the following macros:
-
-        #define FMC_GPIO_RAW(x)         (x)             /* 4096 of them */
-        #define FMC_GPIO_IRQ(x)         ((x) + 0x1000)  /*  256 of them */
-        #define FMC_GPIO_LED(x)         ((x) + 0x1100)  /*  256 of them */
-        #define FMC_GPIO_KEY(x)         ((x) + 0x1200)  /*  256 of them */
-        #define FMC_GPIO_TP(x)          ((x) + 0x1300)  /*  256 of them */
-        #define FMC_GPIO_USER(x)        ((x) + 0x1400)  /*  256 of them */
-
-Use of virtual GPIO numbers (anything but FMC_GPIO_RAW) is allowed
-provided the carrier_name field in the data structure is left
-unspecified (NULL). Each carrier is responsible for providing a mapping
-between virtual and physical GPIO numbers. The carrier may then use the
-_gpio field to cache the result of this mapping.
-
-All carriers must map their I/O lines to the sets above starting from
-zero.  The SPEC, for example, maps interrupt pins 0 and 1, and test
-points 0 through 3 (even if the test points on the PCB are called
-5,6,7,8).
-
-If, for example, a driver requires a free LED and a test point (for a
-scope probe to be plugged at some point during development) it may ask
-for FMC_GPIO_LED(0) and FMC_GPIO_TP(0). Each carrier will provide
-suitable GPIO pins.  Clearly, the person running the drivers will know
-the order used by the specific carrier driver in assigning leds and
-testpoints, so to make a carrier-dependent use of the diagnostic tools.
-
-In theory, some form of autodetection should be possible: a driver like
-the wr-nic (which uses IRQ(1) on the SPEC card) should configure
-IRQ(0), make a test with software-generated interrupts and configure
-IRQ(1) if the test fails. This probing step should be used because even
-if the wr-nic gateware is known to use IRQ1 on the SPEC, the driver
-should be carrier-independent and thus use IRQ(0) as a first bet -
-actually, the knowledge that IRQ0 may fail is carrier-dependent
-information, but using it doesn't make the driver unsuitable for other
-carriers.
-
-The return value of gpio_config is defined as follows:
-
-   * If no pin in the array can be used by the carrier, `-ENODEV'.
-
-   * If at least one virtual GPIO number cannot be mapped, `-ENOENT'.
-
-   * On success, 0 or positive. The value returned is the number of
-     high input bits (if no input is configured, the value for success
-     is 0).
-
-While I admit the procedure is not completely straightforward, it
-allows configuration, input and output with a single carrier operation.
-Given the typical use case of FMC devices, GPIO operations are not
-expected to ever by in hot paths, and GPIO access so fare has only been
-used to configure the interrupt pin, mode and polarity. Especially
-reading inputs is not expected to be common. If your device has GPIO
-capabilities in the hot path, you should consider using the kernel's
-GPIO mechanisms.
diff --git a/Documentation/fmc/fmc-chardev.txt b/Documentation/fmc/fmc-chardev.txt
deleted file mode 100644
index d9ccb278e597..000000000000
--- a/Documentation/fmc/fmc-chardev.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-fmc-chardev
-===========
-
-This is a simple generic driver, that allows user access by means of a
-character device (actually, one for each mezzanine it takes hold of).
-
-The char device is created as a misc device. Its name in /dev (as
-created by udev) is the same name as the underlying FMC device. Thus,
-the name can be a silly fmc-0000 look-alike if the device has no
-identifiers nor bus_id, a more specific fmc-0400 if the device has a
-bus-specific address but no associated name, or something like
-fdelay-0400 if the FMC core can rely on both a mezzanine name and a bus
-address.
-
-Currently the driver only supports read and write: you can lseek to the
-desired address and read or write a register.
-
-The driver assumes all registers are 32-bit in size, and only accepts a
-single read or write per system call. However, as a result of Unix read
-and write semantics, users can simply fread or fwrite bigger areas in
-order to dump or store bigger memory areas.
-
-There is currently no support for mmap, user-space interrupt management
-and DMA buffers. They may be added in later versions, if the need
-arises.
-
-The example below shows raw access to a SPEC card programmed with its
-golden FPGA file, that features an SDB structure at offset 256 - i.e.
-64 words.  The mezzanine's EEPROM in this case is not programmed, so the
-default name is fmc-<bus><devfn>, and there are two cards in the system:
-
-  spusa.root# insmod fmc-chardev.ko
-  [ 1073.339332] spec 0000:02:00.0: Driver has no ID: matches all
-  [ 1073.345051] spec 0000:02:00.0: Created misc device "fmc-0200"
-  [ 1073.350821] spec 0000:04:00.0: Driver has no ID: matches all
-  [ 1073.356525] spec 0000:04:00.0: Created misc device "fmc-0400"
-  spusa.root# ls -l /dev/fmc*
-  crw------- 1 root root 10, 58 Nov 20 19:23 /dev/fmc-0200
-  crw------- 1 root root 10, 57 Nov 20 19:23 /dev/fmc-0400
-  spusa.root# dd bs=4 skip=64 count=1 if=/dev/fmc-0200 2> /dev/null | od -t x1z
-  0000000 2d 42 44 53                                      >-BDS<
-  0000004
-
-The simple program tools/fmc-mem in this package can access an FMC char
-device and read or write a word or a whole area.  Actually, the program
-is not specific to FMC at all, it just uses lseek, read and write.
-
-Its first argument is the device name, the second the offset, the third
-(if any) the value to write and the optional last argument that must
-begin with "+" is the number of bytes to read or write.  In case of
-repeated reading data is written to stdout; repeated writes read from
-stdin and the value argument is ignored.
-
-The following examples show reading the SDB magic number and the first
-SDB record from a SPEC device programmed with its golden image:
-
-     spusa.root# ./fmc-mem /dev/fmc-0200 100
-     5344422d
-     spusa.root# ./fmc-mem /dev/fmc-0200 100 +40 | od -Ax -t x1z
-     000000 2d 42 44 53 00 01 02 00 00 00 00 00 00 00 00 00  >-BDS............<
-     000010 00 00 00 00 ff 01 00 00 00 00 00 00 51 06 00 00  >............Q...<
-     000020 c9 42 a5 e6 02 00 00 00 11 05 12 20 2d 34 42 57  >.B......... -4BW<
-     000030 73 6f 72 43 72 61 62 73 49 53 47 2d 00 20 20 20  >sorCrabsISG-.   <
-     000040
diff --git a/Documentation/fmc/fmc-fakedev.txt b/Documentation/fmc/fmc-fakedev.txt
deleted file mode 100644
index e85b74a4ae30..000000000000
--- a/Documentation/fmc/fmc-fakedev.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-fmc-fakedev
-===========
-
-This package includes a software-only device, called fmc-fakedev, which
-is able to register up to 4 mezzanines (by default it registers one).
-Unlike the SPEC driver, which creates an FMC device for each PCI cards
-it manages, this module creates a single instance of its set of
-mezzanines.
-
-It is meant as the simplest possible example of how a driver should be
-written, and it includes a fake EEPROM image (built using the tools
-described in *note FMC Identification::),, which by default is
-replicated for each fake mezzanine.
-
-You can also use this device to verify the match algorithms, by asking
-it to test your own EEPROM image. You can provide the image by means of
-the eeprom= module parameter: the new EEPROM image is loaded, as usual,
-by means of the firmware loader.  This example shows the defaults and a
-custom EEPROM image:
-
-     spusa.root# insmod fmc-fakedev.ko
-     [   99.971247]  fake-fmc-carrier: mezzanine 0
-     [   99.975393]       Manufacturer: fake-vendor
-     [   99.979624]       Product name: fake-design-for-testing
-     spusa.root# rmmod fmc-fakedev
-     spusa.root# insmod fmc-fakedev.ko eeprom=fdelay-eeprom.bin
-     [  121.447464]  fake-fmc-carrier: Mezzanine 0: eeprom "fdelay-eeprom.bin"
-     [  121.462725]  fake-fmc-carrier: mezzanine 0
-     [  121.466858]       Manufacturer: CERN
-     [  121.470477]       Product name: FmcDelay1ns4cha
-     spusa.root# rmmod fmc-fakedev
-
-After loading the device, you can use the write_ee method do modify its
-own internal fake EEPROM: whenever the image is overwritten starting at
-offset 0, the module will unregister and register again the FMC device.
-This is shown in fmc-write-eeprom.txt
diff --git a/Documentation/fmc/fmc-trivial.txt b/Documentation/fmc/fmc-trivial.txt
deleted file mode 100644
index d1910bc67159..000000000000
--- a/Documentation/fmc/fmc-trivial.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-fmc-trivial
-===========
-
-The simple module fmc-trivial is just a simple client that registers an
-interrupt handler. I used it to verify the basic mechanism of the FMC
-bus and how interrupts worked.
-
-The module implements the generic FMC parameters, so it can program a
-different gateware file in each card. The whole list of parameters it
-accepts are:
-
-`busid='
-`gateware='
-     Generic parameters. See mezzanine.txt
-
-
-This driver is worth reading, in my opinion.
diff --git a/Documentation/fmc/fmc-write-eeprom.txt b/Documentation/fmc/fmc-write-eeprom.txt
deleted file mode 100644
index e0a9712156aa..000000000000
--- a/Documentation/fmc/fmc-write-eeprom.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-fmc-write-eeprom
-================
-
-This module is designed to load a binary file from /lib/firmware and to
-write it to the internal EEPROM of the mezzanine card. This driver uses
-the `busid' generic parameter.
-
-Overwriting the EEPROM is not something you should do daily, and it is
-expected to only happen during manufacturing. For this reason, the
-module makes it unlikely for the random user to change a working EEPROM.
-
-However, since the EEPROM may include application-specific information
-other than the identification, later versions of this packages added
-write-support through sysfs. See *note Accessing the EEPROM::.
-
-To avoid damaging the EEPROM content, the module takes the following
-measures:
-
-   * It accepts a `file=' argument (within /lib/firmware) and if no
-     such argument is received, it doesn't write anything to EEPROM
-     (i.e. there is no default file name).
-
-   * If the file name ends with `.bin' it is written verbatim starting
-     at offset 0.
-
-   * If the file name ends with `.tlv' it is interpreted as
-     type-length-value (i.e., it allows writev(2)-like operation).
-
-   * If the file name doesn't match any of the patterns above, it is
-     ignored and no write is performed.
-
-   * Only cards listed with `busid=' are written to. If no busid is
-     specified, no programming is done (and the probe function of the
-     driver will fail).
-
-
-Each TLV tuple is formatted in this way: the header is 5 bytes,
-followed by data. The first byte is `w' for write, the next two bytes
-represent the address, in little-endian byte order, and the next two
-represent the data length, in little-endian order. The length does not
-include the header (it is the actual number of bytes to be written).
-
-This is a real example: that writes 5 bytes at position 0x110:
-
-        spusa.root# od -t x1 -Ax /lib/firmware/try.tlv
-        000000 77 10 01 05 00 30 31 32 33 34
-        00000a
-        spusa.root# insmod /tmp/fmc-write-eeprom.ko busid=0x0200 file=try.tlv
-        [19983.391498] spec 0000:03:00.0: write 5 bytes at 0x0110
-        [19983.414615] spec 0000:03:00.0: write_eeprom: success
-
-Please note that you'll most likely want to use SDBFS to build your
-EEPROM image, at least if your mezzanines are being used in the White
-Rabbit environment. For this reason the TLV format is not expected to
-be used much and is not expected to be developed further.
-
-If you want to try reflashing fake EEPROM devices, you can use the
-fmc-fakedev.ko module (see *note fmc-fakedev::).  Whenever you change
-the image starting at offset 0, it will deregister and register again
-after two seconds.  Please note, however, that if fmc-write-eeprom is
-still loaded, the system will associate it to the new device, which
-will be reprogrammed and thus will be unloaded after two seconds.  The
-following example removes the module after it reflashed fakedev the
-first time.
-
-     spusa.root# insmod fmc-fakedev.ko
-        [   72.984733]  fake-fmc: Manufacturer: fake-vendor
-        [   72.989434]  fake-fmc: Product name: fake-design-for-testing
-        spusa.root# insmod fmc-write-eeprom.ko busid=0 file=fdelay-eeprom.bin; \
-            rmmod fmc-write-eeprom
-        [  130.874098]  fake-fmc: Matching a generic driver (no ID)
-        [  130.887845]  fake-fmc: programming 6155 bytes
-        [  130.894567]  fake-fmc: write_eeprom: success
-        [  132.895794]  fake-fmc: Manufacturer: CERN
-        [  132.899872]  fake-fmc: Product name: FmcDelay1ns4cha
-
-
-Accessing the EEPROM
-=====================
-
-The bus creates a sysfs binary file called eeprom for each mezzanine it
-knows about:
-
-        spusa.root# cd /sys/bus/fmc/devices; ls -l */eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcAdc100m14b4cha-0800/eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDelay1ns4cha-0200/eeprom
-        -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDio5cha-0400/eeprom
-
-Everybody can read the files and the superuser can also modify it, but
-the operation may on the carrier driver, if the carrier is unable to
-access the I2C bus.  For example, the spec driver can access the bus
-only with its golden gateware: after a mezzanine driver reprogrammed
-the FPGA with a custom circuit, the carrier is unable to access the
-EEPROM and returns ENOTSUPP.
-
-An alternative way to write the EEPROM is the mezzanine driver
-fmc-write-eeprom (See *note fmc-write-eeprom::), but the procedure is
-more complex.
diff --git a/Documentation/fmc/identifiers.txt b/Documentation/fmc/identifiers.txt
deleted file mode 100644
index 3bb577ff0d52..000000000000
--- a/Documentation/fmc/identifiers.txt
+++ /dev/null
@@ -1,168 +0,0 @@
-FMC Identification
-******************
-
-The FMC standard requires every compliant mezzanine to carry
-identification information in an I2C EEPROM.  The information must be
-laid out according to the "IPMI Platform Management FRU Information",
-where IPMI is a lie I'd better not expand, and FRU means "Field
-Replaceable Unit".
-
-The FRU information is an intricate unreadable binary blob that must
-live at offset 0 of the EEPROM, and typically extends for a few hundred
-bytes. The standard allows the application to use all the remaining
-storage area of the EEPROM as it wants.
-
-This chapter explains how to create your own EEPROM image and how to
-write it in your mezzanine, as well as how devices and drivers are
-paired at run time.  EEPROM programming uses tools that are part of this
-package and SDB (part of the fpga-config-space package).
-
-The first sections are only interesting for manufacturers who need to
-write the EEPROM. If you are just a software developer writing an FMC
-device or driver, you may jump straight to *note SDB Support::.
-
-
-Building the FRU Structure
-==========================
-
-If you want to know the internals of the FRU structure and despair, you
-can retrieve the document from
-`http://download.intel.com/design/servers/ipmi/FRU1011.pdf' .  The
-standard is awful and difficult without reason, so we only support the
-minimum mandatory subset - we create a simple structure and parse it
-back at run time, but we are not able to either generate or parse more
-arcane features like non-english languages and 6-bit text.  If you need
-more items of the FRU standard for your boards, please submit patches.
-
-This package includes the Python script that Matthieu Cattin wrote to
-generate the FRU binary blob, based on an helper libipmi by Manohar
-Vanga and Matthieu himself.  I changed the test script to receive
-parameters from the command line or from the environment (the command
-line takes precedence)
-
-To make a long story short, in order to build a standard-compliant
-binary file to be burned in your EEPROM, you need the following items:
-
-        Environment    Opt     Official Name          Default
----------------------------------------------------------------------
-        FRU_VENDOR     -v      "Board Manufacturer"   fmc-example
-        FRU_NAME       -n      "Board Product Name"   mezzanine
-        FRU_SERIAL     -s      `Board Serial Number"  0001
-        FRU_PART       -p      "Board Part Number"    sample-part
-        FRU_OUTPUT     -o      not applicable         /dev/stdout
-
-The "Official Name" above is what you find in the FRU official
-documentation, chapter 11, page 7 ("Board Info Area Format").  The
-output option is used to save the generated binary to a specific file
-name instead of stdout.
-
-You can pass the items to the FRU generator either in the environment
-or on the command line.  This package has currently no support for
-specifying power consumption or such stuff, but I plan to add it as
-soon as I find some time for that.
-
-FIXME: consumption etc for FRU are here or in PTS?
-
-The following example creates a binary image for a specific board:
-
-        ./tools/fru-generator -v CERN -n FmcAdc100m14b4cha \
-               -s HCCFFIA___-CR000003 -p EDA-02063-V5-0 > eeprom.bin
-
-The following example shows a script that builds several binary EEPROM
-images for a series of boards, changing the serial number for each of
-them. The script uses a mix of environment variables and command line
-options, and uses the same string patterns shown above.
-
-        #!/bin/sh
-
-        export FRU_VENDOR="CERN"
-        export FRU_NAME="FmcAdc100m14b4cha"
-        export FRU_PART="EDA-02063-V5-0"
-
-        serial="HCCFFIA___-CR"
-
-        for number in $(seq 1 50); do
-           # build number-string "ns"
-           ns="$(printf %06d $number)"
-           ./fru-generator -s "${serial}${ns}" > eeprom-${ns}.bin
-        done
-
-
-Using SDB-FS in the EEPROM
-==========================
-
-If you want to use SDB as a filesystem in the EEPROM device within the
-mezzanine, you should create one such filesystem using gensdbfs, from
-the fpga-config-space package on OHWR.
-
-By using an SBD filesystem you can cluster several files in a single
-EEPROM, so both the host system and a soft-core running in the FPGA (if
-any) can access extra production-time information.
-
-We chose to use SDB as a storage filesystem because the format is very
-simple, and both the host system and the soft-core will likely already
-include support code for such format. The SDB library offered by the
-fpga-config-space is less than 1kB under LM32, so it proves quite up to
-the task.
-
-The SDB entry point (which acts as a directory listing) cannot live at
-offset zero in the flash device, because the FRU information must live
-there.  To avoid wasting precious storage space while still allowing
-for more-than-minimal FRU structures, the fmc.ko will look for the SDB
-record at address 256, 512 and 1024.
-
-In order to generate the complete EEPROM image you'll need a
-configuration file for gensdbfs: you tell the program where to place
-the sdb entry point, and you must force the FRU data file to be placed
-at the beginning of the storage device. If needed, you can also place
-other files at a special offset (we sometimes do it for backward
-compatibility with drivers we wrote before implementing SDB for flash
-memory).
-
-The directory tools/sdbfs of this package includes a well-commented
-example that you may want to use as a starting point (the comments are
-in the file called -SDB-CONFIG-).  Reading documentation for gensdbfs
-is a suggested first step anyways.
-
-This package (generic FMC bus support) only accesses two files in the
-EEPROM: the FRU information, at offset zero, with a suggested filename
-of IPMI-FRU and the short name for the mezzanine, in a file called
-name. The IPMI-FRU name is not mandatory, but a strongly suggested
-choice; the name filename is mandatory, because this is the preferred
-short name used by the FMC core.  For example, a name of "fdelay" may
-supplement a Product Name like "FmcDelay1ns4cha" - exactly as
-demonstrated in `tools/sdbfs'.
-
-Note: SDB access to flash memory is not yet supported, so the short
-name currently in use is just the "Product Name" FRU string.
-
-The example in tools/sdbfs includes an extra file, that is needed by
-the fine-delay driver, and must live at a known address of 0x1800.  By
-running gensdbfs on that directory you can output your binary EEPROM
-image (here below spusa$ is the shell prompt):
-
-        spusa$ ../fru-generator -v CERN -n FmcDelay1ns4cha -s proto-0 \
-                      -p EDA-02267-V3 > IPMI-FRU
-        spusa$ ls -l
-        total 16
-        -rw-rw-r-- 1 rubini staff 975 Nov 19 18:08 --SDB-CONFIG--
-        -rw-rw-r-- 1 rubini staff 216 Nov 19 18:13 IPMI-FRU
-        -rw-rw-r-- 1 rubini staff  11 Nov 19 18:04 fd-calib
-        -rw-rw-r-- 1 rubini staff   7 Nov 19 18:04 name
-        spusa$ sudo gensdbfs . /lib/firmware/fdelay-eeprom.bin
-        spusa$ sdb-read -l -e 0x100 /lib/firmware/fdelay-eeprom.bin
-        /home/rubini/wip/sdbfs/userspace/sdb-read: listing format is to be defined
-        46696c6544617461:2e202020  00000100-000018ff .
-        46696c6544617461:6e616d65  00000200-00000206 name
-        46696c6544617461:66642d63  00001800-000018ff fd-calib
-        46696c6544617461:49504d49  00000000-000000d7 IPMI-FRU
-        spusa$ ../fru-dump /lib/firmware/fdelay-eeprom.bin
-        /lib/firmware/fdelay-eeprom.bin: manufacturer: CERN
-        /lib/firmware/fdelay-eeprom.bin: product-name: FmcDelay1ns4cha
-        /lib/firmware/fdelay-eeprom.bin: serial-number: proto-0
-        /lib/firmware/fdelay-eeprom.bin: part-number: EDA-02267-V3
-
-As expected, the output file is both a proper sdbfs object and an IPMI
-FRU information blob. The fd-calib file lives at offset 0x1800 and is
-over-allocated to 256 bytes, according to the configuration file for
-gensdbfs.
diff --git a/Documentation/fmc/mezzanine.txt b/Documentation/fmc/mezzanine.txt
deleted file mode 100644
index 87910dbfc91e..000000000000
--- a/Documentation/fmc/mezzanine.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-FMC Driver
-**********
-
-An FMC driver is concerned with the specific mezzanine and associated
-gateware. As such, it is expected to be independent of the carrier
-being used: it will perform I/O accesses only by means of
-carrier-provided functions.
-
-The matching between device and driver is based on the content of the
-EEPROM (as mandated by the FMC standard) or by the actual cores
-configured in the FPGA; the latter technique is used when the FPGA is
-already programmed when the device is registered to the bus core.
-
-In some special cases it is possible for a driver to directly access
-FPGA registers, by means of the `fpga_base' field of the device
-structure. This may be needed for high-bandwidth peripherals like fast
-ADC cards. If the device module registered a remote device (for example
-by means of Etherbone), the `fpga_base' pointer will be NULL.
-Therefore, drivers must be ready to deal with NULL base pointers, and
-fail gracefully.  Most driver, however, are not expected to access the
-pointer directly but run fmc_readl and fmc_writel instead, which will
-work in any case.
-
-In even more special cases, the driver may access carrier-specific
-functionality: the `carrier_name' string allows the driver to check
-which is the current carrier and make use of the `carrier_data'
-pointer.  We chose to use carrier names rather than numeric identifiers
-for greater flexibility, but also to avoid a central registry within
-the `fmc.h' file - we hope other users will exploit our framework with
-their own carriers.  An example use of carrier names is in GPIO setup
-(see *note The GPIO Abstraction::), although the name match is not
-expected to be performed by the driver.  If you depend on specific
-carriers, please check the carrier name and fail gracefully if your
-driver finds it is running in a yet-unknown-to-it environment.
-
-
-ID Table
-========
-
-Like most other Linux drivers, and FMC driver must list all the devices
-which it is able to drive.  This is usually done by means of a device
-table, but in FMC we can match hardware based either on the contents of
-their EEPROM or on the actual FPGA cores that can be enumerated.
-Therefore, we have two tables of identifiers.
-
-Matching of FRU information depends on two names, the manufacturer (or
-vendor) and the device (see *note FMC Identification::); for
-flexibility during production (i.e. before writing to the EEPROM) the
-bus supports a catch-all driver that specifies NULL strings. For this
-reason, the table is specified as pointer-and-length, not a a
-null-terminated array - the entry with NULL names can be a valid entry.
-
-Matching on FPGA cores depends on two numeric fields: the 64-bit vendor
-number and the 32-bit device number. Support for matching based on
-class is not yet implemented.  Each device is expected to be uniquely
-identified by an array of cores (it matches if all of the cores are
-instantiated), and for consistency the list is passed as
-pointer-and-length.  Several similar devices can be driven by the same
-driver, and thus the driver specifies and array of such arrays.
-
-The complete set of involved data structures is thus the following:
-
-        struct fmc_fru_id { char *manufacturer; char *product_name; };
-        struct fmc_sdb_one_id { uint64_t vendor; uint32_t device; };
-        struct fmc_sdb_id { struct fmc_sdb_one_id *cores; int cores_nr; };
-
-        struct fmc_device_id {
-                struct fmc_fru_id *fru_id; int fru_id_nr;
-                struct fmc_sdb_id *sdb_id; int sdb_id_nr;
-        };
-
-A better reference, with full explanation, is the <linux/fmc.h> header.
-
-
-Module Parameters
-=================
-
-Most of the FMC drivers need the same set of kernel parameters. This
-package includes support to implement common parameters by means of
-fields in the `fmc_driver' structure and simple macro definitions.
-
-The parameters are carrier-specific, in that they rely on the busid
-concept, that varies among carriers. For the SPEC, the identifier is a
-PCI bus and devfn number, 16 bits wide in total; drivers for other
-carriers will most likely offer something similar but not identical,
-and some code duplication is unavoidable.
-
-This is the list of parameters that are common to several modules to
-see how they are actually used, please look at spec-trivial.c.
-
-`busid='
-     This is an array of integers, listing carrier-specific
-     identification numbers. For PIC, for example, `0x0400' represents
-     bus 4, slot 0.  If any such ID is specified, the driver will only
-     accept to drive cards that appear in the list (even if the FMC ID
-     matches). This is accomplished by the validate carrier method.
-
-`gateware='
-     The argument is an array of strings. If no busid= is specified,
-     the first string of gateware= is used for all cards; otherwise the
-     identifiers and gateware names are paired one by one, in the order
-     specified.
-
-`show_sdb='
-     For modules supporting it, this parameter asks to show the SDB
-     internal structure by means of kernel messages. It is disabled by
-     default because those lines tend to hide more important messages,
-     if you look at the system console while loading the drivers.
-     Note: the parameter is being obsoleted, because fmc.ko itself now
-     supports dump_sdb= that applies to every client driver.
-
-
-For example, if you are using the trivial driver to load two different
-gateware files to two different cards, you can use the following
-parameters to load different binaries to the cards, after looking up
-the PCI identifiers. This has been tested with a SPEC carrier.
-
-        insmod fmc-trivial.ko \
-                              busid=0x0200,0x0400 \
-                              gateware=fmc/fine-delay.bin,fmc/simple-dio.bin
-
-Please note that not all sub-modules support all of those parameters.
-You can use modinfo to check what is supported by each module.
diff --git a/Documentation/fmc/parameters.txt b/Documentation/fmc/parameters.txt
deleted file mode 100644
index 59edf088e3a4..000000000000
--- a/Documentation/fmc/parameters.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Module Parameters in fmc.ko
-***************************
-
-The core driver receives two module parameters, meant to help debugging
-client modules. Both parameters can be modified by writing to
-/sys/module/fmc/parameters/, because they are used when client drivers
-are devices are registered, not when fmc.ko is loaded.
-
-`dump_eeprom='
-     If not zero, the parameter asks the bus controller to dump the
-     EEPROM of any device that is registered, using printk.
-
-`dump_sdb='
-     If not zero, the parameter prints the SDB tree of every FPGA it is
-     loaded by fmc_reprogram(). If greater than one, it asks to dump
-     the binary content of SDB records.  This currently only dumps the
-     top-level SDB array, though.
-
-
-EEPROM dumping avoids repeating lines, since most of the contents is
-usually empty and all bits are one or zero. This is an example of the
-output:
-
-        [ 6625.850480] spec 0000:02:00.0: FPGA programming successful
-        [ 6626.139949] spec 0000:02:00.0: Manufacturer: CERN
-        [ 6626.144666] spec 0000:02:00.0: Product name: FmcDelay1ns4cha
-        [ 6626.150370] FMC: mezzanine 0: 0000:02:00.0 on SPEC
-        [ 6626.155179] FMC: dumping eeprom 0x2000 (8192) bytes
-        [ 6626.160087] 0000: 01 00 00 01  00 0b 00 f3  01 0a 00 a5  85 87 c4 43
-        [ 6626.167069] 0010: 45 52 4e cf  46 6d 63 44  65 6c 61 79  31 6e 73 34
-        [ 6626.174019] 0020: 63 68 61 c7  70 72 6f 74  6f 2d 30 cc  45 44 41 2d
-        [ 6626.180975] 0030: 30 32 32 36  37 2d 56 33  da 32 30 31  32 2d 31 31
-        [...]
-        [ 6626.371366] 0200: 66 64 65 6c  61 79 0a 00  00 00 00 00  00 00 00 00
-        [ 6626.378359] 0210: 00 00 00 00  00 00 00 00  00 00 00 00  00 00 00 00
-        [ 6626.385361] [...]
-        [ 6626.387308] 1800: 70 6c 61 63  65 68 6f 6c  64 65 72 ff  ff ff ff ff
-        [ 6626.394259] 1810: ff ff ff ff  ff ff ff ff  ff ff ff ff  ff ff ff ff
-        [ 6626.401250] [...]
-
-The dump of SDB looks like the following; the example shows the simple
-golden gateware for the SPEC card, removing the leading timestamps to
-fit the page:
-
-        spec 0000:02:00.0: SDB: 00000651:e6a542c9 WB4-Crossbar-GSI
-        spec 0000:02:00.0: SDB: 0000ce42:ff07fc47 WR-Periph-Syscon (00000000-000000ff)
-        FMC: mezzanine 0: 0000:02:00.0 on SPEC
-        FMC: poor dump of sdb first level:
-        0000: 53 44 42 2d  00 02 01 00  00 00 00 00  00 00 00 00
-        0010: 00 00 00 00  00 00 01 ff  00 00 00 00  00 00 06 51
-        0020: e6 a5 42 c9  00 00 00 02  20 12 05 11  57 42 34 2d
-        0030: 43 72 6f 73  73 62 61 72  2d 47 53 49  20 20 20 00
-        0040: 00 00 01 01  00 00 00 07  00 00 00 00  00 00 00 00
-        0050: 00 00 00 00  00 00 00 ff  00 00 00 00  00 00 ce 42
-        0060: ff 07 fc 47  00 00 00 01  20 12 03 05  57 52 2d 50
-        0070: 65 72 69 70  68 2d 53 79  73 63 6f 6e  20 20 20 01
diff --git a/Documentation/fpga/dfl.rst b/Documentation/fpga/dfl.rst
new file mode 100644
index 000000000000..2f125abd777f
--- /dev/null
+++ b/Documentation/fpga/dfl.rst
@@ -0,0 +1,291 @@
+=================================================
+FPGA Device Feature List (DFL) Framework Overview
+=================================================
+
+Authors:
+
+- Enno Luebbers <enno.luebbers@intel.com>
+- Xiao Guangrong <guangrong.xiao@linux.intel.com>
+- Wu Hao <hao.wu@intel.com>
+
+The Device Feature List (DFL) FPGA framework (and drivers according to this
+this framework) hides the very details of low layer hardwares and provides
+unified interfaces to userspace. Applications could use these interfaces to
+configure, enumerate, open and access FPGA accelerators on platforms which
+implement the DFL in the device memory. Besides this, the DFL framework
+enables system level management functions such as FPGA reconfiguration.
+
+
+Device Feature List (DFL) Overview
+==================================
+Device Feature List (DFL) defines a linked list of feature headers within the
+device MMIO space to provide an extensible way of adding features. Software can
+walk through these predefined data structures to enumerate FPGA features:
+FPGA Interface Unit (FIU), Accelerated Function Unit (AFU) and Private Features,
+as illustrated below::
+
+    Header            Header            Header            Header
+ +----------+  +-->+----------+  +-->+----------+  +-->+----------+
+ |   Type   |  |   |  Type    |  |   |  Type    |  |   |  Type    |
+ |   FIU    |  |   | Private  |  |   | Private  |  |   | Private  |
+ +----------+  |   | Feature  |  |   | Feature  |  |   | Feature  |
+ | Next_DFH |--+   +----------+  |   +----------+  |   +----------+
+ +----------+      | Next_DFH |--+   | Next_DFH |--+   | Next_DFH |--> NULL
+ |    ID    |      +----------+      +----------+      +----------+
+ +----------+      |    ID    |      |    ID    |      |    ID    |
+ | Next_AFU |--+   +----------+      +----------+      +----------+
+ +----------+  |   | Feature  |      | Feature  |      | Feature  |
+ |  Header  |  |   | Register |      | Register |      | Register |
+ | Register |  |   |   Set    |      |   Set    |      |   Set    |
+ |   Set    |  |   +----------+      +----------+      +----------+
+ +----------+  |      Header
+               +-->+----------+
+                   |   Type   |
+                   |   AFU    |
+                   +----------+
+                   | Next_DFH |--> NULL
+                   +----------+
+                   |   GUID   |
+                   +----------+
+                   |  Header  |
+                   | Register |
+                   |   Set    |
+                   +----------+
+
+FPGA Interface Unit (FIU) represents a standalone functional unit for the
+interface to FPGA, e.g. the FPGA Management Engine (FME) and Port (more
+descriptions on FME and Port in later sections).
+
+Accelerated Function Unit (AFU) represents a FPGA programmable region and
+always connects to a FIU (e.g. a Port) as its child as illustrated above.
+
+Private Features represent sub features of the FIU and AFU. They could be
+various function blocks with different IDs, but all private features which
+belong to the same FIU or AFU, must be linked to one list via the Next Device
+Feature Header (Next_DFH) pointer.
+
+Each FIU, AFU and Private Feature could implement its own functional registers.
+The functional register set for FIU and AFU, is named as Header Register Set,
+e.g. FME Header Register Set, and the one for Private Feature, is named as
+Feature Register Set, e.g. FME Partial Reconfiguration Feature Register Set.
+
+This Device Feature List provides a way of linking features together, it's
+convenient for software to locate each feature by walking through this list,
+and can be implemented in register regions of any FPGA device.
+
+
+FIU - FME (FPGA Management Engine)
+==================================
+The FPGA Management Engine performs reconfiguration and other infrastructure
+functions. Each FPGA device only has one FME.
+
+User-space applications can acquire exclusive access to the FME using open(),
+and release it using close().
+
+The following functions are exposed through ioctls:
+
+- Get driver API version (DFL_FPGA_GET_API_VERSION)
+- Check for extensions (DFL_FPGA_CHECK_EXTENSION)
+- Program bitstream (DFL_FPGA_FME_PORT_PR)
+
+More functions are exposed through sysfs
+(/sys/class/fpga_region/regionX/dfl-fme.n/):
+
+ Read bitstream ID (bitstream_id)
+     bitstream_id indicates version of the static FPGA region.
+
+ Read bitstream metadata (bitstream_metadata)
+     bitstream_metadata includes detailed information of static FPGA region,
+     e.g. synthesis date and seed.
+
+ Read number of ports (ports_num)
+     one FPGA device may have more than one port, this sysfs interface indicates
+     how many ports the FPGA device has.
+
+
+FIU - PORT
+==========
+A port represents the interface between the static FPGA fabric and a partially
+reconfigurable region containing an AFU. It controls the communication from SW
+to the accelerator and exposes features such as reset and debug. Each FPGA
+device may have more than one port, but always one AFU per port.
+
+
+AFU
+===
+An AFU is attached to a port FIU and exposes a fixed length MMIO region to be
+used for accelerator-specific control registers.
+
+User-space applications can acquire exclusive access to an AFU attached to a
+port by using open() on the port device node and release it using close().
+
+The following functions are exposed through ioctls:
+
+- Get driver API version (DFL_FPGA_GET_API_VERSION)
+- Check for extensions (DFL_FPGA_CHECK_EXTENSION)
+- Get port info (DFL_FPGA_PORT_GET_INFO)
+- Get MMIO region info (DFL_FPGA_PORT_GET_REGION_INFO)
+- Map DMA buffer (DFL_FPGA_PORT_DMA_MAP)
+- Unmap DMA buffer (DFL_FPGA_PORT_DMA_UNMAP)
+- Reset AFU (DFL_FPGA_PORT_RESET)
+
+DFL_FPGA_PORT_RESET:
+  reset the FPGA Port and its AFU. Userspace can do Port
+  reset at any time, e.g. during DMA or Partial Reconfiguration. But it should
+  never cause any system level issue, only functional failure (e.g. DMA or PR
+  operation failure) and be recoverable from the failure.
+
+User-space applications can also mmap() accelerator MMIO regions.
+
+More functions are exposed through sysfs:
+(/sys/class/fpga_region/<regionX>/<dfl-port.m>/):
+
+ Read Accelerator GUID (afu_id)
+     afu_id indicates which PR bitstream is programmed to this AFU.
+
+
+DFL Framework Overview
+======================
+
+::
+
+         +----------+    +--------+ +--------+ +--------+
+         |   FME    |    |  AFU   | |  AFU   | |  AFU   |
+         |  Module  |    | Module | | Module | | Module |
+         +----------+    +--------+ +--------+ +--------+
+                 +-----------------------+
+                 | FPGA Container Device |    Device Feature List
+                 |  (FPGA Base Region)   |         Framework
+                 +-----------------------+
+  ------------------------------------------------------------------
+               +----------------------------+
+               |   FPGA DFL Device Module   |
+               | (e.g. PCIE/Platform Device)|
+               +----------------------------+
+                 +------------------------+
+                 |  FPGA Hardware Device  |
+                 +------------------------+
+
+DFL framework in kernel provides common interfaces to create container device
+(FPGA base region), discover feature devices and their private features from the
+given Device Feature Lists and create platform devices for feature devices
+(e.g. FME, Port and AFU) with related resources under the container device. It
+also abstracts operations for the private features and exposes common ops to
+feature device drivers.
+
+The FPGA DFL Device could be different hardwares, e.g. PCIe device, platform
+device and etc. Its driver module is always loaded first once the device is
+created by the system. This driver plays an infrastructural role in the
+driver architecture. It locates the DFLs in the device memory, handles them
+and related resources to common interfaces from DFL framework for enumeration.
+(Please refer to drivers/fpga/dfl.c for detailed enumeration APIs).
+
+The FPGA Management Engine (FME) driver is a platform driver which is loaded
+automatically after FME platform device creation from the DFL device module. It
+provides the key features for FPGA management, including:
+
+	a) Expose static FPGA region information, e.g. version and metadata.
+	   Users can read related information via sysfs interfaces exposed
+	   by FME driver.
+
+	b) Partial Reconfiguration. The FME driver creates FPGA manager, FPGA
+	   bridges and FPGA regions during PR sub feature initialization. Once
+	   it receives a DFL_FPGA_FME_PORT_PR ioctl from user, it invokes the
+	   common interface function from FPGA Region to complete the partial
+	   reconfiguration of the PR bitstream to the given port.
+
+Similar to the FME driver, the FPGA Accelerated Function Unit (AFU) driver is
+probed once the AFU platform device is created. The main function of this module
+is to provide an interface for userspace applications to access the individual
+accelerators, including basic reset control on port, AFU MMIO region export, dma
+buffer mapping service functions.
+
+After feature platform devices creation, matched platform drivers will be loaded
+automatically to handle different functionalities. Please refer to next sections
+for detailed information on functional units which have been already implemented
+under this DFL framework.
+
+
+Partial Reconfiguration
+=======================
+As mentioned above, accelerators can be reconfigured through partial
+reconfiguration of a PR bitstream file. The PR bitstream file must have been
+generated for the exact static FPGA region and targeted reconfigurable region
+(port) of the FPGA, otherwise, the reconfiguration operation will fail and
+possibly cause system instability. This compatibility can be checked by
+comparing the compatibility ID noted in the header of PR bitstream file against
+the compat_id exposed by the target FPGA region. This check is usually done by
+userspace before calling the reconfiguration IOCTL.
+
+
+Device enumeration
+==================
+This section introduces how applications enumerate the fpga device from
+the sysfs hierarchy under /sys/class/fpga_region.
+
+In the example below, two DFL based FPGA devices are installed in the host. Each
+fpga device has one FME and two ports (AFUs).
+
+FPGA regions are created under /sys/class/fpga_region/::
+
+	/sys/class/fpga_region/region0
+	/sys/class/fpga_region/region1
+	/sys/class/fpga_region/region2
+	...
+
+Application needs to search each regionX folder, if feature device is found,
+(e.g. "dfl-port.n" or "dfl-fme.m" is found), then it's the base
+fpga region which represents the FPGA device.
+
+Each base region has one FME and two ports (AFUs) as child devices::
+
+	/sys/class/fpga_region/region0/dfl-fme.0
+	/sys/class/fpga_region/region0/dfl-port.0
+	/sys/class/fpga_region/region0/dfl-port.1
+	...
+
+	/sys/class/fpga_region/region3/dfl-fme.1
+	/sys/class/fpga_region/region3/dfl-port.2
+	/sys/class/fpga_region/region3/dfl-port.3
+	...
+
+In general, the FME/AFU sysfs interfaces are named as follows::
+
+	/sys/class/fpga_region/<regionX>/<dfl-fme.n>/
+	/sys/class/fpga_region/<regionX>/<dfl-port.m>/
+
+with 'n' consecutively numbering all FMEs and 'm' consecutively numbering all
+ports.
+
+The device nodes used for ioctl() or mmap() can be referenced through::
+
+	/sys/class/fpga_region/<regionX>/<dfl-fme.n>/dev
+	/sys/class/fpga_region/<regionX>/<dfl-port.n>/dev
+
+
+Add new FIUs support
+====================
+It's possible that developers made some new function blocks (FIUs) under this
+DFL framework, then new platform device driver needs to be developed for the
+new feature dev (FIU) following the same way as existing feature dev drivers
+(e.g. FME and Port/AFU platform device driver). Besides that, it requires
+modification on DFL framework enumeration code too, for new FIU type detection
+and related platform devices creation.
+
+
+Add new private features support
+================================
+In some cases, we may need to add some new private features to existing FIUs
+(e.g. FME or Port). Developers don't need to touch enumeration code in DFL
+framework, as each private feature will be parsed automatically and related
+mmio resources can be found under FIU platform device created by DFL framework.
+Developer only needs to provide a sub feature driver with matched feature id.
+FME Partial Reconfiguration Sub Feature driver (see drivers/fpga/dfl-fme-pr.c)
+could be a reference.
+
+
+Open discussion
+===============
+FME driver exports one ioctl (DFL_FPGA_FME_PORT_PR) for partial reconfiguration
+to user now. In the future, if unified user interfaces for reconfiguration are
+added, FME driver should switch to them from ioctl interface.
diff --git a/Documentation/fpga/dfl.txt b/Documentation/fpga/dfl.txt
deleted file mode 100644
index 6df4621c3f2a..000000000000
--- a/Documentation/fpga/dfl.txt
+++ /dev/null
@@ -1,285 +0,0 @@
-===============================================================================
-              FPGA Device Feature List (DFL) Framework Overview
--------------------------------------------------------------------------------
-                Enno Luebbers <enno.luebbers@intel.com>
-                Xiao Guangrong <guangrong.xiao@linux.intel.com>
-                Wu Hao <hao.wu@intel.com>
-
-The Device Feature List (DFL) FPGA framework (and drivers according to this
-this framework) hides the very details of low layer hardwares and provides
-unified interfaces to userspace. Applications could use these interfaces to
-configure, enumerate, open and access FPGA accelerators on platforms which
-implement the DFL in the device memory. Besides this, the DFL framework
-enables system level management functions such as FPGA reconfiguration.
-
-
-Device Feature List (DFL) Overview
-==================================
-Device Feature List (DFL) defines a linked list of feature headers within the
-device MMIO space to provide an extensible way of adding features. Software can
-walk through these predefined data structures to enumerate FPGA features:
-FPGA Interface Unit (FIU), Accelerated Function Unit (AFU) and Private Features,
-as illustrated below:
-
-    Header            Header            Header            Header
- +----------+  +-->+----------+  +-->+----------+  +-->+----------+
- |   Type   |  |   |  Type    |  |   |  Type    |  |   |  Type    |
- |   FIU    |  |   | Private  |  |   | Private  |  |   | Private  |
- +----------+  |   | Feature  |  |   | Feature  |  |   | Feature  |
- | Next_DFH |--+   +----------+  |   +----------+  |   +----------+
- +----------+      | Next_DFH |--+   | Next_DFH |--+   | Next_DFH |--> NULL
- |    ID    |      +----------+      +----------+      +----------+
- +----------+      |    ID    |      |    ID    |      |    ID    |
- | Next_AFU |--+   +----------+      +----------+      +----------+
- +----------+  |   | Feature  |      | Feature  |      | Feature  |
- |  Header  |  |   | Register |      | Register |      | Register |
- | Register |  |   |   Set    |      |   Set    |      |   Set    |
- |   Set    |  |   +----------+      +----------+      +----------+
- +----------+  |      Header
-               +-->+----------+
-                   |   Type   |
-                   |   AFU    |
-                   +----------+
-                   | Next_DFH |--> NULL
-                   +----------+
-                   |   GUID   |
-                   +----------+
-                   |  Header  |
-                   | Register |
-                   |   Set    |
-                   +----------+
-
-FPGA Interface Unit (FIU) represents a standalone functional unit for the
-interface to FPGA, e.g. the FPGA Management Engine (FME) and Port (more
-descriptions on FME and Port in later sections).
-
-Accelerated Function Unit (AFU) represents a FPGA programmable region and
-always connects to a FIU (e.g. a Port) as its child as illustrated above.
-
-Private Features represent sub features of the FIU and AFU. They could be
-various function blocks with different IDs, but all private features which
-belong to the same FIU or AFU, must be linked to one list via the Next Device
-Feature Header (Next_DFH) pointer.
-
-Each FIU, AFU and Private Feature could implement its own functional registers.
-The functional register set for FIU and AFU, is named as Header Register Set,
-e.g. FME Header Register Set, and the one for Private Feature, is named as
-Feature Register Set, e.g. FME Partial Reconfiguration Feature Register Set.
-
-This Device Feature List provides a way of linking features together, it's
-convenient for software to locate each feature by walking through this list,
-and can be implemented in register regions of any FPGA device.
-
-
-FIU - FME (FPGA Management Engine)
-==================================
-The FPGA Management Engine performs reconfiguration and other infrastructure
-functions. Each FPGA device only has one FME.
-
-User-space applications can acquire exclusive access to the FME using open(),
-and release it using close().
-
-The following functions are exposed through ioctls:
-
- Get driver API version (DFL_FPGA_GET_API_VERSION)
- Check for extensions (DFL_FPGA_CHECK_EXTENSION)
- Program bitstream (DFL_FPGA_FME_PORT_PR)
-
-More functions are exposed through sysfs
-(/sys/class/fpga_region/regionX/dfl-fme.n/):
-
- Read bitstream ID (bitstream_id)
-     bitstream_id indicates version of the static FPGA region.
-
- Read bitstream metadata (bitstream_metadata)
-     bitstream_metadata includes detailed information of static FPGA region,
-     e.g. synthesis date and seed.
-
- Read number of ports (ports_num)
-     one FPGA device may have more than one port, this sysfs interface indicates
-     how many ports the FPGA device has.
-
-
-FIU - PORT
-==========
-A port represents the interface between the static FPGA fabric and a partially
-reconfigurable region containing an AFU. It controls the communication from SW
-to the accelerator and exposes features such as reset and debug. Each FPGA
-device may have more than one port, but always one AFU per port.
-
-
-AFU
-===
-An AFU is attached to a port FIU and exposes a fixed length MMIO region to be
-used for accelerator-specific control registers.
-
-User-space applications can acquire exclusive access to an AFU attached to a
-port by using open() on the port device node and release it using close().
-
-The following functions are exposed through ioctls:
-
- Get driver API version (DFL_FPGA_GET_API_VERSION)
- Check for extensions (DFL_FPGA_CHECK_EXTENSION)
- Get port info (DFL_FPGA_PORT_GET_INFO)
- Get MMIO region info (DFL_FPGA_PORT_GET_REGION_INFO)
- Map DMA buffer (DFL_FPGA_PORT_DMA_MAP)
- Unmap DMA buffer (DFL_FPGA_PORT_DMA_UNMAP)
- Reset AFU (*DFL_FPGA_PORT_RESET)
-
-*DFL_FPGA_PORT_RESET: reset the FPGA Port and its AFU. Userspace can do Port
-reset at any time, e.g. during DMA or Partial Reconfiguration. But it should
-never cause any system level issue, only functional failure (e.g. DMA or PR
-operation failure) and be recoverable from the failure.
-
-User-space applications can also mmap() accelerator MMIO regions.
-
-More functions are exposed through sysfs:
-(/sys/class/fpga_region/<regionX>/<dfl-port.m>/):
-
- Read Accelerator GUID (afu_id)
-     afu_id indicates which PR bitstream is programmed to this AFU.
-
-
-DFL Framework Overview
-======================
-
-         +----------+    +--------+ +--------+ +--------+
-         |   FME    |    |  AFU   | |  AFU   | |  AFU   |
-         |  Module  |    | Module | | Module | | Module |
-         +----------+    +--------+ +--------+ +--------+
-                 +-----------------------+
-                 | FPGA Container Device |    Device Feature List
-                 |  (FPGA Base Region)   |         Framework
-                 +-----------------------+
---------------------------------------------------------------------
-               +----------------------------+
-               |   FPGA DFL Device Module   |
-               | (e.g. PCIE/Platform Device)|
-               +----------------------------+
-                 +------------------------+
-                 |  FPGA Hardware Device  |
-                 +------------------------+
-
-DFL framework in kernel provides common interfaces to create container device
-(FPGA base region), discover feature devices and their private features from the
-given Device Feature Lists and create platform devices for feature devices
-(e.g. FME, Port and AFU) with related resources under the container device. It
-also abstracts operations for the private features and exposes common ops to
-feature device drivers.
-
-The FPGA DFL Device could be different hardwares, e.g. PCIe device, platform
-device and etc. Its driver module is always loaded first once the device is
-created by the system. This driver plays an infrastructural role in the
-driver architecture. It locates the DFLs in the device memory, handles them
-and related resources to common interfaces from DFL framework for enumeration.
-(Please refer to drivers/fpga/dfl.c for detailed enumeration APIs).
-
-The FPGA Management Engine (FME) driver is a platform driver which is loaded
-automatically after FME platform device creation from the DFL device module. It
-provides the key features for FPGA management, including:
-
-	a) Expose static FPGA region information, e.g. version and metadata.
-	   Users can read related information via sysfs interfaces exposed
-	   by FME driver.
-
-	b) Partial Reconfiguration. The FME driver creates FPGA manager, FPGA
-	   bridges and FPGA regions during PR sub feature initialization. Once
-	   it receives a DFL_FPGA_FME_PORT_PR ioctl from user, it invokes the
-	   common interface function from FPGA Region to complete the partial
-	   reconfiguration of the PR bitstream to the given port.
-
-Similar to the FME driver, the FPGA Accelerated Function Unit (AFU) driver is
-probed once the AFU platform device is created. The main function of this module
-is to provide an interface for userspace applications to access the individual
-accelerators, including basic reset control on port, AFU MMIO region export, dma
-buffer mapping service functions.
-
-After feature platform devices creation, matched platform drivers will be loaded
-automatically to handle different functionalities. Please refer to next sections
-for detailed information on functional units which have been already implemented
-under this DFL framework.
-
-
-Partial Reconfiguration
-=======================
-As mentioned above, accelerators can be reconfigured through partial
-reconfiguration of a PR bitstream file. The PR bitstream file must have been
-generated for the exact static FPGA region and targeted reconfigurable region
-(port) of the FPGA, otherwise, the reconfiguration operation will fail and
-possibly cause system instability. This compatibility can be checked by
-comparing the compatibility ID noted in the header of PR bitstream file against
-the compat_id exposed by the target FPGA region. This check is usually done by
-userspace before calling the reconfiguration IOCTL.
-
-
-Device enumeration
-==================
-This section introduces how applications enumerate the fpga device from
-the sysfs hierarchy under /sys/class/fpga_region.
-
-In the example below, two DFL based FPGA devices are installed in the host. Each
-fpga device has one FME and two ports (AFUs).
-
-FPGA regions are created under /sys/class/fpga_region/
-
-	/sys/class/fpga_region/region0
-	/sys/class/fpga_region/region1
-	/sys/class/fpga_region/region2
-	...
-
-Application needs to search each regionX folder, if feature device is found,
-(e.g. "dfl-port.n" or "dfl-fme.m" is found), then it's the base
-fpga region which represents the FPGA device.
-
-Each base region has one FME and two ports (AFUs) as child devices:
-
-	/sys/class/fpga_region/region0/dfl-fme.0
-	/sys/class/fpga_region/region0/dfl-port.0
-	/sys/class/fpga_region/region0/dfl-port.1
-	...
-
-	/sys/class/fpga_region/region3/dfl-fme.1
-	/sys/class/fpga_region/region3/dfl-port.2
-	/sys/class/fpga_region/region3/dfl-port.3
-	...
-
-In general, the FME/AFU sysfs interfaces are named as follows:
-
-	/sys/class/fpga_region/<regionX>/<dfl-fme.n>/
-	/sys/class/fpga_region/<regionX>/<dfl-port.m>/
-
-with 'n' consecutively numbering all FMEs and 'm' consecutively numbering all
-ports.
-
-The device nodes used for ioctl() or mmap() can be referenced through:
-
-	/sys/class/fpga_region/<regionX>/<dfl-fme.n>/dev
-	/sys/class/fpga_region/<regionX>/<dfl-port.n>/dev
-
-
-Add new FIUs support
-====================
-It's possible that developers made some new function blocks (FIUs) under this
-DFL framework, then new platform device driver needs to be developed for the
-new feature dev (FIU) following the same way as existing feature dev drivers
-(e.g. FME and Port/AFU platform device driver). Besides that, it requires
-modification on DFL framework enumeration code too, for new FIU type detection
-and related platform devices creation.
-
-
-Add new private features support
-================================
-In some cases, we may need to add some new private features to existing FIUs
-(e.g. FME or Port). Developers don't need to touch enumeration code in DFL
-framework, as each private feature will be parsed automatically and related
-mmio resources can be found under FIU platform device created by DFL framework.
-Developer only needs to provide a sub feature driver with matched feature id.
-FME Partial Reconfiguration Sub Feature driver (see drivers/fpga/dfl-fme-pr.c)
-could be a reference.
-
-
-Open discussion
-===============
-FME driver exports one ioctl (DFL_FPGA_FME_PORT_PR) for partial reconfiguration
-to user now. In the future, if unified user interfaces for reconfiguration are
-added, FME driver should switch to them from ioctl interface.
diff --git a/Documentation/fpga/index.rst b/Documentation/fpga/index.rst
new file mode 100644
index 000000000000..f80f95667ca2
--- /dev/null
+++ b/Documentation/fpga/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+fpga
+====
+
+.. toctree::
+    :maxdepth: 1
+
+    dfl
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/gpio/index.rst b/Documentation/gpio/index.rst
deleted file mode 100644
index 09a4a553f434..000000000000
--- a/Documentation/gpio/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-:orphan:
-
-====
-gpio
-====
-
-.. toctree::
-    :maxdepth: 1
-
-    sysfs
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index a740e491dfcc..5acdd1842ea2 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -37,10 +37,10 @@ Buffer Objects
 PRIME Buffer Sharing
 --------------------
 
-.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
    :doc: PRIME Buffer Sharing
 
-.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
    :internal:
 
 MMU Notifier
@@ -70,6 +70,26 @@ Interrupt Handling
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
    :internal:
 
+AMDGPU XGMI Support
+===================
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :doc: AMDGPU XGMI Support
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :internal:
+
+AMDGPU RAS debugfs control interface
+====================================
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :doc: AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :internal:
+
+
 GPU Power/Thermal Controls and Monitoring
 =========================================
 
diff --git a/Documentation/gpu/conf.py b/Documentation/gpu/conf.py
deleted file mode 100644
index 1757b040fb32..000000000000
--- a/Documentation/gpu/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Linux GPU Driver Developer's Guide"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'gpu.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst
index 044a7025477c..4bfb7068e9f7 100644
--- a/Documentation/gpu/drivers.rst
+++ b/Documentation/gpu/drivers.rst
@@ -7,6 +7,7 @@ GPU Driver Documentation
    amdgpu
    amdgpu-dc
    i915
+   mcde
    meson
    pl111
    tegra
diff --git a/Documentation/gpu/drm-client.rst b/Documentation/gpu/drm-client.rst
index 7e672063e7eb..58b5a1d1219d 100644
--- a/Documentation/gpu/drm-client.rst
+++ b/Documentation/gpu/drm-client.rst
@@ -10,3 +10,6 @@ Kernel clients
 
 .. kernel-doc:: drivers/gpu/drm/drm_client.c
    :export:
+
+.. kernel-doc:: drivers/gpu/drm/drm_client_modeset.c
+   :export:
diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 14102ae035dc..b327bbc11182 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -181,6 +181,21 @@ Panel Helper Reference
 .. kernel-doc:: drivers/gpu/drm/drm_panel_orientation_quirks.c
    :export:
 
+Panel Self Refresh Helper Reference
+===================================
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+   :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+   :export:
+
+HDCP Helper Functions Reference
+===============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_hdcp.c
+   :export:
+
 Display Port Helper Functions Reference
 =======================================
 
diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index 54a696d961a7..c8ebd4f66a6a 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -79,7 +79,6 @@ count for the TTM, which will call your initialization function.
 
 See the radeon_ttm.c file for an example of usage.
 
-
 The Graphics Execution Manager (GEM)
 ====================================
 
@@ -380,6 +379,39 @@ GEM CMA Helper Functions Reference
 .. kernel-doc:: drivers/gpu/drm/drm_gem_cma_helper.c
    :export:
 
+VRAM Helper Function Reference
+==============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_helper_common.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+   :internal:
+
+GEM VRAM Helper Functions Reference
+-----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+   :export:
+
+VRAM MM Helper Functions Reference
+----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_vram_mm_helper.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+   :export:
+
 VMA Offset Manager
 ==================
 
diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index c9fd23efd957..94f90521f58c 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -85,16 +85,18 @@ leads to a few additional requirements:
 - The userspace side must be fully reviewed and tested to the standards of that
   userspace project. For e.g. mesa this means piglit testcases and review on the
   mailing list. This is again to ensure that the new interface actually gets the
-  job done.
+  job done.  The userspace-side reviewer should also provide an Acked-by on the
+  kernel uAPI patch indicating that they believe the proposed uAPI is sound and
+  sufficiently documented and validated for userspace's consumption.
 
 - The userspace patches must be against the canonical upstream, not some vendor
   fork. This is to make sure that no one cheats on the review and testing
   requirements by doing a quick fork.
 
 - The kernel patch can only be merged after all the above requirements are met,
-  but it **must** be merged **before** the userspace patches land. uAPI always flows
-  from the kernel, doing things the other way round risks divergence of the uAPI
-  definitions and header files.
+  but it **must** be merged to either drm-next or drm-misc-next **before** the
+  userspace patches land. uAPI always flows from the kernel, doing things the
+  other way round risks divergence of the uAPI definitions and header files.
 
 These are fairly steep requirements, but have grown out from years of shared
 pain and experience with uAPI added hastily, and almost always regretted about
@@ -327,3 +329,12 @@ DRM_IOCTL_MODESET_CTL
     mode setting, since on many devices the vertical blank counter is
     reset to 0 at some point during modeset. Modern drivers should not
     call this any more since with kernel mode setting it is a no-op.
+
+Userspace API Structures
+========================
+
+.. kernel-doc:: include/uapi/drm/drm_mode.h
+   :doc: overview
+
+.. kernel-doc:: include/uapi/drm/drm_mode.h
+   :internal:
diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index 055df45596c1..c38ef0dda605 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -61,7 +61,7 @@ Intel GVT-g Host Support(vGPU device model)
 Workarounds
 -----------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_workarounds.c
+.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_workarounds.c
    :doc: Hardware workarounds
 
 Display Hardware Handling
@@ -82,13 +82,13 @@ change.
 Frontbuffer Tracking
 --------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.c
    :doc: frontbuffer tracking
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.h
    :internal:
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.c
    :internal:
 
 .. kernel-doc:: drivers/gpu/drm/i915/i915_gem.c
@@ -97,10 +97,10 @@ Frontbuffer Tracking
 Display FIFO Underrun Reporting
 -------------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fifo_underrun.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fifo_underrun.c
    :doc: fifo underrun handling
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fifo_underrun.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fifo_underrun.c
    :internal:
 
 Plane Configuration
@@ -115,10 +115,10 @@ panel self refresh.
 Atomic Plane Helpers
 --------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_atomic_plane.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_atomic_plane.c
    :doc: atomic plane helpers
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_atomic_plane.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_atomic_plane.c
    :internal:
 
 Output Probing
@@ -132,19 +132,19 @@ probing, so those sections fully apply.
 Hotplug
 -------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_hotplug.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_hotplug.c
    :doc: Hotplug
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_hotplug.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_hotplug.c
    :internal:
 
 High Definition Audio
 ---------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_audio.c
    :doc: High Definition Audio over HDMI and Display Port
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_audio.c
    :internal:
 
 .. kernel-doc:: include/drm/i915_component.h
@@ -153,58 +153,58 @@ High Definition Audio
 Intel HDMI LPE Audio Support
 ----------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lpe_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_lpe_audio.c
    :doc: LPE Audio integration for HDMI or DP playback
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lpe_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_lpe_audio.c
    :internal:
 
 Panel Self Refresh PSR (PSR/SRD)
 --------------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_psr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_psr.c
    :doc: Panel Self Refresh (PSR/SRD)
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_psr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_psr.c
    :internal:
 
 Frame Buffer Compression (FBC)
 ------------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fbc.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fbc.c
    :doc: Frame Buffer Compression (FBC)
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fbc.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fbc.c
    :internal:
 
 Display Refresh Rate Switching (DRRS)
 -------------------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :doc: Display Refresh Rate Switching (DRRS)
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_dp_set_drrs_state
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_edp_drrs_enable
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_edp_drrs_disable
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_edp_drrs_invalidate
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_edp_drrs_flush
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
    :functions: intel_dp_drrs_init
 
 DPIO
 ----
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpio_phy.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpio_phy.c
    :doc: DPIO
 
 CSR firmware support for DMC
@@ -219,34 +219,34 @@ CSR firmware support for DMC
 Video BIOS Table (VBT)
 ----------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_bios.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_bios.c
    :doc: Video BIOS Table (VBT)
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_bios.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_bios.c
    :internal:
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_vbt_defs.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_vbt_defs.h
    :internal:
 
 Display clocks
 --------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_cdclk.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_cdclk.c
    :doc: CDCLK / RAWCLK
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_cdclk.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_cdclk.c
    :internal:
 
 Display PLLs
 ------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.c
    :doc: Display PLLs
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.c
    :internal:
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.h
    :internal:
 
 Memory Management and Command Submission
@@ -349,7 +349,7 @@ of buffer object caches. Shrinking is used to make main memory
 available. Note that this is mostly orthogonal to evicting buffer
 objects, which has the goal to make space in gpu virtual address spaces.
 
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_shrinker.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
    :internal:
 
 Batchbuffer Parsing
@@ -373,18 +373,15 @@ Batchbuffer Pools
 User Batchbuffer Execution
 --------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_execbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
    :doc: User command execution
 
 Logical Rings, Logical Ring Contexts and Execlists
 --------------------------------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c
+.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c
    :doc: Logical Rings, Logical Ring Contexts and Execlists
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c
-   :internal:
-
 Global GTT views
 ----------------
 
@@ -415,10 +412,10 @@ Hardware Tiling and Swizzling Details
 Object Tiling IOCTLs
 --------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c
    :internal:
 
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c
    :doc: buffer object tiling
 
 WOPCM
@@ -478,12 +475,6 @@ i915_context_create and i915_context_free
 .. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h
    :doc: i915_context_create and i915_context_free tracepoints
 
-switch_mm
----------
-
-.. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h
-   :doc: switch_mm tracepoint
-
 Perf
 ====
 
diff --git a/Documentation/gpu/mcde.rst b/Documentation/gpu/mcde.rst
new file mode 100644
index 000000000000..c69e977defda
--- /dev/null
+++ b/Documentation/gpu/mcde.rst
@@ -0,0 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================================
+ drm/mcde ST-Ericsson MCDE Multi-channel display engine
+=======================================================
+
+.. kernel-doc:: drivers/gpu/drm/mcde/mcde_drv.c
+   :doc: ST-Ericsson MCDE DRM Driver
diff --git a/Documentation/gpu/msm-crash-dump.rst b/Documentation/gpu/msm-crash-dump.rst
index 757cd257e0d8..240ef200f76c 100644
--- a/Documentation/gpu/msm-crash-dump.rst
+++ b/Documentation/gpu/msm-crash-dump.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 =====================
 MSM Crash Dump Format
 =====================
diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 1528ad2d598b..0a49c5a1d9ce 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -10,25 +10,6 @@ graphics subsystem useful as newbie projects. Or for slow rainy days.
 Subsystem-wide refactorings
 ===========================
 
-De-midlayer drivers
--------------------
-
-With the recent ``drm_bus`` cleanup patches for 3.17 it is no longer required
-to have a ``drm_bus`` structure set up. Drivers can directly set up the
-``drm_device`` structure instead of relying on bus methods in ``drm_usb.c``
-and ``drm_pci.c``. The goal is to get rid of the driver's ``->load`` /
-``->unload`` callbacks and open-code the load/unload sequence properly, using
-the new two-stage ``drm_device`` setup/teardown.
-
-Once all existing drivers are converted we can also remove those bus support
-files for USB and platform devices.
-
-All you need is a GPU for a non-converted driver (currently almost all of
-them, but also all the virtual ones used by KVM, so everyone qualifies).
-
-Contact: Daniel Vetter, Thierry Reding, respective driver maintainers
-
-
 Remove custom dumb_map_offset implementations
 ---------------------------------------------
 
@@ -247,6 +228,12 @@ struct drm_gem_object_funcs
 GEM objects can now have a function table instead of having the callbacks on the
 DRM driver struct. This is now the preferred way and drivers can be moved over.
 
+DRM_GEM_CMA_VMAP_DRIVER_OPS, DRM_GEM_SHMEM_DRIVER_OPS already support this, but
+DRM_GEM_VRAM_DRIVER_PRIME does not yet and needs to be aligned with the previous
+two. We also need a 2nd version of the CMA define that doesn't require the
+vmapping to be present (different hook for prime importing). Plus this needs to
+be rolled out to all drivers using their own implementations, too.
+
 Use DRM_MODESET_LOCK_ALL_* helpers instead of boilerplate
 ---------------------------------------------------------
 
@@ -300,6 +287,21 @@ it to use drm_mode_hsync() instead.
 
 Contact: Sean Paul
 
+drm_fb_helper tasks
+-------------------
+
+- drm_fb_helper_restore_fbdev_mode_unlocked() should call restore_fbdev_mode()
+  not the _force variant so it can bail out if there is a master. But first
+  these igt tests need to be fixed: kms_fbcon_fbt@psr and
+  kms_fbcon_fbt@psr-suspend.
+
+- The max connector argument for drm_fb_helper_init() and
+  drm_fb_helper_fbdev_setup() isn't used anymore and can be removed.
+
+- The helper doesn't keep an array of connectors anymore so these can be
+  removed: drm_fb_helper_single_add_all_connectors(),
+  drm_fb_helper_add_one_connector() and drm_fb_helper_remove_one_connector().
+
 Core refactorings
 =================
 
@@ -488,5 +490,20 @@ i915
   device_link_add to model the dependency between i915 and snd_had. See
   https://dri.freedesktop.org/docs/drm/driver-api/device_link.html
 
+Bootsplash
+==========
+
+There is support in place now for writing internal DRM clients making it
+possible to pick up the bootsplash work that was rejected because it was written
+for fbdev.
+
+- [v6,8/8] drm/client: Hack: Add bootsplash example
+  https://patchwork.freedesktop.org/patch/306579/
+
+- [RFC PATCH v2 00/13] Kernel based bootsplash
+  https://lkml.org/lkml/2017/12/13/764
+
+Contact: Sam Ravnborg
+
 Outside DRM
 ===========
diff --git a/Documentation/hid/hid-alps.rst b/Documentation/hid/hid-alps.rst
new file mode 100644
index 000000000000..e2f4c4c11e3f
--- /dev/null
+++ b/Documentation/hid/hid-alps.rst
@@ -0,0 +1,180 @@
+==========================
+ALPS HID Touchpad Protocol
+==========================
+
+Introduction
+------------
+Currently ALPS HID driver supports U1 Touchpad device.
+
+U1 device basic information.
+
+==========	======
+Vender ID	0x044E
+Product ID	0x120B
+Version ID	0x0121
+==========	======
+
+
+HID Descriptor
+--------------
+
+=======	====================	=====	=======================================
+Byte	Field			Value	Notes
+=======	====================	=====	=======================================
+0	wHIDDescLength		001E	Length of HID Descriptor : 30 bytes
+2	bcdVersion		0100	Compliant with Version 1.00
+4	wReportDescLength	00B2	Report Descriptor is 178 Bytes (0x00B2)
+6	wReportDescRegister	0002	Identifier to read Report Descriptor
+8	wInputRegister		0003	Identifier to read Input Report
+10	wMaxInputLength		0053	Input Report is 80 Bytes + 2
+12	wOutputRegister		0000	Identifier to read Output Report
+14	wMaxOutputLength	0000	No Output Reports
+16	wCommandRegister	0005	Identifier for Command Register
+18	wDataRegister		0006	Identifier for Data Register
+20	wVendorID		044E	Vendor ID 0x044E
+22	wProductID		120B	Product ID 0x120B
+24	wVersionID		0121	Version 01.21
+26	RESERVED		0000	RESERVED
+=======	====================	=====	=======================================
+
+
+Report ID
+---------
+
+==========	=================  =========================================
+ReportID-1	(Input Reports)	   (HIDUsage-Mouse) for TP&SP
+ReportID-2	(Input Reports)	   (HIDUsage-keyboard) for TP
+ReportID-3	(Input Reports)	   (Vendor Usage: Max 10 finger data) for TP
+ReportID-4	(Input Reports)	   (Vendor Usage: ON bit data) for GP
+ReportID-5	(Feature Reports)  Feature Reports
+ReportID-6	(Input Reports)	   (Vendor Usage: StickPointer data) for SP
+ReportID-7	(Feature Reports)  Flash update (Bootloader)
+==========	=================  =========================================
+
+
+Data pattern
+------------
+
+=====	==========	=====	=================
+Case1	ReportID_1	TP/SP	Relative/Relative
+Case2	ReportID_3	TP	Absolute
+	ReportID_6	SP	Absolute
+=====	==========	=====	=================
+
+
+Command Read/Write
+------------------
+To read/write to RAM, need to send a commands to the device.
+
+The command format is as below.
+
+DataByte(SET_REPORT)
+
+=====	======================
+Byte1	Command Byte
+Byte2	Address - Byte 0 (LSB)
+Byte3	Address - Byte 1
+Byte4	Address - Byte 2
+Byte5	Address - Byte 3 (MSB)
+Byte6	Value Byte
+Byte7	Checksum
+=====	======================
+
+Command Byte is read=0xD1/write=0xD2 .
+
+Address is read/write RAM address.
+
+Value Byte is writing data when you send the write commands.
+
+When you read RAM, there is no meaning.
+
+DataByte(GET_REPORT)
+
+=====	======================
+Byte1	Response Byte
+Byte2	Address - Byte 0 (LSB)
+Byte3	Address - Byte 1
+Byte4	Address - Byte 2
+Byte5	Address - Byte 3 (MSB)
+Byte6	Value Byte
+Byte7	Checksum
+=====	======================
+
+Read value is stored in Value Byte.
+
+
+Packet Format
+Touchpad data byte
+------------------
+
+
+======= ======= ======= ======= ======= ======= ======= ======= =====
+-	b7	b6	b5	b4	b3	b2	b1	b0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+1	0	0	SW6	SW5	SW4	SW3	SW2	SW1
+2	0	0	0	Fcv	Fn3	Fn2	Fn1	Fn0
+3	Xa0_7	Xa0_6	Xa0_5	Xa0_4	Xa0_3	Xa0_2	Xa0_1	Xa0_0
+4	Xa0_15	Xa0_14	Xa0_13	Xa0_12	Xa0_11	Xa0_10	Xa0_9	Xa0_8
+5	Ya0_7	Ya0_6	Ya0_5	Ya0_4	Ya0_3	Ya0_2	Ya0_1	Ya0_0
+6	Ya0_15	Ya0_14	Ya0_13	Ya0_12	Ya0_11	Ya0_10	Ya0_9	Ya0_8
+7	LFB0	Zs0_6	Zs0_5	Zs0_4	Zs0_3	Zs0_2	Zs0_1	Zs0_0
+
+8	Xa1_7	Xa1_6	Xa1_5	Xa1_4	Xa1_3	Xa1_2	Xa1_1	Xa1_0
+9	Xa1_15	Xa1_14	Xa1_13	Xa1_12	Xa1_11	Xa1_10	Xa1_9	Xa1_8
+10	Ya1_7	Ya1_6	Ya1_5	Ya1_4	Ya1_3	Ya1_2	Ya1_1	Ya1_0
+11	Ya1_15	Ya1_14	Ya1_13	Ya1_12	Ya1_11	Ya1_10	Ya1_9	Ya1_8
+12	LFB1	Zs1_6	Zs1_5	Zs1_4	Zs1_3	Zs1_2	Zs1_1	Zs1_0
+
+13	Xa2_7	Xa2_6	Xa2_5	Xa2_4	Xa2_3	Xa2_2	Xa2_1	Xa2_0
+14	Xa2_15	Xa2_14	Xa2_13	Xa2_12	Xa2_11	Xa2_10	Xa2_9	Xa2_8
+15	Ya2_7	Ya2_6	Ya2_5	Ya2_4	Ya2_3	Ya2_2	Ya2_1	Ya2_0
+16	Ya2_15	Ya2_14	Ya2_13	Ya2_12	Ya2_11	Ya2_10	Ya2_9	Ya2_8
+17	LFB2	Zs2_6	Zs2_5	Zs2_4	Zs2_3	Zs2_2	Zs2_1	Zs2_0
+
+18	Xa3_7	Xa3_6	Xa3_5	Xa3_4	Xa3_3	Xa3_2	Xa3_1	Xa3_0
+19	Xa3_15	Xa3_14	Xa3_13	Xa3_12	Xa3_11	Xa3_10	Xa3_9	Xa3_8
+20	Ya3_7	Ya3_6	Ya3_5	Ya3_4	Ya3_3	Ya3_2	Ya3_1	Ya3_0
+21	Ya3_15	Ya3_14	Ya3_13	Ya3_12	Ya3_11	Ya3_10	Ya3_9	Ya3_8
+22	LFB3	Zs3_6	Zs3_5	Zs3_4	Zs3_3	Zs3_2	Zs3_1	Zs3_0
+
+23	Xa4_7	Xa4_6	Xa4_5	Xa4_4	Xa4_3	Xa4_2	Xa4_1	Xa4_0
+24	Xa4_15	Xa4_14	Xa4_13	Xa4_12	Xa4_11	Xa4_10	Xa4_9	Xa4_8
+25	Ya4_7	Ya4_6	Ya4_5	Ya4_4	Ya4_3	Ya4_2	Ya4_1	Ya4_0
+26	Ya4_15	Ya4_14	Ya4_13	Ya4_12	Ya4_11	Ya4_10	Ya4_9	Ya4_8
+27	LFB4	Zs4_6	Zs4_5	Zs4_4	Zs4_3	Zs4_2	Zs4_1	Zs4_0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+
+
+SW1-SW6:
+	SW ON/OFF status
+Xan_15-0(16bit):
+	X Absolute data of the "n"th finger
+Yan_15-0(16bit):
+	Y Absolute data of the "n"th finger
+Zsn_6-0(7bit):
+	Operation area of the "n"th finger
+
+
+StickPointer data byte
+----------------------
+
+======= ======= ======= ======= ======= ======= ======= ======= =====
+-	b7	b6	b5	b4	b3	b2	b1	b0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+Byte1	1	1	1	0	1	SW3	SW2	SW1
+Byte2	X7	X6	X5	X4	X3	X2	X1	X0
+Byte3	X15	X14	X13	X12	X11	X10	X9	X8
+Byte4	Y7	Y6	Y5	Y4	Y3	Y2	Y1	Y0
+Byte5	Y15	Y14	Y13	Y12	Y11	Y10	Y9	Y8
+Byte6	Z7	Z6	Z5	Z4	Z3	Z2	Z1	Z0
+Byte7	T&P	Z14	Z13	Z12	Z11	Z10	Z9	Z8
+======= ======= ======= ======= ======= ======= ======= ======= =====
+
+SW1-SW3:
+	SW ON/OFF status
+Xn_15-0(16bit):
+	X Absolute data
+Yn_15-0(16bit):
+	Y Absolute data
+Zn_14-0(15bit):
+	Z
diff --git a/Documentation/hid/hid-alps.txt b/Documentation/hid/hid-alps.txt
deleted file mode 100644
index 6b02a2447c77..000000000000
--- a/Documentation/hid/hid-alps.txt
+++ /dev/null
@@ -1,139 +0,0 @@
-ALPS HID Touchpad Protocol
-----------------------
-
-Introduction
-------------
-Currently ALPS HID driver supports U1 Touchpad device.
-
-U1 devuce basic information.
-Vender ID	0x044E
-Product ID	0x120B
-Version ID	0x0121
-
-
-HID Descriptor
-------------
-Byte	Field			Value	Notes
-0	wHIDDescLength		001E	Length of HID Descriptor : 30 bytes
-2	bcdVersion		0100	Compliant with Version 1.00
-4	wReportDescLength	00B2	Report Descriptor is 178 Bytes (0x00B2)
-6	wReportDescRegister	0002	Identifier to read Report Descriptor
-8	wInputRegister		0003	Identifier to read Input Report
-10	wMaxInputLength		0053	Input Report is 80 Bytes + 2
-12	wOutputRegister		0000	Identifier to read Output Report
-14	wMaxOutputLength	0000	No Output Reports
-16	wCommandRegister	0005	Identifier for Command Register
-18	wDataRegister		0006	Identifier for Data Register
-20	wVendorID		044E	Vendor ID 0x044E
-22	wProductID		120B	Product ID 0x120B
-24	wVersionID		0121	Version 01.21
-26	RESERVED		0000	RESERVED
-
-
-Report ID
-------------
-ReportID-1	(Input Reports)	(HIDUsage-Mouse) for TP&SP
-ReportID-2	(Input Reports)	(HIDUsage-keyboard) for TP
-ReportID-3	(Input Reports)	(Vendor Usage: Max 10 finger data) for TP
-ReportID-4	(Input Reports)	(Vendor Usage: ON bit data) for GP
-ReportID-5	(Feature Reports)	Feature Reports
-ReportID-6	(Input Reports)	(Vendor Usage: StickPointer data) for SP
-ReportID-7	(Feature Reports)	Flash update (Bootloader)
-
-
-Data pattern
-------------
-Case1	ReportID_1	TP/SP	Relative/Relative
-Case2	ReportID_3	TP	Absolute
-	ReportID_6	SP	Absolute
-
-
-Command Read/Write
-------------------
-To read/write to RAM, need to send a commands to the device.
-The command format is as below.
-
-DataByte(SET_REPORT)
-Byte1	Command Byte
-Byte2	Address - Byte 0 (LSB)
-Byte3	Address - Byte 1
-Byte4	Address - Byte 2
-Byte5	Address - Byte 3 (MSB)
-Byte6	Value Byte
-Byte7	Checksum
-
-Command Byte is read=0xD1/write=0xD2 .
-Address is read/write RAM address.
-Value Byte is writing data when you send the write commands.
-When you read RAM, there is no meaning.
-
-DataByte(GET_REPORT)
-Byte1	Response Byte
-Byte2	Address - Byte 0 (LSB)
-Byte3	Address - Byte 1
-Byte4	Address - Byte 2
-Byte5	Address - Byte 3 (MSB)
-Byte6	Value Byte
-Byte7	Checksum
-
-Read value is stored in Value Byte.
-
-
-Packet Format
-Touchpad data byte
-------------------
-	b7	b6	b5	b4	b3	b2	b1	b0
-1	0	0	SW6	SW5	SW4	SW3	SW2	SW1
-2	0	0	0	Fcv	Fn3	Fn2	Fn1	Fn0
-3	Xa0_7	Xa0_6	Xa0_5	Xa0_4	Xa0_3	Xa0_2	Xa0_1	Xa0_0
-4	Xa0_15	Xa0_14	Xa0_13	Xa0_12	Xa0_11	Xa0_10	Xa0_9	Xa0_8
-5	Ya0_7	Ya0_6	Ya0_5	Ya0_4	Ya0_3	Ya0_2	Ya0_1	Ya0_0
-6	Ya0_15	Ya0_14	Ya0_13	Ya0_12	Ya0_11	Ya0_10	Ya0_9	Ya0_8
-7	LFB0	Zs0_6	Zs0_5	Zs0_4	Zs0_3	Zs0_2	Zs0_1	Zs0_0
-
-8	Xa1_7	Xa1_6	Xa1_5	Xa1_4	Xa1_3	Xa1_2	Xa1_1	Xa1_0
-9	Xa1_15	Xa1_14	Xa1_13	Xa1_12	Xa1_11	Xa1_10	Xa1_9	Xa1_8
-10	Ya1_7	Ya1_6	Ya1_5	Ya1_4	Ya1_3	Ya1_2	Ya1_1	Ya1_0
-11	Ya1_15	Ya1_14	Ya1_13	Ya1_12	Ya1_11	Ya1_10	Ya1_9	Ya1_8
-12	LFB1	Zs1_6	Zs1_5	Zs1_4	Zs1_3	Zs1_2	Zs1_1	Zs1_0
-
-13	Xa2_7	Xa2_6	Xa2_5	Xa2_4	Xa2_3	Xa2_2	Xa2_1	Xa2_0
-14	Xa2_15	Xa2_14	Xa2_13	Xa2_12	Xa2_11	Xa2_10	Xa2_9	Xa2_8
-15	Ya2_7	Ya2_6	Ya2_5	Ya2_4	Ya2_3	Ya2_2	Ya2_1	Ya2_0
-16	Ya2_15	Ya2_14	Ya2_13	Ya2_12	Ya2_11	Ya2_10	Ya2_9	Ya2_8
-17	LFB2	Zs2_6	Zs2_5	Zs2_4	Zs2_3	Zs2_2	Zs2_1	Zs2_0
-
-18	Xa3_7	Xa3_6	Xa3_5	Xa3_4	Xa3_3	Xa3_2	Xa3_1	Xa3_0
-19	Xa3_15	Xa3_14	Xa3_13	Xa3_12	Xa3_11	Xa3_10	Xa3_9	Xa3_8
-20	Ya3_7	Ya3_6	Ya3_5	Ya3_4	Ya3_3	Ya3_2	Ya3_1	Ya3_0
-21	Ya3_15	Ya3_14	Ya3_13	Ya3_12	Ya3_11	Ya3_10	Ya3_9	Ya3_8
-22	LFB3	Zs3_6	Zs3_5	Zs3_4	Zs3_3	Zs3_2	Zs3_1	Zs3_0
-
-23	Xa4_7	Xa4_6	Xa4_5	Xa4_4	Xa4_3	Xa4_2	Xa4_1	Xa4_0
-24	Xa4_15	Xa4_14	Xa4_13	Xa4_12	Xa4_11	Xa4_10	Xa4_9	Xa4_8
-25	Ya4_7	Ya4_6	Ya4_5	Ya4_4	Ya4_3	Ya4_2	Ya4_1	Ya4_0
-26	Ya4_15	Ya4_14	Ya4_13	Ya4_12	Ya4_11	Ya4_10	Ya4_9	Ya4_8
-27	LFB4	Zs4_6	Zs4_5	Zs4_4	Zs4_3	Zs4_2	Zs4_1	Zs4_0
-
-
-SW1-SW6:	SW ON/OFF status
-Xan_15-0(16bit):X Absolute data of the "n"th finger
-Yan_15-0(16bit):Y Absolute data of the "n"th finger
-Zsn_6-0(7bit):	Operation area of the "n"th finger
-
-
-StickPointer data byte
-------------------
-	b7	b6	b5	b4	b3	b2	b1	b0
-Byte1	1	1	1	0	1	SW3	SW2	SW1
-Byte2	X7	X6	X5	X4	X3	X2	X1	X0
-Byte3	X15	X14	X13	X12	X11	X10	X9	X8
-Byte4	Y7	Y6	Y5	Y4	Y3	Y2	Y1	Y0
-Byte5	Y15	Y14	Y13	Y12	Y11	Y10	Y9	Y8
-Byte6	Z7	Z6	Z5	Z4	Z3	Z2	Z1	Z0
-Byte7	T&P	Z14	Z13	Z12	Z11	Z10	Z9	Z8
-
-SW1-SW3:	SW ON/OFF status
-Xn_15-0(16bit):X Absolute data
-Yn_15-0(16bit):Y Absolute data
-Zn_14-0(15bit):Z
diff --git a/Documentation/hid/hid-sensor.rst b/Documentation/hid/hid-sensor.rst
new file mode 100644
index 000000000000..758972e34971
--- /dev/null
+++ b/Documentation/hid/hid-sensor.rst
@@ -0,0 +1,242 @@
+=====================
+HID Sensors Framework
+=====================
+HID sensor framework provides necessary interfaces to implement sensor drivers,
+which are connected to a sensor hub. The sensor hub is a HID device and it provides
+a report descriptor conforming to HID 1.12 sensor usage tables.
+
+Description from the HID 1.12 "HID Sensor Usages" specification:
+"Standardization of HID usages for sensors would allow (but not require) sensor
+hardware vendors to provide a consistent Plug And Play interface at the USB boundary,
+thereby enabling some operating systems to incorporate common device drivers that
+could be reused between vendors, alleviating any need for the vendors to provide
+the drivers themselves."
+
+This specification describes many usage IDs, which describe the type of sensor
+and also the individual data fields. Each sensor can have variable number of
+data fields. The length and order is specified in the report descriptor. For
+example a part of report descriptor can look like::
+
+     INPUT(1)[INPUT]
+   ..
+      Field(2)
+        Physical(0020.0073)
+        Usage(1)
+          0020.045f
+        Logical Minimum(-32767)
+        Logical Maximum(32767)
+        Report Size(8)
+        Report Count(1)
+        Report Offset(16)
+        Flags(Variable Absolute)
+  ..
+  ..
+
+The report is indicating "sensor page (0x20)" contains an accelerometer-3D (0x73).
+This accelerometer-3D has some fields. Here for example field 2 is motion intensity
+(0x045f) with a logical minimum value of -32767 and logical maximum of 32767. The
+order of fields and length of each field is important as the input event raw
+data will use this format.
+
+
+Implementation
+==============
+
+This specification defines many different types of sensors with different sets of
+data fields. It is difficult to have a common input event to user space applications,
+for different sensors. For example an accelerometer can send X,Y and Z data, whereas
+an ambient light sensor can send illumination data.
+So the implementation has two parts:
+
+- Core hid driver
+- Individual sensor processing part (sensor drivers)
+
+Core driver
+-----------
+The core driver registers (hid-sensor-hub) registers as a HID driver. It parses
+report descriptors and identifies all the sensors present. It adds an MFD device
+with name HID-SENSOR-xxxx (where xxxx is usage id from the specification).
+
+For example:
+
+HID-SENSOR-200073 is registered for an Accelerometer 3D driver.
+
+So if any driver with this name is inserted, then the probe routine for that
+function will be called. So an accelerometer processing driver can register
+with this name and will be probed if there is an accelerometer-3D detected.
+
+The core driver provides a set of APIs which can be used by the processing
+drivers to register and get events for that usage id. Also it provides parsing
+functions, which get and set each input/feature/output report.
+
+Individual sensor processing part (sensor drivers)
+--------------------------------------------------
+
+The processing driver will use an interface provided by the core driver to parse
+the report and get the indexes of the fields and also can get events. This driver
+can use IIO interface to use the standard ABI defined for a type of sensor.
+
+
+Core driver Interface
+=====================
+
+Callback structure::
+
+  Each processing driver can use this structure to set some callbacks.
+	int (*suspend)(..): Callback when HID suspend is received
+	int (*resume)(..): Callback when HID resume is received
+	int (*capture_sample)(..): Capture a sample for one of its data fields
+	int (*send_event)(..): One complete event is received which can have
+                               multiple data fields.
+
+Registration functions::
+
+  int sensor_hub_register_callback(struct hid_sensor_hub_device *hsdev,
+			u32 usage_id,
+			struct hid_sensor_hub_callbacks *usage_callback):
+
+Registers callbacks for an usage id. The callback functions are not allowed
+to sleep::
+
+
+  int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
+			u32 usage_id):
+
+Removes callbacks for an usage id.
+
+
+Parsing function::
+
+  int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
+			u8 type,
+			u32 usage_id, u32 attr_usage_id,
+			struct hid_sensor_hub_attribute_info *info);
+
+A processing driver can look for some field of interest and check if it exists
+in a report descriptor. If it exists it will store necessary information
+so that fields can be set or get individually.
+These indexes avoid searching every time and getting field index to get or set.
+
+
+Set Feature report::
+
+  int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
+			u32 field_index, s32 value);
+
+This interface is used to set a value for a field in feature report. For example
+if there is a field report_interval, which is parsed by a call to
+sensor_hub_input_get_attribute_info before, then it can directly set that
+individual field::
+
+
+  int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
+			u32 field_index, s32 *value);
+
+This interface is used to get a value for a field in input report. For example
+if there is a field report_interval, which is parsed by a call to
+sensor_hub_input_get_attribute_info before, then it can directly get that
+individual field value::
+
+
+  int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev,
+			u32 usage_id,
+			u32 attr_usage_id, u32 report_id);
+
+This is used to get a particular field value through input reports. For example
+accelerometer wants to poll X axis value, then it can call this function with
+the usage id of X axis. HID sensors can provide events, so this is not necessary
+to poll for any field. If there is some new sample, the core driver will call
+registered callback function to process the sample.
+
+
+----------
+
+HID Custom and generic Sensors
+------------------------------
+
+
+HID Sensor specification defines two special sensor usage types. Since they
+don't represent a standard sensor, it is not possible to define using Linux IIO
+type interfaces.
+The purpose of these sensors is to extend the functionality or provide a
+way to obfuscate the data being communicated by a sensor. Without knowing the
+mapping between the data and its encapsulated form, it is difficult for
+an application/driver to determine what data is being communicated by the sensor.
+This allows some differentiating use cases, where vendor can provide applications.
+Some common use cases are debug other sensors or to provide some events like
+keyboard attached/detached or lid open/close.
+
+To allow application to utilize these sensors, here they are exported uses sysfs
+attribute groups, attributes and misc device interface.
+
+An example of this representation on sysfs::
+
+  /sys/devices/pci0000:00/INT33C2:00/i2c-0/i2c-INT33D1:00/0018:8086:09FA.0001/HID-SENSOR-2000e1.6.auto$ tree -R
+  .
+  │   ├──  enable_sensor
+  │   │   ├── feature-0-200316
+  │   │   │   ├── feature-0-200316-maximum
+  │   │   │   ├── feature-0-200316-minimum
+  │   │   │   ├── feature-0-200316-name
+  │   │   │   ├── feature-0-200316-size
+  │   │   │   ├── feature-0-200316-unit-expo
+  │   │   │   ├── feature-0-200316-units
+  │   │   │   ├── feature-0-200316-value
+  │   │   ├── feature-1-200201
+  │   │   │   ├── feature-1-200201-maximum
+  │   │   │   ├── feature-1-200201-minimum
+  │   │   │   ├── feature-1-200201-name
+  │   │   │   ├── feature-1-200201-size
+  │   │   │   ├── feature-1-200201-unit-expo
+  │   │   │   ├── feature-1-200201-units
+  │   │   │   ├── feature-1-200201-value
+  │   │   ├── input-0-200201
+  │   │   │   ├── input-0-200201-maximum
+  │   │   │   ├── input-0-200201-minimum
+  │   │   │   ├── input-0-200201-name
+  │   │   │   ├── input-0-200201-size
+  │   │   │   ├── input-0-200201-unit-expo
+  │   │   │   ├── input-0-200201-units
+  │   │   │   ├── input-0-200201-value
+  │   │   ├── input-1-200202
+  │   │   │   ├── input-1-200202-maximum
+  │   │   │   ├── input-1-200202-minimum
+  │   │   │   ├── input-1-200202-name
+  │   │   │   ├── input-1-200202-size
+  │   │   │   ├── input-1-200202-unit-expo
+  │   │   │   ├── input-1-200202-units
+  │   │   │   ├── input-1-200202-value
+
+Here there is a custom sensors with four fields, two feature and two inputs.
+Each field is represented by a set of attributes. All fields except the "value"
+are read only. The value field is a RW field.
+
+Example::
+
+  /sys/bus/platform/devices/HID-SENSOR-2000e1.6.auto/feature-0-200316$ grep -r . *
+  feature-0-200316-maximum:6
+  feature-0-200316-minimum:0
+  feature-0-200316-name:property-reporting-state
+  feature-0-200316-size:1
+  feature-0-200316-unit-expo:0
+  feature-0-200316-units:25
+  feature-0-200316-value:1
+
+How to enable such sensor?
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default sensor can be power gated. To enable sysfs attribute "enable" can be
+used::
+
+	$ echo 1 > enable_sensor
+
+Once enabled and powered on, sensor can report value using HID reports.
+These reports are pushed using misc device interface in a FIFO order::
+
+	/dev$ tree | grep HID-SENSOR-2000e1.6.auto
+	│   │   │   ├── 10:53 -> ../HID-SENSOR-2000e1.6.auto
+	│   ├──  HID-SENSOR-2000e1.6.auto
+
+Each reports can be of variable length preceded by a header. This header
+consist of a 32 bit usage id, 64 bit time stamp and 32 bit length field of raw
+data.
diff --git a/Documentation/hid/hid-sensor.txt b/Documentation/hid/hid-sensor.txt
deleted file mode 100644
index b287752a31cd..000000000000
--- a/Documentation/hid/hid-sensor.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-
-HID Sensors Framework
-======================
-HID sensor framework provides necessary interfaces to implement sensor drivers,
-which are connected to a sensor hub. The sensor hub is a HID device and it provides
-a report descriptor conforming to HID 1.12 sensor usage tables.
-
-Description from the HID 1.12 "HID Sensor Usages" specification:
-"Standardization of HID usages for sensors would allow (but not require) sensor
-hardware vendors to provide a consistent Plug And Play interface at the USB boundary,
-thereby enabling some operating systems to incorporate common device drivers that
-could be reused between vendors, alleviating any need for the vendors to provide
-the drivers themselves."
-
-This specification describes many usage IDs, which describe the type of sensor
-and also the individual data fields. Each sensor can have variable number of
-data fields. The length and order is specified in the report descriptor. For
-example a part of report descriptor can look like:
-
-   INPUT(1)[INPUT]
- ..
-    Field(2)
-      Physical(0020.0073)
-      Usage(1)
-        0020.045f
-      Logical Minimum(-32767)
-      Logical Maximum(32767)
-      Report Size(8)
-      Report Count(1)
-      Report Offset(16)
-      Flags(Variable Absolute)
-..
-..
-
-The report is indicating "sensor page (0x20)" contains an accelerometer-3D (0x73).
-This accelerometer-3D has some fields. Here for example field 2 is motion intensity
-(0x045f) with a logical minimum value of -32767 and logical maximum of 32767. The
-order of fields and length of each field is important as the input event raw
-data will use this format.
-
-
-Implementation
-=================
-
-This specification defines many different types of sensors with different sets of
-data fields. It is difficult to have a common input event to user space applications,
-for different sensors. For example an accelerometer can send X,Y and Z data, whereas
-an ambient light sensor can send illumination data.
-So the implementation has two parts:
-- Core hid driver
-- Individual sensor processing part (sensor drivers)
-
-Core driver
------------
-The core driver registers (hid-sensor-hub) registers as a HID driver. It parses
-report descriptors and identifies all the sensors present. It adds an MFD device
-with name HID-SENSOR-xxxx (where xxxx is usage id from the specification).
-For example
-HID-SENSOR-200073 is registered for an Accelerometer 3D driver.
-So if any driver with this name is inserted, then the probe routine for that
-function will be called. So an accelerometer processing driver can register
-with this name and will be probed if there is an accelerometer-3D detected.
-
-The core driver provides a set of APIs which can be used by the processing
-drivers to register and get events for that usage id. Also it provides parsing
-functions, which get and set each input/feature/output report.
-
-Individual sensor processing part (sensor drivers)
------------
-The processing driver will use an interface provided by the core driver to parse
-the report and get the indexes of the fields and also can get events. This driver
-can use IIO interface to use the standard ABI defined for a type of sensor.
-
-
-Core driver Interface
-=====================
-
-Callback structure:
-Each processing driver can use this structure to set some callbacks.
-	int (*suspend)(..): Callback when HID suspend is received
-	int (*resume)(..): Callback when HID resume is received
-	int (*capture_sample)(..): Capture a sample for one of its data fields
-	int (*send_event)(..): One complete event is received which can have
-                               multiple data fields.
-
-Registration functions:
-int sensor_hub_register_callback(struct hid_sensor_hub_device *hsdev,
-			u32 usage_id,
-			struct hid_sensor_hub_callbacks *usage_callback):
-
-Registers callbacks for an usage id. The callback functions are not allowed
-to sleep.
-
-
-int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
-			u32 usage_id):
-
-Removes callbacks for an usage id.
-
-
-Parsing function:
-int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
-			u8 type,
-			u32 usage_id, u32 attr_usage_id,
-			struct hid_sensor_hub_attribute_info *info);
-
-A processing driver can look for some field of interest and check if it exists
-in a report descriptor. If it exists it will store necessary information
-so that fields can be set or get individually.
-These indexes avoid searching every time and getting field index to get or set.
-
-
-Set Feature report
-int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
-			u32 field_index, s32 value);
-
-This interface is used to set a value for a field in feature report. For example
-if there is a field report_interval, which is parsed by a call to
-sensor_hub_input_get_attribute_info before, then it can directly set that individual
-field.
-
-
-int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
-			u32 field_index, s32 *value);
-
-This interface is used to get a value for a field in input report. For example
-if there is a field report_interval, which is parsed by a call to
-sensor_hub_input_get_attribute_info before, then it can directly get that individual
-field value.
-
-
-int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev,
-			u32 usage_id,
-			u32 attr_usage_id, u32 report_id);
-
-This is used to get a particular field value through input reports. For example
-accelerometer wants to poll X axis value, then it can call this function with
-the usage id of X axis. HID sensors can provide events, so this is not necessary
-to poll for any field. If there is some new sample, the core driver will call
-registered callback function to process the sample.
-
-
-----------
-
-HID Custom and generic Sensors
-
-HID Sensor specification defines two special sensor usage types. Since they
-don't represent a standard sensor, it is not possible to define using Linux IIO
-type interfaces.
-The purpose of these sensors is to extend the functionality or provide a
-way to obfuscate the data being communicated by a sensor. Without knowing the
-mapping between the data and its encapsulated form, it is difficult for
-an application/driver to determine what data is being communicated by the sensor.
-This allows some differentiating use cases, where vendor can provide applications.
-Some common use cases are debug other sensors or to provide some events like
-keyboard attached/detached or lid open/close.
-
-To allow application to utilize these sensors, here they are exported uses sysfs
-attribute groups, attributes and misc device interface.
-
-An example of this representation on sysfs:
-/sys/devices/pci0000:00/INT33C2:00/i2c-0/i2c-INT33D1:00/0018:8086:09FA.0001/HID-SENSOR-2000e1.6.auto$ tree -R
-.
-????????? enable_sensor
-????????? feature-0-200316
-??????? ????????? feature-0-200316-maximum
-??????? ????????? feature-0-200316-minimum
-??????? ????????? feature-0-200316-name
-??????? ????????? feature-0-200316-size
-??????? ????????? feature-0-200316-unit-expo
-??????? ????????? feature-0-200316-units
-??????? ????????? feature-0-200316-value
-????????? feature-1-200201
-??????? ????????? feature-1-200201-maximum
-??????? ????????? feature-1-200201-minimum
-??????? ????????? feature-1-200201-name
-??????? ????????? feature-1-200201-size
-??????? ????????? feature-1-200201-unit-expo
-??????? ????????? feature-1-200201-units
-??????? ????????? feature-1-200201-value
-????????? input-0-200201
-??????? ????????? input-0-200201-maximum
-??????? ????????? input-0-200201-minimum
-??????? ????????? input-0-200201-name
-??????? ????????? input-0-200201-size
-??????? ????????? input-0-200201-unit-expo
-??????? ????????? input-0-200201-units
-??????? ????????? input-0-200201-value
-????????? input-1-200202
-??????? ????????? input-1-200202-maximum
-??????? ????????? input-1-200202-minimum
-??????? ????????? input-1-200202-name
-??????? ????????? input-1-200202-size
-??????? ????????? input-1-200202-unit-expo
-??????? ????????? input-1-200202-units
-??????? ????????? input-1-200202-value
-
-Here there is a custom sensors with four fields, two feature and two inputs.
-Each field is represented by a set of attributes. All fields except the "value"
-are read only. The value field is a RW field.
-Example
-/sys/bus/platform/devices/HID-SENSOR-2000e1.6.auto/feature-0-200316$ grep -r . *
-feature-0-200316-maximum:6
-feature-0-200316-minimum:0
-feature-0-200316-name:property-reporting-state
-feature-0-200316-size:1
-feature-0-200316-unit-expo:0
-feature-0-200316-units:25
-feature-0-200316-value:1
-
-How to enable such sensor?
-By default sensor can be power gated. To enable sysfs attribute "enable" can be
-used.
-$ echo 1 > enable_sensor
-
-Once enabled and powered on, sensor can report value using HID reports.
-These reports are pushed using misc device interface in a FIFO order.
-/dev$ tree | grep HID-SENSOR-2000e1.6.auto
-??????? ????????? 10:53 -> ../HID-SENSOR-2000e1.6.auto
-????????? HID-SENSOR-2000e1.6.auto
-
-Each reports can be of variable length preceded by a header. This header
-consist of a 32 bit usage id, 64 bit time stamp and 32 bit length field of raw
-data.
diff --git a/Documentation/hid/hid-transport.rst b/Documentation/hid/hid-transport.rst
new file mode 100644
index 000000000000..0fe526f36db6
--- /dev/null
+++ b/Documentation/hid/hid-transport.rst
@@ -0,0 +1,359 @@
+=========================
+HID I/O Transport Drivers
+=========================
+
+The HID subsystem is independent of the underlying transport driver. Initially,
+only USB was supported, but other specifications adopted the HID design and
+provided new transport drivers. The kernel includes at least support for USB,
+Bluetooth, I2C and user-space I/O drivers.
+
+1) HID Bus
+==========
+
+The HID subsystem is designed as a bus. Any I/O subsystem may provide HID
+devices and register them with the HID bus. HID core then loads generic device
+drivers on top of it. The transport drivers are responsible of raw data
+transport and device setup/management. HID core is responsible of
+report-parsing, report interpretation and the user-space API. Device specifics
+and quirks are handled by all layers depending on the quirk.
+
+::
+
+ +-----------+  +-----------+            +-----------+  +-----------+
+ | Device #1 |  | Device #i |            | Device #j |  | Device #k |
+ +-----------+  +-----------+            +-----------+  +-----------+
+          \\      //                              \\      //
+        +------------+                          +------------+
+        | I/O Driver |                          | I/O Driver |
+        +------------+                          +------------+
+              ||                                      ||
+     +------------------+                    +------------------+
+     | Transport Driver |                    | Transport Driver |
+     +------------------+                    +------------------+
+                       \___                ___/
+                           \              /
+                          +----------------+
+                          |    HID Core    |
+                          +----------------+
+                           /  |        |  \
+                          /   |        |   \
+             ____________/    |        |    \_________________
+            /                 |        |                      \
+           /                  |        |                       \
+ +----------------+  +-----------+  +------------------+  +------------------+
+ | Generic Driver |  | MT Driver |  | Custom Driver #1 |  | Custom Driver #2 |
+ +----------------+  +-----------+  +------------------+  +------------------+
+
+Example Drivers:
+
+  - I/O: USB, I2C, Bluetooth-l2cap
+  - Transport: USB-HID, I2C-HID, BT-HIDP
+
+Everything below "HID Core" is simplified in this graph as it is only of
+interest to HID device drivers. Transport drivers do not need to know the
+specifics.
+
+1.1) Device Setup
+-----------------
+
+I/O drivers normally provide hotplug detection or device enumeration APIs to the
+transport drivers. Transport drivers use this to find any suitable HID device.
+They allocate HID device objects and register them with HID core. Transport
+drivers are not required to register themselves with HID core. HID core is never
+aware of which transport drivers are available and is not interested in it. It
+is only interested in devices.
+
+Transport drivers attach a constant "struct hid_ll_driver" object with each
+device. Once a device is registered with HID core, the callbacks provided via
+this struct are used by HID core to communicate with the device.
+
+Transport drivers are responsible of detecting device failures and unplugging.
+HID core will operate a device as long as it is registered regardless of any
+device failures. Once transport drivers detect unplug or failure events, they
+must unregister the device from HID core and HID core will stop using the
+provided callbacks.
+
+1.2) Transport Driver Requirements
+----------------------------------
+
+The terms "asynchronous" and "synchronous" in this document describe the
+transmission behavior regarding acknowledgements. An asynchronous channel must
+not perform any synchronous operations like waiting for acknowledgements or
+verifications. Generally, HID calls operating on asynchronous channels must be
+running in atomic-context just fine.
+On the other hand, synchronous channels can be implemented by the transport
+driver in whatever way they like. They might just be the same as asynchronous
+channels, but they can also provide acknowledgement reports, automatic
+retransmission on failure, etc. in a blocking manner. If such functionality is
+required on asynchronous channels, a transport-driver must implement that via
+its own worker threads.
+
+HID core requires transport drivers to follow a given design. A Transport
+driver must provide two bi-directional I/O channels to each HID device. These
+channels must not necessarily be bi-directional in the hardware itself. A
+transport driver might just provide 4 uni-directional channels. Or it might
+multiplex all four on a single physical channel. However, in this document we
+will describe them as two bi-directional channels as they have several
+properties in common.
+
+ - Interrupt Channel (intr): The intr channel is used for asynchronous data
+   reports. No management commands or data acknowledgements are sent on this
+   channel. Any unrequested incoming or outgoing data report must be sent on
+   this channel and is never acknowledged by the remote side. Devices usually
+   send their input events on this channel. Outgoing events are normally
+   not send via intr, except if high throughput is required.
+ - Control Channel (ctrl): The ctrl channel is used for synchronous requests and
+   device management. Unrequested data input events must not be sent on this
+   channel and are normally ignored. Instead, devices only send management
+   events or answers to host requests on this channel.
+   The control-channel is used for direct blocking queries to the device
+   independent of any events on the intr-channel.
+   Outgoing reports are usually sent on the ctrl channel via synchronous
+   SET_REPORT requests.
+
+Communication between devices and HID core is mostly done via HID reports. A
+report can be of one of three types:
+
+ - INPUT Report: Input reports provide data from device to host. This
+   data may include button events, axis events, battery status or more. This
+   data is generated by the device and sent to the host with or without
+   requiring explicit requests. Devices can choose to send data continuously or
+   only on change.
+ - OUTPUT Report: Output reports change device states. They are sent from host
+   to device and may include LED requests, rumble requests or more. Output
+   reports are never sent from device to host, but a host can retrieve their
+   current state.
+   Hosts may choose to send output reports either continuously or only on
+   change.
+ - FEATURE Report: Feature reports are used for specific static device features
+   and never reported spontaneously. A host can read and/or write them to access
+   data like battery-state or device-settings.
+   Feature reports are never sent without requests. A host must explicitly set
+   or retrieve a feature report. This also means, feature reports are never sent
+   on the intr channel as this channel is asynchronous.
+
+INPUT and OUTPUT reports can be sent as pure data reports on the intr channel.
+For INPUT reports this is the usual operational mode. But for OUTPUT reports,
+this is rarely done as OUTPUT reports are normally quite scarce. But devices are
+free to make excessive use of asynchronous OUTPUT reports (for instance, custom
+HID audio speakers make great use of it).
+
+Plain reports must not be sent on the ctrl channel, though. Instead, the ctrl
+channel provides synchronous GET/SET_REPORT requests. Plain reports are only
+allowed on the intr channel and are the only means of data there.
+
+ - GET_REPORT: A GET_REPORT request has a report ID as payload and is sent
+   from host to device. The device must answer with a data report for the
+   requested report ID on the ctrl channel as a synchronous acknowledgement.
+   Only one GET_REPORT request can be pending for each device. This restriction
+   is enforced by HID core as several transport drivers don't allow multiple
+   simultaneous GET_REPORT requests.
+   Note that data reports which are sent as answer to a GET_REPORT request are
+   not handled as generic device events. That is, if a device does not operate
+   in continuous data reporting mode, an answer to GET_REPORT does not replace
+   the raw data report on the intr channel on state change.
+   GET_REPORT is only used by custom HID device drivers to query device state.
+   Normally, HID core caches any device state so this request is not necessary
+   on devices that follow the HID specs except during device initialization to
+   retrieve the current state.
+   GET_REPORT requests can be sent for any of the 3 report types and shall
+   return the current report state of the device. However, OUTPUT reports as
+   payload may be blocked by the underlying transport driver if the
+   specification does not allow them.
+ - SET_REPORT: A SET_REPORT request has a report ID plus data as payload. It is
+   sent from host to device and a device must update it's current report state
+   according to the given data. Any of the 3 report types can be used. However,
+   INPUT reports as payload might be blocked by the underlying transport driver
+   if the specification does not allow them.
+   A device must answer with a synchronous acknowledgement. However, HID core
+   does not require transport drivers to forward this acknowledgement to HID
+   core.
+   Same as for GET_REPORT, only one SET_REPORT can be pending at a time. This
+   restriction is enforced by HID core as some transport drivers do not support
+   multiple synchronous SET_REPORT requests.
+
+Other ctrl-channel requests are supported by USB-HID but are not available
+(or deprecated) in most other transport level specifications:
+
+ - GET/SET_IDLE: Only used by USB-HID and I2C-HID.
+ - GET/SET_PROTOCOL: Not used by HID core.
+ - RESET: Used by I2C-HID, not hooked up in HID core.
+ - SET_POWER: Used by I2C-HID, not hooked up in HID core.
+
+2) HID API
+==========
+
+2.1) Initialization
+-------------------
+
+Transport drivers normally use the following procedure to register a new device
+with HID core::
+
+	struct hid_device *hid;
+	int ret;
+
+	hid = hid_allocate_device();
+	if (IS_ERR(hid)) {
+		ret = PTR_ERR(hid);
+		goto err_<...>;
+	}
+
+	strscpy(hid->name, <device-name-src>, sizeof(hid->name));
+	strscpy(hid->phys, <device-phys-src>, sizeof(hid->phys));
+	strscpy(hid->uniq, <device-uniq-src>, sizeof(hid->uniq));
+
+	hid->ll_driver = &custom_ll_driver;
+	hid->bus = <device-bus>;
+	hid->vendor = <device-vendor>;
+	hid->product = <device-product>;
+	hid->version = <device-version>;
+	hid->country = <device-country>;
+	hid->dev.parent = <pointer-to-parent-device>;
+	hid->driver_data = <transport-driver-data-field>;
+
+	ret = hid_add_device(hid);
+	if (ret)
+		goto err_<...>;
+
+Once hid_add_device() is entered, HID core might use the callbacks provided in
+"custom_ll_driver". Note that fields like "country" can be ignored by underlying
+transport-drivers if not supported.
+
+To unregister a device, use::
+
+	hid_destroy_device(hid);
+
+Once hid_destroy_device() returns, HID core will no longer make use of any
+driver callbacks.
+
+2.2) hid_ll_driver operations
+-----------------------------
+
+The available HID callbacks are:
+
+   ::
+
+      int (*start) (struct hid_device *hdev)
+
+   Called from HID device drivers once they want to use the device. Transport
+   drivers can choose to setup their device in this callback. However, normally
+   devices are already set up before transport drivers register them to HID core
+   so this is mostly only used by USB-HID.
+
+   ::
+
+      void (*stop) (struct hid_device *hdev)
+
+   Called from HID device drivers once they are done with a device. Transport
+   drivers can free any buffers and deinitialize the device. But note that
+   ->start() might be called again if another HID device driver is loaded on the
+   device.
+
+   Transport drivers are free to ignore it and deinitialize devices after they
+   destroyed them via hid_destroy_device().
+
+   ::
+
+      int (*open) (struct hid_device *hdev)
+
+   Called from HID device drivers once they are interested in data reports.
+   Usually, while user-space didn't open any input API/etc., device drivers are
+   not interested in device data and transport drivers can put devices asleep.
+   However, once ->open() is called, transport drivers must be ready for I/O.
+   ->open() calls are nested for each client that opens the HID device.
+
+   ::
+
+      void (*close) (struct hid_device *hdev)
+
+   Called from HID device drivers after ->open() was called but they are no
+   longer interested in device reports. (Usually if user-space closed any input
+   devices of the driver).
+
+   Transport drivers can put devices asleep and terminate any I/O of all
+   ->open() calls have been followed by a ->close() call. However, ->start() may
+   be called again if the device driver is interested in input reports again.
+
+   ::
+
+      int (*parse) (struct hid_device *hdev)
+
+   Called once during device setup after ->start() has been called. Transport
+   drivers must read the HID report-descriptor from the device and tell HID core
+   about it via hid_parse_report().
+
+   ::
+
+      int (*power) (struct hid_device *hdev, int level)
+
+   Called by HID core to give PM hints to transport drivers. Usually this is
+   analogical to the ->open() and ->close() hints and redundant.
+
+   ::
+
+      void (*request) (struct hid_device *hdev, struct hid_report *report,
+		       int reqtype)
+
+   Send an HID request on the ctrl channel. "report" contains the report that
+   should be sent and "reqtype" the request type. Request-type can be
+   HID_REQ_SET_REPORT or HID_REQ_GET_REPORT.
+
+   This callback is optional. If not provided, HID core will assemble a raw
+   report following the HID specs and send it via the ->raw_request() callback.
+   The transport driver is free to implement this asynchronously.
+
+   ::
+
+      int (*wait) (struct hid_device *hdev)
+
+   Used by HID core before calling ->request() again. A transport driver can use
+   it to wait for any pending requests to complete if only one request is
+   allowed at a time.
+
+   ::
+
+      int (*raw_request) (struct hid_device *hdev, unsigned char reportnum,
+                          __u8 *buf, size_t count, unsigned char rtype,
+                          int reqtype)
+
+   Same as ->request() but provides the report as raw buffer. This request shall
+   be synchronous. A transport driver must not use ->wait() to complete such
+   requests. This request is mandatory and hid core will reject the device if
+   it is missing.
+
+   ::
+
+      int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len)
+
+   Send raw output report via intr channel. Used by some HID device drivers
+   which require high throughput for outgoing requests on the intr channel. This
+   must not cause SET_REPORT calls! This must be implemented as asynchronous
+   output report on the intr channel!
+
+   ::
+
+      int (*idle) (struct hid_device *hdev, int report, int idle, int reqtype)
+
+   Perform SET/GET_IDLE request. Only used by USB-HID, do not implement!
+
+2.3) Data Path
+--------------
+
+Transport drivers are responsible of reading data from I/O devices. They must
+handle any I/O-related state-tracking themselves. HID core does not implement
+protocol handshakes or other management commands which can be required by the
+given HID transport specification.
+
+Every raw data packet read from a device must be fed into HID core via
+hid_input_report(). You must specify the channel-type (intr or ctrl) and report
+type (input/output/feature). Under normal conditions, only input reports are
+provided via this API.
+
+Responses to GET_REPORT requests via ->request() must also be provided via this
+API. Responses to ->raw_request() are synchronous and must be intercepted by the
+transport driver and not passed to hid_input_report().
+Acknowledgements to SET_REPORT requests are not of interest to HID core.
+
+----------------------------------------------------
+
+Written 2013, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/hid-transport.txt b/Documentation/hid/hid-transport.txt
deleted file mode 100644
index 3dcba9fd4a3a..000000000000
--- a/Documentation/hid/hid-transport.txt
+++ /dev/null
@@ -1,317 +0,0 @@
-                          HID I/O Transport Drivers
-                         ===========================
-
-The HID subsystem is independent of the underlying transport driver. Initially,
-only USB was supported, but other specifications adopted the HID design and
-provided new transport drivers. The kernel includes at least support for USB,
-Bluetooth, I2C and user-space I/O drivers.
-
-1) HID Bus
-==========
-
-The HID subsystem is designed as a bus. Any I/O subsystem may provide HID
-devices and register them with the HID bus. HID core then loads generic device
-drivers on top of it. The transport drivers are responsible of raw data
-transport and device setup/management. HID core is responsible of
-report-parsing, report interpretation and the user-space API. Device specifics
-and quirks are handled by all layers depending on the quirk.
-
- +-----------+  +-----------+            +-----------+  +-----------+
- | Device #1 |  | Device #i |            | Device #j |  | Device #k |
- +-----------+  +-----------+            +-----------+  +-----------+
-          \\      //                              \\      //
-        +------------+                          +------------+
-        | I/O Driver |                          | I/O Driver |
-        +------------+                          +------------+
-              ||                                      ||
-     +------------------+                    +------------------+
-     | Transport Driver |                    | Transport Driver |
-     +------------------+                    +------------------+
-                       \___                ___/
-                           \              /
-                          +----------------+
-                          |    HID Core    |
-                          +----------------+
-                           /  |        |  \
-                          /   |        |   \
-             ____________/    |        |    \_________________
-            /                 |        |                      \
-           /                  |        |                       \
- +----------------+  +-----------+  +------------------+  +------------------+
- | Generic Driver |  | MT Driver |  | Custom Driver #1 |  | Custom Driver #2 |
- +----------------+  +-----------+  +------------------+  +------------------+
-
-Example Drivers:
-  I/O: USB, I2C, Bluetooth-l2cap
-  Transport: USB-HID, I2C-HID, BT-HIDP
-
-Everything below "HID Core" is simplified in this graph as it is only of
-interest to HID device drivers. Transport drivers do not need to know the
-specifics.
-
-1.1) Device Setup
------------------
-
-I/O drivers normally provide hotplug detection or device enumeration APIs to the
-transport drivers. Transport drivers use this to find any suitable HID device.
-They allocate HID device objects and register them with HID core. Transport
-drivers are not required to register themselves with HID core. HID core is never
-aware of which transport drivers are available and is not interested in it. It
-is only interested in devices.
-
-Transport drivers attach a constant "struct hid_ll_driver" object with each
-device. Once a device is registered with HID core, the callbacks provided via
-this struct are used by HID core to communicate with the device.
-
-Transport drivers are responsible of detecting device failures and unplugging.
-HID core will operate a device as long as it is registered regardless of any
-device failures. Once transport drivers detect unplug or failure events, they
-must unregister the device from HID core and HID core will stop using the
-provided callbacks.
-
-1.2) Transport Driver Requirements
-----------------------------------
-
-The terms "asynchronous" and "synchronous" in this document describe the
-transmission behavior regarding acknowledgements. An asynchronous channel must
-not perform any synchronous operations like waiting for acknowledgements or
-verifications. Generally, HID calls operating on asynchronous channels must be
-running in atomic-context just fine.
-On the other hand, synchronous channels can be implemented by the transport
-driver in whatever way they like. They might just be the same as asynchronous
-channels, but they can also provide acknowledgement reports, automatic
-retransmission on failure, etc. in a blocking manner. If such functionality is
-required on asynchronous channels, a transport-driver must implement that via
-its own worker threads.
-
-HID core requires transport drivers to follow a given design. A Transport
-driver must provide two bi-directional I/O channels to each HID device. These
-channels must not necessarily be bi-directional in the hardware itself. A
-transport driver might just provide 4 uni-directional channels. Or it might
-multiplex all four on a single physical channel. However, in this document we
-will describe them as two bi-directional channels as they have several
-properties in common.
-
- - Interrupt Channel (intr): The intr channel is used for asynchronous data
-   reports. No management commands or data acknowledgements are sent on this
-   channel. Any unrequested incoming or outgoing data report must be sent on
-   this channel and is never acknowledged by the remote side. Devices usually
-   send their input events on this channel. Outgoing events are normally
-   not send via intr, except if high throughput is required.
- - Control Channel (ctrl): The ctrl channel is used for synchronous requests and
-   device management. Unrequested data input events must not be sent on this
-   channel and are normally ignored. Instead, devices only send management
-   events or answers to host requests on this channel.
-   The control-channel is used for direct blocking queries to the device
-   independent of any events on the intr-channel.
-   Outgoing reports are usually sent on the ctrl channel via synchronous
-   SET_REPORT requests.
-
-Communication between devices and HID core is mostly done via HID reports. A
-report can be of one of three types:
-
- - INPUT Report: Input reports provide data from device to host. This
-   data may include button events, axis events, battery status or more. This
-   data is generated by the device and sent to the host with or without
-   requiring explicit requests. Devices can choose to send data continuously or
-   only on change.
- - OUTPUT Report: Output reports change device states. They are sent from host
-   to device and may include LED requests, rumble requests or more. Output
-   reports are never sent from device to host, but a host can retrieve their
-   current state.
-   Hosts may choose to send output reports either continuously or only on
-   change.
- - FEATURE Report: Feature reports are used for specific static device features
-   and never reported spontaneously. A host can read and/or write them to access
-   data like battery-state or device-settings.
-   Feature reports are never sent without requests. A host must explicitly set
-   or retrieve a feature report. This also means, feature reports are never sent
-   on the intr channel as this channel is asynchronous.
-
-INPUT and OUTPUT reports can be sent as pure data reports on the intr channel.
-For INPUT reports this is the usual operational mode. But for OUTPUT reports,
-this is rarely done as OUTPUT reports are normally quite scarce. But devices are
-free to make excessive use of asynchronous OUTPUT reports (for instance, custom
-HID audio speakers make great use of it).
-
-Plain reports must not be sent on the ctrl channel, though. Instead, the ctrl
-channel provides synchronous GET/SET_REPORT requests. Plain reports are only
-allowed on the intr channel and are the only means of data there.
-
- - GET_REPORT: A GET_REPORT request has a report ID as payload and is sent
-   from host to device. The device must answer with a data report for the
-   requested report ID on the ctrl channel as a synchronous acknowledgement.
-   Only one GET_REPORT request can be pending for each device. This restriction
-   is enforced by HID core as several transport drivers don't allow multiple
-   simultaneous GET_REPORT requests.
-   Note that data reports which are sent as answer to a GET_REPORT request are
-   not handled as generic device events. That is, if a device does not operate
-   in continuous data reporting mode, an answer to GET_REPORT does not replace
-   the raw data report on the intr channel on state change.
-   GET_REPORT is only used by custom HID device drivers to query device state.
-   Normally, HID core caches any device state so this request is not necessary
-   on devices that follow the HID specs except during device initialization to
-   retrieve the current state.
-   GET_REPORT requests can be sent for any of the 3 report types and shall
-   return the current report state of the device. However, OUTPUT reports as
-   payload may be blocked by the underlying transport driver if the
-   specification does not allow them.
- - SET_REPORT: A SET_REPORT request has a report ID plus data as payload. It is
-   sent from host to device and a device must update it's current report state
-   according to the given data. Any of the 3 report types can be used. However,
-   INPUT reports as payload might be blocked by the underlying transport driver
-   if the specification does not allow them.
-   A device must answer with a synchronous acknowledgement. However, HID core
-   does not require transport drivers to forward this acknowledgement to HID
-   core.
-   Same as for GET_REPORT, only one SET_REPORT can be pending at a time. This
-   restriction is enforced by HID core as some transport drivers do not support
-   multiple synchronous SET_REPORT requests.
-
-Other ctrl-channel requests are supported by USB-HID but are not available
-(or deprecated) in most other transport level specifications:
-
- - GET/SET_IDLE: Only used by USB-HID and I2C-HID.
- - GET/SET_PROTOCOL: Not used by HID core.
- - RESET: Used by I2C-HID, not hooked up in HID core.
- - SET_POWER: Used by I2C-HID, not hooked up in HID core.
-
-2) HID API
-==========
-
-2.1) Initialization
--------------------
-
-Transport drivers normally use the following procedure to register a new device
-with HID core:
-
-	struct hid_device *hid;
-	int ret;
-
-	hid = hid_allocate_device();
-	if (IS_ERR(hid)) {
-		ret = PTR_ERR(hid);
-		goto err_<...>;
-	}
-
-	strlcpy(hid->name, <device-name-src>, 127);
-	strlcpy(hid->phys, <device-phys-src>, 63);
-	strlcpy(hid->uniq, <device-uniq-src>, 63);
-
-	hid->ll_driver = &custom_ll_driver;
-	hid->bus = <device-bus>;
-	hid->vendor = <device-vendor>;
-	hid->product = <device-product>;
-	hid->version = <device-version>;
-	hid->country = <device-country>;
-	hid->dev.parent = <pointer-to-parent-device>;
-	hid->driver_data = <transport-driver-data-field>;
-
-	ret = hid_add_device(hid);
-	if (ret)
-		goto err_<...>;
-
-Once hid_add_device() is entered, HID core might use the callbacks provided in
-"custom_ll_driver". Note that fields like "country" can be ignored by underlying
-transport-drivers if not supported.
-
-To unregister a device, use:
-
-	hid_destroy_device(hid);
-
-Once hid_destroy_device() returns, HID core will no longer make use of any
-driver callbacks.
-
-2.2) hid_ll_driver operations
------------------------------
-
-The available HID callbacks are:
- - int (*start) (struct hid_device *hdev)
-   Called from HID device drivers once they want to use the device. Transport
-   drivers can choose to setup their device in this callback. However, normally
-   devices are already set up before transport drivers register them to HID core
-   so this is mostly only used by USB-HID.
-
- - void (*stop) (struct hid_device *hdev)
-   Called from HID device drivers once they are done with a device. Transport
-   drivers can free any buffers and deinitialize the device. But note that
-   ->start() might be called again if another HID device driver is loaded on the
-   device.
-   Transport drivers are free to ignore it and deinitialize devices after they
-   destroyed them via hid_destroy_device().
-
- - int (*open) (struct hid_device *hdev)
-   Called from HID device drivers once they are interested in data reports.
-   Usually, while user-space didn't open any input API/etc., device drivers are
-   not interested in device data and transport drivers can put devices asleep.
-   However, once ->open() is called, transport drivers must be ready for I/O.
-   ->open() calls are nested for each client that opens the HID device.
-
- - void (*close) (struct hid_device *hdev)
-   Called from HID device drivers after ->open() was called but they are no
-   longer interested in device reports. (Usually if user-space closed any input
-   devices of the driver).
-   Transport drivers can put devices asleep and terminate any I/O of all
-   ->open() calls have been followed by a ->close() call. However, ->start() may
-   be called again if the device driver is interested in input reports again.
-
- - int (*parse) (struct hid_device *hdev)
-   Called once during device setup after ->start() has been called. Transport
-   drivers must read the HID report-descriptor from the device and tell HID core
-   about it via hid_parse_report().
-
- - int (*power) (struct hid_device *hdev, int level)
-   Called by HID core to give PM hints to transport drivers. Usually this is
-   analogical to the ->open() and ->close() hints and redundant.
-
- - void (*request) (struct hid_device *hdev, struct hid_report *report,
-                    int reqtype)
-   Send an HID request on the ctrl channel. "report" contains the report that
-   should be sent and "reqtype" the request type. Request-type can be
-   HID_REQ_SET_REPORT or HID_REQ_GET_REPORT.
-   This callback is optional. If not provided, HID core will assemble a raw
-   report following the HID specs and send it via the ->raw_request() callback.
-   The transport driver is free to implement this asynchronously.
-
- - int (*wait) (struct hid_device *hdev)
-   Used by HID core before calling ->request() again. A transport driver can use
-   it to wait for any pending requests to complete if only one request is
-   allowed at a time.
-
- - int (*raw_request) (struct hid_device *hdev, unsigned char reportnum,
-                       __u8 *buf, size_t count, unsigned char rtype,
-                       int reqtype)
-   Same as ->request() but provides the report as raw buffer. This request shall
-   be synchronous. A transport driver must not use ->wait() to complete such
-   requests. This request is mandatory and hid core will reject the device if
-   it is missing.
-
- - int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len)
-   Send raw output report via intr channel. Used by some HID device drivers
-   which require high throughput for outgoing requests on the intr channel. This
-   must not cause SET_REPORT calls! This must be implemented as asynchronous
-   output report on the intr channel!
-
- - int (*idle) (struct hid_device *hdev, int report, int idle, int reqtype)
-   Perform SET/GET_IDLE request. Only used by USB-HID, do not implement!
-
-2.3) Data Path
---------------
-
-Transport drivers are responsible of reading data from I/O devices. They must
-handle any I/O-related state-tracking themselves. HID core does not implement
-protocol handshakes or other management commands which can be required by the
-given HID transport specification.
-
-Every raw data packet read from a device must be fed into HID core via
-hid_input_report(). You must specify the channel-type (intr or ctrl) and report
-type (input/output/feature). Under normal conditions, only input reports are
-provided via this API.
-
-Responses to GET_REPORT requests via ->request() must also be provided via this
-API. Responses to ->raw_request() are synchronous and must be intercepted by the
-transport driver and not passed to hid_input_report().
-Acknowledgements to SET_REPORT requests are not of interest to HID core.
-
-----------------------------------------------------
-Written 2013, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/hiddev.rst b/Documentation/hid/hiddev.rst
new file mode 100644
index 000000000000..209e6ba4e019
--- /dev/null
+++ b/Documentation/hid/hiddev.rst
@@ -0,0 +1,251 @@
+================================================
+Care and feeding of your Human Interface Devices
+================================================
+
+Introduction
+============
+
+In addition to the normal input type HID devices, USB also uses the
+human interface device protocols for things that are not really human
+interfaces, but have similar sorts of communication needs. The two big
+examples for this are power devices (especially uninterruptable power
+supplies) and monitor control on higher end monitors.
+
+To support these disparate requirements, the Linux USB system provides
+HID events to two separate interfaces:
+* the input subsystem, which converts HID events into normal input
+device interfaces (such as keyboard, mouse and joystick) and a
+normalised event interface - see Documentation/input/input.rst
+* the hiddev interface, which provides fairly raw HID events
+
+The data flow for a HID event produced by a device is something like
+the following::
+
+ usb.c ---> hid-core.c  ----> hid-input.c ----> [keyboard/mouse/joystick/event]
+                         |
+                         |
+                          --> hiddev.c ----> POWER / MONITOR CONTROL
+
+In addition, other subsystems (apart from USB) can potentially feed
+events into the input subsystem, but these have no effect on the hid
+device interface.
+
+Using the HID Device Interface
+==============================
+
+The hiddev interface is a char interface using the normal USB major,
+with the minor numbers starting at 96 and finishing at 111. Therefore,
+you need the following commands::
+
+	mknod /dev/usb/hiddev0 c 180 96
+	mknod /dev/usb/hiddev1 c 180 97
+	mknod /dev/usb/hiddev2 c 180 98
+	mknod /dev/usb/hiddev3 c 180 99
+	mknod /dev/usb/hiddev4 c 180 100
+	mknod /dev/usb/hiddev5 c 180 101
+	mknod /dev/usb/hiddev6 c 180 102
+	mknod /dev/usb/hiddev7 c 180 103
+	mknod /dev/usb/hiddev8 c 180 104
+	mknod /dev/usb/hiddev9 c 180 105
+	mknod /dev/usb/hiddev10 c 180 106
+	mknod /dev/usb/hiddev11 c 180 107
+	mknod /dev/usb/hiddev12 c 180 108
+	mknod /dev/usb/hiddev13 c 180 109
+	mknod /dev/usb/hiddev14 c 180 110
+	mknod /dev/usb/hiddev15 c 180 111
+
+So you point your hiddev compliant user-space program at the correct
+interface for your device, and it all just works.
+
+Assuming that you have a hiddev compliant user-space program, of
+course. If you need to write one, read on.
+
+
+The HIDDEV API
+==============
+
+This description should be read in conjunction with the HID
+specification, freely available from http://www.usb.org, and
+conveniently linked of http://www.linux-usb.org.
+
+The hiddev API uses a read() interface, and a set of ioctl() calls.
+
+HID devices exchange data with the host computer using data
+bundles called "reports".  Each report is divided into "fields",
+each of which can have one or more "usages".  In the hid-core,
+each one of these usages has a single signed 32 bit value.
+
+read():
+-------
+
+This is the event interface.  When the HID device's state changes,
+it performs an interrupt transfer containing a report which contains
+the changed value.  The hid-core.c module parses the report, and
+returns to hiddev.c the individual usages that have changed within
+the report.  In its basic mode, the hiddev will make these individual
+usage changes available to the reader using a struct hiddev_event::
+
+       struct hiddev_event {
+           unsigned hid;
+           signed int value;
+       };
+
+containing the HID usage identifier for the status that changed, and
+the value that it was changed to. Note that the structure is defined
+within <linux/hiddev.h>, along with some other useful #defines and
+structures.  The HID usage identifier is a composite of the HID usage
+page shifted to the 16 high order bits ORed with the usage code.  The
+behavior of the read() function can be modified using the HIDIOCSFLAG
+ioctl() described below.
+
+
+ioctl():
+--------
+
+This is the control interface. There are a number of controls:
+
+HIDIOCGVERSION
+  - int (read)
+
+ Gets the version code out of the hiddev driver.
+
+HIDIOCAPPLICATION
+  - (none)
+
+This ioctl call returns the HID application usage associated with the
+hid device. The third argument to ioctl() specifies which application
+index to get. This is useful when the device has more than one
+application collection. If the index is invalid (greater or equal to
+the number of application collections this device has) the ioctl
+returns -1. You can find out beforehand how many application
+collections the device has from the num_applications field from the
+hiddev_devinfo structure.
+
+HIDIOCGCOLLECTIONINFO
+  - struct hiddev_collection_info (read/write)
+
+This returns a superset of the information above, providing not only
+application collections, but all the collections the device has.  It
+also returns the level the collection lives in the hierarchy.
+The user passes in a hiddev_collection_info struct with the index
+field set to the index that should be returned.  The ioctl fills in
+the other fields.  If the index is larger than the last collection
+index, the ioctl returns -1 and sets errno to -EINVAL.
+
+HIDIOCGDEVINFO
+  - struct hiddev_devinfo (read)
+
+Gets a hiddev_devinfo structure which describes the device.
+
+HIDIOCGSTRING
+  - struct hiddev_string_descriptor (read/write)
+
+Gets a string descriptor from the device. The caller must fill in the
+"index" field to indicate which descriptor should be returned.
+
+HIDIOCINITREPORT
+  - (none)
+
+Instructs the kernel to retrieve all input and feature report values
+from the device. At this point, all the usage structures will contain
+current values for the device, and will maintain it as the device
+changes.  Note that the use of this ioctl is unnecessary in general,
+since later kernels automatically initialize the reports from the
+device at attach time.
+
+HIDIOCGNAME
+  - string (variable length)
+
+Gets the device name
+
+HIDIOCGREPORT
+  - struct hiddev_report_info (write)
+
+Instructs the kernel to get a feature or input report from the device,
+in order to selectively update the usage structures (in contrast to
+INITREPORT).
+
+HIDIOCSREPORT
+  - struct hiddev_report_info (write)
+
+Instructs the kernel to send a report to the device. This report can
+be filled in by the user through HIDIOCSUSAGE calls (below) to fill in
+individual usage values in the report before sending the report in full
+to the device.
+
+HIDIOCGREPORTINFO
+  - struct hiddev_report_info (read/write)
+
+Fills in a hiddev_report_info structure for the user. The report is
+looked up by type (input, output or feature) and id, so these fields
+must be filled in by the user. The ID can be absolute -- the actual
+report id as reported by the device -- or relative --
+HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT |
+report_id) for the next report after report_id. Without a-priori
+information about report ids, the right way to use this ioctl is to
+use the relative IDs above to enumerate the valid IDs. The ioctl
+returns non-zero when there is no more next ID. The real report ID is
+filled into the returned hiddev_report_info structure.
+
+HIDIOCGFIELDINFO
+  - struct hiddev_field_info (read/write)
+
+Returns the field information associated with a report in a
+hiddev_field_info structure. The user must fill in report_id and
+report_type in this structure, as above. The field_index should also
+be filled in, which should be a number from 0 and maxfield-1, as
+returned from a previous HIDIOCGREPORTINFO call.
+
+HIDIOCGUCODE
+  - struct hiddev_usage_ref (read/write)
+
+Returns the usage_code in a hiddev_usage_ref structure, given that
+given its report type, report id, field index, and index within the
+field have already been filled into the structure.
+
+HIDIOCGUSAGE
+  - struct hiddev_usage_ref (read/write)
+
+Returns the value of a usage in a hiddev_usage_ref structure. The
+usage to be retrieved can be specified as above, or the user can
+choose to fill in the report_type field and specify the report_id as
+HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be
+filled in with the report and field information associated with this
+usage if it is found.
+
+HIDIOCSUSAGE
+  - struct hiddev_usage_ref (write)
+
+Sets the value of a usage in an output report.  The user fills in
+the hiddev_usage_ref structure as above, but additionally fills in
+the value field.
+
+HIDIOGCOLLECTIONINDEX
+  - struct hiddev_usage_ref (write)
+
+Returns the collection index associated with this usage.  This
+indicates where in the collection hierarchy this usage sits.
+
+HIDIOCGFLAG
+  - int (read)
+HIDIOCSFLAG
+  - int (write)
+
+These operations respectively inspect and replace the mode flags
+that influence the read() call above.  The flags are as follows:
+
+    HIDDEV_FLAG_UREF
+      - read() calls will now return
+        struct hiddev_usage_ref instead of struct hiddev_event.
+        This is a larger structure, but in situations where the
+        device has more than one usage in its reports with the
+        same usage code, this mode serves to resolve such
+        ambiguity.
+
+    HIDDEV_FLAG_REPORT
+      - This flag can only be used in conjunction
+        with HIDDEV_FLAG_UREF.  With this flag set, when the device
+        sends a report, a struct hiddev_usage_ref will be returned
+        to read() filled in with the report_type and report_id, but
+        with field_index set to FIELD_INDEX_NONE.  This serves as
+        additional notification when the device has sent a report.
diff --git a/Documentation/hid/hiddev.txt b/Documentation/hid/hiddev.txt
deleted file mode 100644
index 638448707aa2..000000000000
--- a/Documentation/hid/hiddev.txt
+++ /dev/null
@@ -1,205 +0,0 @@
-Care and feeding of your Human Interface Devices
-
-INTRODUCTION
-
-In addition to the normal input type HID devices, USB also uses the
-human interface device protocols for things that are not really human
-interfaces, but have similar sorts of communication needs. The two big
-examples for this are power devices (especially uninterruptable power
-supplies) and monitor control on higher end monitors.
-
-To support these disparate requirements, the Linux USB system provides
-HID events to two separate interfaces:
-* the input subsystem, which converts HID events into normal input
-device interfaces (such as keyboard, mouse and joystick) and a
-normalised event interface - see Documentation/input/input.rst
-* the hiddev interface, which provides fairly raw HID events
-
-The data flow for a HID event produced by a device is something like
-the following :
-
- usb.c ---> hid-core.c  ----> hid-input.c ----> [keyboard/mouse/joystick/event]
-                         |
-                         |
-                          --> hiddev.c ----> POWER / MONITOR CONTROL 
-
-In addition, other subsystems (apart from USB) can potentially feed
-events into the input subsystem, but these have no effect on the hid
-device interface.
-
-USING THE HID DEVICE INTERFACE
-
-The hiddev interface is a char interface using the normal USB major,
-with the minor numbers starting at 96 and finishing at 111. Therefore,
-you need the following commands:
-mknod /dev/usb/hiddev0 c 180 96
-mknod /dev/usb/hiddev1 c 180 97
-mknod /dev/usb/hiddev2 c 180 98
-mknod /dev/usb/hiddev3 c 180 99
-mknod /dev/usb/hiddev4 c 180 100
-mknod /dev/usb/hiddev5 c 180 101
-mknod /dev/usb/hiddev6 c 180 102
-mknod /dev/usb/hiddev7 c 180 103
-mknod /dev/usb/hiddev8 c 180 104
-mknod /dev/usb/hiddev9 c 180 105
-mknod /dev/usb/hiddev10 c 180 106
-mknod /dev/usb/hiddev11 c 180 107
-mknod /dev/usb/hiddev12 c 180 108
-mknod /dev/usb/hiddev13 c 180 109
-mknod /dev/usb/hiddev14 c 180 110
-mknod /dev/usb/hiddev15 c 180 111
-
-So you point your hiddev compliant user-space program at the correct
-interface for your device, and it all just works.
-
-Assuming that you have a hiddev compliant user-space program, of
-course. If you need to write one, read on.
-
-
-THE HIDDEV API
-This description should be read in conjunction with the HID
-specification, freely available from http://www.usb.org, and
-conveniently linked of http://www.linux-usb.org.
-
-The hiddev API uses a read() interface, and a set of ioctl() calls.
-
-HID devices exchange data with the host computer using data
-bundles called "reports".  Each report is divided into "fields",
-each of which can have one or more "usages".  In the hid-core,
-each one of these usages has a single signed 32 bit value.
-
-read():
-This is the event interface.  When the HID device's state changes,
-it performs an interrupt transfer containing a report which contains
-the changed value.  The hid-core.c module parses the report, and
-returns to hiddev.c the individual usages that have changed within
-the report.  In its basic mode, the hiddev will make these individual
-usage changes available to the reader using a struct hiddev_event:
-
-       struct hiddev_event {
-           unsigned hid;
-           signed int value;
-       };
-
-containing the HID usage identifier for the status that changed, and
-the value that it was changed to. Note that the structure is defined
-within <linux/hiddev.h>, along with some other useful #defines and
-structures.  The HID usage identifier is a composite of the HID usage
-page shifted to the 16 high order bits ORed with the usage code.  The
-behavior of the read() function can be modified using the HIDIOCSFLAG
-ioctl() described below.
-
-
-ioctl(): 
-This is the control interface. There are a number of controls: 
-
-HIDIOCGVERSION - int (read)
-Gets the version code out of the hiddev driver.
-
-HIDIOCAPPLICATION - (none)
-This ioctl call returns the HID application usage associated with the
-hid device. The third argument to ioctl() specifies which application
-index to get. This is useful when the device has more than one
-application collection. If the index is invalid (greater or equal to
-the number of application collections this device has) the ioctl
-returns -1. You can find out beforehand how many application
-collections the device has from the num_applications field from the
-hiddev_devinfo structure. 
-
-HIDIOCGCOLLECTIONINFO - struct hiddev_collection_info (read/write)
-This returns a superset of the information above, providing not only
-application collections, but all the collections the device has.  It
-also returns the level the collection lives in the hierarchy.
-The user passes in a hiddev_collection_info struct with the index 
-field set to the index that should be returned.  The ioctl fills in 
-the other fields.  If the index is larger than the last collection 
-index, the ioctl returns -1 and sets errno to -EINVAL.
-
-HIDIOCGDEVINFO - struct hiddev_devinfo (read)
-Gets a hiddev_devinfo structure which describes the device.
-
-HIDIOCGSTRING - struct hiddev_string_descriptor (read/write)
-Gets a string descriptor from the device. The caller must fill in the
-"index" field to indicate which descriptor should be returned.
-
-HIDIOCINITREPORT - (none)
-Instructs the kernel to retrieve all input and feature report values
-from the device. At this point, all the usage structures will contain
-current values for the device, and will maintain it as the device
-changes.  Note that the use of this ioctl is unnecessary in general,
-since later kernels automatically initialize the reports from the
-device at attach time.
-
-HIDIOCGNAME - string (variable length)
-Gets the device name
-
-HIDIOCGREPORT - struct hiddev_report_info (write)
-Instructs the kernel to get a feature or input report from the device,
-in order to selectively update the usage structures (in contrast to
-INITREPORT).
-
-HIDIOCSREPORT - struct hiddev_report_info (write)
-Instructs the kernel to send a report to the device. This report can
-be filled in by the user through HIDIOCSUSAGE calls (below) to fill in
-individual usage values in the report before sending the report in full
-to the device. 
-
-HIDIOCGREPORTINFO - struct hiddev_report_info (read/write)
-Fills in a hiddev_report_info structure for the user. The report is
-looked up by type (input, output or feature) and id, so these fields
-must be filled in by the user. The ID can be absolute -- the actual
-report id as reported by the device -- or relative --
-HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT |
-report_id) for the next report after report_id. Without a-priori
-information about report ids, the right way to use this ioctl is to
-use the relative IDs above to enumerate the valid IDs. The ioctl
-returns non-zero when there is no more next ID. The real report ID is
-filled into the returned hiddev_report_info structure. 
-
-HIDIOCGFIELDINFO - struct hiddev_field_info (read/write)
-Returns the field information associated with a report in a
-hiddev_field_info structure. The user must fill in report_id and
-report_type in this structure, as above. The field_index should also
-be filled in, which should be a number from 0 and maxfield-1, as
-returned from a previous HIDIOCGREPORTINFO call. 
-
-HIDIOCGUCODE - struct hiddev_usage_ref (read/write)
-Returns the usage_code in a hiddev_usage_ref structure, given that
-given its report type, report id, field index, and index within the
-field have already been filled into the structure.
-
-HIDIOCGUSAGE - struct hiddev_usage_ref (read/write)
-Returns the value of a usage in a hiddev_usage_ref structure. The
-usage to be retrieved can be specified as above, or the user can
-choose to fill in the report_type field and specify the report_id as
-HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be
-filled in with the report and field information associated with this
-usage if it is found. 
-
-HIDIOCSUSAGE - struct hiddev_usage_ref (write)
-Sets the value of a usage in an output report.  The user fills in
-the hiddev_usage_ref structure as above, but additionally fills in
-the value field.
-
-HIDIOGCOLLECTIONINDEX - struct hiddev_usage_ref (write)
-Returns the collection index associated with this usage.  This
-indicates where in the collection hierarchy this usage sits.
-
-HIDIOCGFLAG - int (read)
-HIDIOCSFLAG - int (write)
-These operations respectively inspect and replace the mode flags
-that influence the read() call above.  The flags are as follows:
-
-    HIDDEV_FLAG_UREF - read() calls will now return 
-        struct hiddev_usage_ref instead of struct hiddev_event.
-        This is a larger structure, but in situations where the
-        device has more than one usage in its reports with the
-        same usage code, this mode serves to resolve such
-        ambiguity.
-
-    HIDDEV_FLAG_REPORT - This flag can only be used in conjunction
-        with HIDDEV_FLAG_UREF.  With this flag set, when the device
-        sends a report, a struct hiddev_usage_ref will be returned
-        to read() filled in with the report_type and report_id, but 
-        with field_index set to FIELD_INDEX_NONE.  This serves as
-        additional notification when the device has sent a report.
diff --git a/Documentation/hid/hidraw.rst b/Documentation/hid/hidraw.rst
new file mode 100644
index 000000000000..4a4a0ba1f362
--- /dev/null
+++ b/Documentation/hid/hidraw.rst
@@ -0,0 +1,138 @@
+================================================================
+HIDRAW - Raw Access to USB and Bluetooth Human Interface Devices
+================================================================
+
+The hidraw driver provides a raw interface to USB and Bluetooth Human
+Interface Devices (HIDs).  It differs from hiddev in that reports sent and
+received are not parsed by the HID parser, but are sent to and received from
+the device unmodified.
+
+Hidraw should be used if the userspace application knows exactly how to
+communicate with the hardware device, and is able to construct the HID
+reports manually.  This is often the case when making userspace drivers for
+custom HID devices.
+
+Hidraw is also useful for communicating with non-conformant HID devices
+which send and receive data in a way that is inconsistent with their report
+descriptors.  Because hiddev parses reports which are sent and received
+through it, checking them against the device's report descriptor, such
+communication with these non-conformant devices is impossible using hiddev.
+Hidraw is the only alternative, short of writing a custom kernel driver, for
+these non-conformant devices.
+
+A benefit of hidraw is that its use by userspace applications is independent
+of the underlying hardware type.  Currently, Hidraw is implemented for USB
+and Bluetooth.  In the future, as new hardware bus types are developed which
+use the HID specification, hidraw will be expanded to add support for these
+new bus types.
+
+Hidraw uses a dynamic major number, meaning that udev should be relied on to
+create hidraw device nodes.  Udev will typically create the device nodes
+directly under /dev (eg: /dev/hidraw0).  As this location is distribution-
+and udev rule-dependent, applications should use libudev to locate hidraw
+devices attached to the system.  There is a tutorial on libudev with a
+working example at:
+
+	http://www.signal11.us/oss/udev/
+
+The HIDRAW API
+---------------
+
+read()
+-------
+read() will read a queued report received from the HID device. On USB
+devices, the reports read using read() are the reports sent from the device
+on the INTERRUPT IN endpoint.  By default, read() will block until there is
+a report available to be read.  read() can be made non-blocking, by passing
+the O_NONBLOCK flag to open(), or by setting the O_NONBLOCK flag using
+fcntl().
+
+On a device which uses numbered reports, the first byte of the returned data
+will be the report number; the report data follows, beginning in the second
+byte.  For devices which do not use numbered reports, the report data
+will begin at the first byte.
+
+write()
+-------
+The write() function will write a report to the device. For USB devices, if
+the device has an INTERRUPT OUT endpoint, the report will be sent on that
+endpoint. If it does not, the report will be sent over the control endpoint,
+using a SET_REPORT transfer.
+
+The first byte of the buffer passed to write() should be set to the report
+number.  If the device does not use numbered reports, the first byte should
+be set to 0. The report data itself should begin at the second byte.
+
+ioctl()
+-------
+Hidraw supports the following ioctls:
+
+HIDIOCGRDESCSIZE:
+	Get Report Descriptor Size
+
+This ioctl will get the size of the device's report descriptor.
+
+HIDIOCGRDESC:
+	Get Report Descriptor
+
+This ioctl returns the device's report descriptor using a
+hidraw_report_descriptor struct.  Make sure to set the size field of the
+hidraw_report_descriptor struct to the size returned from HIDIOCGRDESCSIZE.
+
+HIDIOCGRAWINFO:
+	Get Raw Info
+
+This ioctl will return a hidraw_devinfo struct containing the bus type, the
+vendor ID (VID), and product ID (PID) of the device. The bus type can be one
+of::
+
+	- BUS_USB
+	- BUS_HIL
+	- BUS_BLUETOOTH
+	- BUS_VIRTUAL
+
+which are defined in uapi/linux/input.h.
+
+HIDIOCGRAWNAME(len):
+	Get Raw Name
+
+This ioctl returns a string containing the vendor and product strings of
+the device.  The returned string is Unicode, UTF-8 encoded.
+
+HIDIOCGRAWPHYS(len):
+	Get Physical Address
+
+This ioctl returns a string representing the physical address of the device.
+For USB devices, the string contains the physical path to the device (the
+USB controller, hubs, ports, etc).  For Bluetooth devices, the string
+contains the hardware (MAC) address of the device.
+
+HIDIOCSFEATURE(len):
+	Send a Feature Report
+
+This ioctl will send a feature report to the device.  Per the HID
+specification, feature reports are always sent using the control endpoint.
+Set the first byte of the supplied buffer to the report number.  For devices
+which do not use numbered reports, set the first byte to 0. The report data
+begins in the second byte. Make sure to set len accordingly, to one more
+than the length of the report (to account for the report number).
+
+HIDIOCGFEATURE(len):
+	Get a Feature Report
+
+This ioctl will request a feature report from the device using the control
+endpoint.  The first byte of the supplied buffer should be set to the report
+number of the requested report.  For devices which do not use numbered
+reports, set the first byte to 0.  The report will be returned starting at
+the first byte of the buffer (ie: the report number is not returned).
+
+Example
+-------
+In samples/, find hid-example.c, which shows examples of read(), write(),
+and all the ioctls for hidraw.  The code may be used by anyone for any
+purpose, and can serve as a starting point for developing applications using
+hidraw.
+
+Document by:
+
+	Alan Ott <alan@signal11.us>, Signal 11 Software
diff --git a/Documentation/hid/hidraw.txt b/Documentation/hid/hidraw.txt
deleted file mode 100644
index c8436e354f44..000000000000
--- a/Documentation/hid/hidraw.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-      HIDRAW - Raw Access to USB and Bluetooth Human Interface Devices
-     ==================================================================
-
-The hidraw driver provides a raw interface to USB and Bluetooth Human
-Interface Devices (HIDs).  It differs from hiddev in that reports sent and
-received are not parsed by the HID parser, but are sent to and received from
-the device unmodified.
-
-Hidraw should be used if the userspace application knows exactly how to
-communicate with the hardware device, and is able to construct the HID
-reports manually.  This is often the case when making userspace drivers for
-custom HID devices.
-
-Hidraw is also useful for communicating with non-conformant HID devices
-which send and receive data in a way that is inconsistent with their report
-descriptors.  Because hiddev parses reports which are sent and received
-through it, checking them against the device's report descriptor, such
-communication with these non-conformant devices is impossible using hiddev.
-Hidraw is the only alternative, short of writing a custom kernel driver, for
-these non-conformant devices.
-
-A benefit of hidraw is that its use by userspace applications is independent
-of the underlying hardware type.  Currently, Hidraw is implemented for USB
-and Bluetooth.  In the future, as new hardware bus types are developed which
-use the HID specification, hidraw will be expanded to add support for these
-new bus types.
-
-Hidraw uses a dynamic major number, meaning that udev should be relied on to
-create hidraw device nodes.  Udev will typically create the device nodes
-directly under /dev (eg: /dev/hidraw0).  As this location is distribution-
-and udev rule-dependent, applications should use libudev to locate hidraw
-devices attached to the system.  There is a tutorial on libudev with a
-working example at:
-	http://www.signal11.us/oss/udev/
-
-The HIDRAW API
----------------
-
-read()
--------
-read() will read a queued report received from the HID device. On USB
-devices, the reports read using read() are the reports sent from the device
-on the INTERRUPT IN endpoint.  By default, read() will block until there is
-a report available to be read.  read() can be made non-blocking, by passing
-the O_NONBLOCK flag to open(), or by setting the O_NONBLOCK flag using
-fcntl().
-
-On a device which uses numbered reports, the first byte of the returned data
-will be the report number; the report data follows, beginning in the second
-byte.  For devices which do not use numbered reports, the report data
-will begin at the first byte.
-
-write()
---------
-The write() function will write a report to the device. For USB devices, if
-the device has an INTERRUPT OUT endpoint, the report will be sent on that
-endpoint. If it does not, the report will be sent over the control endpoint,
-using a SET_REPORT transfer.
-
-The first byte of the buffer passed to write() should be set to the report
-number.  If the device does not use numbered reports, the first byte should
-be set to 0. The report data itself should begin at the second byte.
-
-ioctl()
---------
-Hidraw supports the following ioctls:
-
-HIDIOCGRDESCSIZE: Get Report Descriptor Size
-This ioctl will get the size of the device's report descriptor.
-
-HIDIOCGRDESC: Get Report Descriptor
-This ioctl returns the device's report descriptor using a
-hidraw_report_descriptor struct.  Make sure to set the size field of the
-hidraw_report_descriptor struct to the size returned from HIDIOCGRDESCSIZE.
-
-HIDIOCGRAWINFO: Get Raw Info
-This ioctl will return a hidraw_devinfo struct containing the bus type, the
-vendor ID (VID), and product ID (PID) of the device. The bus type can be one
-of:
-	BUS_USB
-	BUS_HIL
-	BUS_BLUETOOTH
-	BUS_VIRTUAL
-which are defined in uapi/linux/input.h.
-
-HIDIOCGRAWNAME(len): Get Raw Name
-This ioctl returns a string containing the vendor and product strings of
-the device.  The returned string is Unicode, UTF-8 encoded.
-
-HIDIOCGRAWPHYS(len): Get Physical Address
-This ioctl returns a string representing the physical address of the device.
-For USB devices, the string contains the physical path to the device (the
-USB controller, hubs, ports, etc).  For Bluetooth devices, the string
-contains the hardware (MAC) address of the device.
-
-HIDIOCSFEATURE(len): Send a Feature Report
-This ioctl will send a feature report to the device.  Per the HID
-specification, feature reports are always sent using the control endpoint.
-Set the first byte of the supplied buffer to the report number.  For devices
-which do not use numbered reports, set the first byte to 0. The report data
-begins in the second byte. Make sure to set len accordingly, to one more
-than the length of the report (to account for the report number).
-
-HIDIOCGFEATURE(len): Get a Feature Report
-This ioctl will request a feature report from the device using the control
-endpoint.  The first byte of the supplied buffer should be set to the report
-number of the requested report.  For devices which do not use numbered
-reports, set the first byte to 0.  The report will be returned starting at
-the first byte of the buffer (ie: the report number is not returned).
-
-Example
----------
-In samples/, find hid-example.c, which shows examples of read(), write(),
-and all the ioctls for hidraw.  The code may be used by anyone for any
-purpose, and can serve as a starting point for developing applications using
-hidraw.
-
-Document by:
-	Alan Ott <alan@signal11.us>, Signal 11 Software
diff --git a/Documentation/hid/index.rst b/Documentation/hid/index.rst
new file mode 100644
index 000000000000..737d66dc16a1
--- /dev/null
+++ b/Documentation/hid/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+Human Interface Devices (HID)
+=============================
+
+.. toctree::
+   :maxdepth: 1
+
+   hiddev
+   hidraw
+   hid-sensor
+   hid-transport
+
+   uhid
+
+   hid-alps
+   intel-ish-hid
diff --git a/Documentation/hid/intel-ish-hid.rst b/Documentation/hid/intel-ish-hid.rst
new file mode 100644
index 000000000000..cccbf4be17d7
--- /dev/null
+++ b/Documentation/hid/intel-ish-hid.rst
@@ -0,0 +1,485 @@
+=================================
+Intel Integrated Sensor Hub (ISH)
+=================================
+
+A sensor hub enables the ability to offload sensor polling and algorithm
+processing to a dedicated low power co-processor. This allows the core
+processor to go into low power modes more often, resulting in the increased
+battery life.
+
+There are many vendors providing external sensor hubs confirming to HID
+Sensor usage tables, and used in several tablets, 2 in 1 convertible laptops
+and embedded products. Linux had this support since Linux 3.9.
+
+Intel® introduced integrated sensor hubs as a part of the SoC starting from
+Cherry Trail and now supported on multiple generations of CPU packages. There
+are many commercial devices already shipped with Integrated Sensor Hubs (ISH).
+These ISH also comply to HID sensor specification, but the  difference is the
+transport protocol used for communication. The current external sensor hubs
+mainly use HID over i2C or USB. But ISH doesn't use either i2c or USB.
+
+1. Overview
+===========
+
+Using a analogy with a usbhid implementation, the ISH follows a similar model
+for a very high speed communication::
+
+	-----------------		----------------------
+	|    USB HID	|	-->	|    ISH HID	     |
+	-----------------		----------------------
+	-----------------		----------------------
+	|  USB protocol	|	-->	|    ISH Transport   |
+	-----------------		----------------------
+	-----------------		----------------------
+	|  EHCI/XHCI	|	-->	|    ISH IPC	     |
+	-----------------		----------------------
+	      PCI				 PCI
+	-----------------		----------------------
+        |Host controller|	-->	|    ISH processor   |
+	-----------------		----------------------
+	     USB Link
+	-----------------		----------------------
+	| USB End points|	-->	|    ISH Clients     |
+	-----------------		----------------------
+
+Like USB protocol provides a method for device enumeration, link management
+and user data encapsulation, the ISH also provides similar services. But it is
+very light weight tailored to manage and communicate with ISH client
+applications implemented in the firmware.
+
+The ISH allows multiple sensor management applications executing in the
+firmware. Like USB endpoints the messaging can be to/from a client. As part of
+enumeration process, these clients are identified. These clients can be simple
+HID sensor applications, sensor calibration application or senor firmware
+update application.
+
+The implementation model is similar, like USB bus, ISH transport is also
+implemented as a bus. Each client application executing in the ISH processor
+is registered as a device on this bus. The driver, which binds each device
+(ISH HID driver) identifies the device type and registers with the hid core.
+
+2. ISH Implementation: Block Diagram
+====================================
+
+::
+
+	 ---------------------------
+	|  User Space Applications  |
+	 ---------------------------
+
+  ----------------IIO ABI----------------
+	 --------------------------
+	|  IIO Sensor Drivers	  |
+	 --------------------------
+	 --------------------------
+	|	 IIO core	  |
+	 --------------------------
+	 --------------------------
+	|   HID Sensor Hub MFD	  |
+	 --------------------------
+	 --------------------------
+	|       HID Core	  |
+	 --------------------------
+	 --------------------------
+	|   HID over ISH Client   |
+	 --------------------------
+	 --------------------------
+	|   ISH Transport (ISHTP) |
+	 --------------------------
+	 --------------------------
+	|      IPC Drivers	  |
+	 --------------------------
+  OS
+  ---------------- PCI -----------------
+  Hardware + Firmware
+	 ----------------------------
+	| ISH Hardware/Firmware(FW) |
+	 ----------------------------
+
+3. High level processing in above blocks
+========================================
+
+3.1 Hardware Interface
+----------------------
+
+The ISH is exposed as "Non-VGA unclassified PCI device" to the host. The PCI
+product and vendor IDs are changed from different generations of processors. So
+the source code which enumerate drivers needs to update from generation to
+generation.
+
+3.2 Inter Processor Communication (IPC) driver
+----------------------------------------------
+
+Location: drivers/hid/intel-ish-hid/ipc
+
+The IPC message used memory mapped I/O. The registers are defined in
+hw-ish-regs.h.
+
+3.2.1 IPC/FW message types
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two types of messages, one for management of link and other messages
+are to and from transport layers.
+
+TX and RX of Transport messages
+...............................
+
+A set of memory mapped register offers support of multi byte messages TX and
+RX (E.g.IPC_REG_ISH2HOST_MSG, IPC_REG_HOST2ISH_MSG). The IPC layer maintains
+internal queues to sequence messages and send them in order to the FW.
+Optionally the caller can register handler to get notification of completion.
+A door bell mechanism is used in messaging to trigger processing in host and
+client firmware side. When ISH interrupt handler is called, the ISH2HOST
+doorbell register is used by host drivers to determine that the interrupt
+is for ISH.
+
+Each side has 32 32-bit message registers and a 32-bit doorbell. Doorbell
+register has the following format:
+Bits 0..6: fragment length (7 bits are used)
+Bits 10..13: encapsulated protocol
+Bits 16..19: management command (for IPC management protocol)
+Bit 31: doorbell trigger (signal H/W interrupt to the other side)
+Other bits are reserved, should be 0.
+
+3.2.2 Transport layer interface
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To abstract HW level IPC communication, a set of callbacks are registered.
+The transport layer uses them to send and receive messages.
+Refer to  struct ishtp_hw_ops for callbacks.
+
+3.3 ISH Transport layer
+-----------------------
+
+Location: drivers/hid/intel-ish-hid/ishtp/
+
+3.3.1 A Generic Transport Layer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The transport layer is a bi-directional protocol, which defines:
+- Set of commands to start, stop, connect, disconnect and flow control
+(ishtp/hbm.h) for details
+- A flow control mechanism to avoid buffer overflows
+
+This protocol resembles bus messages described in the following document:
+http://www.intel.com/content/dam/www/public/us/en/documents/technical-\
+specifications/dcmi-hi-1-0-spec.pdf "Chapter 7: Bus Message Layer"
+
+3.3.2 Connection and Flow Control Mechanism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each FW client and a protocol is identified by an UUID. In order to communicate
+to a FW client, a connection must be established using connect request and
+response bus messages. If successful, a pair (host_client_id and fw_client_id)
+will identify the connection.
+
+Once connection is established, peers send each other flow control bus messages
+independently. Every peer may send a message only if it has received a
+flow-control credit before. Once it sent a message, it may not send another one
+before receiving the next flow control credit.
+Either side can send disconnect request bus message to end communication. Also
+the link will be dropped if major FW reset occurs.
+
+3.3.3 Peer to Peer data transfer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Peer to Peer data transfer can happen with or without using DMA. Depending on
+the sensor bandwidth requirement DMA can be enabled by using module parameter
+ishtp_use_dma under intel_ishtp.
+
+Each side (host and FW) manages its DMA transfer memory independently. When an
+ISHTP client from either host or FW side wants to send something, it decides
+whether to send over IPC or over DMA; for each transfer the decision is
+independent. The sending side sends DMA_XFER message when the message is in
+the respective host buffer (TX when host client sends, RX when FW client
+sends). The recipient of DMA message responds with DMA_XFER_ACK, indicating
+the sender that the memory region for that message may be reused.
+
+DMA initialization is started with host sending DMA_ALLOC_NOTIFY bus message
+(that includes RX buffer) and FW responds with DMA_ALLOC_NOTIFY_ACK.
+Additionally to DMA address communication, this sequence checks capabilities:
+if thw host doesn't support DMA, then it won't send DMA allocation, so FW can't
+send DMA; if FW doesn't support DMA then it won't respond with
+DMA_ALLOC_NOTIFY_ACK, in which case host will not use DMA transfers.
+Here ISH acts as busmaster DMA controller. Hence when host sends DMA_XFER,
+it's request to do host->ISH DMA transfer; when FW sends DMA_XFER, it means
+that it already did DMA and the message resides at host. Thus, DMA_XFER
+and DMA_XFER_ACK act as ownership indicators.
+
+At initial state all outgoing memory belongs to the sender (TX to host, RX to
+FW), DMA_XFER transfers ownership on the region that contains ISHTP message to
+the receiving side, DMA_XFER_ACK returns ownership to the sender. A sender
+needs not wait for previous DMA_XFER to be ack'ed, and may send another message
+as long as remaining continuous memory in its ownership is enough.
+In principle, multiple DMA_XFER and DMA_XFER_ACK messages may be sent at once
+(up to IPC MTU), thus allowing for interrupt throttling.
+Currently, ISH FW decides to send over DMA if ISHTP message is more than 3 IPC
+fragments and via IPC otherwise.
+
+3.3.4 Ring Buffers
+^^^^^^^^^^^^^^^^^^
+
+When a client initiate a connection, a ring or RX and TX buffers are allocated.
+The size of ring can be specified by the client. HID client set 16 and 32 for
+TX and RX buffers respectively. On send request from client, the data to be
+sent is copied to one of the send ring buffer and scheduled to be sent using
+bus message protocol. These buffers are required because the FW may have not
+have processed the last message and may not have enough flow control credits
+to send. Same thing holds true on receive side and flow control is required.
+
+3.3.5 Host Enumeration
+^^^^^^^^^^^^^^^^^^^^^^
+
+The host enumeration bus command allow discovery of clients present in the FW.
+There can be multiple sensor clients and clients for calibration function.
+
+To ease in implantation and allow independent driver handle each client
+this transport layer takes advantage of Linux Bus driver model. Each
+client is registered as device on the the transport bus (ishtp bus).
+
+Enumeration sequence of messages:
+
+- Host sends HOST_START_REQ_CMD, indicating that host ISHTP layer is up.
+- FW responds with HOST_START_RES_CMD
+- Host sends HOST_ENUM_REQ_CMD (enumerate FW clients)
+- FW responds with HOST_ENUM_RES_CMD that includes bitmap of available FW
+  client IDs
+- For each FW ID found in that bitmap host sends
+  HOST_CLIENT_PROPERTIES_REQ_CMD
+- FW responds with HOST_CLIENT_PROPERTIES_RES_CMD. Properties include UUID,
+  max ISHTP message size, etc.
+- Once host received properties for that last discovered client, it considers
+  ISHTP device fully functional (and allocates DMA buffers)
+
+3.4 HID over ISH Client
+-----------------------
+
+Location: drivers/hid/intel-ish-hid
+
+The ISHTP client driver is responsible for:
+
+- enumerate HID devices under FW ISH client
+- Get Report descriptor
+- Register with HID core as a LL driver
+- Process Get/Set feature request
+- Get input reports
+
+3.5 HID Sensor Hub MFD and IIO sensor drivers
+---------------------------------------------
+
+The functionality in these drivers is the same as an external sensor hub.
+Refer to
+Documentation/hid/hid-sensor.rst for HID sensor
+Documentation/ABI/testing/sysfs-bus-iio for IIO ABIs to user space
+
+3.6 End to End HID transport Sequence Diagram
+---------------------------------------------
+
+::
+
+  HID-ISH-CLN                    ISHTP                    IPC                             HW
+          |                        |                       |                               |
+          |                        |                       |-----WAKE UP------------------>|
+          |                        |                       |                               |
+          |                        |                       |-----HOST READY--------------->|
+          |                        |                       |                               |
+          |                        |                       |<----MNG_RESET_NOTIFY_ACK----- |
+          |                        |                       |                               |
+          |                        |<----ISHTP_START------ |                               |
+          |                        |                       |                               |
+          |                        |<-----------------HOST_START_RES_CMD-------------------|
+          |                        |                       |                               |
+          |                        |------------------QUERY_SUBSCRIBER-------------------->|
+          |                        |                       |                               |
+          |                        |------------------HOST_ENUM_REQ_CMD------------------->|
+          |                        |                       |                               |
+          |                        |<-----------------HOST_ENUM_RES_CMD--------------------|
+          |                        |                       |                               |
+          |                        |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
+          |                        |                       |                               |
+          |                        |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
+          |       Create new device on in ishtp bus        |                               |
+          |                        |                       |                               |
+          |                        |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
+          |                        |                       |                               |
+          |                        |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
+          |       Create new device on in ishtp bus        |                               |
+          |                        |                       |                               |
+          |                        |--Repeat HOST_CLIENT_PROPERTIES_REQ_CMD-till last one--|
+          |                        |                       |                               |
+       probed()
+          |----ishtp_cl_connect--->|----------------- CLIENT_CONNECT_REQ_CMD-------------->|
+          |                        |                       |                               |
+          |                        |<----------------CLIENT_CONNECT_RES_CMD----------------|
+          |                        |                       |                               |
+          |register event callback |                       |                               |
+          |                        |                       |                               |
+          |ishtp_cl_send(
+          HOSTIF_DM_ENUM_DEVICES)  |----------fill ishtp_msg_hdr struct write to HW-----  >|
+          |                        |                       |                               |
+          |                        |                       |<-----IRQ(IPC_PROTOCOL_ISHTP---|
+          |                        |                       |                               |
+          |<--ENUM_DEVICE RSP------|                       |                               |
+          |                        |                       |                               |
+  for each enumerated device
+          |ishtp_cl_send(
+          HOSTIF_GET_HID_DESCRIPTOR|----------fill ishtp_msg_hdr struct write to HW-----  >|
+          |                        |                       |                               |
+          ...Response
+          |                        |                       |                               |
+  for each enumerated device
+          |ishtp_cl_send(
+       HOSTIF_GET_REPORT_DESCRIPTOR|--------------fill ishtp_msg_hdr struct write to HW-- >|
+          |                        |                       |                               |
+          |                        |                       |                               |
+   hid_allocate_device
+          |                        |                       |                               |
+   hid_add_device                  |                       |                               |
+          |                        |                       |                               |
+
+
+3.7 ISH Debugging
+-----------------
+
+To debug ISH, event tracing mechanism is used. To enable debug logs
+echo 1 > /sys/kernel/debug/tracing/events/intel_ish/enable
+cat sys/kernel/debug/tracing/trace
+
+3.8 ISH IIO sysfs Example on Lenovo thinkpad Yoga 260
+-----------------------------------------------------
+
+::
+
+  root@otcpl-ThinkPad-Yoga-260:~# tree -l /sys/bus/iio/devices/
+  /sys/bus/iio/devices/
+  ├── iio:device0 -> ../../../devices/0044:8086:22D8.0001/HID-SENSOR-200073.9.auto/iio:device0
+  │   ├── buffer
+  │   │   ├── enable
+  │   │   ├── length
+  │   │   └── watermark
+  ...
+  │   ├── in_accel_hysteresis
+  │   ├── in_accel_offset
+  │   ├── in_accel_sampling_frequency
+  │   ├── in_accel_scale
+  │   ├── in_accel_x_raw
+  │   ├── in_accel_y_raw
+  │   ├── in_accel_z_raw
+  │   ├── name
+  │   ├── scan_elements
+  │   │   ├── in_accel_x_en
+  │   │   ├── in_accel_x_index
+  │   │   ├── in_accel_x_type
+  │   │   ├── in_accel_y_en
+  │   │   ├── in_accel_y_index
+  │   │   ├── in_accel_y_type
+  │   │   ├── in_accel_z_en
+  │   │   ├── in_accel_z_index
+  │   │   └── in_accel_z_type
+  ...
+  │   │   ├── devices
+  │   │   │   │   ├── buffer
+  │   │   │   │   │   ├── enable
+  │   │   │   │   │   ├── length
+  │   │   │   │   │   └── watermark
+  │   │   │   │   ├── dev
+  │   │   │   │   ├── in_intensity_both_raw
+  │   │   │   │   ├── in_intensity_hysteresis
+  │   │   │   │   ├── in_intensity_offset
+  │   │   │   │   ├── in_intensity_sampling_frequency
+  │   │   │   │   ├── in_intensity_scale
+  │   │   │   │   ├── name
+  │   │   │   │   ├── scan_elements
+  │   │   │   │   │   ├── in_intensity_both_en
+  │   │   │   │   │   ├── in_intensity_both_index
+  │   │   │   │   │   └── in_intensity_both_type
+  │   │   │   │   ├── trigger
+  │   │   │   │   │   └── current_trigger
+  ...
+  │   │   │   │   ├── buffer
+  │   │   │   │   │   ├── enable
+  │   │   │   │   │   ├── length
+  │   │   │   │   │   └── watermark
+  │   │   │   │   ├── dev
+  │   │   │   │   ├── in_magn_hysteresis
+  │   │   │   │   ├── in_magn_offset
+  │   │   │   │   ├── in_magn_sampling_frequency
+  │   │   │   │   ├── in_magn_scale
+  │   │   │   │   ├── in_magn_x_raw
+  │   │   │   │   ├── in_magn_y_raw
+  │   │   │   │   ├── in_magn_z_raw
+  │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_raw
+  │   │   │   │   ├── in_rot_hysteresis
+  │   │   │   │   ├── in_rot_offset
+  │   │   │   │   ├── in_rot_sampling_frequency
+  │   │   │   │   ├── in_rot_scale
+  │   │   │   │   ├── name
+  ...
+  │   │   │   │   ├── scan_elements
+  │   │   │   │   │   ├── in_magn_x_en
+  │   │   │   │   │   ├── in_magn_x_index
+  │   │   │   │   │   ├── in_magn_x_type
+  │   │   │   │   │   ├── in_magn_y_en
+  │   │   │   │   │   ├── in_magn_y_index
+  │   │   │   │   │   ├── in_magn_y_type
+  │   │   │   │   │   ├── in_magn_z_en
+  │   │   │   │   │   ├── in_magn_z_index
+  │   │   │   │   │   ├── in_magn_z_type
+  │   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_en
+  │   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_index
+  │   │   │   │   │   └── in_rot_from_north_magnetic_tilt_comp_type
+  │   │   │   │   ├── trigger
+  │   │   │   │   │   └── current_trigger
+  ...
+  │   │   │   │   ├── buffer
+  │   │   │   │   │   ├── enable
+  │   │   │   │   │   ├── length
+  │   │   │   │   │   └── watermark
+  │   │   │   │   ├── dev
+  │   │   │   │   ├── in_anglvel_hysteresis
+  │   │   │   │   ├── in_anglvel_offset
+  │   │   │   │   ├── in_anglvel_sampling_frequency
+  │   │   │   │   ├── in_anglvel_scale
+  │   │   │   │   ├── in_anglvel_x_raw
+  │   │   │   │   ├── in_anglvel_y_raw
+  │   │   │   │   ├── in_anglvel_z_raw
+  │   │   │   │   ├── name
+  │   │   │   │   ├── scan_elements
+  │   │   │   │   │   ├── in_anglvel_x_en
+  │   │   │   │   │   ├── in_anglvel_x_index
+  │   │   │   │   │   ├── in_anglvel_x_type
+  │   │   │   │   │   ├── in_anglvel_y_en
+  │   │   │   │   │   ├── in_anglvel_y_index
+  │   │   │   │   │   ├── in_anglvel_y_type
+  │   │   │   │   │   ├── in_anglvel_z_en
+  │   │   │   │   │   ├── in_anglvel_z_index
+  │   │   │   │   │   └── in_anglvel_z_type
+  │   │   │   │   ├── trigger
+  │   │   │   │   │   └── current_trigger
+  ...
+  │   │   │   │   ├── buffer
+  │   │   │   │   │   ├── enable
+  │   │   │   │   │   ├── length
+  │   │   │   │   │   └── watermark
+  │   │   │   │   ├── dev
+  │   │   │   │   ├── in_anglvel_hysteresis
+  │   │   │   │   ├── in_anglvel_offset
+  │   │   │   │   ├── in_anglvel_sampling_frequency
+  │   │   │   │   ├── in_anglvel_scale
+  │   │   │   │   ├── in_anglvel_x_raw
+  │   │   │   │   ├── in_anglvel_y_raw
+  │   │   │   │   ├── in_anglvel_z_raw
+  │   │   │   │   ├── name
+  │   │   │   │   ├── scan_elements
+  │   │   │   │   │   ├── in_anglvel_x_en
+  │   │   │   │   │   ├── in_anglvel_x_index
+  │   │   │   │   │   ├── in_anglvel_x_type
+  │   │   │   │   │   ├── in_anglvel_y_en
+  │   │   │   │   │   ├── in_anglvel_y_index
+  │   │   │   │   │   ├── in_anglvel_y_type
+  │   │   │   │   │   ├── in_anglvel_z_en
+  │   │   │   │   │   ├── in_anglvel_z_index
+  │   │   │   │   │   └── in_anglvel_z_type
+  │   │   │   │   ├── trigger
+  │   │   │   │   │   └── current_trigger
+  ...
diff --git a/Documentation/hid/intel-ish-hid.txt b/Documentation/hid/intel-ish-hid.txt
deleted file mode 100644
index d48b21c71ddd..000000000000
--- a/Documentation/hid/intel-ish-hid.txt
+++ /dev/null
@@ -1,454 +0,0 @@
-Intel Integrated Sensor Hub (ISH)
-===============================
-
-A sensor hub enables the ability to offload sensor polling and algorithm
-processing to a dedicated low power co-processor. This allows the core
-processor to go into low power modes more often, resulting in the increased
-battery life.
-
-There are many vendors providing external sensor hubs confirming to HID
-Sensor usage tables, and used in several tablets, 2 in 1 convertible laptops
-and embedded products. Linux had this support since Linux 3.9.
-
-Intel® introduced integrated sensor hubs as a part of the SoC starting from
-Cherry Trail and now supported on multiple generations of CPU packages. There
-are many commercial devices already shipped with Integrated Sensor Hubs (ISH).
-These ISH also comply to HID sensor specification, but the  difference is the
-transport protocol used for communication. The current external sensor hubs
-mainly use HID over i2C or USB. But ISH doesn't use either i2c or USB.
-
-1. Overview
-
-Using a analogy with a usbhid implementation, the ISH follows a similar model
-for a very high speed communication:
-
-	-----------------		----------------------
-	|    USB HID	|	-->	|    ISH HID	     |
-	-----------------		----------------------
-	-----------------		----------------------
-	|  USB protocol	|	-->	|    ISH Transport   |
-	-----------------		----------------------
-	-----------------		----------------------
-	|  EHCI/XHCI	|	-->	|    ISH IPC	     |
-	-----------------		----------------------
-	      PCI				 PCI
-	-----------------		----------------------
-        |Host controller|	-->	|    ISH processor   |
-	-----------------		----------------------
-	     USB Link
-	-----------------		----------------------
-	| USB End points|	-->	|    ISH Clients     |
-	-----------------		----------------------
-
-Like USB protocol provides a method for device enumeration, link management
-and user data encapsulation, the ISH also provides similar services. But it is
-very light weight tailored to manage and communicate with ISH client
-applications implemented in the firmware.
-
-The ISH allows multiple sensor management applications executing in the
-firmware. Like USB endpoints the messaging can be to/from a client. As part of
-enumeration process, these clients are identified. These clients can be simple
-HID sensor applications, sensor calibration application or senor firmware
-update application.
-
-The implementation model is similar, like USB bus, ISH transport is also
-implemented as a bus. Each client application executing in the ISH processor
-is registered as a device on this bus. The driver, which binds each device
-(ISH HID driver) identifies the device type and registers with the hid core.
-
-2. ISH Implementation: Block Diagram
-
-	 ---------------------------
-	|  User Space Applications  |
-	 ---------------------------
-
-----------------IIO ABI----------------
-	 --------------------------
-	|  IIO Sensor Drivers	  |
-	 --------------------------
-	 --------------------------
-	|	 IIO core	  |
-	 --------------------------
-	 --------------------------
-	|   HID Sensor Hub MFD	  |
-	 --------------------------
-	 --------------------------
-	|       HID Core	  |
-	 --------------------------
-	 --------------------------
-	|   HID over ISH Client   |
-	 --------------------------
-	 --------------------------
-	|   ISH Transport (ISHTP) |
-	 --------------------------
-	 --------------------------
-	|      IPC Drivers	  |
-	 --------------------------
-OS
-----------------   PCI -----------------
-Hardware + Firmware
-	 ----------------------------
-	| ISH Hardware/Firmware(FW) |
-	 ----------------------------
-
-3. High level processing in above blocks
-
-3.1 Hardware Interface
-
-The ISH is exposed as "Non-VGA unclassified PCI device" to the host. The PCI
-product and vendor IDs are changed from different generations of processors. So
-the source code which enumerate drivers needs to update from generation to
-generation.
-
-3.2 Inter Processor Communication (IPC) driver
-Location: drivers/hid/intel-ish-hid/ipc
-
-The IPC message used memory mapped I/O. The registers are defined in
-hw-ish-regs.h.
-
-3.2.1 IPC/FW message types
-
-There are two types of messages, one for management of link and other messages
-are to and from transport layers.
-
-TX and RX of Transport messages
-
-A set of memory mapped register offers support of multi byte messages TX and
-RX (E.g.IPC_REG_ISH2HOST_MSG, IPC_REG_HOST2ISH_MSG). The IPC layer maintains
-internal queues to sequence messages and send them in order to the FW.
-Optionally the caller can register handler to get notification of completion.
-A door bell mechanism is used in messaging to trigger processing in host and
-client firmware side. When ISH interrupt handler is called, the ISH2HOST
-doorbell register is used by host drivers to determine that the interrupt
-is for ISH.
-
-Each side has 32 32-bit message registers and a 32-bit doorbell. Doorbell
-register has the following format:
-Bits 0..6: fragment length (7 bits are used)
-Bits 10..13: encapsulated protocol
-Bits 16..19: management command (for IPC management protocol)
-Bit 31: doorbell trigger (signal H/W interrupt to the other side)
-Other bits are reserved, should be 0.
-
-3.2.2 Transport layer interface
-
-To abstract HW level IPC communication, a set of callbacks are registered.
-The transport layer uses them to send and receive messages.
-Refer to  struct ishtp_hw_ops for callbacks.
-
-3.3 ISH Transport layer
-Location: drivers/hid/intel-ish-hid/ishtp/
-
-3.3.1 A Generic Transport Layer
-
-The transport layer is a bi-directional protocol, which defines:
-- Set of commands to start, stop, connect, disconnect and flow control
-(ishtp/hbm.h) for details
-- A flow control mechanism to avoid buffer overflows
-
-This protocol resembles bus messages described in the following document:
-http://www.intel.com/content/dam/www/public/us/en/documents/technical-\
-specifications/dcmi-hi-1-0-spec.pdf "Chapter 7: Bus Message Layer"
-
-3.3.2 Connection and Flow Control Mechanism
-
-Each FW client and a protocol is identified by an UUID. In order to communicate
-to a FW client, a connection must be established using connect request and
-response bus messages. If successful, a pair (host_client_id and fw_client_id)
-will identify the connection.
-
-Once connection is established, peers send each other flow control bus messages
-independently. Every peer may send a message only if it has received a
-flow-control credit before. Once it sent a message, it may not send another one
-before receiving the next flow control credit.
-Either side can send disconnect request bus message to end communication. Also
-the link will be dropped if major FW reset occurs.
-
-3.3.3 Peer to Peer data transfer
-
-Peer to Peer data transfer can happen with or without using DMA. Depending on
-the sensor bandwidth requirement DMA can be enabled by using module parameter
-ishtp_use_dma under intel_ishtp.
-
-Each side (host and FW) manages its DMA transfer memory independently. When an
-ISHTP client from either host or FW side wants to send something, it decides
-whether to send over IPC or over DMA; for each transfer the decision is
-independent. The sending side sends DMA_XFER message when the message is in
-the respective host buffer (TX when host client sends, RX when FW client
-sends). The recipient of DMA message responds with DMA_XFER_ACK, indicating
-the sender that the memory region for that message may be reused.
-
-DMA initialization is started with host sending DMA_ALLOC_NOTIFY bus message
-(that includes RX buffer) and FW responds with DMA_ALLOC_NOTIFY_ACK.
-Additionally to DMA address communication, this sequence checks capabilities:
-if thw host doesn't support DMA, then it won't send DMA allocation, so FW can't
-send DMA; if FW doesn't support DMA then it won't respond with
-DMA_ALLOC_NOTIFY_ACK, in which case host will not use DMA transfers.
-Here ISH acts as busmaster DMA controller. Hence when host sends DMA_XFER,
-it's request to do host->ISH DMA transfer; when FW sends DMA_XFER, it means
-that it already did DMA and the message resides at host. Thus, DMA_XFER
-and DMA_XFER_ACK act as ownership indicators.
-
-At initial state all outgoing memory belongs to the sender (TX to host, RX to
-FW), DMA_XFER transfers ownership on the region that contains ISHTP message to
-the receiving side, DMA_XFER_ACK returns ownership to the sender. A sender
-needs not wait for previous DMA_XFER to be ack'ed, and may send another message
-as long as remaining continuous memory in its ownership is enough.
-In principle, multiple DMA_XFER and DMA_XFER_ACK messages may be sent at once
-(up to IPC MTU), thus allowing for interrupt throttling.
-Currently, ISH FW decides to send over DMA if ISHTP message is more than 3 IPC
-fragments and via IPC otherwise.
-
-3.3.4 Ring Buffers
-
-When a client initiate a connection, a ring or RX and TX buffers are allocated.
-The size of ring can be specified by the client. HID client set 16 and 32 for
-TX and RX buffers respectively. On send request from client, the data to be
-sent is copied to one of the send ring buffer and scheduled to be sent using
-bus message protocol. These buffers are required because the FW may have not
-have processed the last message and may not have enough flow control credits
-to send. Same thing holds true on receive side and flow control is required.
-
-3.3.5 Host Enumeration
-
-The host enumeration bus command allow discovery of clients present in the FW.
-There can be multiple sensor clients and clients for calibration function.
-
-To ease in implantation and allow independent driver handle each client
-this transport layer takes advantage of Linux Bus driver model. Each
-client is registered as device on the the transport bus (ishtp bus).
-
-Enumeration sequence of messages:
-- Host sends HOST_START_REQ_CMD, indicating that host ISHTP layer is up.
-- FW responds with HOST_START_RES_CMD
-- Host sends HOST_ENUM_REQ_CMD (enumerate FW clients)
-- FW responds with HOST_ENUM_RES_CMD that includes bitmap of available FW
-client IDs
-- For each FW ID found in that bitmap host sends
-HOST_CLIENT_PROPERTIES_REQ_CMD
-- FW responds with HOST_CLIENT_PROPERTIES_RES_CMD. Properties include UUID,
-max ISHTP message size, etc.
-- Once host received properties for that last discovered client, it considers
-ISHTP device fully functional (and allocates DMA buffers)
-
-3.4 HID over ISH Client
-Location: drivers/hid/intel-ish-hid
-
-The ISHTP client driver is responsible for:
-- enumerate HID devices under FW ISH client
-- Get Report descriptor
-- Register with HID core as a LL driver
-- Process Get/Set feature request
-- Get input reports
-
-3.5 HID Sensor Hub MFD and IIO sensor drivers
-
-The functionality in these drivers is the same as an external sensor hub.
-Refer to
-Documentation/hid/hid-sensor.txt for HID sensor
-Documentation/ABI/testing/sysfs-bus-iio for IIO ABIs to user space
-
-3.6 End to End HID transport Sequence Diagram
-
-HID-ISH-CLN			ISHTP			IPC				HW
-	|			|			|				|
-	|			|			|-----WAKE UP------------------>|
-	|			|			|				|
-	|			|			|-----HOST READY--------------->|
-	|			|			|				|
-	|			|			|<----MNG_RESET_NOTIFY_ACK----- |
-	|			|			|				|
-	|			|<----ISHTP_START------ |				|
-	|			|			|				|
-	|			|<-----------------HOST_START_RES_CMD-------------------|
-	|			|			|				|
-	|			|------------------QUERY_SUBSCRIBER-------------------->|
-	|			|			|				|
-	|			|------------------HOST_ENUM_REQ_CMD------------------->|
-	|			|			|				|
-	|			|<-----------------HOST_ENUM_RES_CMD--------------------|
-	|			|			|				|
-	|			|------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
-	|			|			|				|
-	|			|<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
-	|	Create new device on in ishtp bus	|				|
-	|			|			|				|
-	|			|------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
-	|			|			|				|
-	|			|<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
-	|	Create new device on in ishtp bus	|				|
-	|			|			|				|
-	|			|--Repeat HOST_CLIENT_PROPERTIES_REQ_CMD-till last one--|
-	|			|			|				|
-     probed()
-	|----ishtp_cl_connect-->|----------------- CLIENT_CONNECT_REQ_CMD-------------->|
-	|			|			|				|
-	|			|<----------------CLIENT_CONNECT_RES_CMD----------------|
-	|			|			|				|
-	|register event callback|			|				|
-	|			|			|				|
-	|ishtp_cl_send(
-	HOSTIF_DM_ENUM_DEVICES) |----------fill ishtp_msg_hdr struct write to HW-----  >|
-	|			|			|				|
-	|			|			|<-----IRQ(IPC_PROTOCOL_ISHTP---|
-	|			|			|				|
-	|<--ENUM_DEVICE RSP-----|			|				|
-	|			|			|				|
-for each enumerated device
-	|ishtp_cl_send(
-	HOSTIF_GET_HID_DESCRIPTOR |----------fill ishtp_msg_hdr struct write to HW---  >|
-	|			|			|				|
-	...Response
-	|			|			|				|
-for each enumerated device
-	|ishtp_cl_send(
-	HOSTIF_GET_REPORT_DESCRIPTOR |----------fill ishtp_msg_hdr struct write to HW- >|
-	|			|			|				|
-	|			|			|				|
- hid_allocate_device
-	|			|			|				|
- hid_add_device			|			|				|
-	|			|			|				|
-
-
-3.7 ISH Debugging
-
-To debug ISH, event tracing mechanism is used. To enable debug logs
-echo 1 > /sys/kernel/debug/tracing/events/intel_ish/enable
-cat sys/kernel/debug/tracing/trace
-
-3.8 ISH IIO sysfs Example on Lenovo thinkpad Yoga 260
-
-root@otcpl-ThinkPad-Yoga-260:~# tree -l /sys/bus/iio/devices/
-/sys/bus/iio/devices/
-├── iio:device0 -> ../../../devices/0044:8086:22D8.0001/HID-SENSOR-200073.9.auto/iio:device0
-│   ├── buffer
-│   │   ├── enable
-│   │   ├── length
-│   │   └── watermark
-...
-│   ├── in_accel_hysteresis
-│   ├── in_accel_offset
-│   ├── in_accel_sampling_frequency
-│   ├── in_accel_scale
-│   ├── in_accel_x_raw
-│   ├── in_accel_y_raw
-│   ├── in_accel_z_raw
-│   ├── name
-│   ├── scan_elements
-│   │   ├── in_accel_x_en
-│   │   ├── in_accel_x_index
-│   │   ├── in_accel_x_type
-│   │   ├── in_accel_y_en
-│   │   ├── in_accel_y_index
-│   │   ├── in_accel_y_type
-│   │   ├── in_accel_z_en
-│   │   ├── in_accel_z_index
-│   │   └── in_accel_z_type
-...
-│   │   ├── devices
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_intensity_both_raw
-│   │   │   │   ├── in_intensity_hysteresis
-│   │   │   │   ├── in_intensity_offset
-│   │   │   │   ├── in_intensity_sampling_frequency
-│   │   │   │   ├── in_intensity_scale
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_intensity_both_en
-│   │   │   │   │   ├── in_intensity_both_index
-│   │   │   │   │   └── in_intensity_both_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_magn_hysteresis
-│   │   │   │   ├── in_magn_offset
-│   │   │   │   ├── in_magn_sampling_frequency
-│   │   │   │   ├── in_magn_scale
-│   │   │   │   ├── in_magn_x_raw
-│   │   │   │   ├── in_magn_y_raw
-│   │   │   │   ├── in_magn_z_raw
-│   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_raw
-│   │   │   │   ├── in_rot_hysteresis
-│   │   │   │   ├── in_rot_offset
-│   │   │   │   ├── in_rot_sampling_frequency
-│   │   │   │   ├── in_rot_scale
-│   │   │   │   ├── name
-...
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_magn_x_en
-│   │   │   │   │   ├── in_magn_x_index
-│   │   │   │   │   ├── in_magn_x_type
-│   │   │   │   │   ├── in_magn_y_en
-│   │   │   │   │   ├── in_magn_y_index
-│   │   │   │   │   ├── in_magn_y_type
-│   │   │   │   │   ├── in_magn_z_en
-│   │   │   │   │   ├── in_magn_z_index
-│   │   │   │   │   ├── in_magn_z_type
-│   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_en
-│   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_index
-│   │   │   │   │   └── in_rot_from_north_magnetic_tilt_comp_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_anglvel_hysteresis
-│   │   │   │   ├── in_anglvel_offset
-│   │   │   │   ├── in_anglvel_sampling_frequency
-│   │   │   │   ├── in_anglvel_scale
-│   │   │   │   ├── in_anglvel_x_raw
-│   │   │   │   ├── in_anglvel_y_raw
-│   │   │   │   ├── in_anglvel_z_raw
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_anglvel_x_en
-│   │   │   │   │   ├── in_anglvel_x_index
-│   │   │   │   │   ├── in_anglvel_x_type
-│   │   │   │   │   ├── in_anglvel_y_en
-│   │   │   │   │   ├── in_anglvel_y_index
-│   │   │   │   │   ├── in_anglvel_y_type
-│   │   │   │   │   ├── in_anglvel_z_en
-│   │   │   │   │   ├── in_anglvel_z_index
-│   │   │   │   │   └── in_anglvel_z_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_anglvel_hysteresis
-│   │   │   │   ├── in_anglvel_offset
-│   │   │   │   ├── in_anglvel_sampling_frequency
-│   │   │   │   ├── in_anglvel_scale
-│   │   │   │   ├── in_anglvel_x_raw
-│   │   │   │   ├── in_anglvel_y_raw
-│   │   │   │   ├── in_anglvel_z_raw
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_anglvel_x_en
-│   │   │   │   │   ├── in_anglvel_x_index
-│   │   │   │   │   ├── in_anglvel_x_type
-│   │   │   │   │   ├── in_anglvel_y_en
-│   │   │   │   │   ├── in_anglvel_y_index
-│   │   │   │   │   ├── in_anglvel_y_type
-│   │   │   │   │   ├── in_anglvel_z_en
-│   │   │   │   │   ├── in_anglvel_z_index
-│   │   │   │   │   └── in_anglvel_z_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
diff --git a/Documentation/hid/uhid.rst b/Documentation/hid/uhid.rst
new file mode 100644
index 000000000000..b18cb96c885f
--- /dev/null
+++ b/Documentation/hid/uhid.rst
@@ -0,0 +1,193 @@
+======================================================
+UHID - User-space I/O driver support for HID subsystem
+======================================================
+
+UHID allows user-space to implement HID transport drivers. Please see
+hid-transport.txt for an introduction into HID transport drivers. This document
+relies heavily on the definitions declared there.
+
+With UHID, a user-space transport driver can create kernel hid-devices for each
+device connected to the user-space controlled bus. The UHID API defines the I/O
+events provided from the kernel to user-space and vice versa.
+
+There is an example user-space application in ./samples/uhid/uhid-example.c
+
+The UHID API
+------------
+
+UHID is accessed through a character misc-device. The minor-number is allocated
+dynamically so you need to rely on udev (or similar) to create the device node.
+This is /dev/uhid by default.
+
+If a new device is detected by your HID I/O Driver and you want to register this
+device with the HID subsystem, then you need to open /dev/uhid once for each
+device you want to register. All further communication is done by read()'ing or
+write()'ing "struct uhid_event" objects. Non-blocking operations are supported
+by setting O_NONBLOCK::
+
+  struct uhid_event {
+        __u32 type;
+        union {
+                struct uhid_create2_req create2;
+                struct uhid_output_req output;
+                struct uhid_input2_req input2;
+                ...
+        } u;
+  };
+
+The "type" field contains the ID of the event. Depending on the ID different
+payloads are sent. You must not split a single event across multiple read()'s or
+multiple write()'s. A single event must always be sent as a whole. Furthermore,
+only a single event can be sent per read() or write(). Pending data is ignored.
+If you want to handle multiple events in a single syscall, then use vectored
+I/O with readv()/writev().
+The "type" field defines the payload. For each type, there is a
+payload-structure available in the union "u" (except for empty payloads). This
+payload contains management and/or device data.
+
+The first thing you should do is sending an UHID_CREATE2 event. This will
+register the device. UHID will respond with an UHID_START event. You can now
+start sending data to and reading data from UHID. However, unless UHID sends the
+UHID_OPEN event, the internally attached HID Device Driver has no user attached.
+That is, you might put your device asleep unless you receive the UHID_OPEN
+event. If you receive the UHID_OPEN event, you should start I/O. If the last
+user closes the HID device, you will receive an UHID_CLOSE event. This may be
+followed by an UHID_OPEN event again and so on. There is no need to perform
+reference-counting in user-space. That is, you will never receive multiple
+UHID_OPEN events without an UHID_CLOSE event. The HID subsystem performs
+ref-counting for you.
+You may decide to ignore UHID_OPEN/UHID_CLOSE, though. I/O is allowed even
+though the device may have no users.
+
+If you want to send data on the interrupt channel to the HID subsystem, you send
+an HID_INPUT2 event with your raw data payload. If the kernel wants to send data
+on the interrupt channel to the device, you will read an UHID_OUTPUT event.
+Data requests on the control channel are currently limited to GET_REPORT and
+SET_REPORT (no other data reports on the control channel are defined so far).
+Those requests are always synchronous. That means, the kernel sends
+UHID_GET_REPORT and UHID_SET_REPORT events and requires you to forward them to
+the device on the control channel. Once the device responds, you must forward
+the response via UHID_GET_REPORT_REPLY and UHID_SET_REPORT_REPLY to the kernel.
+The kernel blocks internal driver-execution during such round-trips (times out
+after a hard-coded period).
+
+If your device disconnects, you should send an UHID_DESTROY event. This will
+unregister the device. You can now send UHID_CREATE2 again to register a new
+device.
+If you close() the fd, the device is automatically unregistered and destroyed
+internally.
+
+write()
+-------
+write() allows you to modify the state of the device and feed input data into
+the kernel. The kernel will parse the event immediately and if the event ID is
+not supported, it will return -EOPNOTSUPP. If the payload is invalid, then
+-EINVAL is returned, otherwise, the amount of data that was read is returned and
+the request was handled successfully. O_NONBLOCK does not affect write() as
+writes are always handled immediately in a non-blocking fashion. Future requests
+might make use of O_NONBLOCK, though.
+
+UHID_CREATE2:
+  This creates the internal HID device. No I/O is possible until you send this
+  event to the kernel. The payload is of type struct uhid_create2_req and
+  contains information about your device. You can start I/O now.
+
+UHID_DESTROY:
+  This destroys the internal HID device. No further I/O will be accepted. There
+  may still be pending messages that you can receive with read() but no further
+  UHID_INPUT events can be sent to the kernel.
+  You can create a new device by sending UHID_CREATE2 again. There is no need to
+  reopen the character device.
+
+UHID_INPUT2:
+  You must send UHID_CREATE2 before sending input to the kernel! This event
+  contains a data-payload. This is the raw data that you read from your device
+  on the interrupt channel. The kernel will parse the HID reports.
+
+UHID_GET_REPORT_REPLY:
+  If you receive a UHID_GET_REPORT request you must answer with this request.
+  You  must copy the "id" field from the request into the answer. Set the "err"
+  field to 0 if no error occurred or to EIO if an I/O error occurred.
+  If "err" is 0 then you should fill the buffer of the answer with the results
+  of the GET_REPORT request and set "size" correspondingly.
+
+UHID_SET_REPORT_REPLY:
+  This is the SET_REPORT equivalent of UHID_GET_REPORT_REPLY. Unlike GET_REPORT,
+  SET_REPORT never returns a data buffer, therefore, it's sufficient to set the
+  "id" and "err" fields correctly.
+
+read()
+------
+read() will return a queued output report. No reaction is required to any of
+them but you should handle them according to your needs.
+
+UHID_START:
+  This is sent when the HID device is started. Consider this as an answer to
+  UHID_CREATE2. This is always the first event that is sent. Note that this
+  event might not be available immediately after write(UHID_CREATE2) returns.
+  Device drivers might required delayed setups.
+  This event contains a payload of type uhid_start_req. The "dev_flags" field
+  describes special behaviors of a device. The following flags are defined:
+
+      - UHID_DEV_NUMBERED_FEATURE_REPORTS
+      - UHID_DEV_NUMBERED_OUTPUT_REPORTS
+      - UHID_DEV_NUMBERED_INPUT_REPORTS
+
+          Each of these flags defines whether a given report-type uses numbered
+          reports. If numbered reports are used for a type, all messages from
+          the kernel already have the report-number as prefix. Otherwise, no
+          prefix is added by the kernel.
+          For messages sent by user-space to the kernel, you must adjust the
+          prefixes according to these flags.
+
+UHID_STOP:
+  This is sent when the HID device is stopped. Consider this as an answer to
+  UHID_DESTROY.
+
+  If you didn't destroy your device via UHID_DESTROY, but the kernel sends an
+  UHID_STOP event, this should usually be ignored. It means that the kernel
+  reloaded/changed the device driver loaded on your HID device (or some other
+  maintenance actions happened).
+
+  You can usually ignored any UHID_STOP events safely.
+
+UHID_OPEN:
+  This is sent when the HID device is opened. That is, the data that the HID
+  device provides is read by some other process. You may ignore this event but
+  it is useful for power-management. As long as you haven't received this event
+  there is actually no other process that reads your data so there is no need to
+  send UHID_INPUT2 events to the kernel.
+
+UHID_CLOSE:
+  This is sent when there are no more processes which read the HID data. It is
+  the counterpart of UHID_OPEN and you may as well ignore this event.
+
+UHID_OUTPUT:
+  This is sent if the HID device driver wants to send raw data to the I/O
+  device on the interrupt channel. You should read the payload and forward it to
+  the device. The payload is of type "struct uhid_output_req".
+  This may be received even though you haven't received UHID_OPEN, yet.
+
+UHID_GET_REPORT:
+  This event is sent if the kernel driver wants to perform a GET_REPORT request
+  on the control channeld as described in the HID specs. The report-type and
+  report-number are available in the payload.
+  The kernel serializes GET_REPORT requests so there will never be two in
+  parallel. However, if you fail to respond with a UHID_GET_REPORT_REPLY, the
+  request might silently time out.
+  Once you read a GET_REPORT request, you shall forward it to the hid device and
+  remember the "id" field in the payload. Once your hid device responds to the
+  GET_REPORT (or if it fails), you must send a UHID_GET_REPORT_REPLY to the
+  kernel with the exact same "id" as in the request. If the request already
+  timed out, the kernel will ignore the response silently. The "id" field is
+  never re-used, so conflicts cannot happen.
+
+UHID_SET_REPORT:
+  This is the SET_REPORT equivalent of UHID_GET_REPORT. On receipt, you shall
+  send a SET_REPORT request to your hid device. Once it replies, you must tell
+  the kernel about it via UHID_SET_REPORT_REPLY.
+  The same restrictions as for UHID_GET_REPORT apply.
+
+----------------------------------------------------
+
+Written 2012, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/uhid.txt b/Documentation/hid/uhid.txt
deleted file mode 100644
index 958fff945304..000000000000
--- a/Documentation/hid/uhid.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-      UHID - User-space I/O driver support for HID subsystem
-     ========================================================
-
-UHID allows user-space to implement HID transport drivers. Please see
-hid-transport.txt for an introduction into HID transport drivers. This document
-relies heavily on the definitions declared there.
-
-With UHID, a user-space transport driver can create kernel hid-devices for each
-device connected to the user-space controlled bus. The UHID API defines the I/O
-events provided from the kernel to user-space and vice versa.
-
-There is an example user-space application in ./samples/uhid/uhid-example.c
-
-The UHID API
-------------
-
-UHID is accessed through a character misc-device. The minor-number is allocated
-dynamically so you need to rely on udev (or similar) to create the device node.
-This is /dev/uhid by default.
-
-If a new device is detected by your HID I/O Driver and you want to register this
-device with the HID subsystem, then you need to open /dev/uhid once for each
-device you want to register. All further communication is done by read()'ing or
-write()'ing "struct uhid_event" objects. Non-blocking operations are supported
-by setting O_NONBLOCK.
-
-struct uhid_event {
-        __u32 type;
-        union {
-                struct uhid_create2_req create2;
-                struct uhid_output_req output;
-                struct uhid_input2_req input2;
-                ...
-        } u;
-};
-
-The "type" field contains the ID of the event. Depending on the ID different
-payloads are sent. You must not split a single event across multiple read()'s or
-multiple write()'s. A single event must always be sent as a whole. Furthermore,
-only a single event can be sent per read() or write(). Pending data is ignored.
-If you want to handle multiple events in a single syscall, then use vectored
-I/O with readv()/writev().
-The "type" field defines the payload. For each type, there is a
-payload-structure available in the union "u" (except for empty payloads). This
-payload contains management and/or device data.
-
-The first thing you should do is sending an UHID_CREATE2 event. This will
-register the device. UHID will respond with an UHID_START event. You can now
-start sending data to and reading data from UHID. However, unless UHID sends the
-UHID_OPEN event, the internally attached HID Device Driver has no user attached.
-That is, you might put your device asleep unless you receive the UHID_OPEN
-event. If you receive the UHID_OPEN event, you should start I/O. If the last
-user closes the HID device, you will receive an UHID_CLOSE event. This may be
-followed by an UHID_OPEN event again and so on. There is no need to perform
-reference-counting in user-space. That is, you will never receive multiple
-UHID_OPEN events without an UHID_CLOSE event. The HID subsystem performs
-ref-counting for you.
-You may decide to ignore UHID_OPEN/UHID_CLOSE, though. I/O is allowed even
-though the device may have no users.
-
-If you want to send data on the interrupt channel to the HID subsystem, you send
-an HID_INPUT2 event with your raw data payload. If the kernel wants to send data
-on the interrupt channel to the device, you will read an UHID_OUTPUT event.
-Data requests on the control channel are currently limited to GET_REPORT and
-SET_REPORT (no other data reports on the control channel are defined so far).
-Those requests are always synchronous. That means, the kernel sends
-UHID_GET_REPORT and UHID_SET_REPORT events and requires you to forward them to
-the device on the control channel. Once the device responds, you must forward
-the response via UHID_GET_REPORT_REPLY and UHID_SET_REPORT_REPLY to the kernel.
-The kernel blocks internal driver-execution during such round-trips (times out
-after a hard-coded period).
-
-If your device disconnects, you should send an UHID_DESTROY event. This will
-unregister the device. You can now send UHID_CREATE2 again to register a new
-device.
-If you close() the fd, the device is automatically unregistered and destroyed
-internally.
-
-write()
--------
-write() allows you to modify the state of the device and feed input data into
-the kernel. The kernel will parse the event immediately and if the event ID is
-not supported, it will return -EOPNOTSUPP. If the payload is invalid, then
--EINVAL is returned, otherwise, the amount of data that was read is returned and
-the request was handled successfully. O_NONBLOCK does not affect write() as
-writes are always handled immediately in a non-blocking fashion. Future requests
-might make use of O_NONBLOCK, though.
-
-  UHID_CREATE2:
-  This creates the internal HID device. No I/O is possible until you send this
-  event to the kernel. The payload is of type struct uhid_create2_req and
-  contains information about your device. You can start I/O now.
-
-  UHID_DESTROY:
-  This destroys the internal HID device. No further I/O will be accepted. There
-  may still be pending messages that you can receive with read() but no further
-  UHID_INPUT events can be sent to the kernel.
-  You can create a new device by sending UHID_CREATE2 again. There is no need to
-  reopen the character device.
-
-  UHID_INPUT2:
-  You must send UHID_CREATE2 before sending input to the kernel! This event
-  contains a data-payload. This is the raw data that you read from your device
-  on the interrupt channel. The kernel will parse the HID reports.
-
-  UHID_GET_REPORT_REPLY:
-  If you receive a UHID_GET_REPORT request you must answer with this request.
-  You  must copy the "id" field from the request into the answer. Set the "err"
-  field to 0 if no error occurred or to EIO if an I/O error occurred.
-  If "err" is 0 then you should fill the buffer of the answer with the results
-  of the GET_REPORT request and set "size" correspondingly.
-
-  UHID_SET_REPORT_REPLY:
-  This is the SET_REPORT equivalent of UHID_GET_REPORT_REPLY. Unlike GET_REPORT,
-  SET_REPORT never returns a data buffer, therefore, it's sufficient to set the
-  "id" and "err" fields correctly.
-
-read()
-------
-read() will return a queued output report. No reaction is required to any of
-them but you should handle them according to your needs.
-
-  UHID_START:
-  This is sent when the HID device is started. Consider this as an answer to
-  UHID_CREATE2. This is always the first event that is sent. Note that this
-  event might not be available immediately after write(UHID_CREATE2) returns.
-  Device drivers might required delayed setups.
-  This event contains a payload of type uhid_start_req. The "dev_flags" field
-  describes special behaviors of a device. The following flags are defined:
-      UHID_DEV_NUMBERED_FEATURE_REPORTS:
-      UHID_DEV_NUMBERED_OUTPUT_REPORTS:
-      UHID_DEV_NUMBERED_INPUT_REPORTS:
-          Each of these flags defines whether a given report-type uses numbered
-          reports. If numbered reports are used for a type, all messages from
-          the kernel already have the report-number as prefix. Otherwise, no
-          prefix is added by the kernel.
-          For messages sent by user-space to the kernel, you must adjust the
-          prefixes according to these flags.
-
-  UHID_STOP:
-  This is sent when the HID device is stopped. Consider this as an answer to
-  UHID_DESTROY.
-  If you didn't destroy your device via UHID_DESTROY, but the kernel sends an
-  UHID_STOP event, this should usually be ignored. It means that the kernel
-  reloaded/changed the device driver loaded on your HID device (or some other
-  maintenance actions happened).
-  You can usually ignored any UHID_STOP events safely.
-
-  UHID_OPEN:
-  This is sent when the HID device is opened. That is, the data that the HID
-  device provides is read by some other process. You may ignore this event but
-  it is useful for power-management. As long as you haven't received this event
-  there is actually no other process that reads your data so there is no need to
-  send UHID_INPUT2 events to the kernel.
-
-  UHID_CLOSE:
-  This is sent when there are no more processes which read the HID data. It is
-  the counterpart of UHID_OPEN and you may as well ignore this event.
-
-  UHID_OUTPUT:
-  This is sent if the HID device driver wants to send raw data to the I/O
-  device on the interrupt channel. You should read the payload and forward it to
-  the device. The payload is of type "struct uhid_output_req".
-  This may be received even though you haven't received UHID_OPEN, yet.
-
-  UHID_GET_REPORT:
-  This event is sent if the kernel driver wants to perform a GET_REPORT request
-  on the control channeld as described in the HID specs. The report-type and
-  report-number are available in the payload.
-  The kernel serializes GET_REPORT requests so there will never be two in
-  parallel. However, if you fail to respond with a UHID_GET_REPORT_REPLY, the
-  request might silently time out.
-  Once you read a GET_REPORT request, you shall forward it to the hid device and
-  remember the "id" field in the payload. Once your hid device responds to the
-  GET_REPORT (or if it fails), you must send a UHID_GET_REPORT_REPLY to the
-  kernel with the exact same "id" as in the request. If the request already
-  timed out, the kernel will ignore the response silently. The "id" field is
-  never re-used, so conflicts cannot happen.
-
-  UHID_SET_REPORT:
-  This is the SET_REPORT equivalent of UHID_GET_REPORT. On receipt, you shall
-  send a SET_REPORT request to your hid device. Once it replies, you must tell
-  the kernel about it via UHID_SET_REPORT_REPLY.
-  The same restrictions as for UHID_GET_REPORT apply.
-
-----------------------------------------------------
-Written 2012, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hwmon/k8temp.rst b/Documentation/hwmon/k8temp.rst
index 72da12aa17e5..fe9109521056 100644
--- a/Documentation/hwmon/k8temp.rst
+++ b/Documentation/hwmon/k8temp.rst
@@ -9,7 +9,7 @@ Supported chips:
 
     Addresses scanned: PCI space
 
-    Datasheet: http://support.amd.com/us/Processor_TechDocs/32559.pdf
+    Datasheet: http://www.amd.com/system/files/TechDocs/32559.pdf
 
 Author: Rudolf Marek
 
diff --git a/Documentation/hwmon/pxe1610 b/Documentation/hwmon/pxe1610
new file mode 100644
index 000000000000..211cedeefb44
--- /dev/null
+++ b/Documentation/hwmon/pxe1610
@@ -0,0 +1,90 @@
+Kernel driver pxe1610
+=====================
+
+Supported chips:
+  * Infineon PXE1610
+    Prefix: 'pxe1610'
+    Addresses scanned: -
+    Datasheet: Datasheet is not publicly available.
+
+  * Infineon PXE1110
+    Prefix: 'pxe1110'
+    Addresses scanned: -
+    Datasheet: Datasheet is not publicly available.
+
+  * Infineon PXM1310
+    Prefix: 'pxm1310'
+    Addresses scanned: -
+    Datasheet: Datasheet is not publicly available.
+
+Author: Vijay Khemka <vijaykhemka@fb.com>
+
+
+Description
+-----------
+
+PXE1610/PXE1110 are Multi-rail/Multiphase Digital Controllers
+and compliant to
+	-- Intel VR13 DC-DC converter specifications.
+	-- Intel SVID protocol.
+Used for Vcore power regulation for Intel VR13 based microprocessors
+	-- Servers, Workstations, and High-end desktops
+
+PXM1310 is a Multi-rail Controller and it is compliant to
+	-- Intel VR13 DC-DC converter specifications.
+	-- Intel SVID protocol.
+Used for DDR3/DDR4 Memory power regulation for Intel VR13 and
+IMVP8 based systems
+
+
+Usage Notes
+-----------
+
+This driver does not probe for PMBus devices. You will have
+to instantiate devices explicitly.
+
+Example: the following commands will load the driver for an PXE1610
+at address 0x70 on I2C bus #4:
+
+# modprobe pxe1610
+# echo pxe1610 0x70 > /sys/bus/i2c/devices/i2c-4/new_device
+
+It can also be instantiated by declaring in device tree
+
+
+Sysfs attributes
+----------------
+
+curr1_label		"iin"
+curr1_input		Measured input current
+curr1_alarm		Current high alarm
+
+curr[2-4]_label		"iout[1-3]"
+curr[2-4]_input		Measured output current
+curr[2-4]_crit		Critical maximum current
+curr[2-4]_crit_alarm	Current critical high alarm
+
+in1_label		"vin"
+in1_input		Measured input voltage
+in1_crit		Critical maximum input voltage
+in1_crit_alarm		Input voltage critical high alarm
+
+in[2-4]_label		"vout[1-3]"
+in[2-4]_input		Measured output voltage
+in[2-4]_lcrit		Critical minimum output voltage
+in[2-4]_lcrit_alarm	Output voltage critical low alarm
+in[2-4]_crit		Critical maximum output voltage
+in[2-4]_crit_alarm	Output voltage critical high alarm
+
+power1_label		"pin"
+power1_input		Measured input power
+power1_alarm		Input power high alarm
+
+power[2-4]_label	"pout[1-3]"
+power[2-4]_input	Measured output power
+
+temp[1-3]_input		Measured temperature
+temp[1-3]_crit		Critical high temperature
+temp[1-3]_crit_alarm	Chip temperature critical high alarm
+temp[1-3]_max		Maximum temperature
+temp[1-3]_max_alarm	Chip temperature high alarm
diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst
index f9796b9d9db6..452fc28d8e0b 100644
--- a/Documentation/hwmon/submitting-patches.rst
+++ b/Documentation/hwmon/submitting-patches.rst
@@ -89,7 +89,7 @@ increase the chances of your change being accepted.
   console. Excessive logging can seriously affect system performance.
 
 * Use devres functions whenever possible to allocate resources. For rationale
-  and supported functions, please see Documentation/driver-model/devres.txt.
+  and supported functions, please see Documentation/driver-api/driver-model/devres.rst.
   If a function is not supported by devres, consider using devm_add_action().
 
 * If the driver has a detect function, make sure it is silent. Debug messages
diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index ed640a278185..6f03713b7003 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -136,6 +136,39 @@ The function will never sleep.
 
 ::
 
+  int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int timeout);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+Caution: User must protect the routine of getting hardware lock with mutex
+or spinlock to avoid dead-lock, that will let user can do some time-consuming
+or sleepable operations under the hardware lock.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
+::
+
+  int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+This function shall be called only from an atomic context and the timeout
+value shall not exceed a few msecs.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
+::
+
   int hwspin_trylock(struct hwspinlock *hwlock);
 
 
@@ -186,6 +219,34 @@ The function will never sleep.
 
 ::
 
+  int hwspin_trylock_raw(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+Caution: User must protect the routine of getting hardware lock with mutex
+or spinlock to avoid dead-lock, that will let user can do some time-consuming
+or sleepable operations under the hardware lock.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
+::
+
+  int hwspin_trylock_in_atomic(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+This function shall be called only from an atomic context.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
+::
+
   void hwspin_unlock(struct hwspinlock *hwlock);
 
 Unlock a previously-locked hwspinlock. Always succeed, and can be called
@@ -222,6 +283,26 @@ the given flags. This function will never sleep.
 
 ::
 
+  void hwspin_unlock_raw(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
+::
+
+  void hwspin_unlock_in_atomic(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
+::
+
   int hwspin_lock_get_id(struct hwspinlock *hwlock);
 
 Retrieve id number of a given hwspinlock. This is needed when an
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
index ee9984f35868..f426c13c63a9 100644
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -37,6 +37,8 @@ Supported adapters:
   * Intel Cedar Fork (PCH)
   * Intel Ice Lake (PCH)
   * Intel Comet Lake (PCH)
+  * Intel Elkhart Lake (PCH)
+  * Intel Tiger Lake (PCH)
    Datasheets: Publicly available at the Intel website
 
 On Intel Patsburg and later chipsets, both the normal host SMBus controller
@@ -58,6 +60,7 @@ question doesn't work as intended for whatever reason. Bit values:
  0x02  disable the block buffer
  0x08  disable the I2C block read functionality
  0x10  don't use interrupts
+ 0x20  disable SMBus Host Notify
 
 
 Description
@@ -88,7 +91,7 @@ SMBus controller.
 Process Call Support
 --------------------
 
-Not supported.
+Block process call is supported on the 82801EB (ICH5) and later chips.
 
 
 I2C Block Read Support
@@ -118,16 +121,15 @@ BIOS to enable it, it means it has been hidden by the BIOS code. Asus is
 well known for first doing this on their P4B motherboard, and many other
 boards after that. Some vendor machines are affected as well.
 
-The first thing to try is the "i2c_ec" ACPI driver. It could be that the
+The first thing to try is the "i2c-scmi" ACPI driver. It could be that the
 SMBus was hidden on purpose because it'll be driven by ACPI. If the
-i2c_ec driver works for you, just forget about the i2c-i801 driver and
-don't try to unhide the ICH SMBus. Even if i2c_ec doesn't work, you
+i2c-scmi driver works for you, just forget about the i2c-i801 driver and
+don't try to unhide the ICH SMBus. Even if i2c-scmi doesn't work, you
 better make sure that the SMBus isn't used by the ACPI code. Try loading
-the "fan" and "thermal" drivers, and check in /proc/acpi/fan and
-/proc/acpi/thermal_zone. If you find anything there, it's likely that
-the ACPI is accessing the SMBus and it's safer not to unhide it. Only
-once you are certain that ACPI isn't using the SMBus, you can attempt
-to unhide it.
+the "fan" and "thermal" drivers, and check in /sys/class/thermal. If you
+find a thermal zone with type "acpitz", it's likely that the ACPI is
+accessing the SMBus and it's safer not to unhide it. Only once you are
+certain that ACPI isn't using the SMBus, you can attempt to unhide it.
 
 In order to unhide the SMBus, we need to change the value of a PCI
 register before the kernel enumerates the PCI devices. This is done in
diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index 0d85ac1935b7..345e9ea8281a 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI
 -------------------------------------------
 
 ACPI can also describe I2C devices. There is special documentation for this
-which is currently located at Documentation/acpi/enumeration.txt.
+which is currently located at Documentation/firmware-guide/acpi/enumeration.rst.
 
 
 Method 2: Instantiate the devices explicitly
@@ -137,7 +137,7 @@ static int usb_hcd_nxp_probe(struct platform_device *pdev)
 	(...)
 	i2c_adap = i2c_get_adapter(2);
 	memset(&i2c_info, 0, sizeof(struct i2c_board_info));
-	strlcpy(i2c_info.type, "isp1301_nxp", I2C_NAME_SIZE);
+	strscpy(i2c_info.type, "isp1301_nxp", sizeof(i2c_info.type));
 	isp1301_i2c_client = i2c_new_probed_device(i2c_adap, &i2c_info,
 						   normal_i2c, NULL);
 	i2c_put_adapter(i2c_adap);
diff --git a/Documentation/i2c/upgrading-clients b/Documentation/i2c/upgrading-clients
index ccba3ffd6e80..96392cc5b5c7 100644
--- a/Documentation/i2c/upgrading-clients
+++ b/Documentation/i2c/upgrading-clients
@@ -43,7 +43,7 @@ static int example_attach(struct i2c_adapter *adap, int addr, int kind)
 	example->client.adapter = adap;
 
 	i2c_set_clientdata(&state->i2c_client, state);
-	strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE);
+	strscpy(client->i2c_client.name, "example", sizeof(client->i2c_client.name));
 
 	ret = i2c_attach_client(&state->i2c_client);
 	if (ret < 0) {
@@ -138,7 +138,7 @@ can be removed:
 -	example->client.flags   = 0;
 -	example->client.adapter = adap;
 -
--	strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE);
+-	strscpy(client->i2c_client.name, "example", sizeof(client->i2c_client.name));
 
 The i2c_set_clientdata is now:
 
diff --git a/Documentation/ia64/IRQ-redir.txt b/Documentation/ia64/IRQ-redir.txt
deleted file mode 100644
index f7bd72261283..000000000000
--- a/Documentation/ia64/IRQ-redir.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-IRQ affinity on IA64 platforms
-------------------------------
-                           07.01.2002, Erich Focht <efocht@ess.nec.de>
-
-
-By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
-controlled. The behavior on IA64 platforms is slightly different from
-that described in Documentation/IRQ-affinity.txt for i386 systems.
-
-Because of the usage of SAPIC mode and physical destination mode the
-IRQ target is one particular CPU and cannot be a mask of several
-CPUs. Only the first non-zero bit is taken into account.
-
-
-Usage examples:
-
-The target CPU has to be specified as a hexadecimal CPU mask. The
-first non-zero bit is the selected CPU. This format has been kept for
-compatibility reasons with i386.
-
-Set the delivery mode of interrupt 41 to fixed and route the
-interrupts to CPU #3 (logical CPU number) (2^3=0x08):
-     echo "8" >/proc/irq/41/smp_affinity
-
-Set the default route for IRQ number 41 to CPU 6 in lowest priority
-delivery mode (redirectable):
-     echo "r 40" >/proc/irq/41/smp_affinity
-
-The output of the command
-     cat /proc/irq/IRQ#/smp_affinity
-gives the target CPU mask for the specified interrupt vector. If the CPU
-mask is preceded by the character "r", the interrupt is redirectable
-(i.e. lowest priority mode routing is used), otherwise its route is
-fixed.
-
-
-
-Initialization and default behavior:
-
-If the platform features IRQ redirection (info provided by SAL) all
-IO-SAPIC interrupts are initialized with CPU#0 as their default target
-and the routing is the so called "lowest priority mode" (actually
-fixed SAPIC mode with hint). The XTP chipset registers are used as hints
-for the IRQ routing. Currently in Linux XTP registers can have three
-values:
-	- minimal for an idle task,
-	- normal if any other task runs,
-	- maximal if the CPU is going to be switched off.
-The IRQ is routed to the CPU with lowest XTP register value, the
-search begins at the default CPU. Therefore most of the interrupts
-will be handled by CPU #0.
-
-If the platform doesn't feature interrupt redirection IOSAPIC fixed
-routing is used. The target CPUs are distributed in a round robin
-manner. IRQs will be routed only to the selected target CPUs. Check
-with
-        cat /proc/interrupts
-
-
-
-Comments:
-
-On large (multi-node) systems it is recommended to route the IRQs to
-the node to which the corresponding device is connected.
-For systems like the NEC AzusA we get IRQ node-affinity for free. This
-is because usually the chipsets on each node redirect the interrupts
-only to their own CPUs (as they cannot see the XTP registers on the
-other nodes).
-
diff --git a/Documentation/ia64/README b/Documentation/ia64/README
deleted file mode 100644
index aa17f2154cba..000000000000
--- a/Documentation/ia64/README
+++ /dev/null
@@ -1,43 +0,0 @@
-        Linux kernel release 2.4.xx for the IA-64 Platform
-
-   These are the release notes for Linux version 2.4 for IA-64
-   platform.  This document provides information specific to IA-64
-   ONLY, to get additional information about the Linux kernel also
-   read the original Linux README provided with the kernel.
-
-INSTALLING the kernel:
-
- - IA-64 kernel installation is the same as the other platforms, see
-   original README for details.
-
-
-SOFTWARE REQUIREMENTS
-
-   Compiling and running this kernel requires an IA-64 compliant GCC
-   compiler.  And various software packages also compiled with an
-   IA-64 compliant GCC compiler.
-
-
-CONFIGURING the kernel:
-
-   Configuration is the same, see original README for details.
-
-
-COMPILING the kernel:
-
- - Compiling this kernel doesn't differ from other platform so read
-   the original README for details BUT make sure you have an IA-64
-   compliant GCC compiler.
-
-IA-64 SPECIFICS
-
- - General issues:
-
-    o Hardly any performance tuning has been done. Obvious targets
-      include the library routines (IP checksum, etc.). Less
-      obvious targets include making sure we don't flush the TLB
-      needlessly, etc.
-
-    o SMP locks cleanup/optimization
-
-    o IA32 support.  Currently experimental.  It mostly works.
diff --git a/Documentation/ia64/aliasing.rst b/Documentation/ia64/aliasing.rst
new file mode 100644
index 000000000000..a08b36aba015
--- /dev/null
+++ b/Documentation/ia64/aliasing.rst
@@ -0,0 +1,246 @@
+==================================
+Memory Attribute Aliasing on IA-64
+==================================
+
+Bjorn Helgaas <bjorn.helgaas@hp.com>
+
+May 4, 2006
+
+
+Memory Attributes
+=================
+
+    Itanium supports several attributes for virtual memory references.
+    The attribute is part of the virtual translation, i.e., it is
+    contained in the TLB entry.  The ones of most interest to the Linux
+    kernel are:
+
+	==		======================
+        WB		Write-back (cacheable)
+	UC		Uncacheable
+	WC		Write-coalescing
+	==		======================
+
+    System memory typically uses the WB attribute.  The UC attribute is
+    used for memory-mapped I/O devices.  The WC attribute is uncacheable
+    like UC is, but writes may be delayed and combined to increase
+    performance for things like frame buffers.
+
+    The Itanium architecture requires that we avoid accessing the same
+    page with both a cacheable mapping and an uncacheable mapping[1].
+
+    The design of the chipset determines which attributes are supported
+    on which regions of the address space.  For example, some chipsets
+    support either WB or UC access to main memory, while others support
+    only WB access.
+
+Memory Map
+==========
+
+    Platform firmware describes the physical memory map and the
+    supported attributes for each region.  At boot-time, the kernel uses
+    the EFI GetMemoryMap() interface.  ACPI can also describe memory
+    devices and the attributes they support, but Linux/ia64 currently
+    doesn't use this information.
+
+    The kernel uses the efi_memmap table returned from GetMemoryMap() to
+    learn the attributes supported by each region of physical address
+    space.  Unfortunately, this table does not completely describe the
+    address space because some machines omit some or all of the MMIO
+    regions from the map.
+
+    The kernel maintains another table, kern_memmap, which describes the
+    memory Linux is actually using and the attribute for each region.
+    This contains only system memory; it does not contain MMIO space.
+
+    The kern_memmap table typically contains only a subset of the system
+    memory described by the efi_memmap.  Linux/ia64 can't use all memory
+    in the system because of constraints imposed by the identity mapping
+    scheme.
+
+    The efi_memmap table is preserved unmodified because the original
+    boot-time information is required for kexec.
+
+Kernel Identify Mappings
+========================
+
+    Linux/ia64 identity mappings are done with large pages, currently
+    either 16MB or 64MB, referred to as "granules."  Cacheable mappings
+    are speculative[2], so the processor can read any location in the
+    page at any time, independent of the programmer's intentions.  This
+    means that to avoid attribute aliasing, Linux can create a cacheable
+    identity mapping only when the entire granule supports cacheable
+    access.
+
+    Therefore, kern_memmap contains only full granule-sized regions that
+    can referenced safely by an identity mapping.
+
+    Uncacheable mappings are not speculative, so the processor will
+    generate UC accesses only to locations explicitly referenced by
+    software.  This allows UC identity mappings to cover granules that
+    are only partially populated, or populated with a combination of UC
+    and WB regions.
+
+User Mappings
+=============
+
+    User mappings are typically done with 16K or 64K pages.  The smaller
+    page size allows more flexibility because only 16K or 64K has to be
+    homogeneous with respect to memory attributes.
+
+Potential Attribute Aliasing Cases
+==================================
+
+    There are several ways the kernel creates new mappings:
+
+mmap of /dev/mem
+----------------
+
+	This uses remap_pfn_range(), which creates user mappings.  These
+	mappings may be either WB or UC.  If the region being mapped
+	happens to be in kern_memmap, meaning that it may also be mapped
+	by a kernel identity mapping, the user mapping must use the same
+	attribute as the kernel mapping.
+
+	If the region is not in kern_memmap, the user mapping should use
+	an attribute reported as being supported in the EFI memory map.
+
+	Since the EFI memory map does not describe MMIO on some
+	machines, this should use an uncacheable mapping as a fallback.
+
+mmap of /sys/class/pci_bus/.../legacy_mem
+-----------------------------------------
+
+	This is very similar to mmap of /dev/mem, except that legacy_mem
+	only allows mmap of the one megabyte "legacy MMIO" area for a
+	specific PCI bus.  Typically this is the first megabyte of
+	physical address space, but it may be different on machines with
+	several VGA devices.
+
+	"X" uses this to access VGA frame buffers.  Using legacy_mem
+	rather than /dev/mem allows multiple instances of X to talk to
+	different VGA cards.
+
+	The /dev/mem mmap constraints apply.
+
+mmap of /proc/bus/pci/.../??.?
+------------------------------
+
+	This is an MMIO mmap of PCI functions, which additionally may or
+	may not be requested as using the WC attribute.
+
+	If WC is requested, and the region in kern_memmap is either WC
+	or UC, and the EFI memory map designates the region as WC, then
+	the WC mapping is allowed.
+
+	Otherwise, the user mapping must use the same attribute as the
+	kernel mapping.
+
+read/write of /dev/mem
+----------------------
+
+	This uses copy_from_user(), which implicitly uses a kernel
+	identity mapping.  This is obviously safe for things in
+	kern_memmap.
+
+	There may be corner cases of things that are not in kern_memmap,
+	but could be accessed this way.  For example, registers in MMIO
+	space are not in kern_memmap, but could be accessed with a UC
+	mapping.  This would not cause attribute aliasing.  But
+	registers typically can be accessed only with four-byte or
+	eight-byte accesses, and the copy_from_user() path doesn't allow
+	any control over the access size, so this would be dangerous.
+
+ioremap()
+---------
+
+	This returns a mapping for use inside the kernel.
+
+	If the region is in kern_memmap, we should use the attribute
+	specified there.
+
+	If the EFI memory map reports that the entire granule supports
+	WB, we should use that (granules that are partially reserved
+	or occupied by firmware do not appear in kern_memmap).
+
+	If the granule contains non-WB memory, but we can cover the
+	region safely with kernel page table mappings, we can use
+	ioremap_page_range() as most other architectures do.
+
+	Failing all of the above, we have to fall back to a UC mapping.
+
+Past Problem Cases
+==================
+
+mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
+--------------------------------------------------------------------
+
+      The EFI memory map may not report these MMIO regions.
+
+      These must be allowed so that X will work.  This means that
+      when the EFI memory map is incomplete, every /dev/mem mmap must
+      succeed.  It may create either WB or UC user mappings, depending
+      on whether the region is in kern_memmap or the EFI memory map.
+
+mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+----------------------------------------------------------------------
+
+      The EFI memory map reports the following attributes:
+
+        =============== ======= ==================
+        0x00000-0x9FFFF WB only
+        0xA0000-0xBFFFF UC only (VGA frame buffer)
+        0xC0000-0xFFFFF WB only
+        =============== ======= ==================
+
+      This mmap is done with user pages, not kernel identity mappings,
+      so it is safe to use WB mappings.
+
+      The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
+      which uses a granule-sized UC mapping.  This granule will cover some
+      WB-only memory, but since UC is non-speculative, the processor will
+      never generate an uncacheable reference to the WB-only areas unless
+      the driver explicitly touches them.
+
+mmap of 0x0-0xFFFFF legacy_mem by "X"
+-------------------------------------
+
+      If the EFI memory map reports that the entire range supports the
+      same attributes, we can allow the mmap (and we will prefer WB if
+      supported, as is the case with HP sx[12]000 machines with VGA
+      disabled).
+
+      If EFI reports the range as partly WB and partly UC (as on sx[12]000
+      machines with VGA enabled), we must fail the mmap because there's no
+      safe attribute to use.
+
+      If EFI reports some of the range but not all (as on Intel firmware
+      that doesn't report the VGA frame buffer at all), we should fail the
+      mmap and force the user to map just the specific region of interest.
+
+mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
+------------------------------------------------------------------------
+
+      The EFI memory map reports the following attributes::
+
+        0x00000-0xFFFFF WB only (no VGA MMIO hole)
+
+      This is a special case of the previous case, and the mmap should
+      fail for the same reason as above.
+
+read of /sys/devices/.../rom
+----------------------------
+
+      For VGA devices, this may cause an ioremap() of 0xC0000.  This
+      used to be done with a UC mapping, because the VGA frame buffer
+      at 0xA0000 prevents use of a WB granule.  The UC mapping causes
+      an MCA on HP sx[12]000 chipsets.
+
+      We should use WB page table mappings to avoid covering the VGA
+      frame buffer.
+
+Notes
+=====
+
+    [1] SDM rev 2.2, vol 2, sec 4.4.1.
+    [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
deleted file mode 100644
index 5a4dea6abebd..000000000000
--- a/Documentation/ia64/aliasing.txt
+++ /dev/null
@@ -1,221 +0,0 @@
-	         MEMORY ATTRIBUTE ALIASING ON IA-64
-
-			   Bjorn Helgaas
-		       <bjorn.helgaas@hp.com>
-			    May 4, 2006
-
-
-MEMORY ATTRIBUTES
-
-    Itanium supports several attributes for virtual memory references.
-    The attribute is part of the virtual translation, i.e., it is
-    contained in the TLB entry.  The ones of most interest to the Linux
-    kernel are:
-
-	WB		Write-back (cacheable)
-	UC		Uncacheable
-	WC		Write-coalescing
-
-    System memory typically uses the WB attribute.  The UC attribute is
-    used for memory-mapped I/O devices.  The WC attribute is uncacheable
-    like UC is, but writes may be delayed and combined to increase
-    performance for things like frame buffers.
-
-    The Itanium architecture requires that we avoid accessing the same
-    page with both a cacheable mapping and an uncacheable mapping[1].
-
-    The design of the chipset determines which attributes are supported
-    on which regions of the address space.  For example, some chipsets
-    support either WB or UC access to main memory, while others support
-    only WB access.
-
-MEMORY MAP
-
-    Platform firmware describes the physical memory map and the
-    supported attributes for each region.  At boot-time, the kernel uses
-    the EFI GetMemoryMap() interface.  ACPI can also describe memory
-    devices and the attributes they support, but Linux/ia64 currently
-    doesn't use this information.
-
-    The kernel uses the efi_memmap table returned from GetMemoryMap() to
-    learn the attributes supported by each region of physical address
-    space.  Unfortunately, this table does not completely describe the
-    address space because some machines omit some or all of the MMIO
-    regions from the map.
-
-    The kernel maintains another table, kern_memmap, which describes the
-    memory Linux is actually using and the attribute for each region.
-    This contains only system memory; it does not contain MMIO space.
-
-    The kern_memmap table typically contains only a subset of the system
-    memory described by the efi_memmap.  Linux/ia64 can't use all memory
-    in the system because of constraints imposed by the identity mapping
-    scheme.
-
-    The efi_memmap table is preserved unmodified because the original
-    boot-time information is required for kexec.
-
-KERNEL IDENTITY MAPPINGS
-
-    Linux/ia64 identity mappings are done with large pages, currently
-    either 16MB or 64MB, referred to as "granules."  Cacheable mappings
-    are speculative[2], so the processor can read any location in the
-    page at any time, independent of the programmer's intentions.  This
-    means that to avoid attribute aliasing, Linux can create a cacheable
-    identity mapping only when the entire granule supports cacheable
-    access.
-
-    Therefore, kern_memmap contains only full granule-sized regions that
-    can referenced safely by an identity mapping.
-
-    Uncacheable mappings are not speculative, so the processor will
-    generate UC accesses only to locations explicitly referenced by
-    software.  This allows UC identity mappings to cover granules that
-    are only partially populated, or populated with a combination of UC
-    and WB regions.
-
-USER MAPPINGS
-
-    User mappings are typically done with 16K or 64K pages.  The smaller
-    page size allows more flexibility because only 16K or 64K has to be
-    homogeneous with respect to memory attributes.
-
-POTENTIAL ATTRIBUTE ALIASING CASES
-
-    There are several ways the kernel creates new mappings:
-
-    mmap of /dev/mem
-
-	This uses remap_pfn_range(), which creates user mappings.  These
-	mappings may be either WB or UC.  If the region being mapped
-	happens to be in kern_memmap, meaning that it may also be mapped
-	by a kernel identity mapping, the user mapping must use the same
-	attribute as the kernel mapping.
-
-	If the region is not in kern_memmap, the user mapping should use
-	an attribute reported as being supported in the EFI memory map.
-
-	Since the EFI memory map does not describe MMIO on some
-	machines, this should use an uncacheable mapping as a fallback.
-
-    mmap of /sys/class/pci_bus/.../legacy_mem
-
-	This is very similar to mmap of /dev/mem, except that legacy_mem
-	only allows mmap of the one megabyte "legacy MMIO" area for a
-	specific PCI bus.  Typically this is the first megabyte of
-	physical address space, but it may be different on machines with
-	several VGA devices.
-
-	"X" uses this to access VGA frame buffers.  Using legacy_mem
-	rather than /dev/mem allows multiple instances of X to talk to
-	different VGA cards.
-
-	The /dev/mem mmap constraints apply.
-
-    mmap of /proc/bus/pci/.../??.?
-
-    	This is an MMIO mmap of PCI functions, which additionally may or
-	may not be requested as using the WC attribute.
-
-	If WC is requested, and the region in kern_memmap is either WC
-	or UC, and the EFI memory map designates the region as WC, then
-	the WC mapping is allowed.
-
-	Otherwise, the user mapping must use the same attribute as the
-	kernel mapping.
-
-    read/write of /dev/mem
-
-	This uses copy_from_user(), which implicitly uses a kernel
-	identity mapping.  This is obviously safe for things in
-	kern_memmap.
-
-	There may be corner cases of things that are not in kern_memmap,
-	but could be accessed this way.  For example, registers in MMIO
-	space are not in kern_memmap, but could be accessed with a UC
-	mapping.  This would not cause attribute aliasing.  But
-	registers typically can be accessed only with four-byte or
-	eight-byte accesses, and the copy_from_user() path doesn't allow
-	any control over the access size, so this would be dangerous.
-
-    ioremap()
-
-	This returns a mapping for use inside the kernel.
-
-	If the region is in kern_memmap, we should use the attribute
-	specified there.
-
-	If the EFI memory map reports that the entire granule supports
-	WB, we should use that (granules that are partially reserved
-	or occupied by firmware do not appear in kern_memmap).
-
-	If the granule contains non-WB memory, but we can cover the
-	region safely with kernel page table mappings, we can use
-	ioremap_page_range() as most other architectures do.
-
-	Failing all of the above, we have to fall back to a UC mapping.
-
-PAST PROBLEM CASES
-
-    mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
-
-      The EFI memory map may not report these MMIO regions.
-
-      These must be allowed so that X will work.  This means that
-      when the EFI memory map is incomplete, every /dev/mem mmap must
-      succeed.  It may create either WB or UC user mappings, depending
-      on whether the region is in kern_memmap or the EFI memory map.
-
-    mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
-
-      The EFI memory map reports the following attributes:
-        0x00000-0x9FFFF WB only
-        0xA0000-0xBFFFF UC only (VGA frame buffer)
-        0xC0000-0xFFFFF WB only
-
-      This mmap is done with user pages, not kernel identity mappings,
-      so it is safe to use WB mappings.
-
-      The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
-      which uses a granule-sized UC mapping.  This granule will cover some
-      WB-only memory, but since UC is non-speculative, the processor will
-      never generate an uncacheable reference to the WB-only areas unless
-      the driver explicitly touches them.
-
-    mmap of 0x0-0xFFFFF legacy_mem by "X"
-
-      If the EFI memory map reports that the entire range supports the
-      same attributes, we can allow the mmap (and we will prefer WB if
-      supported, as is the case with HP sx[12]000 machines with VGA
-      disabled).
-
-      If EFI reports the range as partly WB and partly UC (as on sx[12]000
-      machines with VGA enabled), we must fail the mmap because there's no
-      safe attribute to use.
-
-      If EFI reports some of the range but not all (as on Intel firmware
-      that doesn't report the VGA frame buffer at all), we should fail the
-      mmap and force the user to map just the specific region of interest.
-
-    mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
-
-      The EFI memory map reports the following attributes:
-        0x00000-0xFFFFF WB only (no VGA MMIO hole)
-
-      This is a special case of the previous case, and the mmap should
-      fail for the same reason as above.
-
-    read of /sys/devices/.../rom
-
-      For VGA devices, this may cause an ioremap() of 0xC0000.  This
-      used to be done with a UC mapping, because the VGA frame buffer
-      at 0xA0000 prevents use of a WB granule.  The UC mapping causes
-      an MCA on HP sx[12]000 chipsets.
-
-      We should use WB page table mappings to avoid covering the VGA
-      frame buffer.
-
-NOTES
-
-    [1] SDM rev 2.2, vol 2, sec 4.4.1.
-    [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/ia64/efirtc.rst b/Documentation/ia64/efirtc.rst
new file mode 100644
index 000000000000..2f7ff5026308
--- /dev/null
+++ b/Documentation/ia64/efirtc.rst
@@ -0,0 +1,144 @@
+==========================
+EFI Real Time Clock driver
+==========================
+
+S. Eranian <eranian@hpl.hp.com>
+
+March 2000
+
+1. Introduction
+===============
+
+This document describes the efirtc.c driver has provided for
+the IA-64 platform.
+
+The purpose of this driver is to supply an API for kernel and user applications
+to get access to the Time Service offered by EFI version 0.92.
+
+EFI provides 4 calls one can make once the OS is booted: GetTime(),
+SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
+driver. We describe those calls as well the design of the driver in the
+following sections.
+
+2. Design Decisions
+===================
+
+The original ideas was to provide a very simple driver to get access to,
+at first, the time of day service. This is required in order to access, in a
+portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
+to initialize the system view of the time during boot.
+
+Because we wanted to minimize the impact on existing user-level apps using
+the CMOS clock, we decided to expose an API that was very similar to the one
+used today with the legacy RTC driver (driver/char/rtc.c). However, because
+EFI provides a simpler services, not all ioctl() are available. Also
+new ioctl()s have been introduced for things that EFI provides but not the
+legacy.
+
+EFI uses a slightly different way of representing the time, noticeably
+the reference date is different. Year is the using the full 4-digit format.
+The Epoch is January 1st 1998. For backward compatibility reasons we don't
+expose this new way of representing time. Instead we use something very
+similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
+One of the reasons for doing it this way is to allow for EFI to still evolve
+without necessarily impacting any of the user applications. The decoupling
+enables flexibility and permits writing wrapper code is ncase things change.
+
+The driver exposes two interfaces, one via the device file and a set of
+ioctl()s. The other is read-only via the /proc filesystem.
+
+As of today we don't offer a /proc/sys interface.
+
+To allow for a uniform interface between the legacy RTC and EFI time service,
+we have created the include/linux/rtc.h header file to contain only the
+"public" API of the two drivers.  The specifics of the legacy RTC are still
+in include/linux/mc146818rtc.h.
+
+
+3. Time of day service
+======================
+
+The part of the driver gives access to the time of day service of EFI.
+Two ioctl()s, compatible with the legacy RTC calls:
+
+	Read the CMOS clock::
+
+		ioctl(d, RTC_RD_TIME, &rtc);
+
+	Write the CMOS clock::
+
+		ioctl(d, RTC_SET_TIME, &rtc);
+
+The rtc is a pointer to a data structure defined in rtc.h which is close
+to a struct tm::
+
+  struct rtc_time {
+          int tm_sec;
+          int tm_min;
+          int tm_hour;
+          int tm_mday;
+          int tm_mon;
+          int tm_year;
+          int tm_wday;
+          int tm_yday;
+          int tm_isdst;
+  };
+
+The driver takes care of converting back an forth between the EFI time and
+this format.
+
+Those two ioctl()s can be exercised with the hwclock command:
+
+For reading::
+
+	# /sbin/hwclock --show
+	Mon Mar  6 15:32:32 2000  -0.910248 seconds
+
+For setting::
+
+	# /sbin/hwclock --systohc
+
+Root privileges are required to be able to set the time of day.
+
+4. Wakeup Alarm service
+=======================
+
+EFI provides an API by which one can program when a machine should wakeup,
+i.e. reboot. This is very different from the alarm provided by the legacy
+RTC which is some kind of interval timer alarm. For this reason we don't use
+the same ioctl()s to get access to the service. Instead we have
+introduced 2 news ioctl()s to the interface of an RTC.
+
+We have added 2 new ioctl()s that are specific to the EFI driver:
+
+	Read the current state of the alarm::
+
+		ioctl(d, RTC_WKLAM_RD, &wkt)
+
+	Set the alarm or change its status::
+
+		ioctl(d, RTC_WKALM_SET, &wkt)
+
+The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
+status information::
+
+  struct rtc_wkalrm {
+
+          unsigned char enabled; /* =1 if alarm is enabled */
+          unsigned char pending; /* =1 if alarm is pending  */
+
+          struct rtc_time time;
+  }
+
+As of today, none of the existing user-level apps supports this feature.
+However writing such a program should be hard by simply using those two
+ioctl().
+
+Root privileges are required to be able to set the alarm.
+
+5. References
+=============
+
+Checkout the following Web site for more information on EFI:
+
+http://developer.intel.com/technology/efi/
diff --git a/Documentation/ia64/efirtc.txt b/Documentation/ia64/efirtc.txt
deleted file mode 100644
index 057e6bebda8f..000000000000
--- a/Documentation/ia64/efirtc.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-EFI Real Time Clock driver
--------------------------------
-S. Eranian <eranian@hpl.hp.com>
-March 2000
-
-I/ Introduction
-
-This document describes the efirtc.c driver has provided for
-the IA-64 platform. 
-
-The purpose of this driver is to supply an API for kernel and user applications
-to get access to the Time Service offered by EFI version 0.92.
-
-EFI provides 4 calls one can make once the OS is booted: GetTime(),
-SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
-driver. We describe those calls as well the design of the driver in the
-following sections.
-
-II/ Design Decisions
-
-The original ideas was to provide a very simple driver to get access to, 
-at first, the time of day service. This is required in order to access, in a 
-portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock 
-to initialize the system view of the time during boot.
-
-Because we wanted to minimize the impact on existing user-level apps using
-the CMOS clock, we decided to expose an API that was very similar to the one
-used today with the legacy RTC driver (driver/char/rtc.c). However, because 
-EFI provides a simpler services, not all ioctl() are available. Also
-new ioctl()s have been introduced for things that EFI provides but not the 
-legacy.
-
-EFI uses a slightly different way of representing the time, noticeably
-the reference date is different. Year is the using the full 4-digit format.
-The Epoch is January 1st 1998. For backward compatibility reasons we don't
-expose this new way of representing time. Instead we use something very 
-similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
-One of the reasons for doing it this way is to allow for EFI to still evolve
-without necessarily impacting any of the user applications. The decoupling
-enables flexibility and permits writing wrapper code is ncase things change.
-
-The driver exposes two interfaces, one via the device file and a set of
-ioctl()s. The other is read-only via the /proc filesystem. 
-
-As of today we don't offer a /proc/sys interface.
-
-To allow for a uniform interface between the legacy RTC and EFI time service,
-we have created the include/linux/rtc.h header file to contain only the 
-"public" API of the two drivers.  The specifics of the legacy RTC are still 
-in include/linux/mc146818rtc.h.
-
- 
-III/ Time of day service
-
-The part of the driver gives access to the time of day service of EFI.
-Two ioctl()s, compatible with the legacy RTC calls:
-
-	Read the CMOS clock: ioctl(d, RTC_RD_TIME, &rtc);
-
-	Write the CMOS clock: ioctl(d, RTC_SET_TIME, &rtc);
-
-The rtc is a pointer to a data structure defined in rtc.h which is close
-to a struct tm:
-
-struct rtc_time {
-        int tm_sec;
-        int tm_min;
-        int tm_hour;
-        int tm_mday;
-        int tm_mon;
-        int tm_year;
-        int tm_wday;
-        int tm_yday;
-        int tm_isdst;
-};
-
-The driver takes care of converting back an forth between the EFI time and
-this format.
-
-Those two ioctl()s can be exercised with the hwclock command:
-
-For reading:
-# /sbin/hwclock --show
-Mon Mar  6 15:32:32 2000  -0.910248 seconds
-
-For setting:
-# /sbin/hwclock --systohc
-
-Root privileges are required to be able to set the time of day.
-
-IV/ Wakeup Alarm service
-
-EFI provides an API by which one can program when a machine should wakeup,
-i.e. reboot. This is very different from the alarm provided by the legacy
-RTC which is some kind of interval timer alarm. For this reason we don't use
-the same ioctl()s to get access to the service. Instead we have
-introduced 2 news ioctl()s to the interface of an RTC. 
-
-We have added 2 new ioctl()s that are specific to the EFI driver:
-
-	Read the current state of the alarm
-	ioctl(d, RTC_WKLAM_RD, &wkt)
-
-	Set the alarm or change its status
-	ioctl(d, RTC_WKALM_SET, &wkt)
-
-The wkt structure encapsulates a struct rtc_time + 2 extra fields to get 
-status information:
-	
-struct rtc_wkalrm {
-
-        unsigned char enabled; /* =1 if alarm is enabled */
-        unsigned char pending; /* =1 if alarm is pending  */
-
-        struct rtc_time time;
-} 
-
-As of today, none of the existing user-level apps supports this feature.
-However writing such a program should be hard by simply using those two 
-ioctl(). 
-
-Root privileges are required to be able to set the alarm.
-
-V/ References.
-
-Checkout the following Web site for more information on EFI:
-
-http://developer.intel.com/technology/efi/
diff --git a/Documentation/ia64/err_inject.rst b/Documentation/ia64/err_inject.rst
new file mode 100644
index 000000000000..900f71e93a29
--- /dev/null
+++ b/Documentation/ia64/err_inject.rst
@@ -0,0 +1,1067 @@
+========================================
+IPF Machine Check (MC) error inject tool
+========================================
+
+IPF Machine Check (MC) error inject tool is used to inject MC
+errors from Linux. The tool is a test bed for IPF MC work flow including
+hardware correctable error handling, OS recoverable error handling, MC
+event logging, etc.
+
+The tool includes two parts: a kernel driver and a user application
+sample. The driver provides interface to PAL to inject error
+and query error injection capabilities. The driver code is in
+arch/ia64/kernel/err_inject.c. The application sample (shown below)
+provides a combination of various errors and calls the driver's interface
+(sysfs interface) to inject errors or query error injection capabilities.
+
+The tool can be used to test Intel IPF machine MC handling capabilities.
+It's especially useful for people who can not access hardware MC injection
+tool to inject error. It's also very useful to integrate with other
+software test suits to do stressful testing on IPF.
+
+Below is a sample application as part of the whole tool. The sample
+can be used as a working test tool. Or it can be expanded to include
+more features. It also can be a integrated into a library or other user
+application to have more thorough test.
+
+The sample application takes err.conf as error configuration input. GCC
+compiles the code. After you install err_inject driver, you can run
+this sample application to inject errors.
+
+Errata: Itanium 2 Processors Specification Update lists some errata against
+the pal_mc_error_inject PAL procedure. The following err.conf has been tested
+on latest Montecito PAL.
+
+err.conf::
+
+  #This is configuration file for err_inject_tool.
+  #The format of the each line is:
+  #cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
+  #where
+  #	cpu: logical cpu number the error will be inject in.
+  #	loop: times the error will be injected.
+  #	interval: In second. every so often one error is injected.
+  #	err_type_info, err_struct_info: PAL parameters.
+  #
+  #Note: All values are hex w/o or w/ 0x prefix.
+
+
+  #On cpu2, inject only total 0x10 errors, interval 5 seconds
+  #corrected, data cache, hier-2, physical addr(assigned by tool code).
+  #working on Montecito latest PAL.
+  2, 10, 5, 4101, 95
+
+  #On cpu4, inject and consume total 0x10 errors, interval 5 seconds
+  #corrected, data cache, hier-2, physical addr(assigned by tool code).
+  #working on Montecito latest PAL.
+  4, 10, 5, 4109, 95
+
+  #On cpu15, inject and consume total 0x10 errors, interval 5 seconds
+  #recoverable, DTR0, hier-2.
+  #working on Montecito latest PAL.
+  0xf, 0x10, 5, 4249, 15
+
+The sample application source code:
+
+err_injection_tool.c::
+
+  /*
+   * This program is free software; you can redistribute it and/or modify
+   * it under the terms of the GNU General Public License as published by
+   * the Free Software Foundation; either version 2 of the License, or
+   * (at your option) any later version.
+   *
+   * This program is distributed in the hope that it will be useful, but
+   * WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+   * NON INFRINGEMENT.  See the GNU General Public License for more
+   * details.
+   *
+   * You should have received a copy of the GNU General Public License
+   * along with this program; if not, write to the Free Software
+   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+   *
+   * Copyright (C) 2006 Intel Co
+   *	Fenghua Yu <fenghua.yu@intel.com>
+   *
+   */
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <fcntl.h>
+  #include <stdio.h>
+  #include <sched.h>
+  #include <unistd.h>
+  #include <stdlib.h>
+  #include <stdarg.h>
+  #include <string.h>
+  #include <errno.h>
+  #include <time.h>
+  #include <sys/ipc.h>
+  #include <sys/sem.h>
+  #include <sys/wait.h>
+  #include <sys/mman.h>
+  #include <sys/shm.h>
+
+  #define MAX_FN_SIZE 		256
+  #define MAX_BUF_SIZE 		256
+  #define DATA_BUF_SIZE 		256
+  #define NR_CPUS 		512
+  #define MAX_TASK_NUM		2048
+  #define MIN_INTERVAL		5	// seconds
+  #define	ERR_DATA_BUFFER_SIZE 	3	// Three 8-byte.
+  #define PARA_FIELD_NUM		5
+  #define MASK_SIZE		(NR_CPUS/64)
+  #define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
+
+  int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
+
+  int verbose;
+  #define vbprintf if (verbose) printf
+
+  int log_info(int cpu, const char *fmt, ...)
+  {
+	FILE *log;
+	char fn[MAX_FN_SIZE];
+	char buf[MAX_BUF_SIZE];
+	va_list args;
+
+	sprintf(fn, "%d.log", cpu);
+	log=fopen(fn, "a+");
+	if (log==NULL) {
+		perror("Error open:");
+		return -1;
+	}
+
+	va_start(args, fmt);
+	vprintf(fmt, args);
+	memset(buf, 0, MAX_BUF_SIZE);
+	vsprintf(buf, fmt, args);
+	va_end(args);
+
+	fwrite(buf, sizeof(buf), 1, log);
+	fclose(log);
+
+	return 0;
+  }
+
+  typedef unsigned long u64;
+  typedef unsigned int  u32;
+
+  typedef union err_type_info_u {
+	struct {
+		u64	mode		: 3,	/* 0-2 */
+			err_inj		: 3,	/* 3-5 */
+			err_sev		: 2,	/* 6-7 */
+			err_struct	: 5,	/* 8-12 */
+			struct_hier	: 3,	/* 13-15 */
+			reserved	: 48;	/* 16-63 */
+	} err_type_info_u;
+	u64	err_type_info;
+  } err_type_info_t;
+
+  typedef union err_struct_info_u {
+	struct {
+		u64	siv		: 1,	/* 0	 */
+			c_t		: 2,	/* 1-2	 */
+			cl_p		: 3,	/* 3-5	 */
+			cl_id		: 3,	/* 6-8	 */
+			cl_dp		: 1,	/* 9	 */
+			reserved1	: 22,	/* 10-31 */
+			tiv		: 1,	/* 32	 */
+			trigger		: 4,	/* 33-36 */
+			trigger_pl 	: 3,	/* 37-39 */
+			reserved2 	: 24;	/* 40-63 */
+	} err_struct_info_cache;
+	struct {
+		u64	siv		: 1,	/* 0	 */
+			tt		: 2,	/* 1-2	 */
+			tc_tr		: 2,	/* 3-4	 */
+			tr_slot		: 8,	/* 5-12	 */
+			reserved1	: 19,	/* 13-31 */
+			tiv		: 1,	/* 32	 */
+			trigger		: 4,	/* 33-36 */
+			trigger_pl 	: 3,	/* 37-39 */
+			reserved2 	: 24;	/* 40-63 */
+	} err_struct_info_tlb;
+	struct {
+		u64	siv		: 1,	/* 0	 */
+			regfile_id	: 4,	/* 1-4	 */
+			reg_num		: 7,	/* 5-11	 */
+			reserved1	: 20,	/* 12-31 */
+			tiv		: 1,	/* 32	 */
+			trigger		: 4,	/* 33-36 */
+			trigger_pl 	: 3,	/* 37-39 */
+			reserved2 	: 24;	/* 40-63 */
+	} err_struct_info_register;
+	struct {
+		u64	reserved;
+	} err_struct_info_bus_processor_interconnect;
+	u64	err_struct_info;
+  } err_struct_info_t;
+
+  typedef union err_data_buffer_u {
+	struct {
+		u64	trigger_addr;		/* 0-63		*/
+		u64	inj_addr;		/* 64-127 	*/
+		u64	way		: 5,	/* 128-132	*/
+			index		: 20,	/* 133-152	*/
+					: 39;	/* 153-191	*/
+	} err_data_buffer_cache;
+	struct {
+		u64	trigger_addr;		/* 0-63		*/
+		u64	inj_addr;		/* 64-127 	*/
+		u64	way		: 5,	/* 128-132	*/
+			index		: 20,	/* 133-152	*/
+			reserved	: 39;	/* 153-191	*/
+	} err_data_buffer_tlb;
+	struct {
+		u64	trigger_addr;		/* 0-63		*/
+	} err_data_buffer_register;
+	struct {
+		u64	reserved;		/* 0-63		*/
+	} err_data_buffer_bus_processor_interconnect;
+	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+  } err_data_buffer_t;
+
+  typedef union capabilities_u {
+	struct {
+		u64	i		: 1,
+			d		: 1,
+			rv		: 1,
+			tag		: 1,
+			data		: 1,
+			mesi		: 1,
+			dp		: 1,
+			reserved1	: 3,
+			pa		: 1,
+			va		: 1,
+			wi		: 1,
+			reserved2	: 20,
+			trigger		: 1,
+			trigger_pl	: 1,
+			reserved3	: 30;
+	} capabilities_cache;
+	struct {
+		u64	d		: 1,
+			i		: 1,
+			rv		: 1,
+			tc		: 1,
+			tr		: 1,
+			reserved1	: 27,
+			trigger		: 1,
+			trigger_pl	: 1,
+			reserved2	: 30;
+	} capabilities_tlb;
+	struct {
+		u64	gr_b0		: 1,
+			gr_b1		: 1,
+			fr		: 1,
+			br		: 1,
+			pr		: 1,
+			ar		: 1,
+			cr		: 1,
+			rr		: 1,
+			pkr		: 1,
+			dbr		: 1,
+			ibr		: 1,
+			pmc		: 1,
+			pmd		: 1,
+			reserved1	: 3,
+			regnum		: 1,
+			reserved2	: 15,
+			trigger		: 1,
+			trigger_pl	: 1,
+			reserved3	: 30;
+	} capabilities_register;
+	struct {
+		u64	reserved;
+	} capabilities_bus_processor_interconnect;
+  } capabilities_t;
+
+  typedef struct resources_s {
+	u64	ibr0		: 1,
+		ibr2		: 1,
+		ibr4		: 1,
+		ibr6		: 1,
+		dbr0		: 1,
+		dbr2		: 1,
+		dbr4		: 1,
+		dbr6		: 1,
+		reserved	: 48;
+  } resources_t;
+
+
+  long get_page_size(void)
+  {
+	long page_size=sysconf(_SC_PAGESIZE);
+	return page_size;
+  }
+
+  #define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
+  #define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
+  #define SHM_VA 0x2000000100000000
+
+  int shmid;
+  void *shmaddr;
+
+  int create_shm(void)
+  {
+	key_t key;
+	char fn[MAX_FN_SIZE];
+
+	/* cpu0 is always existing */
+	sprintf(fn, PATH_FORMAT, 0);
+	if ((key = ftok(fn, 's')) == -1) {
+		perror("ftok");
+		return -1;
+	}
+
+	shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
+	if (shmid == -1) {
+		if (errno==EEXIST) {
+			shmid = shmget(key, SHM_SIZE, 0);
+			if (shmid == -1) {
+				perror("shmget");
+				return -1;
+			}
+		}
+		else {
+			perror("shmget");
+			return -1;
+		}
+	}
+	vbprintf("shmid=%d", shmid);
+
+	/* connect to the segment: */
+	shmaddr = shmat(shmid, (void *)SHM_VA, 0);
+	if (shmaddr == (void*)-1) {
+		perror("shmat");
+		return -1;
+	}
+
+	memset(shmaddr, 0, SHM_SIZE);
+	mlock(shmaddr, SHM_SIZE);
+
+	return 0;
+  }
+
+  int free_shm()
+  {
+	munlock(shmaddr, SHM_SIZE);
+          shmdt(shmaddr);
+	semctl(shmid, 0, IPC_RMID);
+
+	return 0;
+  }
+
+  #ifdef _SEM_SEMUN_UNDEFINED
+  union semun
+  {
+	int val;
+	struct semid_ds *buf;
+	unsigned short int *array;
+	struct seminfo *__buf;
+  };
+  #endif
+
+  u32 mode=1; /* 1: physical mode; 2: virtual mode. */
+  int one_lock=1;
+  key_t key[NR_CPUS];
+  int semid[NR_CPUS];
+
+  int create_sem(int cpu)
+  {
+	union semun arg;
+	char fn[MAX_FN_SIZE];
+	int sid;
+
+	sprintf(fn, PATH_FORMAT, cpu);
+	sprintf(fn, "%s/%s", fn, "err_type_info");
+	if ((key[cpu] = ftok(fn, 'e')) == -1) {
+		perror("ftok");
+		return -1;
+	}
+
+	if (semid[cpu]!=0)
+		return 0;
+
+	/* clear old semaphore */
+	if ((sid = semget(key[cpu], 1, 0)) != -1)
+		semctl(sid, 0, IPC_RMID);
+
+	/* get one semaphore */
+	if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
+		perror("semget");
+		printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
+			(u64)key[cpu]);
+		return -1;
+	}
+
+	vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
+		(u64)key[cpu]);
+	/* initialize the semaphore to 1: */
+	arg.val = 1;
+	if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
+		perror("semctl");
+		return -1;
+	}
+
+	return 0;
+  }
+
+  static int lock(int cpu)
+  {
+	struct sembuf lock;
+
+	lock.sem_num = cpu;
+	lock.sem_op = 1;
+	semop(semid[cpu], &lock, 1);
+
+          return 0;
+  }
+
+  static int unlock(int cpu)
+  {
+	struct sembuf unlock;
+
+	unlock.sem_num = cpu;
+	unlock.sem_op = -1;
+	semop(semid[cpu], &unlock, 1);
+
+          return 0;
+  }
+
+  void free_sem(int cpu)
+  {
+	semctl(semid[cpu], 0, IPC_RMID);
+  }
+
+  int wr_multi(char *fn, unsigned long *data, int size)
+  {
+	int fd;
+	char buf[MAX_BUF_SIZE];
+	int ret;
+
+	if (size==1)
+		sprintf(buf, "%lx", *data);
+	else if (size==3)
+		sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
+	else {
+		fprintf(stderr,"write to file with wrong size!\n");
+		return -1;
+	}
+
+	fd=open(fn, O_RDWR);
+	if (!fd) {
+		perror("Error:");
+		return -1;
+	}
+	ret=write(fd, buf, sizeof(buf));
+	close(fd);
+	return ret;
+  }
+
+  int wr(char *fn, unsigned long data)
+  {
+	return wr_multi(fn, &data, 1);
+  }
+
+  int rd(char *fn, unsigned long *data)
+  {
+	int fd;
+	char buf[MAX_BUF_SIZE];
+
+	fd=open(fn, O_RDONLY);
+	if (fd<0) {
+		perror("Error:");
+		return -1;
+	}
+	read(fd, buf, MAX_BUF_SIZE);
+	*data=strtoul(buf, NULL, 16);
+	close(fd);
+	return 0;
+  }
+
+  int rd_status(char *path, int *status)
+  {
+	char fn[MAX_FN_SIZE];
+	sprintf(fn, "%s/status", path);
+	if (rd(fn, (u64*)status)<0) {
+		perror("status reading error.\n");
+		return -1;
+	}
+
+	return 0;
+  }
+
+  int rd_capabilities(char *path, u64 *capabilities)
+  {
+	char fn[MAX_FN_SIZE];
+	sprintf(fn, "%s/capabilities", path);
+	if (rd(fn, capabilities)<0) {
+		perror("capabilities reading error.\n");
+		return -1;
+	}
+
+	return 0;
+  }
+
+  int rd_all(char *path)
+  {
+	unsigned long err_type_info, err_struct_info, err_data_buffer;
+	int status;
+	unsigned long capabilities, resources;
+	char fn[MAX_FN_SIZE];
+
+	sprintf(fn, "%s/err_type_info", path);
+	if (rd(fn, &err_type_info)<0) {
+		perror("err_type_info reading error.\n");
+		return -1;
+	}
+	printf("err_type_info=%lx\n", err_type_info);
+
+	sprintf(fn, "%s/err_struct_info", path);
+	if (rd(fn, &err_struct_info)<0) {
+		perror("err_struct_info reading error.\n");
+		return -1;
+	}
+	printf("err_struct_info=%lx\n", err_struct_info);
+
+	sprintf(fn, "%s/err_data_buffer", path);
+	if (rd(fn, &err_data_buffer)<0) {
+		perror("err_data_buffer reading error.\n");
+		return -1;
+	}
+	printf("err_data_buffer=%lx\n", err_data_buffer);
+
+	sprintf(fn, "%s/status", path);
+	if (rd("status", (u64*)&status)<0) {
+		perror("status reading error.\n");
+		return -1;
+	}
+	printf("status=%d\n", status);
+
+	sprintf(fn, "%s/capabilities", path);
+	if (rd(fn,&capabilities)<0) {
+		perror("capabilities reading error.\n");
+		return -1;
+	}
+	printf("capabilities=%lx\n", capabilities);
+
+	sprintf(fn, "%s/resources", path);
+	if (rd(fn, &resources)<0) {
+		perror("resources reading error.\n");
+		return -1;
+	}
+	printf("resources=%lx\n", resources);
+
+	return 0;
+  }
+
+  int query_capabilities(char *path, err_type_info_t err_type_info,
+			u64 *capabilities)
+  {
+	char fn[MAX_FN_SIZE];
+	err_struct_info_t err_struct_info;
+	err_data_buffer_t err_data_buffer;
+
+	err_struct_info.err_struct_info=0;
+	memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
+
+	sprintf(fn, "%s/err_type_info", path);
+	wr(fn, err_type_info.err_type_info);
+	sprintf(fn, "%s/err_struct_info", path);
+	wr(fn, 0x0);
+	sprintf(fn, "%s/err_data_buffer", path);
+	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+	// Fire pal_mc_error_inject procedure.
+	sprintf(fn, "%s/call_start", path);
+	wr(fn, mode);
+
+	if (rd_capabilities(path, capabilities)<0)
+		return -1;
+
+	return 0;
+  }
+
+  int query_all_capabilities()
+  {
+	int status;
+	err_type_info_t err_type_info;
+	int err_sev, err_struct, struct_hier;
+	int cap=0;
+	u64 capabilities;
+	char path[MAX_FN_SIZE];
+
+	err_type_info.err_type_info=0;			// Initial
+	err_type_info.err_type_info_u.mode=0;		// Query mode;
+	err_type_info.err_type_info_u.err_inj=0;
+
+	printf("All capabilities implemented in pal_mc_error_inject:\n");
+	sprintf(path, PATH_FORMAT ,0);
+	for (err_sev=0;err_sev<3;err_sev++)
+		for (err_struct=0;err_struct<5;err_struct++)
+			for (struct_hier=0;struct_hier<5;struct_hier++)
+	{
+		status=-1;
+		capabilities=0;
+		err_type_info.err_type_info_u.err_sev=err_sev;
+		err_type_info.err_type_info_u.err_struct=err_struct;
+		err_type_info.err_type_info_u.struct_hier=struct_hier;
+
+		if (query_capabilities(path, err_type_info, &capabilities)<0)
+			continue;
+
+		if (rd_status(path, &status)<0)
+			continue;
+
+		if (status==0) {
+			cap=1;
+			printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
+				err_sev, err_struct, struct_hier);
+			printf("capabilities 0x%lx\n", capabilities);
+		}
+	}
+	if (!cap) {
+		printf("No capabilities supported.\n");
+		return 0;
+	}
+
+	return 0;
+  }
+
+  int err_inject(int cpu, char *path, err_type_info_t err_type_info,
+		err_struct_info_t err_struct_info,
+		err_data_buffer_t err_data_buffer)
+  {
+	int status;
+	char fn[MAX_FN_SIZE];
+
+	log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
+		err_type_info.err_type_info,
+		err_struct_info.err_struct_info);
+	log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
+		err_data_buffer.err_data_buffer[0],
+		err_data_buffer.err_data_buffer[1],
+		err_data_buffer.err_data_buffer[2]);
+	sprintf(fn, "%s/err_type_info", path);
+	wr(fn, err_type_info.err_type_info);
+	sprintf(fn, "%s/err_struct_info", path);
+	wr(fn, err_struct_info.err_struct_info);
+	sprintf(fn, "%s/err_data_buffer", path);
+	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+	// Fire pal_mc_error_inject procedure.
+	sprintf(fn, "%s/call_start", path);
+	wr(fn,mode);
+
+	if (rd_status(path, &status)<0) {
+		vbprintf("fail: read status\n");
+		return -100;
+	}
+
+	if (status!=0) {
+		log_info(cpu, "fail: status=%d\n", status);
+		return status;
+	}
+
+	return status;
+  }
+
+  static int construct_data_buf(char *path, err_type_info_t err_type_info,
+		err_struct_info_t err_struct_info,
+		err_data_buffer_t *err_data_buffer,
+		void *va1)
+  {
+	char fn[MAX_FN_SIZE];
+	u64 virt_addr=0, phys_addr=0;
+
+	vbprintf("va1=%lx\n", (u64)va1);
+	memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
+
+	switch (err_type_info.err_type_info_u.err_struct) {
+		case 1: // Cache
+			switch (err_struct_info.err_struct_info_cache.cl_id) {
+				case 1: //Virtual addr
+					err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
+					break;
+				case 2: //Phys addr
+					sprintf(fn, "%s/virtual_to_phys", path);
+					virt_addr=(u64)va1;
+					if (wr(fn,virt_addr)<0)
+						return -1;
+					rd(fn, &phys_addr);
+					err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
+					break;
+				default:
+					printf("Not supported cl_id\n");
+					break;
+			}
+			break;
+		case 2: //  TLB
+			break;
+		case 3: //  Register file
+			break;
+		case 4: //  Bus/system interconnect
+		default:
+			printf("Not supported err_struct\n");
+			break;
+	}
+
+	return 0;
+  }
+
+  typedef struct {
+	u64 cpu;
+	u64 loop;
+	u64 interval;
+	u64 err_type_info;
+	u64 err_struct_info;
+	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+  } parameters_t;
+
+  parameters_t line_para;
+  int para;
+
+  static int empty_data_buffer(u64 *err_data_buffer)
+  {
+	int empty=1;
+	int i;
+
+	for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
+	   if (err_data_buffer[i]!=-1)
+		empty=0;
+
+	return empty;
+  }
+
+  int err_inj()
+  {
+	err_type_info_t err_type_info;
+	err_struct_info_t err_struct_info;
+	err_data_buffer_t err_data_buffer;
+	int count;
+	FILE *fp;
+	unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
+	u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
+	int num;
+	int i;
+	char path[MAX_FN_SIZE];
+	parameters_t parameters[MAX_TASK_NUM]={};
+	pid_t child_pid[MAX_TASK_NUM];
+	time_t current_time;
+	int status;
+
+	if (!para) {
+	    fp=fopen("err.conf", "r");
+	    if (fp==NULL) {
+		perror("Error open err.conf");
+		return -1;
+	    }
+
+	    num=0;
+	    while (!feof(fp)) {
+		char buf[256];
+		memset(buf,0,256);
+		fgets(buf, 256, fp);
+		count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+				&cpu, &loop, &interval,&err_type_info_conf,
+				&err_struct_info_conf,
+				&err_data_buffer_conf[0],
+				&err_data_buffer_conf[1],
+				&err_data_buffer_conf[2]);
+		if (count!=PARA_FIELD_NUM+3) {
+			err_data_buffer_conf[0]=-1;
+			err_data_buffer_conf[1]=-1;
+			err_data_buffer_conf[2]=-1;
+			count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
+				&cpu, &loop, &interval,&err_type_info_conf,
+				&err_struct_info_conf);
+			if (count!=PARA_FIELD_NUM)
+				continue;
+		}
+
+		parameters[num].cpu=cpu;
+		parameters[num].loop=loop;
+		parameters[num].interval= interval>MIN_INTERVAL
+					  ?interval:MIN_INTERVAL;
+		parameters[num].err_type_info=err_type_info_conf;
+		parameters[num].err_struct_info=err_struct_info_conf;
+		memcpy(parameters[num++].err_data_buffer,
+			err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
+
+		if (num>=MAX_TASK_NUM)
+			break;
+	    }
+	}
+	else {
+		parameters[0].cpu=line_para.cpu;
+		parameters[0].loop=line_para.loop;
+		parameters[0].interval= line_para.interval>MIN_INTERVAL
+					  ?line_para.interval:MIN_INTERVAL;
+		parameters[0].err_type_info=line_para.err_type_info;
+		parameters[0].err_struct_info=line_para.err_struct_info;
+		memcpy(parameters[0].err_data_buffer,
+			line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
+
+		num=1;
+	}
+
+	/* Create semaphore: If one_lock, one semaphore for all processors.
+	   Otherwise, one semaphore for each processor. */
+	if (one_lock) {
+		if (create_sem(0)) {
+			printf("Can not create semaphore...exit\n");
+			free_sem(0);
+			return -1;
+		}
+	}
+	else {
+		for (i=0;i<num;i++) {
+		   if (create_sem(parameters[i].cpu)) {
+			printf("Can not create semaphore for cpu%d...exit\n",i);
+			free_sem(parameters[num].cpu);
+			return -1;
+		   }
+		}
+	}
+
+	/* Create a shm segment which will be used to inject/consume errors on.*/
+	if (create_shm()==-1) {
+		printf("Error to create shm...exit\n");
+		return -1;
+	}
+
+	for (i=0;i<num;i++) {
+		pid_t pid;
+
+		current_time=time(NULL);
+		log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
+		log_info(parameters[i].cpu, "Configurations:\n");
+		log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
+			parameters[i].cpu,
+			parameters[i].loop,
+			parameters[i].interval);
+		log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
+			parameters[i].err_type_info,
+			parameters[i].err_struct_info);
+
+		sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
+		err_type_info.err_type_info=parameters[i].err_type_info;
+		err_struct_info.err_struct_info=parameters[i].err_struct_info;
+		memcpy(err_data_buffer.err_data_buffer,
+			parameters[i].err_data_buffer,
+			ERR_DATA_BUFFER_SIZE*8);
+
+		pid=fork();
+		if (pid==0) {
+			unsigned long mask[MASK_SIZE];
+			int j, k;
+
+			void *va1, *va2;
+
+			/* Allocate two memory areas va1 and va2 in shm */
+			va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
+			va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
+
+			vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
+			memset(va1, 0x1, PAGE_SIZE);
+			memset(va2, 0x2, PAGE_SIZE);
+
+			if (empty_data_buffer(err_data_buffer.err_data_buffer))
+				/* If not specified yet, construct data buffer
+				 * with va1
+				 */
+				construct_data_buf(path, err_type_info,
+					err_struct_info, &err_data_buffer,va1);
+
+			for (j=0;j<MASK_SIZE;j++)
+				mask[j]=0;
+
+			cpu=parameters[i].cpu;
+			k = cpu%64;
+			j = cpu/64;
+			mask[j] = 1UL << k;
+
+			if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
+				perror("Error sched_setaffinity:");
+				return -1;
+			}
+
+			for (j=0; j<parameters[i].loop; j++) {
+				log_info(parameters[i].cpu,"Injection ");
+				log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
+
+					parameters[i].cpu,j+1, parameters[i].loop);
+
+				/* Hold the lock */
+				if (one_lock)
+					lock(0);
+				else
+				/* Hold lock on this cpu */
+					lock(parameters[i].cpu);
+
+				if ((status=err_inject(parameters[i].cpu,
+					   path, err_type_info,
+					   err_struct_info, err_data_buffer))
+					   ==0) {
+					/* consume the error for "inject only"*/
+					memcpy(va2, va1, PAGE_SIZE);
+					memcpy(va1, va2, PAGE_SIZE);
+					log_info(parameters[i].cpu,
+						"successful\n");
+				}
+				else {
+					log_info(parameters[i].cpu,"fail:");
+					log_info(parameters[i].cpu,
+						"status=%d\n", status);
+					unlock(parameters[i].cpu);
+					break;
+				}
+				if (one_lock)
+				/* Release the lock */
+					unlock(0);
+				/* Release lock on this cpu */
+				else
+					unlock(parameters[i].cpu);
+
+				if (j < parameters[i].loop-1)
+					sleep(parameters[i].interval);
+			}
+			current_time=time(NULL);
+			log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
+			return 0;
+		}
+		else if (pid<0) {
+			perror("Error fork:");
+			continue;
+		}
+		child_pid[i]=pid;
+	}
+	for (i=0;i<num;i++)
+		waitpid(child_pid[i], NULL, 0);
+
+	if (one_lock)
+		free_sem(0);
+	else
+		for (i=0;i<num;i++)
+			free_sem(parameters[i].cpu);
+
+	printf("All done.\n");
+
+	return 0;
+  }
+
+  void help()
+  {
+	printf("err_inject_tool:\n");
+	printf("\t-q: query all capabilities. default: off\n");
+	printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
+	printf("\t-i: inject errors. default: off\n");
+	printf("\t-l: one lock per cpu. default: one lock for all\n");
+	printf("\t-e: error parameters:\n");
+	printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
+	printf("\t\t   cpu: logical cpu number the error will be inject in.\n");
+	printf("\t\t   loop: times the error will be injected.\n");
+	printf("\t\t   interval: In second. every so often one error is injected.\n");
+	printf("\t\t   err_type_info, err_struct_info: PAL parameters.\n");
+	printf("\t\t   err_data_buffer: PAL parameter. Optional. If not present,\n");
+	printf("\t\t                    it's constructed by tool automatically. Be\n");
+	printf("\t\t                    careful to provide err_data_buffer and make\n");
+	printf("\t\t                    sure it's working with the environment.\n");
+	printf("\t    Note:no space between error parameters.\n");
+	printf("\t    default: Take error parameters from err.conf instead of command line.\n");
+	printf("\t-v: verbose. default: off\n");
+	printf("\t-h: help\n\n");
+	printf("The tool will take err.conf file as ");
+	printf("input to inject single or multiple errors ");
+	printf("on one or multiple cpus in parallel.\n");
+  }
+
+  int main(int argc, char **argv)
+  {
+	char c;
+	int do_err_inj=0;
+	int do_query_all=0;
+	int count;
+	u32 m;
+
+	/* Default one lock for all cpu's */
+	one_lock=1;
+	while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
+		switch (c) {
+			case 'm':	/* Procedure mode. 1: phys 2: virt */
+				count=sscanf(optarg, "%x", &m);
+				if (count!=1 || (m!=1 && m!=2)) {
+					printf("Wrong mode number.\n");
+					help();
+					return -1;
+				}
+				mode=m;
+				break;
+			case 'i':	/* Inject errors */
+				do_err_inj=1;
+				break;
+			case 'q':	/* Query */
+				do_query_all=1;
+				break;
+			case 'v':	/* Verbose */
+				verbose=1;
+				break;
+			case 'l':	/* One lock per cpu */
+				one_lock=0;
+				break;
+			case 'e':	/* error arguments */
+				/* Take parameters:
+				 * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
+				 * err_data_buffer is optional. Recommend not to specify
+				 * err_data_buffer. Better to use tool to generate it.
+				 */
+				count=sscanf(optarg,
+					"%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+					&line_para.cpu,
+					&line_para.loop,
+					&line_para.interval,
+					&line_para.err_type_info,
+					&line_para.err_struct_info,
+					&line_para.err_data_buffer[0],
+					&line_para.err_data_buffer[1],
+					&line_para.err_data_buffer[2]);
+				if (count!=PARA_FIELD_NUM+3) {
+				    line_para.err_data_buffer[0]=-1,
+				    line_para.err_data_buffer[1]=-1,
+				    line_para.err_data_buffer[2]=-1;
+				    count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
+						&line_para.cpu,
+						&line_para.loop,
+						&line_para.interval,
+						&line_para.err_type_info,
+						&line_para.err_struct_info);
+				    if (count!=PARA_FIELD_NUM) {
+					printf("Wrong error arguments.\n");
+					help();
+					return -1;
+				    }
+				}
+				para=1;
+				break;
+			continue;
+				break;
+			case 'h':
+				help();
+				return 0;
+			default:
+				break;
+		}
+
+	if (do_query_all)
+		query_all_capabilities();
+	if (do_err_inj)
+		err_inj();
+
+	if (!do_query_all &&  !do_err_inj)
+		help();
+
+	return 0;
+  }
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
deleted file mode 100644
index 9f651c181429..000000000000
--- a/Documentation/ia64/err_inject.txt
+++ /dev/null
@@ -1,1068 +0,0 @@
-
-IPF Machine Check (MC) error inject tool
-========================================
-
-IPF Machine Check (MC) error inject tool is used to inject MC
-errors from Linux. The tool is a test bed for IPF MC work flow including
-hardware correctable error handling, OS recoverable error handling, MC
-event logging, etc.
-
-The tool includes two parts: a kernel driver and a user application
-sample. The driver provides interface to PAL to inject error
-and query error injection capabilities. The driver code is in
-arch/ia64/kernel/err_inject.c. The application sample (shown below)
-provides a combination of various errors and calls the driver's interface
-(sysfs interface) to inject errors or query error injection capabilities.
-
-The tool can be used to test Intel IPF machine MC handling capabilities.
-It's especially useful for people who can not access hardware MC injection
-tool to inject error. It's also very useful to integrate with other
-software test suits to do stressful testing on IPF.
-
-Below is a sample application as part of the whole tool. The sample
-can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a library or other user
-application to have more thorough test.
-
-The sample application takes err.conf as error configuration input. GCC
-compiles the code. After you install err_inject driver, you can run
-this sample application to inject errors.
-
-Errata: Itanium 2 Processors Specification Update lists some errata against
-the pal_mc_error_inject PAL procedure. The following err.conf has been tested
-on latest Montecito PAL.
-
-err.conf:
-
-#This is configuration file for err_inject_tool.
-#The format of the each line is:
-#cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
-#where
-#	cpu: logical cpu number the error will be inject in.
-#	loop: times the error will be injected.
-#	interval: In second. every so often one error is injected.
-#	err_type_info, err_struct_info: PAL parameters.
-#
-#Note: All values are hex w/o or w/ 0x prefix.
-
-
-#On cpu2, inject only total 0x10 errors, interval 5 seconds
-#corrected, data cache, hier-2, physical addr(assigned by tool code).
-#working on Montecito latest PAL.
-2, 10, 5, 4101, 95
-
-#On cpu4, inject and consume total 0x10 errors, interval 5 seconds
-#corrected, data cache, hier-2, physical addr(assigned by tool code).
-#working on Montecito latest PAL.
-4, 10, 5, 4109, 95
-
-#On cpu15, inject and consume total 0x10 errors, interval 5 seconds
-#recoverable, DTR0, hier-2.
-#working on Montecito latest PAL.
-0xf, 0x10, 5, 4249, 15
-
-The sample application source code:
-
-err_injection_tool.c:
-
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Copyright (C) 2006 Intel Co
- *	Fenghua Yu <fenghua.yu@intel.com>
- *
- */
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sched.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <time.h>
-#include <sys/ipc.h>
-#include <sys/sem.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/shm.h>
-
-#define MAX_FN_SIZE 		256
-#define MAX_BUF_SIZE 		256
-#define DATA_BUF_SIZE 		256
-#define NR_CPUS 		512
-#define MAX_TASK_NUM		2048
-#define MIN_INTERVAL		5	// seconds
-#define	ERR_DATA_BUFFER_SIZE 	3	// Three 8-byte.
-#define PARA_FIELD_NUM		5
-#define MASK_SIZE		(NR_CPUS/64)
-#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
-
-int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
-
-int verbose;
-#define vbprintf if (verbose) printf
-
-int log_info(int cpu, const char *fmt, ...)
-{
-	FILE *log;
-	char fn[MAX_FN_SIZE];
-	char buf[MAX_BUF_SIZE];
-	va_list args;
-
-	sprintf(fn, "%d.log", cpu);
-	log=fopen(fn, "a+");
-	if (log==NULL) {
-		perror("Error open:");
-		return -1;
-	}
-
-	va_start(args, fmt);
-	vprintf(fmt, args);
-	memset(buf, 0, MAX_BUF_SIZE);
-	vsprintf(buf, fmt, args);
-	va_end(args);
-
-	fwrite(buf, sizeof(buf), 1, log);
-	fclose(log);
-
-	return 0;
-}
-
-typedef unsigned long u64;
-typedef unsigned int  u32;
-
-typedef union err_type_info_u {
-	struct {
-		u64	mode		: 3,	/* 0-2 */
-			err_inj		: 3,	/* 3-5 */
-			err_sev		: 2,	/* 6-7 */
-			err_struct	: 5,	/* 8-12 */
-			struct_hier	: 3,	/* 13-15 */
-			reserved	: 48;	/* 16-63 */
-	} err_type_info_u;
-	u64	err_type_info;
-} err_type_info_t;
-
-typedef union err_struct_info_u {
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			c_t		: 2,	/* 1-2	 */
-			cl_p		: 3,	/* 3-5	 */
-			cl_id		: 3,	/* 6-8	 */
-			cl_dp		: 1,	/* 9	 */
-			reserved1	: 22,	/* 10-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_cache;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			tt		: 2,	/* 1-2	 */
-			tc_tr		: 2,	/* 3-4	 */
-			tr_slot		: 8,	/* 5-12	 */
-			reserved1	: 19,	/* 13-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_tlb;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			regfile_id	: 4,	/* 1-4	 */
-			reg_num		: 7,	/* 5-11	 */
-			reserved1	: 20,	/* 12-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_register;
-	struct {
-		u64	reserved;
-	} err_struct_info_bus_processor_interconnect;
-	u64	err_struct_info;
-} err_struct_info_t;
-
-typedef union err_data_buffer_u {
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-					: 39;	/* 153-191	*/
-	} err_data_buffer_cache;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-			reserved	: 39;	/* 153-191	*/
-	} err_data_buffer_tlb;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-	} err_data_buffer_register;
-	struct {
-		u64	reserved;		/* 0-63		*/
-	} err_data_buffer_bus_processor_interconnect;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-} err_data_buffer_t;
-
-typedef union capabilities_u {
-	struct {
-		u64	i		: 1,
-			d		: 1,
-			rv		: 1,
-			tag		: 1,
-			data		: 1,
-			mesi		: 1,
-			dp		: 1,
-			reserved1	: 3,
-			pa		: 1,
-			va		: 1,
-			wi		: 1,
-			reserved2	: 20,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_cache;
-	struct {
-		u64	d		: 1,
-			i		: 1,
-			rv		: 1,
-			tc		: 1,
-			tr		: 1,
-			reserved1	: 27,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved2	: 30;
-	} capabilities_tlb;
-	struct {
-		u64	gr_b0		: 1,
-			gr_b1		: 1,
-			fr		: 1,
-			br		: 1,
-			pr		: 1,
-			ar		: 1,
-			cr		: 1,
-			rr		: 1,
-			pkr		: 1,
-			dbr		: 1,
-			ibr		: 1,
-			pmc		: 1,
-			pmd		: 1,
-			reserved1	: 3,
-			regnum		: 1,
-			reserved2	: 15,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_register;
-	struct {
-		u64	reserved;
-	} capabilities_bus_processor_interconnect;
-} capabilities_t;
-
-typedef struct resources_s {
-	u64	ibr0		: 1,
-		ibr2		: 1,
-		ibr4		: 1,
-		ibr6		: 1,
-		dbr0		: 1,
-		dbr2		: 1,
-		dbr4		: 1,
-		dbr6		: 1,
-		reserved	: 48;
-} resources_t;
-
-
-long get_page_size(void)
-{
-	long page_size=sysconf(_SC_PAGESIZE);
-	return page_size;
-}
-
-#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
-#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
-#define SHM_VA 0x2000000100000000
-
-int shmid;
-void *shmaddr;
-
-int create_shm(void)
-{
-	key_t key;
-	char fn[MAX_FN_SIZE];
-
-	/* cpu0 is always existing */
-	sprintf(fn, PATH_FORMAT, 0);
-	if ((key = ftok(fn, 's')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
-	if (shmid == -1) {
-		if (errno==EEXIST) {
-			shmid = shmget(key, SHM_SIZE, 0);
-			if (shmid == -1) {
-				perror("shmget");
-				return -1;
-			}
-		}
-		else {
-			perror("shmget");
-			return -1;
-		}
-	}
-	vbprintf("shmid=%d", shmid);
-
-	/* connect to the segment: */
-	shmaddr = shmat(shmid, (void *)SHM_VA, 0);
-	if (shmaddr == (void*)-1) {
-		perror("shmat");
-		return -1;
-	}
-
-	memset(shmaddr, 0, SHM_SIZE);
-	mlock(shmaddr, SHM_SIZE);
-
-	return 0;
-}
-
-int free_shm()
-{
-	munlock(shmaddr, SHM_SIZE);
-        shmdt(shmaddr);
-	semctl(shmid, 0, IPC_RMID);
-
-	return 0;
-}
-
-#ifdef _SEM_SEMUN_UNDEFINED
-union semun
-{
-	int val;
-	struct semid_ds *buf;
-	unsigned short int *array;
-	struct seminfo *__buf;
-};
-#endif
-
-u32 mode=1; /* 1: physical mode; 2: virtual mode. */
-int one_lock=1;
-key_t key[NR_CPUS];
-int semid[NR_CPUS];
-
-int create_sem(int cpu)
-{
-	union semun arg;
-	char fn[MAX_FN_SIZE];
-	int sid;
-
-	sprintf(fn, PATH_FORMAT, cpu);
-	sprintf(fn, "%s/%s", fn, "err_type_info");
-	if ((key[cpu] = ftok(fn, 'e')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	if (semid[cpu]!=0)
-		return 0;
-
-	/* clear old semaphore */
-	if ((sid = semget(key[cpu], 1, 0)) != -1)
-		semctl(sid, 0, IPC_RMID);
-
-	/* get one semaphore */
-	if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
-		perror("semget");
-		printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
-			(u64)key[cpu]);
-		return -1;
-	}
-
-	vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
-		(u64)key[cpu]);
-	/* initialize the semaphore to 1: */
-	arg.val = 1;
-	if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
-		perror("semctl");
-		return -1;
-	}
-
-	return 0;
-}
-
-static int lock(int cpu)
-{
-	struct sembuf lock;
-
-	lock.sem_num = cpu;
-	lock.sem_op = 1;
-	semop(semid[cpu], &lock, 1);
-
-        return 0;
-}
-
-static int unlock(int cpu)
-{
-	struct sembuf unlock;
-
-	unlock.sem_num = cpu;
-	unlock.sem_op = -1;
-	semop(semid[cpu], &unlock, 1);
-
-        return 0;
-}
-
-void free_sem(int cpu)
-{
-	semctl(semid[cpu], 0, IPC_RMID);
-}
-
-int wr_multi(char *fn, unsigned long *data, int size)
-{
-	int fd;
-	char buf[MAX_BUF_SIZE];
-	int ret;
-
-	if (size==1)
-		sprintf(buf, "%lx", *data);
-	else if (size==3)
-		sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
-	else {
-		fprintf(stderr,"write to file with wrong size!\n");
-		return -1;
-	}
-
-	fd=open(fn, O_RDWR);
-	if (!fd) {
-		perror("Error:");
-		return -1;
-	}
-	ret=write(fd, buf, sizeof(buf));
-	close(fd);
-	return ret;
-}
-
-int wr(char *fn, unsigned long data)
-{
-	return wr_multi(fn, &data, 1);
-}
-
-int rd(char *fn, unsigned long *data)
-{
-	int fd;
-	char buf[MAX_BUF_SIZE];
-
-	fd=open(fn, O_RDONLY);
-	if (fd<0) {
-		perror("Error:");
-		return -1;
-	}
-	read(fd, buf, MAX_BUF_SIZE);
-	*data=strtoul(buf, NULL, 16);
-	close(fd);
-	return 0;
-}
-
-int rd_status(char *path, int *status)
-{
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/status", path);
-	if (rd(fn, (u64*)status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-int rd_capabilities(char *path, u64 *capabilities)
-{
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn, capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-int rd_all(char *path)
-{
-	unsigned long err_type_info, err_struct_info, err_data_buffer;
-	int status;
-	unsigned long capabilities, resources;
-	char fn[MAX_FN_SIZE];
-
-	sprintf(fn, "%s/err_type_info", path);
-	if (rd(fn, &err_type_info)<0) {
-		perror("err_type_info reading error.\n");
-		return -1;
-	}
-	printf("err_type_info=%lx\n", err_type_info);
-
-	sprintf(fn, "%s/err_struct_info", path);
-	if (rd(fn, &err_struct_info)<0) {
-		perror("err_struct_info reading error.\n");
-		return -1;
-	}
-	printf("err_struct_info=%lx\n", err_struct_info);
-
-	sprintf(fn, "%s/err_data_buffer", path);
-	if (rd(fn, &err_data_buffer)<0) {
-		perror("err_data_buffer reading error.\n");
-		return -1;
-	}
-	printf("err_data_buffer=%lx\n", err_data_buffer);
-
-	sprintf(fn, "%s/status", path);
-	if (rd("status", (u64*)&status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-	printf("status=%d\n", status);
-
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn,&capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-	printf("capabilities=%lx\n", capabilities);
-
-	sprintf(fn, "%s/resources", path);
-	if (rd(fn, &resources)<0) {
-		perror("resources reading error.\n");
-		return -1;
-	}
-	printf("resources=%lx\n", resources);
-
-	return 0;
-}
-
-int query_capabilities(char *path, err_type_info_t err_type_info,
-			u64 *capabilities)
-{
-	char fn[MAX_FN_SIZE];
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-
-	err_struct_info.err_struct_info=0;
-	memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
-
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, 0x0);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn, mode);
-
-	if (rd_capabilities(path, capabilities)<0)
-		return -1;
-
-	return 0;
-}
-
-int query_all_capabilities()
-{
-	int status;
-	err_type_info_t err_type_info;
-	int err_sev, err_struct, struct_hier;
-	int cap=0;
-	u64 capabilities;
-	char path[MAX_FN_SIZE];
-
-	err_type_info.err_type_info=0;			// Initial
-	err_type_info.err_type_info_u.mode=0;		// Query mode;
-	err_type_info.err_type_info_u.err_inj=0;
-
-	printf("All capabilities implemented in pal_mc_error_inject:\n");
-	sprintf(path, PATH_FORMAT ,0);
-	for (err_sev=0;err_sev<3;err_sev++)
-		for (err_struct=0;err_struct<5;err_struct++)
-			for (struct_hier=0;struct_hier<5;struct_hier++)
-	{
-		status=-1;
-		capabilities=0;
-		err_type_info.err_type_info_u.err_sev=err_sev;
-		err_type_info.err_type_info_u.err_struct=err_struct;
-		err_type_info.err_type_info_u.struct_hier=struct_hier;
-
-		if (query_capabilities(path, err_type_info, &capabilities)<0)
-			continue;
-
-		if (rd_status(path, &status)<0)
-			continue;
-
-		if (status==0) {
-			cap=1;
-			printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
-				err_sev, err_struct, struct_hier);
-			printf("capabilities 0x%lx\n", capabilities);
-		}
-	}
-	if (!cap) {
-		printf("No capabilities supported.\n");
-		return 0;
-	}
-
-	return 0;
-}
-
-int err_inject(int cpu, char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t err_data_buffer)
-{
-	int status;
-	char fn[MAX_FN_SIZE];
-
-	log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
-		err_type_info.err_type_info,
-		err_struct_info.err_struct_info);
-	log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
-		err_data_buffer.err_data_buffer[0],
-		err_data_buffer.err_data_buffer[1],
-		err_data_buffer.err_data_buffer[2]);
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, err_struct_info.err_struct_info);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn,mode);
-
-	if (rd_status(path, &status)<0) {
-		vbprintf("fail: read status\n");
-		return -100;
-	}
-
-	if (status!=0) {
-		log_info(cpu, "fail: status=%d\n", status);
-		return status;
-	}
-
-	return status;
-}
-
-static int construct_data_buf(char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t *err_data_buffer,
-		void *va1)
-{
-	char fn[MAX_FN_SIZE];
-	u64 virt_addr=0, phys_addr=0;
-
-	vbprintf("va1=%lx\n", (u64)va1);
-	memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
-
-	switch (err_type_info.err_type_info_u.err_struct) {
-		case 1: // Cache
-			switch (err_struct_info.err_struct_info_cache.cl_id) {
-				case 1: //Virtual addr
-					err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
-					break;
-				case 2: //Phys addr
-					sprintf(fn, "%s/virtual_to_phys", path);
-					virt_addr=(u64)va1;
-					if (wr(fn,virt_addr)<0)
-						return -1;
-					rd(fn, &phys_addr);
-					err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
-					break;
-				default:
-					printf("Not supported cl_id\n");
-					break;
-			}
-			break;
-		case 2: //  TLB
-			break;
-		case 3: //  Register file
-			break;
-		case 4: //  Bus/system interconnect
-		default:
-			printf("Not supported err_struct\n");
-			break;
-	}
-
-	return 0;
-}
-
-typedef struct {
-	u64 cpu;
-	u64 loop;
-	u64 interval;
-	u64 err_type_info;
-	u64 err_struct_info;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-} parameters_t;
-
-parameters_t line_para;
-int para;
-
-static int empty_data_buffer(u64 *err_data_buffer)
-{
-	int empty=1;
-	int i;
-
-	for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
-	   if (err_data_buffer[i]!=-1)
-		empty=0;
-
-	return empty;
-}
-
-int err_inj()
-{
-	err_type_info_t err_type_info;
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-	int count;
-	FILE *fp;
-	unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
-	u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
-	int num;
-	int i;
-	char path[MAX_FN_SIZE];
-	parameters_t parameters[MAX_TASK_NUM]={};
-	pid_t child_pid[MAX_TASK_NUM];
-	time_t current_time;
-	int status;
-
-	if (!para) {
-	    fp=fopen("err.conf", "r");
-	    if (fp==NULL) {
-		perror("Error open err.conf");
-		return -1;
-	    }
-
-	    num=0;
-	    while (!feof(fp)) {
-		char buf[256];
-		memset(buf,0,256);
-		fgets(buf, 256, fp);
-		count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf,
-				&err_data_buffer_conf[0],
-				&err_data_buffer_conf[1],
-				&err_data_buffer_conf[2]);
-		if (count!=PARA_FIELD_NUM+3) {
-			err_data_buffer_conf[0]=-1;
-			err_data_buffer_conf[1]=-1;
-			err_data_buffer_conf[2]=-1;
-			count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf);
-			if (count!=PARA_FIELD_NUM)
-				continue;
-		}
-
-		parameters[num].cpu=cpu;
-		parameters[num].loop=loop;
-		parameters[num].interval= interval>MIN_INTERVAL
-					  ?interval:MIN_INTERVAL;
-		parameters[num].err_type_info=err_type_info_conf;
-		parameters[num].err_struct_info=err_struct_info_conf;
-		memcpy(parameters[num++].err_data_buffer,
-			err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
-
-		if (num>=MAX_TASK_NUM)
-			break;
-	    }
-	}
-	else {
-		parameters[0].cpu=line_para.cpu;
-		parameters[0].loop=line_para.loop;
-		parameters[0].interval= line_para.interval>MIN_INTERVAL
-					  ?line_para.interval:MIN_INTERVAL;
-		parameters[0].err_type_info=line_para.err_type_info;
-		parameters[0].err_struct_info=line_para.err_struct_info;
-		memcpy(parameters[0].err_data_buffer,
-			line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
-
-		num=1;
-	}
-
-	/* Create semaphore: If one_lock, one semaphore for all processors.
-	   Otherwise, one semaphore for each processor. */
-	if (one_lock) {
-		if (create_sem(0)) {
-			printf("Can not create semaphore...exit\n");
-			free_sem(0);
-			return -1;
-		}
-	}
-	else {
-		for (i=0;i<num;i++) {
-		   if (create_sem(parameters[i].cpu)) {
-			printf("Can not create semaphore for cpu%d...exit\n",i);
-			free_sem(parameters[num].cpu);
-			return -1;
-		   }
-		}
-	}
-
-	/* Create a shm segment which will be used to inject/consume errors on.*/
-	if (create_shm()==-1) {
-		printf("Error to create shm...exit\n");
-		return -1;
-	}
-
-	for (i=0;i<num;i++) {
-		pid_t pid;
-
-		current_time=time(NULL);
-		log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
-		log_info(parameters[i].cpu, "Configurations:\n");
-		log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
-			parameters[i].cpu,
-			parameters[i].loop,
-			parameters[i].interval);
-		log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
-			parameters[i].err_type_info,
-			parameters[i].err_struct_info);
-
-		sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
-		err_type_info.err_type_info=parameters[i].err_type_info;
-		err_struct_info.err_struct_info=parameters[i].err_struct_info;
-		memcpy(err_data_buffer.err_data_buffer,
-			parameters[i].err_data_buffer,
-			ERR_DATA_BUFFER_SIZE*8);
-
-		pid=fork();
-		if (pid==0) {
-			unsigned long mask[MASK_SIZE];
-			int j, k;
-
-			void *va1, *va2;
-
-			/* Allocate two memory areas va1 and va2 in shm */
-			va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
-			va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
-
-			vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
-			memset(va1, 0x1, PAGE_SIZE);
-			memset(va2, 0x2, PAGE_SIZE);
-
-			if (empty_data_buffer(err_data_buffer.err_data_buffer))
-				/* If not specified yet, construct data buffer
-				 * with va1
-				 */
-				construct_data_buf(path, err_type_info,
-					err_struct_info, &err_data_buffer,va1);
-
-			for (j=0;j<MASK_SIZE;j++)
-				mask[j]=0;
-
-			cpu=parameters[i].cpu;
-			k = cpu%64;
-			j = cpu/64;
-			mask[j] = 1UL << k;
-
-			if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
-				perror("Error sched_setaffinity:");
-				return -1;
-			}
-
-			for (j=0; j<parameters[i].loop; j++) {
-				log_info(parameters[i].cpu,"Injection ");
-				log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
-
-					parameters[i].cpu,j+1, parameters[i].loop);
-
-				/* Hold the lock */
-				if (one_lock)
-					lock(0);
-				else
-				/* Hold lock on this cpu */
-					lock(parameters[i].cpu);
-
-				if ((status=err_inject(parameters[i].cpu,
-					   path, err_type_info,
-					   err_struct_info, err_data_buffer))
-					   ==0) {
-					/* consume the error for "inject only"*/
-					memcpy(va2, va1, PAGE_SIZE);
-					memcpy(va1, va2, PAGE_SIZE);
-					log_info(parameters[i].cpu,
-						"successful\n");
-				}
-				else {
-					log_info(parameters[i].cpu,"fail:");
-					log_info(parameters[i].cpu,
-						"status=%d\n", status);
-					unlock(parameters[i].cpu);
-					break;
-				}
-				if (one_lock)
-				/* Release the lock */
-					unlock(0);
-				/* Release lock on this cpu */
-				else
-					unlock(parameters[i].cpu);
-
-				if (j < parameters[i].loop-1)
-					sleep(parameters[i].interval);
-			}
-			current_time=time(NULL);
-			log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
-			return 0;
-		}
-		else if (pid<0) {
-			perror("Error fork:");
-			continue;
-		}
-		child_pid[i]=pid;
-	}
-	for (i=0;i<num;i++)
-		waitpid(child_pid[i], NULL, 0);
-
-	if (one_lock)
-		free_sem(0);
-	else
-		for (i=0;i<num;i++)
-			free_sem(parameters[i].cpu);
-
-	printf("All done.\n");
-
-	return 0;
-}
-
-void help()
-{
-	printf("err_inject_tool:\n");
-	printf("\t-q: query all capabilities. default: off\n");
-	printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
-	printf("\t-i: inject errors. default: off\n");
-	printf("\t-l: one lock per cpu. default: one lock for all\n");
-	printf("\t-e: error parameters:\n");
-	printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
-	printf("\t\t   cpu: logical cpu number the error will be inject in.\n");
-	printf("\t\t   loop: times the error will be injected.\n");
-	printf("\t\t   interval: In second. every so often one error is injected.\n");
-	printf("\t\t   err_type_info, err_struct_info: PAL parameters.\n");
-	printf("\t\t   err_data_buffer: PAL parameter. Optional. If not present,\n");
-	printf("\t\t                    it's constructed by tool automatically. Be\n");
-	printf("\t\t                    careful to provide err_data_buffer and make\n");
-	printf("\t\t                    sure it's working with the environment.\n");
-	printf("\t    Note:no space between error parameters.\n");
-	printf("\t    default: Take error parameters from err.conf instead of command line.\n");
-	printf("\t-v: verbose. default: off\n");
-	printf("\t-h: help\n\n");
-	printf("The tool will take err.conf file as ");
-	printf("input to inject single or multiple errors ");
-	printf("on one or multiple cpus in parallel.\n");
-}
-
-int main(int argc, char **argv)
-{
-	char c;
-	int do_err_inj=0;
-	int do_query_all=0;
-	int count;
-	u32 m;
-
-	/* Default one lock for all cpu's */
-	one_lock=1;
-	while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
-		switch (c) {
-			case 'm':	/* Procedure mode. 1: phys 2: virt */
-				count=sscanf(optarg, "%x", &m);
-				if (count!=1 || (m!=1 && m!=2)) {
-					printf("Wrong mode number.\n");
-					help();
-					return -1;
-				}
-				mode=m;
-				break;
-			case 'i':	/* Inject errors */
-				do_err_inj=1;
-				break;
-			case 'q':	/* Query */
-				do_query_all=1;
-				break;
-			case 'v':	/* Verbose */
-				verbose=1;
-				break;
-			case 'l':	/* One lock per cpu */
-				one_lock=0;
-				break;
-			case 'e':	/* error arguments */
-				/* Take parameters:
-				 * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
-				 * err_data_buffer is optional. Recommend not to specify
-				 * err_data_buffer. Better to use tool to generate it.
-				 */
-				count=sscanf(optarg,
-					"%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-					&line_para.cpu,
-					&line_para.loop,
-					&line_para.interval,
-					&line_para.err_type_info,
-					&line_para.err_struct_info,
-					&line_para.err_data_buffer[0],
-					&line_para.err_data_buffer[1],
-					&line_para.err_data_buffer[2]);
-				if (count!=PARA_FIELD_NUM+3) {
-				    line_para.err_data_buffer[0]=-1,
-				    line_para.err_data_buffer[1]=-1,
-			 	    line_para.err_data_buffer[2]=-1;
-				    count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
-						&line_para.cpu,
-						&line_para.loop,
-						&line_para.interval,
-						&line_para.err_type_info,
-						&line_para.err_struct_info);
-				    if (count!=PARA_FIELD_NUM) {
-					printf("Wrong error arguments.\n");
-					help();
-					return -1;
-				    }
-				}
-				para=1;
-				break;
-			continue;
-				break;
-			case 'h':
-				help();
-				return 0;
-			default:
-				break;
-		}
-
-	if (do_query_all)
-		query_all_capabilities();
-	if (do_err_inj)
-		err_inj();
-
-	if (!do_query_all &&  !do_err_inj)
-		help();
-
-	return 0;
-}
-
diff --git a/Documentation/ia64/fsys.rst b/Documentation/ia64/fsys.rst
new file mode 100644
index 000000000000..a702d2cc94b6
--- /dev/null
+++ b/Documentation/ia64/fsys.rst
@@ -0,0 +1,303 @@
+===================================
+Light-weight System Calls for IA-64
+===================================
+
+		        Started: 13-Jan-2003
+
+		    Last update: 27-Sep-2003
+
+	              David Mosberger-Tang
+		      <davidm@hpl.hp.com>
+
+Using the "epc" instruction effectively introduces a new mode of
+execution to the ia64 linux kernel.  We call this mode the
+"fsys-mode".  To recap, the normal states of execution are:
+
+  - kernel mode:
+	Both the register stack and the memory stack have been
+	switched over to kernel memory.  The user-level state is saved
+	in a pt-regs structure at the top of the kernel memory stack.
+
+  - user mode:
+	Both the register stack and the kernel stack are in
+	user memory.  The user-level state is contained in the
+	CPU registers.
+
+  - bank 0 interruption-handling mode:
+	This is the non-interruptible state which all
+	interruption-handlers start execution in.  The user-level
+	state remains in the CPU registers and some kernel state may
+	be stored in bank 0 of registers r16-r31.
+
+In contrast, fsys-mode has the following special properties:
+
+  - execution is at privilege level 0 (most-privileged)
+
+  - CPU registers may contain a mixture of user-level and kernel-level
+    state (it is the responsibility of the kernel to ensure that no
+    security-sensitive kernel-level state is leaked back to
+    user-level)
+
+  - execution is interruptible and preemptible (an fsys-mode handler
+    can disable interrupts and avoid all other interruption-sources
+    to avoid preemption)
+
+  - neither the memory-stack nor the register-stack can be trusted while
+    in fsys-mode (they point to the user-level stacks, which may
+    be invalid, or completely bogus addresses)
+
+In summary, fsys-mode is much more similar to running in user-mode
+than it is to running in kernel-mode.  Of course, given that the
+privilege level is at level 0, this means that fsys-mode requires some
+care (see below).
+
+
+How to tell fsys-mode
+=====================
+
+Linux operates in fsys-mode when (a) the privilege level is 0 (most
+privileged) and (b) the stacks have NOT been switched to kernel memory
+yet.  For convenience, the header file <asm-ia64/ptrace.h> provides
+three macros::
+
+	user_mode(regs)
+	user_stack(task,regs)
+	fsys_mode(task,regs)
+
+The "regs" argument is a pointer to a pt_regs structure.  The "task"
+argument is a pointer to the task structure to which the "regs"
+pointer belongs to.  user_mode() returns TRUE if the CPU state pointed
+to by "regs" was executing in user mode (privilege level 3).
+user_stack() returns TRUE if the state pointed to by "regs" was
+executing on the user-level stack(s).  Finally, fsys_mode() returns
+TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
+The fsys_mode() macro is equivalent to the expression::
+
+	!user_mode(regs) && user_stack(task,regs)
+
+How to write an fsyscall handler
+================================
+
+The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
+(fsyscall_table).  This table contains one entry for each system call.
+By default, a system call is handled by fsys_fallback_syscall().  This
+routine takes care of entering (full) kernel mode and calling the
+normal Linux system call handler.  For performance-critical system
+calls, it is possible to write a hand-tuned fsyscall_handler.  For
+example, fsys.S contains fsys_getpid(), which is a hand-tuned version
+of the getpid() system call.
+
+The entry and exit-state of an fsyscall handler is as follows:
+
+Machine state on entry to fsyscall handler
+------------------------------------------
+
+  ========= ===============================================================
+  r10	    0
+  r11	    saved ar.pfs (a user-level value)
+  r15	    system call number
+  r16	    "current" task pointer (in normal kernel-mode, this is in r13)
+  r32-r39   system call arguments
+  b6	    return address (a user-level value)
+  ar.pfs    previous frame-state (a user-level value)
+  PSR.be    cleared to zero (i.e., little-endian byte order is in effect)
+  -         all other registers may contain values passed in from user-mode
+  ========= ===============================================================
+
+Required machine state on exit to fsyscall handler
+--------------------------------------------------
+
+  ========= ===========================================================
+  r11	    saved ar.pfs (as passed into the fsyscall handler)
+  r15	    system call number (as passed into the fsyscall handler)
+  r32-r39   system call arguments (as passed into the fsyscall handler)
+  b6	    return address (as passed into the fsyscall handler)
+  ar.pfs    previous frame-state (as passed into the fsyscall handler)
+  ========= ===========================================================
+
+Fsyscall handlers can execute with very little overhead, but with that
+speed comes a set of restrictions:
+
+ * Fsyscall-handlers MUST check for any pending work in the flags
+   member of the thread-info structure and if any of the
+   TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
+   doing a full system call (by calling fsys_fallback_syscall).
+
+ * Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
+   r15, b6, and ar.pfs) because they will be needed in case of a
+   system call restart.  Of course, all "preserved" registers also
+   must be preserved, in accordance to the normal calling conventions.
+
+ * Fsyscall-handlers MUST check argument registers for containing a
+   NaT value before using them in any way that could trigger a
+   NaT-consumption fault.  If a system call argument is found to
+   contain a NaT value, an fsyscall-handler may return immediately
+   with r8=EINVAL, r10=-1.
+
+ * Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
+   any other operation that would trigger mandatory RSE
+   (register-stack engine) traffic.
+
+ * Fsyscall-handlers MUST NOT write to any stacked registers because
+   it is not safe to assume that user-level called a handler with the
+   proper number of arguments.
+
+ * Fsyscall-handlers need to be careful when accessing per-CPU variables:
+   unless proper safe-guards are taken (e.g., interruptions are avoided),
+   execution may be pre-empted and resumed on another CPU at any given
+   time.
+
+ * Fsyscall-handlers must be careful not to leak sensitive kernel'
+   information back to user-level.  In particular, before returning to
+   user-level, care needs to be taken to clear any scratch registers
+   that could contain sensitive information (note that the current
+   task pointer is not considered sensitive: it's already exposed
+   through ar.k6).
+
+ * Fsyscall-handlers MUST NOT access user-memory without first
+   validating access-permission (this can be done typically via
+   probe.r.fault and/or probe.w.fault) and without guarding against
+   memory access exceptions (this can be done with the EX() macros
+   defined by asmmacro.h).
+
+The above restrictions may seem draconian, but remember that it's
+possible to trade off some of the restrictions by paying a slightly
+higher overhead.  For example, if an fsyscall-handler could benefit
+from the shadow register bank, it could temporarily disable PSR.i and
+PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
+needed.  In other words, following the above rules yields extremely
+fast system call execution (while fully preserving system call
+semantics), but there is also a lot of flexibility in handling more
+complicated cases.
+
+Signal handling
+===============
+
+The delivery of (asynchronous) signals must be delayed until fsys-mode
+is exited.  This is accomplished with the help of the lower-privilege
+transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
+checks whether the interrupted task was in fsys-mode and, if so, sets
+PSR.lp and returns immediately.  When fsys-mode is exited via the
+"br.ret" instruction that lowers the privilege level, a trap will
+occur.  The trap handler clears PSR.lp again and returns immediately.
+The kernel exit path then checks for and delivers any pending signals.
+
+PSR Handling
+============
+
+The "epc" instruction doesn't change the contents of PSR at all.  This
+is in contrast to a regular interruption, which clears almost all
+bits.  Because of that, some care needs to be taken to ensure things
+work as expected.  The following discussion describes how each PSR bit
+is handled.
+
+======= =======================================================================
+PSR.be	Cleared when entering fsys-mode.  A srlz.d instruction is used
+	to ensure the CPU is in little-endian mode before the first
+	load/store instruction is executed.  PSR.be is normally NOT
+	restored upon return from an fsys-mode handler.  In other
+	words, user-level code must not rely on PSR.be being preserved
+	across a system call.
+PSR.up	Unchanged.
+PSR.ac	Unchanged.
+PSR.mfl Unchanged.  Note: fsys-mode handlers must not write-registers!
+PSR.mfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
+PSR.ic	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
+PSR.i	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
+PSR.pk	Unchanged.
+PSR.dt	Unchanged.
+PSR.dfl	Unchanged.  Note: fsys-mode handlers must not write-registers!
+PSR.dfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
+PSR.sp	Unchanged.
+PSR.pp	Unchanged.
+PSR.di	Unchanged.
+PSR.si	Unchanged.
+PSR.db	Unchanged.  The kernel prevents user-level from setting a hardware
+	breakpoint that triggers at any privilege level other than
+	3 (user-mode).
+PSR.lp	Unchanged.
+PSR.tb	Lazy redirect.  If a taken-branch trap occurs while in
+	fsys-mode, the trap-handler modifies the saved machine state
+	such that execution resumes in the gate page at
+	syscall_via_break(), with privilege level 3.  Note: the
+	taken branch would occur on the branch invoking the
+	fsyscall-handler, at which point, by definition, a syscall
+	restart is still safe.  If the system call number is invalid,
+	the fsys-mode handler will return directly to user-level.  This
+	return will trigger a taken-branch trap, but since the trap is
+	taken _after_ restoring the privilege level, the CPU has already
+	left fsys-mode, so no special treatment is needed.
+PSR.rt	Unchanged.
+PSR.cpl	Cleared to 0.
+PSR.is	Unchanged (guaranteed to be 0 on entry to the gate page).
+PSR.mc	Unchanged.
+PSR.it	Unchanged (guaranteed to be 1).
+PSR.id	Unchanged.  Note: the ia64 linux kernel never sets this bit.
+PSR.da	Unchanged.  Note: the ia64 linux kernel never sets this bit.
+PSR.dd	Unchanged.  Note: the ia64 linux kernel never sets this bit.
+PSR.ss	Lazy redirect.  If set, "epc" will cause a Single Step Trap to
+	be taken.  The trap handler then modifies the saved machine
+	state such that execution resumes in the gate page at
+	syscall_via_break(), with privilege level 3.
+PSR.ri	Unchanged.
+PSR.ed	Unchanged.  Note: This bit could only have an effect if an fsys-mode
+	handler performed a speculative load that gets NaTted.  If so, this
+	would be the normal & expected behavior, so no special treatment is
+	needed.
+PSR.bn	Unchanged.  Note: fsys-mode handlers may clear the bit, if needed.
+	Doing so requires clearing PSR.i and PSR.ic as well.
+PSR.ia	Unchanged.  Note: the ia64 linux kernel never sets this bit.
+======= =======================================================================
+
+Using fast system calls
+=======================
+
+To use fast system calls, userspace applications need simply call
+__kernel_syscall_via_epc().  For example
+
+-- example fgettimeofday() call --
+
+-- fgettimeofday.S --
+
+::
+
+  #include <asm/asmmacro.h>
+
+  GLOBAL_ENTRY(fgettimeofday)
+  .prologue
+  .save ar.pfs, r11
+  mov r11 = ar.pfs
+  .body
+
+  mov r2 = 0xa000000000020660;;  // gate address
+			       // found by inspection of System.map for the
+			       // __kernel_syscall_via_epc() function.  See
+			       // below for how to do this for real.
+
+  mov b7 = r2
+  mov r15 = 1087		       // gettimeofday syscall
+  ;;
+  br.call.sptk.many b6 = b7
+  ;;
+
+  .restore sp
+
+  mov ar.pfs = r11
+  br.ret.sptk.many rp;;	      // return to caller
+  END(fgettimeofday)
+
+-- end fgettimeofday.S --
+
+In reality, getting the gate address is accomplished by two extra
+values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
+
+ * AT_SYSINFO : is the address of __kernel_syscall_via_epc()
+ * AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
+
+The ELF DSO is a pre-linked library that is mapped in by the kernel at
+the gate page.  It is a proper ELF shared object so, with a dynamic
+loader that recognises the library, you should be able to make calls to
+the exported functions within it as with any other shared library.
+AT_SYSINFO points into the kernel DSO at the
+__kernel_syscall_via_epc() function for historical reasons (it was
+used before the kernel DSO) and as a convenience.
diff --git a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt
deleted file mode 100644
index 59dd689d9b86..000000000000
--- a/Documentation/ia64/fsys.txt
+++ /dev/null
@@ -1,286 +0,0 @@
--*-Mode: outline-*-
-
-		Light-weight System Calls for IA-64
-		-----------------------------------
-
-		        Started: 13-Jan-2003
-		    Last update: 27-Sep-2003
-
-	              David Mosberger-Tang
-		      <davidm@hpl.hp.com>
-
-Using the "epc" instruction effectively introduces a new mode of
-execution to the ia64 linux kernel.  We call this mode the
-"fsys-mode".  To recap, the normal states of execution are:
-
-  - kernel mode:
-	Both the register stack and the memory stack have been
-	switched over to kernel memory.  The user-level state is saved
-	in a pt-regs structure at the top of the kernel memory stack.
-
-  - user mode:
-	Both the register stack and the kernel stack are in
-	user memory.  The user-level state is contained in the
-	CPU registers.
-
-  - bank 0 interruption-handling mode:
-	This is the non-interruptible state which all
-	interruption-handlers start execution in.  The user-level
-	state remains in the CPU registers and some kernel state may
-	be stored in bank 0 of registers r16-r31.
-
-In contrast, fsys-mode has the following special properties:
-
-  - execution is at privilege level 0 (most-privileged)
-
-  - CPU registers may contain a mixture of user-level and kernel-level
-    state (it is the responsibility of the kernel to ensure that no
-    security-sensitive kernel-level state is leaked back to
-    user-level)
-
-  - execution is interruptible and preemptible (an fsys-mode handler
-    can disable interrupts and avoid all other interruption-sources
-    to avoid preemption)
-
-  - neither the memory-stack nor the register-stack can be trusted while
-    in fsys-mode (they point to the user-level stacks, which may
-    be invalid, or completely bogus addresses)
-
-In summary, fsys-mode is much more similar to running in user-mode
-than it is to running in kernel-mode.  Of course, given that the
-privilege level is at level 0, this means that fsys-mode requires some
-care (see below).
-
-
-* How to tell fsys-mode
-
-Linux operates in fsys-mode when (a) the privilege level is 0 (most
-privileged) and (b) the stacks have NOT been switched to kernel memory
-yet.  For convenience, the header file <asm-ia64/ptrace.h> provides
-three macros:
-
-	user_mode(regs)
-	user_stack(task,regs)
-	fsys_mode(task,regs)
-
-The "regs" argument is a pointer to a pt_regs structure.  The "task"
-argument is a pointer to the task structure to which the "regs"
-pointer belongs to.  user_mode() returns TRUE if the CPU state pointed
-to by "regs" was executing in user mode (privilege level 3).
-user_stack() returns TRUE if the state pointed to by "regs" was
-executing on the user-level stack(s).  Finally, fsys_mode() returns
-TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
-The fsys_mode() macro is equivalent to the expression:
-
-	!user_mode(regs) && user_stack(task,regs)
-
-* How to write an fsyscall handler
-
-The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
-(fsyscall_table).  This table contains one entry for each system call.
-By default, a system call is handled by fsys_fallback_syscall().  This
-routine takes care of entering (full) kernel mode and calling the
-normal Linux system call handler.  For performance-critical system
-calls, it is possible to write a hand-tuned fsyscall_handler.  For
-example, fsys.S contains fsys_getpid(), which is a hand-tuned version
-of the getpid() system call.
-
-The entry and exit-state of an fsyscall handler is as follows:
-
-** Machine state on entry to fsyscall handler:
-
- - r10	  = 0
- - r11	  = saved ar.pfs (a user-level value)
- - r15	  = system call number
- - r16	  = "current" task pointer (in normal kernel-mode, this is in r13)
- - r32-r39 = system call arguments
- - b6	  = return address (a user-level value)
- - ar.pfs = previous frame-state (a user-level value)
- - PSR.be = cleared to zero (i.e., little-endian byte order is in effect)
- - all other registers may contain values passed in from user-mode
-
-** Required machine state on exit to fsyscall handler:
-
- - r11	  = saved ar.pfs (as passed into the fsyscall handler)
- - r15	  = system call number (as passed into the fsyscall handler)
- - r32-r39 = system call arguments (as passed into the fsyscall handler)
- - b6	  = return address (as passed into the fsyscall handler)
- - ar.pfs = previous frame-state (as passed into the fsyscall handler)
-
-Fsyscall handlers can execute with very little overhead, but with that
-speed comes a set of restrictions:
-
- o Fsyscall-handlers MUST check for any pending work in the flags
-   member of the thread-info structure and if any of the
-   TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
-   doing a full system call (by calling fsys_fallback_syscall).
-
- o Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
-   r15, b6, and ar.pfs) because they will be needed in case of a
-   system call restart.  Of course, all "preserved" registers also
-   must be preserved, in accordance to the normal calling conventions.
-
- o Fsyscall-handlers MUST check argument registers for containing a
-   NaT value before using them in any way that could trigger a
-   NaT-consumption fault.  If a system call argument is found to
-   contain a NaT value, an fsyscall-handler may return immediately
-   with r8=EINVAL, r10=-1.
-
- o Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
-   any other operation that would trigger mandatory RSE
-   (register-stack engine) traffic.
-
- o Fsyscall-handlers MUST NOT write to any stacked registers because
-   it is not safe to assume that user-level called a handler with the
-   proper number of arguments.
-
- o Fsyscall-handlers need to be careful when accessing per-CPU variables:
-   unless proper safe-guards are taken (e.g., interruptions are avoided),
-   execution may be pre-empted and resumed on another CPU at any given
-   time.
-
- o Fsyscall-handlers must be careful not to leak sensitive kernel'
-   information back to user-level.  In particular, before returning to
-   user-level, care needs to be taken to clear any scratch registers
-   that could contain sensitive information (note that the current
-   task pointer is not considered sensitive: it's already exposed
-   through ar.k6).
-
- o Fsyscall-handlers MUST NOT access user-memory without first
-   validating access-permission (this can be done typically via
-   probe.r.fault and/or probe.w.fault) and without guarding against
-   memory access exceptions (this can be done with the EX() macros
-   defined by asmmacro.h).
-
-The above restrictions may seem draconian, but remember that it's
-possible to trade off some of the restrictions by paying a slightly
-higher overhead.  For example, if an fsyscall-handler could benefit
-from the shadow register bank, it could temporarily disable PSR.i and
-PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
-needed.  In other words, following the above rules yields extremely
-fast system call execution (while fully preserving system call
-semantics), but there is also a lot of flexibility in handling more
-complicated cases.
-
-* Signal handling
-
-The delivery of (asynchronous) signals must be delayed until fsys-mode
-is exited.  This is accomplished with the help of the lower-privilege
-transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
-checks whether the interrupted task was in fsys-mode and, if so, sets
-PSR.lp and returns immediately.  When fsys-mode is exited via the
-"br.ret" instruction that lowers the privilege level, a trap will
-occur.  The trap handler clears PSR.lp again and returns immediately.
-The kernel exit path then checks for and delivers any pending signals.
-
-* PSR Handling
-
-The "epc" instruction doesn't change the contents of PSR at all.  This
-is in contrast to a regular interruption, which clears almost all
-bits.  Because of that, some care needs to be taken to ensure things
-work as expected.  The following discussion describes how each PSR bit
-is handled.
-
-PSR.be	Cleared when entering fsys-mode.  A srlz.d instruction is used
-	to ensure the CPU is in little-endian mode before the first
-	load/store instruction is executed.  PSR.be is normally NOT
-	restored upon return from an fsys-mode handler.  In other
-	words, user-level code must not rely on PSR.be being preserved
-	across a system call.
-PSR.up	Unchanged.
-PSR.ac	Unchanged.
-PSR.mfl Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.mfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.ic	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.i	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.pk	Unchanged.
-PSR.dt	Unchanged.
-PSR.dfl	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.dfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.sp	Unchanged.
-PSR.pp	Unchanged.
-PSR.di	Unchanged.
-PSR.si	Unchanged.
-PSR.db	Unchanged.  The kernel prevents user-level from setting a hardware
-	breakpoint that triggers at any privilege level other than 3 (user-mode).
-PSR.lp	Unchanged.
-PSR.tb	Lazy redirect.  If a taken-branch trap occurs while in
-	fsys-mode, the trap-handler modifies the saved machine state
-	such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.  Note: the
-	taken branch would occur on the branch invoking the
-	fsyscall-handler, at which point, by definition, a syscall
-	restart is still safe.  If the system call number is invalid,
-	the fsys-mode handler will return directly to user-level.  This
-	return will trigger a taken-branch trap, but since the trap is
-	taken _after_ restoring the privilege level, the CPU has already
-	left fsys-mode, so no special treatment is needed.
-PSR.rt	Unchanged.
-PSR.cpl	Cleared to 0.
-PSR.is	Unchanged (guaranteed to be 0 on entry to the gate page).
-PSR.mc	Unchanged.
-PSR.it	Unchanged (guaranteed to be 1).
-PSR.id	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.da	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.dd	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.ss	Lazy redirect.  If set, "epc" will cause a Single Step Trap to
-	be taken.  The trap handler then modifies the saved machine
-	state such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.
-PSR.ri	Unchanged.
-PSR.ed	Unchanged.  Note: This bit could only have an effect if an fsys-mode
-	handler performed a speculative load that gets NaTted.  If so, this
-	would be the normal & expected behavior, so no special treatment is
-	needed.
-PSR.bn	Unchanged.  Note: fsys-mode handlers may clear the bit, if needed.
-	Doing so requires clearing PSR.i and PSR.ic as well.
-PSR.ia	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-
-* Using fast system calls
-
-To use fast system calls, userspace applications need simply call
-__kernel_syscall_via_epc().  For example
-
--- example fgettimeofday() call --
--- fgettimeofday.S --
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(fgettimeofday)
-.prologue
-.save ar.pfs, r11
-mov r11 = ar.pfs
-.body 
-
-mov r2 = 0xa000000000020660;;  // gate address 
-			       // found by inspection of System.map for the 
-			       // __kernel_syscall_via_epc() function.  See
-			       // below for how to do this for real.
-
-mov b7 = r2
-mov r15 = 1087		       // gettimeofday syscall
-;;
-br.call.sptk.many b6 = b7
-;;
-
-.restore sp
-
-mov ar.pfs = r11
-br.ret.sptk.many rp;;	      // return to caller
-END(fgettimeofday)
-
--- end fgettimeofday.S --
-
-In reality, getting the gate address is accomplished by two extra
-values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
-
- o AT_SYSINFO : is the address of __kernel_syscall_via_epc()
- o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
-
-The ELF DSO is a pre-linked library that is mapped in by the kernel at
-the gate page.  It is a proper ELF shared object so, with a dynamic
-loader that recognises the library, you should be able to make calls to
-the exported functions within it as with any other shared library.
-AT_SYSINFO points into the kernel DSO at the
-__kernel_syscall_via_epc() function for historical reasons (it was
-used before the kernel DSO) and as a convenience.
diff --git a/Documentation/ia64/ia64.rst b/Documentation/ia64/ia64.rst
new file mode 100644
index 000000000000..b725019a9492
--- /dev/null
+++ b/Documentation/ia64/ia64.rst
@@ -0,0 +1,49 @@
+===========================================
+Linux kernel release for the IA-64 Platform
+===========================================
+
+   These are the release notes for Linux since version 2.4 for IA-64
+   platform.  This document provides information specific to IA-64
+   ONLY, to get additional information about the Linux kernel also
+   read the original Linux README provided with the kernel.
+
+Installing the Kernel
+=====================
+
+ - IA-64 kernel installation is the same as the other platforms, see
+   original README for details.
+
+
+Software Requirements
+=====================
+
+   Compiling and running this kernel requires an IA-64 compliant GCC
+   compiler.  And various software packages also compiled with an
+   IA-64 compliant GCC compiler.
+
+
+Configuring the Kernel
+======================
+
+   Configuration is the same, see original README for details.
+
+
+Compiling the Kernel:
+
+ - Compiling this kernel doesn't differ from other platform so read
+   the original README for details BUT make sure you have an IA-64
+   compliant GCC compiler.
+
+IA-64 Specifics
+===============
+
+ - General issues:
+
+    * Hardly any performance tuning has been done. Obvious targets
+      include the library routines (IP checksum, etc.). Less
+      obvious targets include making sure we don't flush the TLB
+      needlessly, etc.
+
+    * SMP locks cleanup/optimization
+
+    * IA32 support.  Currently experimental.  It mostly works.
diff --git a/Documentation/ia64/index.rst b/Documentation/ia64/index.rst
new file mode 100644
index 000000000000..0436e1034115
--- /dev/null
+++ b/Documentation/ia64/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+IA-64 Architecture
+==================
+
+.. toctree::
+   :maxdepth: 1
+
+   ia64
+   aliasing
+   efirtc
+   err_inject
+   fsys
+   irq-redir
+   mca
+   serial
+   xen
diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/ia64/irq-redir.rst
new file mode 100644
index 000000000000..39bf94484a15
--- /dev/null
+++ b/Documentation/ia64/irq-redir.rst
@@ -0,0 +1,80 @@
+==============================
+IRQ affinity on IA64 platforms
+==============================
+
+07.01.2002, Erich Focht <efocht@ess.nec.de>
+
+
+By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
+controlled. The behavior on IA64 platforms is slightly different from
+that described in Documentation/IRQ-affinity.txt for i386 systems.
+
+Because of the usage of SAPIC mode and physical destination mode the
+IRQ target is one particular CPU and cannot be a mask of several
+CPUs. Only the first non-zero bit is taken into account.
+
+
+Usage examples
+==============
+
+The target CPU has to be specified as a hexadecimal CPU mask. The
+first non-zero bit is the selected CPU. This format has been kept for
+compatibility reasons with i386.
+
+Set the delivery mode of interrupt 41 to fixed and route the
+interrupts to CPU #3 (logical CPU number) (2^3=0x08)::
+
+     echo "8" >/proc/irq/41/smp_affinity
+
+Set the default route for IRQ number 41 to CPU 6 in lowest priority
+delivery mode (redirectable)::
+
+     echo "r 40" >/proc/irq/41/smp_affinity
+
+The output of the command::
+
+     cat /proc/irq/IRQ#/smp_affinity
+
+gives the target CPU mask for the specified interrupt vector. If the CPU
+mask is preceded by the character "r", the interrupt is redirectable
+(i.e. lowest priority mode routing is used), otherwise its route is
+fixed.
+
+
+
+Initialization and default behavior
+===================================
+
+If the platform features IRQ redirection (info provided by SAL) all
+IO-SAPIC interrupts are initialized with CPU#0 as their default target
+and the routing is the so called "lowest priority mode" (actually
+fixed SAPIC mode with hint). The XTP chipset registers are used as hints
+for the IRQ routing. Currently in Linux XTP registers can have three
+values:
+
+	- minimal for an idle task,
+	- normal if any other task runs,
+	- maximal if the CPU is going to be switched off.
+
+The IRQ is routed to the CPU with lowest XTP register value, the
+search begins at the default CPU. Therefore most of the interrupts
+will be handled by CPU #0.
+
+If the platform doesn't feature interrupt redirection IOSAPIC fixed
+routing is used. The target CPUs are distributed in a round robin
+manner. IRQs will be routed only to the selected target CPUs. Check
+with::
+
+        cat /proc/interrupts
+
+
+
+Comments
+========
+
+On large (multi-node) systems it is recommended to route the IRQs to
+the node to which the corresponding device is connected.
+For systems like the NEC AzusA we get IRQ node-affinity for free. This
+is because usually the chipsets on each node redirect the interrupts
+only to their own CPUs (as they cannot see the XTP registers on the
+other nodes).
diff --git a/Documentation/ia64/mca.rst b/Documentation/ia64/mca.rst
new file mode 100644
index 000000000000..08270bba44a4
--- /dev/null
+++ b/Documentation/ia64/mca.rst
@@ -0,0 +1,198 @@
+=============================================================
+An ad-hoc collection of notes on IA64 MCA and INIT processing
+=============================================================
+
+Feel free to update it with notes about any area that is not clear.
+
+---
+
+MCA/INIT are completely asynchronous.  They can occur at any time, when
+the OS is in any state.  Including when one of the cpus is already
+holding a spinlock.  Trying to get any lock from MCA/INIT state is
+asking for deadlock.  Also the state of structures that are protected
+by locks is indeterminate, including linked lists.
+
+---
+
+The complicated ia64 MCA process.  All of this is mandated by Intel's
+specification for ia64 SAL, error recovery and unwind, it is not as
+if we have a choice here.
+
+* MCA occurs on one cpu, usually due to a double bit memory error.
+  This is the monarch cpu.
+
+* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
+  to all the other cpus, the slaves.
+
+* Slave cpus that receive the MCA interrupt call down into SAL, they
+  end up spinning disabled while the MCA is being serviced.
+
+* If any slave cpu was already spinning disabled when the MCA occurred
+  then it cannot service the MCA interrupt.  SAL waits ~20 seconds then
+  sends an unmaskable INIT event to the slave cpus that have not
+  already rendezvoused.
+
+* Because MCA/INIT can be delivered at any time, including when the cpu
+  is down in PAL in physical mode, the registers at the time of the
+  event are _completely_ undefined.  In particular the MCA/INIT
+  handlers cannot rely on the thread pointer, PAL physical mode can
+  (and does) modify TP.  It is allowed to do that as long as it resets
+  TP on return.  However MCA/INIT events expose us to these PAL
+  internal TP changes.  Hence curr_task().
+
+* If an MCA/INIT event occurs while the kernel was running (not user
+  space) and the kernel has called PAL then the MCA/INIT handler cannot
+  assume that the kernel stack is in a fit state to be used.  Mainly
+  because PAL may or may not maintain the stack pointer internally.
+  Because the MCA/INIT handlers cannot trust the kernel stack, they
+  have to use their own, per-cpu stacks.  The MCA/INIT stacks are
+  preformatted with just enough task state to let the relevant handlers
+  do their job.
+
+* Unlike most other architectures, the ia64 struct task is embedded in
+  the kernel stack[1].  So switching to a new kernel stack means that
+  we switch to a new task as well.  Because various bits of the kernel
+  assume that current points into the struct task, switching to a new
+  stack also means a new value for current.
+
+* Once all slaves have rendezvoused and are spinning disabled, the
+  monarch is entered.  The monarch now tries to diagnose the problem
+  and decide if it can recover or not.
+
+* Part of the monarch's job is to look at the state of all the other
+  tasks.  The only way to do that on ia64 is to call the unwinder,
+  as mandated by Intel.
+
+* The starting point for the unwind depends on whether a task is
+  running or not.  That is, whether it is on a cpu or is blocked.  The
+  monarch has to determine whether or not a task is on a cpu before it
+  knows how to start unwinding it.  The tasks that received an MCA or
+  INIT event are no longer running, they have been converted to blocked
+  tasks.  But (and its a big but), the cpus that received the MCA
+  rendezvous interrupt are still running on their normal kernel stacks!
+
+* To distinguish between these two cases, the monarch must know which
+  tasks are on a cpu and which are not.  Hence each slave cpu that
+  switches to an MCA/INIT stack, registers its new stack using
+  set_curr_task(), so the monarch can tell that the _original_ task is
+  no longer running on that cpu.  That gives us a decent chance of
+  getting a valid backtrace of the _original_ task.
+
+* MCA/INIT can be nested, to a depth of 2 on any cpu.  In the case of a
+  nested error, we want diagnostics on the MCA/INIT handler that
+  failed, not on the task that was originally running.  Again this
+  requires set_curr_task() so the MCA/INIT handlers can register their
+  own stack as running on that cpu.  Then a recursive error gets a
+  trace of the failing handler's "task".
+
+[1]
+    My (Keith Owens) original design called for ia64 to separate its
+    struct task and the kernel stacks.  Then the MCA/INIT data would be
+    chained stacks like i386 interrupt stacks.  But that required
+    radical surgery on the rest of ia64, plus extra hard wired TLB
+    entries with its associated performance degradation.  David
+    Mosberger vetoed that approach.  Which meant that separate kernel
+    stacks meant separate "tasks" for the MCA/INIT handlers.
+
+---
+
+INIT is less complicated than MCA.  Pressing the nmi button or using
+the equivalent command on the management console sends INIT to all
+cpus.  SAL picks one of the cpus as the monarch and the rest are
+slaves.  All the OS INIT handlers are entered at approximately the same
+time.  The OS monarch prints the state of all tasks and returns, after
+which the slaves return and the system resumes.
+
+At least that is what is supposed to happen.  Alas there are broken
+versions of SAL out there.  Some drive all the cpus as monarchs.  Some
+drive them all as slaves.  Some drive one cpu as monarch, wait for that
+cpu to return from the OS then drive the rest as slaves.  Some versions
+of SAL cannot even cope with returning from the OS, they spin inside
+SAL on resume.  The OS INIT code has workarounds for some of these
+broken SAL symptoms, but some simply cannot be fixed from the OS side.
+
+---
+
+The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
+violations.  Unfortunately MCA/INIT start off as massive layer
+violations (can occur at _any_ time) and they build from there.
+
+At least ia64 makes an attempt at recovering from hardware errors, but
+it is a difficult problem because of the asynchronous nature of these
+errors.  When processing an unmaskable interrupt we sometimes need
+special code to cope with our inability to take any locks.
+
+---
+
+How is ia64 MCA/INIT different from x86 NMI?
+
+* x86 NMI typically gets delivered to one cpu.  MCA/INIT gets sent to
+  all cpus.
+
+* x86 NMI cannot be nested.  MCA/INIT can be nested, to a depth of 2
+  per cpu.
+
+* x86 has a separate struct task which points to one of multiple kernel
+  stacks.  ia64 has the struct task embedded in the single kernel
+  stack, so switching stack means switching task.
+
+* x86 does not call the BIOS so the NMI handler does not have to worry
+  about any registers having changed.  MCA/INIT can occur while the cpu
+  is in PAL in physical mode, with undefined registers and an undefined
+  kernel stack.
+
+* i386 backtrace is not very sensitive to whether a process is running
+  or not.  ia64 unwind is very, very sensitive to whether a process is
+  running or not.
+
+---
+
+What happens when MCA/INIT is delivered what a cpu is running user
+space code?
+
+The user mode registers are stored in the RSE area of the MCA/INIT on
+entry to the OS and are restored from there on return to SAL, so user
+mode registers are preserved across a recoverable MCA/INIT.  Since the
+OS has no idea what unwind data is available for the user space stack,
+MCA/INIT never tries to backtrace user space.  Which means that the OS
+does not bother making the user space process look like a blocked task,
+i.e. the OS does not copy pt_regs and switch_stack to the user space
+stack.  Also the OS has no idea how big the user space RSE and memory
+stacks are, which makes it too risky to copy the saved state to a user
+mode stack.
+
+---
+
+How do we get a backtrace on the tasks that were running when MCA/INIT
+was delivered?
+
+mca.c:::ia64_mca_modify_original_stack().  That identifies and
+verifies the original kernel stack, copies the dirty registers from
+the MCA/INIT stack's RSE to the original stack's RSE, copies the
+skeleton struct pt_regs and switch_stack to the original stack, fills
+in the skeleton structures from the PAL minstate area and updates the
+original stack's thread.ksp.  That makes the original stack look
+exactly like any other blocked task, i.e. it now appears to be
+sleeping.  To get a backtrace, just start with thread.ksp for the
+original task and unwind like any other sleeping task.
+
+---
+
+How do we identify the tasks that were running when MCA/INIT was
+delivered?
+
+If the previous task has been verified and converted to a blocked
+state, then sos->prev_task on the MCA/INIT stack is updated to point to
+the previous task.  You can look at that field in dumps or debuggers.
+To help distinguish between the handler and the original tasks,
+handlers have _TIF_MCA_INIT set in thread_info.flags.
+
+The sos data is always in the MCA/INIT handler stack, at offset
+MCA_SOS_OFFSET.  You can get that value from mca_asm.h or calculate it
+as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
+ia64_sal_os_state), with 16 byte alignment for all structures.
+
+Also the comm field of the MCA/INIT task is modified to include the pid
+of the original task, for humans to use.  For example, a comm field of
+'MCA 12159' means that pid 12159 was running when the MCA was
+delivered.
diff --git a/Documentation/ia64/mca.txt b/Documentation/ia64/mca.txt
deleted file mode 100644
index f097c60cba1b..000000000000
--- a/Documentation/ia64/mca.txt
+++ /dev/null
@@ -1,194 +0,0 @@
-An ad-hoc collection of notes on IA64 MCA and INIT processing.  Feel
-free to update it with notes about any area that is not clear.
-
----
-
-MCA/INIT are completely asynchronous.  They can occur at any time, when
-the OS is in any state.  Including when one of the cpus is already
-holding a spinlock.  Trying to get any lock from MCA/INIT state is
-asking for deadlock.  Also the state of structures that are protected
-by locks is indeterminate, including linked lists.
-
----
-
-The complicated ia64 MCA process.  All of this is mandated by Intel's
-specification for ia64 SAL, error recovery and unwind, it is not as
-if we have a choice here.
-
-* MCA occurs on one cpu, usually due to a double bit memory error.
-  This is the monarch cpu.
-
-* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
-  to all the other cpus, the slaves.
-
-* Slave cpus that receive the MCA interrupt call down into SAL, they
-  end up spinning disabled while the MCA is being serviced.
-
-* If any slave cpu was already spinning disabled when the MCA occurred
-  then it cannot service the MCA interrupt.  SAL waits ~20 seconds then
-  sends an unmaskable INIT event to the slave cpus that have not
-  already rendezvoused.
-
-* Because MCA/INIT can be delivered at any time, including when the cpu
-  is down in PAL in physical mode, the registers at the time of the
-  event are _completely_ undefined.  In particular the MCA/INIT
-  handlers cannot rely on the thread pointer, PAL physical mode can
-  (and does) modify TP.  It is allowed to do that as long as it resets
-  TP on return.  However MCA/INIT events expose us to these PAL
-  internal TP changes.  Hence curr_task().
-
-* If an MCA/INIT event occurs while the kernel was running (not user
-  space) and the kernel has called PAL then the MCA/INIT handler cannot
-  assume that the kernel stack is in a fit state to be used.  Mainly
-  because PAL may or may not maintain the stack pointer internally.
-  Because the MCA/INIT handlers cannot trust the kernel stack, they
-  have to use their own, per-cpu stacks.  The MCA/INIT stacks are
-  preformatted with just enough task state to let the relevant handlers
-  do their job.
-
-* Unlike most other architectures, the ia64 struct task is embedded in
-  the kernel stack[1].  So switching to a new kernel stack means that
-  we switch to a new task as well.  Because various bits of the kernel
-  assume that current points into the struct task, switching to a new
-  stack also means a new value for current.
-
-* Once all slaves have rendezvoused and are spinning disabled, the
-  monarch is entered.  The monarch now tries to diagnose the problem
-  and decide if it can recover or not.
-
-* Part of the monarch's job is to look at the state of all the other
-  tasks.  The only way to do that on ia64 is to call the unwinder,
-  as mandated by Intel.
-
-* The starting point for the unwind depends on whether a task is
-  running or not.  That is, whether it is on a cpu or is blocked.  The
-  monarch has to determine whether or not a task is on a cpu before it
-  knows how to start unwinding it.  The tasks that received an MCA or
-  INIT event are no longer running, they have been converted to blocked
-  tasks.  But (and its a big but), the cpus that received the MCA
-  rendezvous interrupt are still running on their normal kernel stacks!
-
-* To distinguish between these two cases, the monarch must know which
-  tasks are on a cpu and which are not.  Hence each slave cpu that
-  switches to an MCA/INIT stack, registers its new stack using
-  set_curr_task(), so the monarch can tell that the _original_ task is
-  no longer running on that cpu.  That gives us a decent chance of
-  getting a valid backtrace of the _original_ task.
-
-* MCA/INIT can be nested, to a depth of 2 on any cpu.  In the case of a
-  nested error, we want diagnostics on the MCA/INIT handler that
-  failed, not on the task that was originally running.  Again this
-  requires set_curr_task() so the MCA/INIT handlers can register their
-  own stack as running on that cpu.  Then a recursive error gets a
-  trace of the failing handler's "task".
-
-[1] My (Keith Owens) original design called for ia64 to separate its
-    struct task and the kernel stacks.  Then the MCA/INIT data would be
-    chained stacks like i386 interrupt stacks.  But that required
-    radical surgery on the rest of ia64, plus extra hard wired TLB
-    entries with its associated performance degradation.  David
-    Mosberger vetoed that approach.  Which meant that separate kernel
-    stacks meant separate "tasks" for the MCA/INIT handlers.
-
----
-
-INIT is less complicated than MCA.  Pressing the nmi button or using
-the equivalent command on the management console sends INIT to all
-cpus.  SAL picks one of the cpus as the monarch and the rest are
-slaves.  All the OS INIT handlers are entered at approximately the same
-time.  The OS monarch prints the state of all tasks and returns, after
-which the slaves return and the system resumes.
-
-At least that is what is supposed to happen.  Alas there are broken
-versions of SAL out there.  Some drive all the cpus as monarchs.  Some
-drive them all as slaves.  Some drive one cpu as monarch, wait for that
-cpu to return from the OS then drive the rest as slaves.  Some versions
-of SAL cannot even cope with returning from the OS, they spin inside
-SAL on resume.  The OS INIT code has workarounds for some of these
-broken SAL symptoms, but some simply cannot be fixed from the OS side.
-
----
-
-The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
-violations.  Unfortunately MCA/INIT start off as massive layer
-violations (can occur at _any_ time) and they build from there.
-
-At least ia64 makes an attempt at recovering from hardware errors, but
-it is a difficult problem because of the asynchronous nature of these
-errors.  When processing an unmaskable interrupt we sometimes need
-special code to cope with our inability to take any locks.
-
----
-
-How is ia64 MCA/INIT different from x86 NMI?
-
-* x86 NMI typically gets delivered to one cpu.  MCA/INIT gets sent to
-  all cpus.
-
-* x86 NMI cannot be nested.  MCA/INIT can be nested, to a depth of 2
-  per cpu.
-
-* x86 has a separate struct task which points to one of multiple kernel
-  stacks.  ia64 has the struct task embedded in the single kernel
-  stack, so switching stack means switching task.
-
-* x86 does not call the BIOS so the NMI handler does not have to worry
-  about any registers having changed.  MCA/INIT can occur while the cpu
-  is in PAL in physical mode, with undefined registers and an undefined
-  kernel stack.
-
-* i386 backtrace is not very sensitive to whether a process is running
-  or not.  ia64 unwind is very, very sensitive to whether a process is
-  running or not.
-
----
-
-What happens when MCA/INIT is delivered what a cpu is running user
-space code?
-
-The user mode registers are stored in the RSE area of the MCA/INIT on
-entry to the OS and are restored from there on return to SAL, so user
-mode registers are preserved across a recoverable MCA/INIT.  Since the
-OS has no idea what unwind data is available for the user space stack,
-MCA/INIT never tries to backtrace user space.  Which means that the OS
-does not bother making the user space process look like a blocked task,
-i.e. the OS does not copy pt_regs and switch_stack to the user space
-stack.  Also the OS has no idea how big the user space RSE and memory
-stacks are, which makes it too risky to copy the saved state to a user
-mode stack.
-
----
-
-How do we get a backtrace on the tasks that were running when MCA/INIT
-was delivered?
-
-mca.c:::ia64_mca_modify_original_stack().  That identifies and
-verifies the original kernel stack, copies the dirty registers from
-the MCA/INIT stack's RSE to the original stack's RSE, copies the
-skeleton struct pt_regs and switch_stack to the original stack, fills
-in the skeleton structures from the PAL minstate area and updates the
-original stack's thread.ksp.  That makes the original stack look
-exactly like any other blocked task, i.e. it now appears to be
-sleeping.  To get a backtrace, just start with thread.ksp for the
-original task and unwind like any other sleeping task.
-
----
-
-How do we identify the tasks that were running when MCA/INIT was
-delivered?
-
-If the previous task has been verified and converted to a blocked
-state, then sos->prev_task on the MCA/INIT stack is updated to point to
-the previous task.  You can look at that field in dumps or debuggers.
-To help distinguish between the handler and the original tasks,
-handlers have _TIF_MCA_INIT set in thread_info.flags.
-
-The sos data is always in the MCA/INIT handler stack, at offset
-MCA_SOS_OFFSET.  You can get that value from mca_asm.h or calculate it
-as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
-ia64_sal_os_state), with 16 byte alignment for all structures.
-
-Also the comm field of the MCA/INIT task is modified to include the pid
-of the original task, for humans to use.  For example, a comm field of
-'MCA 12159' means that pid 12159 was running when the MCA was
-delivered.
diff --git a/Documentation/ia64/serial.rst b/Documentation/ia64/serial.rst
new file mode 100644
index 000000000000..1de70c305a79
--- /dev/null
+++ b/Documentation/ia64/serial.rst
@@ -0,0 +1,165 @@
+==============
+Serial Devices
+==============
+
+Serial Device Naming
+====================
+
+    As of 2.6.10, serial devices on ia64 are named based on the
+    order of ACPI and PCI enumeration.  The first device in the
+    ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
+    /dev/ttyS1, etc., and PCI devices are named sequentially
+    starting after the ACPI devices.
+
+    Prior to 2.6.10, there were confusing exceptions to this:
+
+	- Firmware on some machines (mostly from HP) provides an HCDP
+	  table[1] that tells the kernel about devices that can be used
+	  as a serial console.  If the user specified "console=ttyS0"
+	  or the EFI ConOut path contained only UART devices, the
+	  kernel registered the device described by the HCDP as
+	  /dev/ttyS0.
+
+	- If there was no HCDP, we assumed there were UARTs at the
+	  legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
+	  the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
+
+    Any additional ACPI or PCI devices were registered sequentially
+    after /dev/ttyS0 as they were discovered.
+
+    With an HCDP, device names changed depending on EFI configuration
+    and "console=" arguments.  Without an HCDP, device names didn't
+    change, but we registered devices that might not really exist.
+
+    For example, an HP rx1600 with a single built-in serial port
+    (described in the ACPI namespace) plus an MP[2] (a PCI device) has
+    these ports:
+
+      ==========  ==========     ============    ============   =======
+      Type        MMIO           pre-2.6.10      pre-2.6.10     2.6.10+
+		  address
+				 (EFI console    (EFI console
+                                 on builtin)     on MP port)
+      ==========  ==========     ============    ============   =======
+      builtin     0xff5e0000        ttyS0           ttyS1         ttyS0
+      MP UPS      0xf8031000        ttyS1           ttyS2         ttyS1
+      MP Console  0xf8030000        ttyS2           ttyS0         ttyS2
+      MP 2        0xf8030010        ttyS3           ttyS3         ttyS3
+      MP 3        0xf8030038        ttyS4           ttyS4         ttyS4
+      ==========  ==========     ============    ============   =======
+
+Console Selection
+=================
+
+    EFI knows what your console devices are, but it doesn't tell the
+    kernel quite enough to actually locate them.  The DIG64 HCDP
+    table[1] does tell the kernel where potential serial console
+    devices are, but not all firmware supplies it.  Also, EFI supports
+    multiple simultaneous consoles and doesn't tell the kernel which
+    should be the "primary" one.
+
+    So how do you tell Linux which console device to use?
+
+	- If your firmware supplies the HCDP, it is simplest to
+	  configure EFI with a single device (either a UART or a VGA
+	  card) as the console.  Then you don't need to tell Linux
+	  anything; the kernel will automatically use the EFI console.
+
+	  (This works only in 2.6.6 or later; prior to that you had
+	  to specify "console=ttyS0" to get a serial console.)
+
+	- Without an HCDP, Linux defaults to a VGA console unless you
+	  specify a "console=" argument.
+
+    NOTE: Don't assume that a serial console device will be /dev/ttyS0.
+    It might be ttyS1, ttyS2, etc.  Make sure you have the appropriate
+    entries in /etc/inittab (for getty) and /etc/securetty (to allow
+    root login).
+
+Early Serial Console
+====================
+
+    The kernel can't start using a serial console until it knows where
+    the device lives.  Normally this happens when the driver enumerates
+    all the serial devices, which can happen a minute or more after the
+    kernel starts booting.
+
+    2.6.10 and later kernels have an "early uart" driver that works
+    very early in the boot process.  The kernel will automatically use
+    this if the user supplies an argument like "console=uart,io,0x3f8",
+    or if the EFI console path contains only a UART device and the
+    firmware supplies an HCDP.
+
+Troubleshooting Serial Console Problems
+=======================================
+
+    No kernel output after elilo prints "Uncompressing Linux... done":
+
+	- You specified "console=ttyS0" but Linux changed the device
+	  to which ttyS0 refers.  Configure exactly one EFI console
+	  device[3] and remove the "console=" option.
+
+	- The EFI console path contains both a VGA device and a UART.
+	  EFI and elilo use both, but Linux defaults to VGA.  Remove
+	  the VGA device from the EFI console path[3].
+
+	- Multiple UARTs selected as EFI console devices.  EFI and
+	  elilo use all selected devices, but Linux uses only one.
+	  Make sure only one UART is selected in the EFI console
+	  path[3].
+
+	- You're connected to an HP MP port[2] but have a non-MP UART
+	  selected as EFI console device.  EFI uses the MP as a
+	  console device even when it isn't explicitly selected.
+	  Either move the console cable to the non-MP UART, or change
+	  the EFI console path[3] to the MP UART.
+
+    Long pause (60+ seconds) between "Uncompressing Linux... done" and
+    start of kernel output:
+
+	- No early console because you used "console=ttyS<n>".  Remove
+	  the "console=" option if your firmware supplies an HCDP.
+
+	- If you don't have an HCDP, the kernel doesn't know where
+	  your console lives until the driver discovers serial
+	  devices.  Use "console=uart,io,0x3f8" (or appropriate
+	  address for your machine).
+
+    Kernel and init script output works fine, but no "login:" prompt:
+
+	- Add getty entry to /etc/inittab for console tty.  Look for
+	  the "Adding console on ttyS<n>" message that tells you which
+	  device is the console.
+
+    "login:" prompt, but can't login as root:
+
+	- Add entry to /etc/securetty for console tty.
+
+    No ACPI serial devices found in 2.6.17 or later:
+
+	- Turn on CONFIG_PNP and CONFIG_PNPACPI.  Prior to 2.6.17, ACPI
+	  serial devices were discovered by 8250_acpi.  In 2.6.17,
+	  8250_acpi was replaced by the combination of 8250_pnp and
+	  CONFIG_PNPACPI.
+
+
+
+[1]
+    http://www.dig64.org/specifications/agreement
+    The table was originally defined as the "HCDP" for "Headless
+    Console/Debug Port."  The current version is the "PCDP" for
+    "Primary Console and Debug Port Devices."
+
+[2]
+    The HP MP (management processor) is a PCI device that provides
+    several UARTs.  One of the UARTs is often used as a console; the
+    EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
+    The external connection is usually a 25-pin connector, and a
+    special dongle converts that to three 9-pin connectors, one of
+    which is labelled "Console."
+
+[3]
+    EFI console devices are configured using the EFI Boot Manager
+    "Boot option maintenance" menu.  You may have to interrupt the
+    boot sequence to use this menu, and you will have to reset the
+    box after changing console configuration.
diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt
deleted file mode 100644
index a63d2c54329b..000000000000
--- a/Documentation/ia64/serial.txt
+++ /dev/null
@@ -1,151 +0,0 @@
-SERIAL DEVICE NAMING
-
-    As of 2.6.10, serial devices on ia64 are named based on the
-    order of ACPI and PCI enumeration.  The first device in the
-    ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
-    /dev/ttyS1, etc., and PCI devices are named sequentially
-    starting after the ACPI devices.
-
-    Prior to 2.6.10, there were confusing exceptions to this:
-
-	- Firmware on some machines (mostly from HP) provides an HCDP
-	  table[1] that tells the kernel about devices that can be used
-	  as a serial console.  If the user specified "console=ttyS0"
-	  or the EFI ConOut path contained only UART devices, the
-	  kernel registered the device described by the HCDP as
-	  /dev/ttyS0.
-
-	- If there was no HCDP, we assumed there were UARTs at the
-	  legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
-	  the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
-
-    Any additional ACPI or PCI devices were registered sequentially
-    after /dev/ttyS0 as they were discovered.
-
-    With an HCDP, device names changed depending on EFI configuration
-    and "console=" arguments.  Without an HCDP, device names didn't
-    change, but we registered devices that might not really exist.
-
-    For example, an HP rx1600 with a single built-in serial port
-    (described in the ACPI namespace) plus an MP[2] (a PCI device) has
-    these ports:
-
-                                  pre-2.6.10      pre-2.6.10
-                    MMIO         (EFI console    (EFI console
-                   address        on builtin)     on MP port)    2.6.10
-                  ==========      ==========      ==========     ======
-      builtin     0xff5e0000        ttyS0           ttyS1         ttyS0
-      MP UPS      0xf8031000        ttyS1           ttyS2         ttyS1
-      MP Console  0xf8030000        ttyS2           ttyS0         ttyS2
-      MP 2        0xf8030010        ttyS3           ttyS3         ttyS3
-      MP 3        0xf8030038        ttyS4           ttyS4         ttyS4
-
-CONSOLE SELECTION
-
-    EFI knows what your console devices are, but it doesn't tell the
-    kernel quite enough to actually locate them.  The DIG64 HCDP
-    table[1] does tell the kernel where potential serial console
-    devices are, but not all firmware supplies it.  Also, EFI supports
-    multiple simultaneous consoles and doesn't tell the kernel which
-    should be the "primary" one.
-
-    So how do you tell Linux which console device to use?
-
-	- If your firmware supplies the HCDP, it is simplest to
-	  configure EFI with a single device (either a UART or a VGA
-	  card) as the console.  Then you don't need to tell Linux
-	  anything; the kernel will automatically use the EFI console.
-
-	  (This works only in 2.6.6 or later; prior to that you had
-	  to specify "console=ttyS0" to get a serial console.)
-
-	- Without an HCDP, Linux defaults to a VGA console unless you
-	  specify a "console=" argument.
-
-    NOTE: Don't assume that a serial console device will be /dev/ttyS0.
-    It might be ttyS1, ttyS2, etc.  Make sure you have the appropriate
-    entries in /etc/inittab (for getty) and /etc/securetty (to allow
-    root login).
-
-EARLY SERIAL CONSOLE
-
-    The kernel can't start using a serial console until it knows where
-    the device lives.  Normally this happens when the driver enumerates
-    all the serial devices, which can happen a minute or more after the
-    kernel starts booting.
-
-    2.6.10 and later kernels have an "early uart" driver that works
-    very early in the boot process.  The kernel will automatically use
-    this if the user supplies an argument like "console=uart,io,0x3f8",
-    or if the EFI console path contains only a UART device and the
-    firmware supplies an HCDP.
-
-TROUBLESHOOTING SERIAL CONSOLE PROBLEMS
-
-    No kernel output after elilo prints "Uncompressing Linux... done":
-
-	- You specified "console=ttyS0" but Linux changed the device
-	  to which ttyS0 refers.  Configure exactly one EFI console
-	  device[3] and remove the "console=" option.
-
-	- The EFI console path contains both a VGA device and a UART.
-	  EFI and elilo use both, but Linux defaults to VGA.  Remove
-	  the VGA device from the EFI console path[3].
-
-	- Multiple UARTs selected as EFI console devices.  EFI and
-	  elilo use all selected devices, but Linux uses only one.
-	  Make sure only one UART is selected in the EFI console
-	  path[3].
-
-	- You're connected to an HP MP port[2] but have a non-MP UART
-	  selected as EFI console device.  EFI uses the MP as a
-	  console device even when it isn't explicitly selected.
-	  Either move the console cable to the non-MP UART, or change
-	  the EFI console path[3] to the MP UART.
-
-    Long pause (60+ seconds) between "Uncompressing Linux... done" and
-    start of kernel output:
-
-	- No early console because you used "console=ttyS<n>".  Remove
-	  the "console=" option if your firmware supplies an HCDP.
-
-	- If you don't have an HCDP, the kernel doesn't know where
-	  your console lives until the driver discovers serial
-	  devices.  Use "console=uart,io,0x3f8" (or appropriate
-	  address for your machine).
-
-    Kernel and init script output works fine, but no "login:" prompt:
-
-	- Add getty entry to /etc/inittab for console tty.  Look for
-	  the "Adding console on ttyS<n>" message that tells you which
-	  device is the console.
-
-    "login:" prompt, but can't login as root:
-
-	- Add entry to /etc/securetty for console tty.
-
-    No ACPI serial devices found in 2.6.17 or later:
-
-	- Turn on CONFIG_PNP and CONFIG_PNPACPI.  Prior to 2.6.17, ACPI
-	  serial devices were discovered by 8250_acpi.  In 2.6.17,
-	  8250_acpi was replaced by the combination of 8250_pnp and
-	  CONFIG_PNPACPI.
-
-
-
-[1] http://www.dig64.org/specifications/agreement 
-    The table was originally defined as the "HCDP" for "Headless
-    Console/Debug Port."  The current version is the "PCDP" for
-    "Primary Console and Debug Port Devices."
-
-[2] The HP MP (management processor) is a PCI device that provides
-    several UARTs.  One of the UARTs is often used as a console; the
-    EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
-    The external connection is usually a 25-pin connector, and a
-    special dongle converts that to three 9-pin connectors, one of
-    which is labelled "Console."
-
-[3] EFI console devices are configured using the EFI Boot Manager
-    "Boot option maintenance" menu.  You may have to interrupt the
-    boot sequence to use this menu, and you will have to reset the
-    box after changing console configuration.
diff --git a/Documentation/ia64/xen.rst b/Documentation/ia64/xen.rst
new file mode 100644
index 000000000000..831339c74441
--- /dev/null
+++ b/Documentation/ia64/xen.rst
@@ -0,0 +1,206 @@
+********************************************************
+Recipe for getting/building/running Xen/ia64 with pv_ops
+********************************************************
+This recipe describes how to get xen-ia64 source and build it,
+and run domU with pv_ops.
+
+Requirements
+============
+
+  - python
+  - mercurial
+    it (aka "hg") is an open-source source code
+    management software. See the below.
+    http://www.selenic.com/mercurial/wiki/
+  - git
+  - bridge-utils
+
+Getting and Building Xen and Dom0
+=================================
+
+  My environment is:
+
+    - Machine  : Tiger4
+    - Domain0 OS  : RHEL5
+    - DomainU OS  : RHEL5
+
+ 1. Download source::
+
+	# hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg
+	# cd xen-unstable.hg
+	# hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg
+
+ 2. # make world
+
+ 3. # make install-tools
+
+ 4. copy kernels and xen::
+
+	# cp xen/xen.gz /boot/efi/efi/redhat/
+	# cp build-linux-2.6.18-xen_ia64/vmlinux.gz \
+	/boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen
+
+ 5. make initrd for Dom0/DomU::
+
+	# make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \
+          O=$(pwd)/build-linux-2.6.18-xen_ia64
+	# mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \
+	  2.6.18.8-xen --builtin mptspi --builtin mptbase \
+	  --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \
+	  --builtin ehci-hcd
+
+Making a disk image for guest OS
+================================
+
+ 1. make file::
+
+      # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0
+      # mke2fs -F -j /root/rhel5.img
+      # mount -o loop /root/rhel5.img /mnt
+      # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt
+      # mkdir /mnt/{root,proc,sys,home,tmp}
+
+      Note: You may miss some device files. If so, please create them
+      with mknod. Or you can use tar instead of cp.
+
+ 2. modify DomU's fstab::
+
+      # vi /mnt/etc/fstab
+         /dev/xvda1  /            ext3    defaults        1 1
+         none        /dev/pts     devpts  gid=5,mode=620  0 0
+         none        /dev/shm     tmpfs   defaults        0 0
+         none        /proc        proc    defaults        0 0
+         none        /sys         sysfs   defaults        0 0
+
+ 3. modify inittab
+
+    set runlevel to 3 to avoid X trying to start::
+
+      # vi /mnt/etc/inittab
+         id:3:initdefault:
+
+    Start a getty on the hvc0 console::
+
+       X0:2345:respawn:/sbin/mingetty hvc0
+
+    tty1-6 mingetty can be commented out
+
+ 4. add hvc0 into /etc/securetty::
+
+      # vi /mnt/etc/securetty (add hvc0)
+
+ 5. umount::
+
+      # umount /mnt
+
+FYI, virt-manager can also make a disk image for guest OS.
+It's GUI tools and easy to make it.
+
+Boot Xen & Domain0
+==================
+
+ 1. replace elilo
+    elilo of RHEL5 can boot Xen and Dom0.
+    If you use old elilo (e.g RHEL4), please download from the below
+    http://elilo.sourceforge.net/cgi-bin/blosxom
+    and copy into /boot/efi/efi/redhat/::
+
+      # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi
+
+ 2. modify elilo.conf (like the below)::
+
+      # vi /boot/efi/efi/redhat/elilo.conf
+      prompt
+      timeout=20
+      default=xen
+      relocatable
+
+      image=vmlinuz-2.6.18.8-xen
+             label=xen
+             vmm=xen.gz
+             initrd=initrd-2.6.18.8-xen.img
+             read-only
+             append=" -- rhgb root=/dev/sda2"
+
+The append options before "--" are for xen hypervisor,
+the options after "--" are for dom0.
+
+FYI, your machine may need console options like
+"com1=19200,8n1 console=vga,com1". For example,
+append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \
+console=ttyS0 root=/dev/sda2"
+
+Getting and Building domU with pv_ops
+=====================================
+
+ 1. get pv_ops tree::
+
+      # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/
+
+ 2. git branch (if necessary)::
+
+      # cd linux-2.6-xen-ia64/
+      # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19
+
+   Note:
+     The current branch is xen-ia64-domu-minimal-2008may19.
+     But you would find the new branch. You can see with
+     "git branch -r" to get the branch lists.
+
+       http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/
+
+     is also available.
+
+     The tree is based on
+
+      git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test)
+
+ 3. copy .config for pv_ops of domU::
+
+      # cp arch/ia64/configs/xen_domu_wip_defconfig .config
+
+ 4. make kernel with pv_ops::
+
+      # make oldconfig
+      # make
+
+ 5. install the kernel and initrd::
+
+      # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU
+      # make modules_install
+      # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \
+        2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \
+        --builtin mptbase --builtin mptscsih --builtin uhci-hcd \
+        --builtin ohci-hcd --builtin ehci-hcd
+
+Boot DomainU with pv_ops
+========================
+
+ 1. make config of DomU::
+
+     # vi /etc/xen/rhel5
+       kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU"
+       ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img"
+       vcpus = 1
+       memory = 512
+       name = "rhel5"
+       disk = [ 'file:/root/rhel5.img,xvda1,w' ]
+       root = "/dev/xvda1 ro"
+       extra= "rhgb console=hvc0"
+
+ 2. After boot xen and dom0, start xend::
+
+	# /etc/init.d/xend start
+
+   ( In the debugging case, `# XEND_DEBUG=1 xend trace_start` )
+
+ 3. start domU::
+
+	# xm create -c rhel5
+
+Reference
+=========
+- Wiki of Xen/IA64 upstream merge
+  http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge
+
+Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008
diff --git a/Documentation/ia64/xen.txt b/Documentation/ia64/xen.txt
deleted file mode 100644
index a12c74ce2773..000000000000
--- a/Documentation/ia64/xen.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-       Recipe for getting/building/running Xen/ia64 with pv_ops
-       --------------------------------------------------------
-
-This recipe describes how to get xen-ia64 source and build it,
-and run domU with pv_ops.
-
-============
-Requirements
-============
-
-  - python
-  - mercurial
-    it (aka "hg") is an open-source source code
-    management software. See the below.
-    http://www.selenic.com/mercurial/wiki/
-  - git
-  - bridge-utils
-
-=================================
-Getting and Building Xen and Dom0
-=================================
-
-  My environment is;
-    Machine  : Tiger4
-    Domain0 OS  : RHEL5
-    DomainU OS  : RHEL5
-
- 1. Download source
-    # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg
-    # cd xen-unstable.hg
-    # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg
-
- 2. # make world
-
- 3. # make install-tools
-
- 4. copy kernels and xen
-    # cp xen/xen.gz /boot/efi/efi/redhat/
-    # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \
-      /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen
-
- 5. make initrd for Dom0/DomU
-    # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \
-      O=$(pwd)/build-linux-2.6.18-xen_ia64
-    # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \
-      2.6.18.8-xen --builtin mptspi --builtin mptbase \
-      --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \
-      --builtin ehci-hcd
-
-================================
-Making a disk image for guest OS
-================================
-
- 1. make file
-    # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0
-    # mke2fs -F -j /root/rhel5.img
-    # mount -o loop /root/rhel5.img /mnt
-    # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt
-    # mkdir /mnt/{root,proc,sys,home,tmp}
-
-    Note: You may miss some device files. If so, please create them
-    with mknod. Or you can use tar instead of cp.
-
- 2. modify DomU's fstab
-    # vi /mnt/etc/fstab
-       /dev/xvda1  /            ext3    defaults        1 1
-       none        /dev/pts     devpts  gid=5,mode=620  0 0
-       none        /dev/shm     tmpfs   defaults        0 0
-       none        /proc        proc    defaults        0 0
-       none        /sys         sysfs   defaults        0 0
-
- 3. modify inittab
-    set runlevel to 3 to avoid X trying to start
-    # vi /mnt/etc/inittab
-       id:3:initdefault:
-    Start a getty on the hvc0 console
-       X0:2345:respawn:/sbin/mingetty hvc0
-    tty1-6 mingetty can be commented out
-
- 4. add hvc0 into /etc/securetty
-    # vi /mnt/etc/securetty (add hvc0)
-
- 5. umount
-    # umount /mnt
-
-FYI, virt-manager can also make a disk image for guest OS.
-It's GUI tools and easy to make it.
-
-==================
-Boot Xen & Domain0
-==================
-
- 1. replace elilo
-    elilo of RHEL5 can boot Xen and Dom0.
-    If you use old elilo (e.g RHEL4), please download from the below
-    http://elilo.sourceforge.net/cgi-bin/blosxom
-    and copy into /boot/efi/efi/redhat/
-    # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi
-
- 2. modify elilo.conf (like the below)
-    # vi /boot/efi/efi/redhat/elilo.conf
-     prompt
-     timeout=20
-     default=xen
-     relocatable
-
-     image=vmlinuz-2.6.18.8-xen
-             label=xen
-             vmm=xen.gz
-             initrd=initrd-2.6.18.8-xen.img
-             read-only
-             append=" -- rhgb root=/dev/sda2"
-
-The append options before "--" are for xen hypervisor,
-the options after "--" are for dom0.
-
-FYI, your machine may need console options like
-"com1=19200,8n1 console=vga,com1". For example,
-append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \
-console=ttyS0 root=/dev/sda2"
-
-=====================================
-Getting and Building domU with pv_ops
-=====================================
-
- 1. get pv_ops tree
-    # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/
-
- 2. git branch (if necessary)
-    # cd linux-2.6-xen-ia64/
-    # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19
-    (Note: The current branch is xen-ia64-domu-minimal-2008may19.
-    But you would find the new branch. You can see with
-    "git branch -r" to get the branch lists.
-    http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/
-    is also available. The tree is based on
-    git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test)
-
-
- 3. copy .config for pv_ops of domU
-    # cp arch/ia64/configs/xen_domu_wip_defconfig .config
-
- 4. make kernel with pv_ops
-    # make oldconfig
-    # make
-
- 5. install the kernel and initrd
-    # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU
-    # make modules_install
-    # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \
-      2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \
-      --builtin mptbase --builtin mptscsih --builtin uhci-hcd \
-      --builtin ohci-hcd --builtin ehci-hcd
-
-========================
-Boot DomainU with pv_ops
-========================
-
- 1. make config of DomU
-   # vi /etc/xen/rhel5
-     kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU"
-     ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img"
-     vcpus = 1
-     memory = 512
-     name = "rhel5"
-     disk = [ 'file:/root/rhel5.img,xvda1,w' ]
-     root = "/dev/xvda1 ro"
-     extra= "rhgb console=hvc0"
-
- 2. After boot xen and dom0, start xend
-   # /etc/init.d/xend start
-   ( In the debugging case, # XEND_DEBUG=1 xend trace_start )
-
- 3. start domU
-   # xm create -c rhel5
-
-=========
-Reference
-=========
-- Wiki of Xen/IA64 upstream merge
-  http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge
-
-Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008
diff --git a/Documentation/ide/changelogs.rst b/Documentation/ide/changelogs.rst
new file mode 100644
index 000000000000..fdf9d0fb8027
--- /dev/null
+++ b/Documentation/ide/changelogs.rst
@@ -0,0 +1,17 @@
+Changelog for ide cd
+--------------------
+
+ .. include:: ChangeLog.ide-cd.1994-2004
+    :literal:
+
+Changelog for ide floppy
+------------------------
+
+ .. include:: ChangeLog.ide-floppy.1996-2002
+    :literal:
+
+Changelog for ide tape
+----------------------
+
+ .. include:: ChangeLog.ide-tape.1995-2002
+    :literal:
diff --git a/Documentation/ide/ide-tape.rst b/Documentation/ide/ide-tape.rst
new file mode 100644
index 000000000000..3e061d9c0e38
--- /dev/null
+++ b/Documentation/ide/ide-tape.rst
@@ -0,0 +1,68 @@
+===============================
+IDE ATAPI streaming tape driver
+===============================
+
+This driver is a part of the Linux ide driver.
+
+The driver, in co-operation with ide.c, basically traverses the
+request-list for the block device interface. The character device
+interface, on the other hand, creates new requests, adds them
+to the request-list of the block device, and waits for their completion.
+
+The block device major and minor numbers are determined from the
+tape's relative position in the ide interfaces, as explained in ide.c.
+
+The character device interface consists of the following devices::
+
+  ht0		major 37, minor 0	first  IDE tape, rewind on close.
+  ht1		major 37, minor 1	second IDE tape, rewind on close.
+  ...
+  nht0		major 37, minor 128	first  IDE tape, no rewind on close.
+  nht1		major 37, minor 129	second IDE tape, no rewind on close.
+  ...
+
+The general magnetic tape commands compatible interface, as defined by
+include/linux/mtio.h, is accessible through the character device.
+
+General ide driver configuration options, such as the interrupt-unmask
+flag, can be configured by issuing an ioctl to the block device interface,
+as any other ide device.
+
+Our own ide-tape ioctl's can be issued to either the block device or
+the character device interface.
+
+Maximal throughput with minimal bus load will usually be achieved in the
+following scenario:
+
+     1.	ide-tape is operating in the pipelined operation mode.
+     2.	No buffering is performed by the user backup program.
+
+Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive.
+
+Here are some words from the first releases of hd.c, which are quoted
+in ide.c and apply here as well:
+
+* Special care is recommended.  Have Fun!
+
+Possible improvements
+=====================
+
+1. Support for the ATAPI overlap protocol.
+
+In order to maximize bus throughput, we currently use the DSC
+overlap method which enables ide.c to service requests from the
+other device while the tape is busy executing a command. The
+DSC overlap method involves polling the tape's status register
+for the DSC bit, and servicing the other device while the tape
+isn't ready.
+
+In the current QIC development standard (December 1995),
+it is recommended that new tape drives will *in addition*
+implement the ATAPI overlap protocol, which is used for the
+same purpose - efficient use of the IDE bus, but is interrupt
+driven and thus has much less CPU overhead.
+
+ATAPI overlap is likely to be supported in most new ATAPI
+devices, including new ATAPI cdroms, and thus provides us
+a method by which we can achieve higher throughput when
+sharing a (fast) ATA-2 disk with any (slow) new ATAPI device.
diff --git a/Documentation/ide/ide-tape.txt b/Documentation/ide/ide-tape.txt
deleted file mode 100644
index 3f348a0b21d8..000000000000
--- a/Documentation/ide/ide-tape.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-IDE ATAPI streaming tape driver.
-
-This driver is a part of the Linux ide driver.
-
-The driver, in co-operation with ide.c, basically traverses the
-request-list for the block device interface. The character device
-interface, on the other hand, creates new requests, adds them
-to the request-list of the block device, and waits for their completion.
-
-The block device major and minor numbers are determined from the
-tape's relative position in the ide interfaces, as explained in ide.c.
-
-The character device interface consists of the following devices:
-
-ht0		major 37, minor 0	first  IDE tape, rewind on close.
-ht1		major 37, minor 1	second IDE tape, rewind on close.
-...
-nht0		major 37, minor 128	first  IDE tape, no rewind on close.
-nht1		major 37, minor 129	second IDE tape, no rewind on close.
-...
-
-The general magnetic tape commands compatible interface, as defined by
-include/linux/mtio.h, is accessible through the character device.
-
-General ide driver configuration options, such as the interrupt-unmask
-flag, can be configured by issuing an ioctl to the block device interface,
-as any other ide device.
-
-Our own ide-tape ioctl's can be issued to either the block device or
-the character device interface.
-
-Maximal throughput with minimal bus load will usually be achieved in the
-following scenario:
-
-     1.	ide-tape is operating in the pipelined operation mode.
-     2.	No buffering is performed by the user backup program.
-
-Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive.
-
-Here are some words from the first releases of hd.c, which are quoted
-in ide.c and apply here as well:
-
-| Special care is recommended.  Have Fun!
-
-Possible improvements:
-
-1. Support for the ATAPI overlap protocol.
-
-In order to maximize bus throughput, we currently use the DSC
-overlap method which enables ide.c to service requests from the
-other device while the tape is busy executing a command. The
-DSC overlap method involves polling the tape's status register
-for the DSC bit, and servicing the other device while the tape
-isn't ready.
-
-In the current QIC development standard (December 1995),
-it is recommended that new tape drives will *in addition*
-implement the ATAPI overlap protocol, which is used for the
-same purpose - efficient use of the IDE bus, but is interrupt
-driven and thus has much less CPU overhead.
-
-ATAPI overlap is likely to be supported in most new ATAPI
-devices, including new ATAPI cdroms, and thus provides us
-a method by which we can achieve higher throughput when
-sharing a (fast) ATA-2 disk with any (slow) new ATAPI device.
diff --git a/Documentation/ide/ide.rst b/Documentation/ide/ide.rst
new file mode 100644
index 000000000000..88bdcba92f7d
--- /dev/null
+++ b/Documentation/ide/ide.rst
@@ -0,0 +1,265 @@
+============================================
+Information regarding the Enhanced IDE drive
+============================================
+
+   The hdparm utility can be used to control various IDE features on a
+   running system. It is packaged separately.  Please Look for it on popular
+   linux FTP sites.
+
+-------------------------------------------------------------------------------
+
+.. important::
+
+   BUGGY IDE CHIPSETS CAN CORRUPT DATA!!
+
+    PCI versions of the CMD640 and RZ1000 interfaces are now detected
+    automatically at startup when PCI BIOS support is configured.
+
+    Linux disables the "prefetch" ("readahead") mode of the RZ1000
+    to prevent data corruption possible due to hardware design flaws.
+
+    For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any
+    drive for which the "prefetch" mode of the CMD640 is turned on.
+    If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be
+    used again.
+
+    For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive
+    for which the "prefetch" mode of the CMD640 is turned off.
+    If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be
+    used again.
+
+    The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT*
+    automatically detected by Linux.  For safe, reliable operation with such
+    interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option.
+
+    Use of the "serialize" option is no longer necessary.
+
+-------------------------------------------------------------------------------
+
+Common pitfalls
+===============
+
+- 40-conductor IDE cables are capable of transferring data in DMA modes up to
+  udma2, but no faster.
+
+- If possible devices should be attached to separate channels if they are
+  available. Typically the disk on the first and CD-ROM on the second.
+
+- If you mix devices on the same cable, please consider using similar devices
+  in respect of the data transfer mode they support.
+
+- Even better try to stick to the same vendor and device type on the same
+  cable.
+
+This is the multiple IDE interface driver, as evolved from hd.c
+===============================================================
+
+It supports up to 9 IDE interfaces per default, on one or more IRQs (usually
+14 & 15).  There can be up to two drives per interface, as per the ATA-6 spec.::
+
+  Primary:    ide0, port 0x1f0; major=3;  hda is minor=0; hdb is minor=64
+  Secondary:  ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64
+  Tertiary:   ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64
+  Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64
+  fifth..     ide4, usually PCI, probed
+  sixth..     ide5, usually PCI, probed
+
+To access devices on interfaces > ide0, device entries please make sure that
+device files for them are present in /dev.  If not, please create such
+entries, by using /dev/MAKEDEV.
+
+This driver automatically probes for most IDE interfaces (including all PCI
+ones), for the drives/geometries attached to those interfaces, and for the IRQ
+lines being used by the interfaces (normally 14, 15 for ide0/ide1).
+
+Any number of interfaces may share a single IRQ if necessary, at a slight
+performance penalty, whether on separate cards or a single VLB card.
+The IDE driver automatically detects and handles this.  However, this may
+or may not be harmful to your hardware.. two or more cards driving the same IRQ
+can potentially burn each other's bus driver, though in practice this
+seldom occurs.  Be careful, and if in doubt, don't do it!
+
+Drives are normally found by auto-probing and/or examining the CMOS/BIOS data.
+For really weird situations, the apparent (fdisk) geometry can also be specified
+on the kernel "command line" using LILO.  The format of such lines is::
+
+	ide_core.chs=[interface_number.device_number]:cyls,heads,sects
+
+or::
+
+	ide_core.cdrom=[interface_number.device_number]
+
+For example::
+
+	ide_core.chs=1.0:1050,32,64  ide_core.cdrom=1.1
+
+The results of successful auto-probing may override the physical geometry/irq
+specified, though the "original" geometry may be retained as the "logical"
+geometry for partitioning purposes (fdisk).
+
+If the auto-probing during boot time confuses a drive (ie. the drive works
+with hd.c but not with ide.c), then an command line option may be specified
+for each drive for which you'd like the drive to skip the hardware
+probe/identification sequence.  For example::
+
+	ide_core.noprobe=0.1
+
+or::
+
+	ide_core.chs=1.0:768,16,32
+	ide_core.noprobe=1.0
+
+Note that when only one IDE device is attached to an interface, it should be
+jumpered as "single" or "master", *not* "slave".  Many folks have had
+"trouble" with cdroms because of this requirement, so the driver now probes
+for both units, though success is more likely when the drive is jumpered
+correctly.
+
+Courtesy of Scott Snyder and others, the driver supports ATAPI cdrom drives
+such as the NEC-260 and the new MITSUMI triple/quad speed drives.
+Such drives will be identified at boot time, just like a hard disk.
+
+If for some reason your cdrom drive is *not* found at boot time, you can force
+the probe to look harder by supplying a kernel command line parameter
+via LILO, such as:::
+
+	ide_core.cdrom=1.0	/* "master" on second interface (hdc) */
+
+or::
+
+	ide_core.cdrom=1.1	/* "slave" on second interface (hdd) */
+
+For example, a GW2000 system might have a hard drive on the primary
+interface (/dev/hda) and an IDE cdrom drive on the secondary interface
+(/dev/hdc).  To mount a CD in the cdrom drive, one would use something like::
+
+	ln -sf /dev/hdc /dev/cdrom
+	mkdir /mnt/cdrom
+	mount /dev/cdrom /mnt/cdrom -t iso9660 -o ro
+
+If, after doing all of the above, mount doesn't work and you see
+errors from the driver (with dmesg) complaining about `status=0xff`,
+this means that the hardware is not responding to the driver's attempts
+to read it.  One of the following is probably the problem:
+
+  - Your hardware is broken.
+
+  - You are using the wrong address for the device, or you have the
+    drive jumpered wrong.  Review the configuration instructions above.
+
+  - Your IDE controller requires some nonstandard initialization sequence
+    before it will work properly.  If this is the case, there will often
+    be a separate MS-DOS driver just for the controller.  IDE interfaces
+    on sound cards usually fall into this category.  Such configurations
+    can often be made to work by first booting MS-DOS, loading the
+    appropriate drivers, and then warm-booting linux (without powering
+    off).  This can be automated using loadlin in the MS-DOS autoexec.
+
+If you always get timeout errors, interrupts from the drive are probably
+not making it to the host.  Check how you have the hardware jumpered
+and make sure it matches what the driver expects (see the configuration
+instructions above).  If you have a PCI system, also check the BIOS
+setup; I've had one report of a system which was shipped with IRQ 15
+disabled by the BIOS.
+
+The kernel is able to execute binaries directly off of the cdrom,
+provided it is mounted with the default block size of 1024 (as above).
+
+Please pass on any feedback on any of this stuff to the maintainer,
+whose address can be found in linux/MAINTAINERS.
+
+The IDE driver is modularized.  The high level disk/CD-ROM/tape/floppy
+drivers can always be compiled as loadable modules, the chipset drivers
+can only be compiled into the kernel, and the core code (ide.c) can be
+compiled as a loadable module provided no chipset support is needed.
+
+When using ide.c as a module in combination with kmod, add::
+
+	alias block-major-3 ide-probe
+
+to a configuration file in /etc/modprobe.d/.
+
+When ide.c is used as a module, you can pass command line parameters to the
+driver using the "options=" keyword to insmod, while replacing any ',' with
+';'.
+
+
+Summary of ide driver parameters for kernel command line
+========================================================
+
+For legacy IDE VLB host drivers (ali14xx/dtc2278/ht6560b/qd65xx/umc8672)
+you need to explicitly enable probing by using "probe" kernel parameter,
+i.e. to enable probing for ALI M14xx chipsets (ali14xx host driver) use:
+
+* "ali14xx.probe" boot option when ali14xx driver is built-in the kernel
+
+* "probe" module parameter when ali14xx driver is compiled as module
+  ("modprobe ali14xx probe")
+
+Also for legacy CMD640 host driver (cmd640) you need to use "probe_vlb"
+kernel paremeter to enable probing for VLB version of the chipset (PCI ones
+are detected automatically).
+
+You also need to use "probe" kernel parameter for ide-4drives driver
+(support for IDE generic chipset with four drives on one port).
+
+To enable support for IDE doublers on Amiga use "doubler" kernel parameter
+for gayle host driver (i.e. "gayle.doubler" if the driver is built-in).
+
+To force ignoring cable detection (this should be needed only if you're using
+short 40-wires cable which cannot be automatically detected - if this is not
+a case please report it as a bug instead) use "ignore_cable" kernel parameter:
+
+* "ide_core.ignore_cable=[interface_number]" boot option if IDE is built-in
+  (i.e. "ide_core.ignore_cable=1" to force ignoring cable for "ide1")
+
+* "ignore_cable=[interface_number]" module parameter (for ide_core module)
+  if IDE is compiled as module
+
+Other kernel parameters for ide_core are:
+
+* "nodma=[interface_number.device_number]" to disallow DMA for a device
+
+* "noflush=[interface_number.device_number]" to disable flush requests
+
+* "nohpa=[interface_number.device_number]" to disable Host Protected Area
+
+* "noprobe=[interface_number.device_number]" to skip probing
+
+* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
+
+* "cdrom=[interface_number.device_number]" to force device as a CD-ROM
+
+* "chs=[interface_number.device_number]" to force device as a disk (using CHS)
+
+
+Some Terminology
+================
+
+IDE
+  Integrated Drive Electronics, meaning that each drive has a built-in
+  controller, which is why an "IDE interface card" is not a "controller card".
+
+ATA
+  AT (the old IBM 286 computer) Attachment Interface, a draft American
+  National Standard for connecting hard drives to PCs.  This is the official
+  name for "IDE".
+
+  The latest standards define some enhancements, known as the ATA-6 spec,
+  which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations.
+
+ATAPI
+  ATA Packet Interface, a new protocol for controlling the drives,
+  similar to SCSI protocols, created at the same time as the ATA2 standard.
+  ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or
+  LS120/240) devices, removable R/W cartridges, and for high capacity hard disk
+  drives.
+
+mlord@pobox.com
+
+
+Wed Apr 17 22:52:44 CEST 2002 edited by Marcin Dalecki, the current
+maintainer.
+
+Wed Aug 20 22:31:29 CEST 2003 updated ide boot options to current ide.c
+comments at 2.6.0-test4 time. Maciej Soltysiak <solt@dns.toxicfilms.tv>
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
deleted file mode 100644
index 7aca987c23d9..000000000000
--- a/Documentation/ide/ide.txt
+++ /dev/null
@@ -1,256 +0,0 @@
-
-	Information regarding the Enhanced IDE drive in Linux 2.6
-
-==============================================================================
-
-
-   The hdparm utility can be used to control various IDE features on a
-   running system. It is packaged separately.  Please Look for it on popular
-   linux FTP sites.
-
-
-
-***  IMPORTANT NOTICES:  BUGGY IDE CHIPSETS CAN CORRUPT DATA!!
-***  =================
-***  PCI versions of the CMD640 and RZ1000 interfaces are now detected
-***  automatically at startup when PCI BIOS support is configured.
-***
-***  Linux disables the "prefetch" ("readahead") mode of the RZ1000
-***  to prevent data corruption possible due to hardware design flaws.
-***
-***  For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any
-***  drive for which the "prefetch" mode of the CMD640 is turned on.
-***  If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be
-***  used again.
-***
-***  For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive
-***  for which the "prefetch" mode of the CMD640 is turned off.
-***  If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be
-***  used again.
-***
-***  The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT*
-***  automatically detected by Linux.  For safe, reliable operation with such
-***  interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option.
-***
-***  Use of the "serialize" option is no longer necessary.
-
-================================================================================
-Common pitfalls:
-
-- 40-conductor IDE cables are capable of transferring data in DMA modes up to
-  udma2, but no faster.
-
-- If possible devices should be attached to separate channels if they are
-  available. Typically the disk on the first and CD-ROM on the second.
-
-- If you mix devices on the same cable, please consider using similar devices
-  in respect of the data transfer mode they support.
-
-- Even better try to stick to the same vendor and device type on the same
-  cable.
-
-================================================================================
-
-This is the multiple IDE interface driver, as evolved from hd.c.
-
-It supports up to 9 IDE interfaces per default, on one or more IRQs (usually
-14 & 15).  There can be up to two drives per interface, as per the ATA-6 spec.
-
-Primary:    ide0, port 0x1f0; major=3;  hda is minor=0; hdb is minor=64
-Secondary:  ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64
-Tertiary:   ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64
-Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64
-fifth..     ide4, usually PCI, probed
-sixth..     ide5, usually PCI, probed
-
-To access devices on interfaces > ide0, device entries please make sure that
-device files for them are present in /dev.  If not, please create such
-entries, by using /dev/MAKEDEV.
-
-This driver automatically probes for most IDE interfaces (including all PCI
-ones), for the drives/geometries attached to those interfaces, and for the IRQ
-lines being used by the interfaces (normally 14, 15 for ide0/ide1).
-
-Any number of interfaces may share a single IRQ if necessary, at a slight
-performance penalty, whether on separate cards or a single VLB card.
-The IDE driver automatically detects and handles this.  However, this may
-or may not be harmful to your hardware.. two or more cards driving the same IRQ
-can potentially burn each other's bus driver, though in practice this
-seldom occurs.  Be careful, and if in doubt, don't do it!
-
-Drives are normally found by auto-probing and/or examining the CMOS/BIOS data.
-For really weird situations, the apparent (fdisk) geometry can also be specified
-on the kernel "command line" using LILO.  The format of such lines is:
-
-	ide_core.chs=[interface_number.device_number]:cyls,heads,sects
-or	ide_core.cdrom=[interface_number.device_number]
-
-For example:
-
-	ide_core.chs=1.0:1050,32,64  ide_core.cdrom=1.1
-
-The results of successful auto-probing may override the physical geometry/irq
-specified, though the "original" geometry may be retained as the "logical"
-geometry for partitioning purposes (fdisk).
-
-If the auto-probing during boot time confuses a drive (ie. the drive works
-with hd.c but not with ide.c), then an command line option may be specified
-for each drive for which you'd like the drive to skip the hardware
-probe/identification sequence.  For example:
-
-	ide_core.noprobe=0.1
-or
-	ide_core.chs=1.0:768,16,32
-	ide_core.noprobe=1.0
-
-Note that when only one IDE device is attached to an interface, it should be
-jumpered as "single" or "master", *not* "slave".  Many folks have had
-"trouble" with cdroms because of this requirement, so the driver now probes
-for both units, though success is more likely when the drive is jumpered
-correctly.
-
-Courtesy of Scott Snyder and others, the driver supports ATAPI cdrom drives
-such as the NEC-260 and the new MITSUMI triple/quad speed drives.
-Such drives will be identified at boot time, just like a hard disk.
-
-If for some reason your cdrom drive is *not* found at boot time, you can force
-the probe to look harder by supplying a kernel command line parameter
-via LILO, such as:
-
-	ide_core.cdrom=1.0	/* "master" on second interface (hdc) */
-or
-	ide_core.cdrom=1.1	/* "slave" on second interface (hdd) */
-
-For example, a GW2000 system might have a hard drive on the primary
-interface (/dev/hda) and an IDE cdrom drive on the secondary interface
-(/dev/hdc).  To mount a CD in the cdrom drive, one would use something like:
-
-	ln -sf /dev/hdc /dev/cdrom
-	mkdir /mnt/cdrom
-	mount /dev/cdrom /mnt/cdrom -t iso9660 -o ro
-
-If, after doing all of the above, mount doesn't work and you see
-errors from the driver (with dmesg) complaining about `status=0xff',
-this means that the hardware is not responding to the driver's attempts
-to read it.  One of the following is probably the problem:
-
-  - Your hardware is broken.
-
-  - You are using the wrong address for the device, or you have the
-    drive jumpered wrong.  Review the configuration instructions above.
-
-  - Your IDE controller requires some nonstandard initialization sequence
-    before it will work properly.  If this is the case, there will often
-    be a separate MS-DOS driver just for the controller.  IDE interfaces
-    on sound cards usually fall into this category.  Such configurations
-    can often be made to work by first booting MS-DOS, loading the
-    appropriate drivers, and then warm-booting linux (without powering
-    off).  This can be automated using loadlin in the MS-DOS autoexec.
-
-If you always get timeout errors, interrupts from the drive are probably
-not making it to the host.  Check how you have the hardware jumpered
-and make sure it matches what the driver expects (see the configuration
-instructions above).  If you have a PCI system, also check the BIOS
-setup; I've had one report of a system which was shipped with IRQ 15
-disabled by the BIOS.
-
-The kernel is able to execute binaries directly off of the cdrom,
-provided it is mounted with the default block size of 1024 (as above).
-
-Please pass on any feedback on any of this stuff to the maintainer,
-whose address can be found in linux/MAINTAINERS.
-
-The IDE driver is modularized.  The high level disk/CD-ROM/tape/floppy
-drivers can always be compiled as loadable modules, the chipset drivers
-can only be compiled into the kernel, and the core code (ide.c) can be
-compiled as a loadable module provided no chipset support is needed.
-
-When using ide.c as a module in combination with kmod, add:
-
-	alias block-major-3 ide-probe
-
-to a configuration file in /etc/modprobe.d/.
-
-When ide.c is used as a module, you can pass command line parameters to the
-driver using the "options=" keyword to insmod, while replacing any ',' with
-';'.
-
-
-================================================================================
-
-Summary of ide driver parameters for kernel command line
---------------------------------------------------------
-
-For legacy IDE VLB host drivers (ali14xx/dtc2278/ht6560b/qd65xx/umc8672)
-you need to explicitly enable probing by using "probe" kernel parameter,
-i.e. to enable probing for ALI M14xx chipsets (ali14xx host driver) use:
-
-* "ali14xx.probe" boot option when ali14xx driver is built-in the kernel
-
-* "probe" module parameter when ali14xx driver is compiled as module
-  ("modprobe ali14xx probe")
-
-Also for legacy CMD640 host driver (cmd640) you need to use "probe_vlb"
-kernel paremeter to enable probing for VLB version of the chipset (PCI ones
-are detected automatically).
-
-You also need to use "probe" kernel parameter for ide-4drives driver
-(support for IDE generic chipset with four drives on one port).
-
-To enable support for IDE doublers on Amiga use "doubler" kernel parameter
-for gayle host driver (i.e. "gayle.doubler" if the driver is built-in).
-
-To force ignoring cable detection (this should be needed only if you're using
-short 40-wires cable which cannot be automatically detected - if this is not
-a case please report it as a bug instead) use "ignore_cable" kernel parameter:
-
-* "ide_core.ignore_cable=[interface_number]" boot option if IDE is built-in
-  (i.e. "ide_core.ignore_cable=1" to force ignoring cable for "ide1")
-
-* "ignore_cable=[interface_number]" module parameter (for ide_core module)
-  if IDE is compiled as module
-
-Other kernel parameters for ide_core are:
-
-* "nodma=[interface_number.device_number]" to disallow DMA for a device
-
-* "noflush=[interface_number.device_number]" to disable flush requests
-
-* "nohpa=[interface_number.device_number]" to disable Host Protected Area
-
-* "noprobe=[interface_number.device_number]" to skip probing
-
-* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
-
-* "cdrom=[interface_number.device_number]" to force device as a CD-ROM
-
-* "chs=[interface_number.device_number]" to force device as a disk (using CHS)
-
-================================================================================
-
-Some Terminology
-----------------
-IDE = Integrated Drive Electronics, meaning that each drive has a built-in
-controller, which is why an "IDE interface card" is not a "controller card".
-
-ATA = AT (the old IBM 286 computer) Attachment Interface, a draft American
-National Standard for connecting hard drives to PCs.  This is the official
-name for "IDE".
-
-The latest standards define some enhancements, known as the ATA-6 spec,
-which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations.
-
-ATAPI = ATA Packet Interface, a new protocol for controlling the drives,
-similar to SCSI protocols, created at the same time as the ATA2 standard.
-ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or
-LS120/240) devices, removable R/W cartridges, and for high capacity hard disk
-drives.
-
-mlord@pobox.com
---
-
-Wed Apr 17 22:52:44 CEST 2002 edited by Marcin Dalecki, the current
-maintainer.
-
-Wed Aug 20 22:31:29 CEST 2003 updated ide boot options to current ide.c
-comments at 2.6.0-test4 time. Maciej Soltysiak <solt@dns.toxicfilms.tv>
diff --git a/Documentation/ide/index.rst b/Documentation/ide/index.rst
new file mode 100644
index 000000000000..813dfe611a31
--- /dev/null
+++ b/Documentation/ide/index.rst
@@ -0,0 +1,21 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================================
+Integrated Drive Electronics (IDE)
+==================================
+
+.. toctree::
+    :maxdepth: 1
+
+    ide
+    ide-tape
+    warm-plug-howto
+
+    changelogs
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/ide/warm-plug-howto.rst b/Documentation/ide/warm-plug-howto.rst
new file mode 100644
index 000000000000..c245242ef2f1
--- /dev/null
+++ b/Documentation/ide/warm-plug-howto.rst
@@ -0,0 +1,18 @@
+===================
+IDE warm-plug HOWTO
+===================
+
+To warm-plug devices on a port 'idex'::
+
+	# echo -n "1" > /sys/class/ide_port/idex/delete_devices
+
+unplug old device(s) and plug new device(s)::
+
+	# echo -n "1" > /sys/class/ide_port/idex/scan
+
+done
+
+NOTE: please make sure that partitions are unmounted and that there are
+no other active references to devices before doing "delete_devices" step,
+also do not attempt "scan" step on devices currently in use -- otherwise
+results may be unpredictable and lead to data loss if you're unlucky
diff --git a/Documentation/ide/warm-plug-howto.txt b/Documentation/ide/warm-plug-howto.txt
deleted file mode 100644
index 98152bcd515a..000000000000
--- a/Documentation/ide/warm-plug-howto.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-
-IDE warm-plug HOWTO
-===================
-
-To warm-plug devices on a port 'idex':
-
-# echo -n "1" > /sys/class/ide_port/idex/delete_devices
-
-unplug old device(s) and plug new device(s)
-
-# echo -n "1" > /sys/class/ide_port/idex/scan
-
-done
-
-NOTE: please make sure that partitions are unmounted and that there are
-no other active references to devices before doing "delete_devices" step,
-also do not attempt "scan" step on devices currently in use -- otherwise
-results may be unpredictable and lead to data loss if you're unlucky
diff --git a/Documentation/iio/ep93xx_adc.rst b/Documentation/iio/ep93xx_adc.rst
new file mode 100644
index 000000000000..4fd8dea3f6b8
--- /dev/null
+++ b/Documentation/iio/ep93xx_adc.rst
@@ -0,0 +1,40 @@
+==============================
+Cirrus Logic EP93xx ADC driver
+==============================
+
+1. Overview
+===========
+
+The driver is intended to work on both low-end (EP9301, EP9302) devices with
+5-channel ADC and high-end (EP9307, EP9312, EP9315) devices with 10-channel
+touchscreen/ADC module.
+
+2. Channel numbering
+====================
+
+Numbering scheme for channels 0..4 is defined in EP9301 and EP9302 datasheets.
+EP9307, EP9312 and EP9312 have 3 channels more (total 8), but the numbering is
+not defined. So the last three are numbered randomly, let's say.
+
+Assuming ep93xx_adc is IIO device0, you'd find the following entries under
+/sys/bus/iio/devices/iio:device0/:
+
+  +-----------------+---------------+
+  | sysfs entry     | ball/pin name |
+  +=================+===============+
+  | in_voltage0_raw | YM            |
+  +-----------------+---------------+
+  | in_voltage1_raw | SXP           |
+  +-----------------+---------------+
+  | in_voltage2_raw | SXM           |
+  +-----------------+---------------+
+  | in_voltage3_raw | SYP           |
+  +-----------------+---------------+
+  | in_voltage4_raw | SYM           |
+  +-----------------+---------------+
+  | in_voltage5_raw | XP            |
+  +-----------------+---------------+
+  | in_voltage6_raw | XM            |
+  +-----------------+---------------+
+  | in_voltage7_raw | YP            |
+  +-----------------+---------------+
diff --git a/Documentation/iio/ep93xx_adc.txt b/Documentation/iio/ep93xx_adc.txt
deleted file mode 100644
index 23053e7817bd..000000000000
--- a/Documentation/iio/ep93xx_adc.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-Cirrus Logic EP93xx ADC driver.
-
-1. Overview
-
-The driver is intended to work on both low-end (EP9301, EP9302) devices with
-5-channel ADC and high-end (EP9307, EP9312, EP9315) devices with 10-channel
-touchscreen/ADC module.
-
-2. Channel numbering
-
-Numbering scheme for channels 0..4 is defined in EP9301 and EP9302 datasheets.
-EP9307, EP9312 and EP9312 have 3 channels more (total 8), but the numbering is
-not defined. So the last three are numbered randomly, let's say.
-
-Assuming ep93xx_adc is IIO device0, you'd find the following entries under
-/sys/bus/iio/devices/iio:device0/:
-
-  +-----------------+---------------+
-  | sysfs entry     | ball/pin name |
-  +-----------------+---------------+
-  | in_voltage0_raw | YM            |
-  | in_voltage1_raw | SXP           |
-  | in_voltage2_raw | SXM           |
-  | in_voltage3_raw | SYP           |
-  | in_voltage4_raw | SYM           |
-  | in_voltage5_raw | XP            |
-  | in_voltage6_raw | XM            |
-  | in_voltage7_raw | YP            |
-  +-----------------+---------------+
diff --git a/Documentation/iio/iio_configfs.rst b/Documentation/iio/iio_configfs.rst
new file mode 100644
index 000000000000..ecbfdb3afef7
--- /dev/null
+++ b/Documentation/iio/iio_configfs.rst
@@ -0,0 +1,101 @@
+===============================
+Industrial IIO configfs support
+===============================
+
+1. Overview
+===========
+
+Configfs is a filesystem-based manager of kernel objects. IIO uses some
+objects that could be easily configured using configfs (e.g.: devices,
+triggers).
+
+See Documentation/filesystems/configfs/configfs.txt for more information
+about how configfs works.
+
+2. Usage
+========
+
+In order to use configfs support in IIO we need to select it at compile
+time via CONFIG_IIO_CONFIGFS config option.
+
+Then, mount the configfs filesystem (usually under /config directory)::
+
+  $ mkdir /config
+  $ mount -t configfs none /config
+
+At this point, all default IIO groups will be created and can be accessed
+under /config/iio. Next chapters will describe available IIO configuration
+objects.
+
+3. Software triggers
+====================
+
+One of the IIO default configfs groups is the "triggers" group. It is
+automagically accessible when the configfs is mounted and can be found
+under /config/iio/triggers.
+
+IIO software triggers implementation offers support for creating multiple
+trigger types. A new trigger type is usually implemented as a separate
+kernel module following the interface in include/linux/iio/sw_trigger.h::
+
+  /*
+   * drivers/iio/trigger/iio-trig-sample.c
+   * sample kernel module implementing a new trigger type
+   */
+  #include <linux/iio/sw_trigger.h>
+
+
+  static struct iio_sw_trigger *iio_trig_sample_probe(const char *name)
+  {
+	/*
+	 * This allocates and registers an IIO trigger plus other
+	 * trigger type specific initialization.
+	 */
+  }
+
+  static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
+  {
+	/*
+	 * This undoes the actions in iio_trig_sample_probe
+	 */
+  }
+
+  static const struct iio_sw_trigger_ops iio_trig_sample_ops = {
+	.probe		= iio_trig_sample_probe,
+	.remove		= iio_trig_sample_remove,
+  };
+
+  static struct iio_sw_trigger_type iio_trig_sample = {
+	.name = "trig-sample",
+	.owner = THIS_MODULE,
+	.ops = &iio_trig_sample_ops,
+  };
+
+module_iio_sw_trigger_driver(iio_trig_sample);
+
+Each trigger type has its own directory under /config/iio/triggers. Loading
+iio-trig-sample module will create 'trig-sample' trigger type directory
+/config/iio/triggers/trig-sample.
+
+We support the following interrupt sources (trigger types):
+
+	* hrtimer, uses high resolution timers as interrupt source
+
+3.1 Hrtimer triggers creation and destruction
+---------------------------------------------
+
+Loading iio-trig-hrtimer module will register hrtimer trigger types allowing
+users to create hrtimer triggers under /config/iio/triggers/hrtimer.
+
+e.g::
+
+  $ mkdir /config/iio/triggers/hrtimer/instance1
+  $ rmdir /config/iio/triggers/hrtimer/instance1
+
+Each trigger can have one or more attributes specific to the trigger type.
+
+3.2 "hrtimer" trigger types attributes
+--------------------------------------
+
+"hrtimer" trigger type doesn't have any configurable attribute from /config dir.
+It does introduce the sampling_frequency attribute to trigger directory.
diff --git a/Documentation/iio/iio_configfs.txt b/Documentation/iio/iio_configfs.txt
deleted file mode 100644
index 4e5f101837a8..000000000000
--- a/Documentation/iio/iio_configfs.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Industrial IIO configfs support
-
-1. Overview
-
-Configfs is a filesystem-based manager of kernel objects. IIO uses some
-objects that could be easily configured using configfs (e.g.: devices,
-triggers).
-
-See Documentation/filesystems/configfs/configfs.txt for more information
-about how configfs works.
-
-2. Usage
-
-In order to use configfs support in IIO we need to select it at compile
-time via CONFIG_IIO_CONFIGFS config option.
-
-Then, mount the configfs filesystem (usually under /config directory):
-
-$ mkdir /config
-$ mount -t configfs none /config
-
-At this point, all default IIO groups will be created and can be accessed
-under /config/iio. Next chapters will describe available IIO configuration
-objects.
-
-3. Software triggers
-
-One of the IIO default configfs groups is the "triggers" group. It is
-automagically accessible when the configfs is mounted and can be found
-under /config/iio/triggers.
-
-IIO software triggers implementation offers support for creating multiple
-trigger types. A new trigger type is usually implemented as a separate
-kernel module following the interface in include/linux/iio/sw_trigger.h:
-
-/*
- * drivers/iio/trigger/iio-trig-sample.c
- * sample kernel module implementing a new trigger type
- */
-#include <linux/iio/sw_trigger.h>
-
-
-static struct iio_sw_trigger *iio_trig_sample_probe(const char *name)
-{
-	/*
-	 * This allocates and registers an IIO trigger plus other
-	 * trigger type specific initialization.
-	 */
-}
-
-static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
-{
-	/*
-	 * This undoes the actions in iio_trig_sample_probe
-	 */
-}
-
-static const struct iio_sw_trigger_ops iio_trig_sample_ops = {
-	.probe		= iio_trig_sample_probe,
-	.remove		= iio_trig_sample_remove,
-};
-
-static struct iio_sw_trigger_type iio_trig_sample = {
-	.name = "trig-sample",
-	.owner = THIS_MODULE,
-	.ops = &iio_trig_sample_ops,
-};
-
-module_iio_sw_trigger_driver(iio_trig_sample);
-
-Each trigger type has its own directory under /config/iio/triggers. Loading
-iio-trig-sample module will create 'trig-sample' trigger type directory
-/config/iio/triggers/trig-sample.
-
-We support the following interrupt sources (trigger types):
-	* hrtimer, uses high resolution timers as interrupt source
-
-3.1 Hrtimer triggers creation and destruction
-
-Loading iio-trig-hrtimer module will register hrtimer trigger types allowing
-users to create hrtimer triggers under /config/iio/triggers/hrtimer.
-
-e.g:
-
-$ mkdir /config/iio/triggers/hrtimer/instance1
-$ rmdir /config/iio/triggers/hrtimer/instance1
-
-Each trigger can have one or more attributes specific to the trigger type.
-
-3.2 "hrtimer" trigger types attributes
-
-"hrtimer" trigger type doesn't have any configurable attribute from /config dir.
-It does introduce the sampling_frequency attribute to trigger directory.
diff --git a/Documentation/iio/index.rst b/Documentation/iio/index.rst
new file mode 100644
index 000000000000..58b7a4ebac51
--- /dev/null
+++ b/Documentation/iio/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+Industrial I/O
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   iio_configfs
+
+   ep93xx_adc
diff --git a/Documentation/index.rst b/Documentation/index.rst
index a7566ef62411..2df5a3da563c 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -1,3 +1,4 @@
+
 .. The Linux Kernel documentation master file, created by
    sphinx-quickstart on Fri Feb 12 13:51:46 2016.
    You can adapt this file completely to your liking, but it should at least
@@ -34,6 +35,7 @@ trying to get it to work optimally on a given system.
    :maxdepth: 2
 
    admin-guide/index
+   kbuild/index
 
 Firmware-related documentation
 ------------------------------
@@ -55,6 +57,7 @@ the kernel interface as seen by application developers.
    :maxdepth: 2
 
    userspace-api/index
+   ioctl/index
 
 
 Introduction to kernel development
@@ -75,6 +78,9 @@ merged much easier.
    kernel-hacking/index
    trace/index
    maintainer/index
+   fault-injection/index
+   livepatch/index
+
 
 Kernel API documentation
 ------------------------
@@ -90,8 +96,26 @@ needed).
 
    driver-api/index
    core-api/index
+   locking/index
+   accounting/index
+   block/index
+   cdrom/index
+   ide/index
+   fb/index
+   fpga/index
+   hid/index
+   iio/index
+   infiniband/index
+   leds/index
    media/index
+   netlabel/index
    networking/index
+   pcmcia/index
+   power/index
+   target/index
+   timers/index
+   watchdog/index
+   virtual/index
    input/index
    hwmon/index
    gpu/index
@@ -101,7 +125,11 @@ needed).
    filesystems/index
    vm/index
    bpf/index
+   usb/index
+   PCI/index
    misc-devices/index
+   mic/index
+   scheduler/index
 
 Architecture-specific documentation
 -----------------------------------
@@ -112,9 +140,18 @@ implementation.
 .. toctree::
    :maxdepth: 2
 
-   x86/index
    sh/index
+   arm/index
+   arm64/index
+   ia64/index
+   m68k/index
+   powerpc/index
+   riscv/index
+   s390/index
+   sh/index
+   sparc/index
    x86/index
+   xtensa/index
 
 Filesystem Documentation
 ------------------------
diff --git a/Documentation/infiniband/core_locking.rst b/Documentation/infiniband/core_locking.rst
new file mode 100644
index 000000000000..f34669beb4fe
--- /dev/null
+++ b/Documentation/infiniband/core_locking.rst
@@ -0,0 +1,118 @@
+===========================
+InfiniBand Midlayer Locking
+===========================
+
+  This guide is an attempt to make explicit the locking assumptions
+  made by the InfiniBand midlayer.  It describes the requirements on
+  both low-level drivers that sit below the midlayer and upper level
+  protocols that use the midlayer.
+
+Sleeping and interrupt context
+==============================
+
+  With the following exceptions, a low-level driver implementation of
+  all of the methods in struct ib_device may sleep.  The exceptions
+  are any methods from the list:
+
+    - create_ah
+    - modify_ah
+    - query_ah
+    - destroy_ah
+    - post_send
+    - post_recv
+    - poll_cq
+    - req_notify_cq
+    - map_phys_fmr
+
+  which may not sleep and must be callable from any context.
+
+  The corresponding functions exported to upper level protocol
+  consumers:
+
+    - ib_create_ah
+    - ib_modify_ah
+    - ib_query_ah
+    - ib_destroy_ah
+    - ib_post_send
+    - ib_post_recv
+    - ib_req_notify_cq
+    - ib_map_phys_fmr
+
+  are therefore safe to call from any context.
+
+  In addition, the function
+
+    - ib_dispatch_event
+
+  used by low-level drivers to dispatch asynchronous events through
+  the midlayer is also safe to call from any context.
+
+Reentrancy
+----------
+
+  All of the methods in struct ib_device exported by a low-level
+  driver must be fully reentrant.  The low-level driver is required to
+  perform all synchronization necessary to maintain consistency, even
+  if multiple function calls using the same object are run
+  simultaneously.
+
+  The IB midlayer does not perform any serialization of function calls.
+
+  Because low-level drivers are reentrant, upper level protocol
+  consumers are not required to perform any serialization.  However,
+  some serialization may be required to get sensible results.  For
+  example, a consumer may safely call ib_poll_cq() on multiple CPUs
+  simultaneously.  However, the ordering of the work completion
+  information between different calls of ib_poll_cq() is not defined.
+
+Callbacks
+---------
+
+  A low-level driver must not perform a callback directly from the
+  same callchain as an ib_device method call.  For example, it is not
+  allowed for a low-level driver to call a consumer's completion event
+  handler directly from its post_send method.  Instead, the low-level
+  driver should defer this callback by, for example, scheduling a
+  tasklet to perform the callback.
+
+  The low-level driver is responsible for ensuring that multiple
+  completion event handlers for the same CQ are not called
+  simultaneously.  The driver must guarantee that only one CQ event
+  handler for a given CQ is running at a time.  In other words, the
+  following situation is not allowed::
+
+          CPU1                                    CPU2
+
+    low-level driver ->
+      consumer CQ event callback:
+        /* ... */
+        ib_req_notify_cq(cq, ...);
+                                          low-level driver ->
+        /* ... */                           consumer CQ event callback:
+                                              /* ... */
+        return from CQ event handler
+
+  The context in which completion event and asynchronous event
+  callbacks run is not defined.  Depending on the low-level driver, it
+  may be process context, softirq context, or interrupt context.
+  Upper level protocol consumers may not sleep in a callback.
+
+Hot-plug
+--------
+
+  A low-level driver announces that a device is ready for use by
+  consumers when it calls ib_register_device(), all initialization
+  must be complete before this call.  The device must remain usable
+  until the driver's call to ib_unregister_device() has returned.
+
+  A low-level driver must call ib_register_device() and
+  ib_unregister_device() from process context.  It must not hold any
+  semaphores that could cause deadlock if a consumer calls back into
+  the driver across these calls.
+
+  An upper level protocol consumer may begin using an IB device as
+  soon as the add method of its struct ib_client is called for that
+  device.  A consumer must finish all cleanup and free all resources
+  relating to a device before returning from the remove method.
+
+  A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
deleted file mode 100644
index 4b1f36b6ada0..000000000000
--- a/Documentation/infiniband/core_locking.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-INFINIBAND MIDLAYER LOCKING
-
-  This guide is an attempt to make explicit the locking assumptions
-  made by the InfiniBand midlayer.  It describes the requirements on
-  both low-level drivers that sit below the midlayer and upper level
-  protocols that use the midlayer.
-
-Sleeping and interrupt context
-
-  With the following exceptions, a low-level driver implementation of
-  all of the methods in struct ib_device may sleep.  The exceptions
-  are any methods from the list:
-
-    create_ah
-    modify_ah
-    query_ah
-    destroy_ah
-    post_send
-    post_recv
-    poll_cq
-    req_notify_cq
-    map_phys_fmr
-
-  which may not sleep and must be callable from any context.
-
-  The corresponding functions exported to upper level protocol
-  consumers:
-
-    ib_create_ah
-    ib_modify_ah
-    ib_query_ah
-    ib_destroy_ah
-    ib_post_send
-    ib_post_recv
-    ib_req_notify_cq
-    ib_map_phys_fmr
-
-  are therefore safe to call from any context.
-
-  In addition, the function
-
-    ib_dispatch_event
-
-  used by low-level drivers to dispatch asynchronous events through
-  the midlayer is also safe to call from any context.
-
-Reentrancy
-
-  All of the methods in struct ib_device exported by a low-level
-  driver must be fully reentrant.  The low-level driver is required to
-  perform all synchronization necessary to maintain consistency, even
-  if multiple function calls using the same object are run
-  simultaneously.
-
-  The IB midlayer does not perform any serialization of function calls.
-
-  Because low-level drivers are reentrant, upper level protocol
-  consumers are not required to perform any serialization.  However,
-  some serialization may be required to get sensible results.  For
-  example, a consumer may safely call ib_poll_cq() on multiple CPUs
-  simultaneously.  However, the ordering of the work completion
-  information between different calls of ib_poll_cq() is not defined.
-
-Callbacks
-
-  A low-level driver must not perform a callback directly from the
-  same callchain as an ib_device method call.  For example, it is not
-  allowed for a low-level driver to call a consumer's completion event
-  handler directly from its post_send method.  Instead, the low-level
-  driver should defer this callback by, for example, scheduling a
-  tasklet to perform the callback.
-
-  The low-level driver is responsible for ensuring that multiple
-  completion event handlers for the same CQ are not called
-  simultaneously.  The driver must guarantee that only one CQ event
-  handler for a given CQ is running at a time.  In other words, the
-  following situation is not allowed:
-
-        CPU1                                    CPU2
-
-  low-level driver ->
-    consumer CQ event callback:
-      /* ... */
-      ib_req_notify_cq(cq, ...);
-                                        low-level driver ->
-      /* ... */                           consumer CQ event callback:
-                                            /* ... */
-      return from CQ event handler
-
-  The context in which completion event and asynchronous event
-  callbacks run is not defined.  Depending on the low-level driver, it
-  may be process context, softirq context, or interrupt context.
-  Upper level protocol consumers may not sleep in a callback.
-
-Hot-plug
-
-  A low-level driver announces that a device is ready for use by
-  consumers when it calls ib_register_device(), all initialization
-  must be complete before this call.  The device must remain usable
-  until the driver's call to ib_unregister_device() has returned.
-
-  A low-level driver must call ib_register_device() and
-  ib_unregister_device() from process context.  It must not hold any
-  semaphores that could cause deadlock if a consumer calls back into
-  the driver across these calls.
-
-  An upper level protocol consumer may begin using an IB device as
-  soon as the add method of its struct ib_client is called for that
-  device.  A consumer must finish all cleanup and free all resources
-  relating to a device before returning from the remove method.
-
-  A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst
new file mode 100644
index 000000000000..9cd7615438b9
--- /dev/null
+++ b/Documentation/infiniband/index.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+InfiniBand
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   core_locking
+   ipoib
+   opa_vnic
+   sysfs
+   tag_matching
+   user_mad
+   user_verbs
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/infiniband/ipoib.rst b/Documentation/infiniband/ipoib.rst
new file mode 100644
index 000000000000..0dd36154c0c9
--- /dev/null
+++ b/Documentation/infiniband/ipoib.rst
@@ -0,0 +1,115 @@
+==================
+IP over InfiniBand
+==================
+
+  The ib_ipoib driver is an implementation of the IP over InfiniBand
+  protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
+  working group.  It is a "native" implementation in the sense of
+  setting the interface type to ARPHRD_INFINIBAND and the hardware
+  address length to 20 (earlier proprietary implementations
+  masqueraded to the kernel as ethernet interfaces).
+
+Partitions and P_Keys
+=====================
+
+  When the IPoIB driver is loaded, it creates one interface for each
+  port using the P_Key at index 0.  To create an interface with a
+  different P_Key, write the desired P_Key into the main interface's
+  /sys/class/net/<intf name>/create_child file.  For example::
+
+    echo 0x8001 > /sys/class/net/ib0/create_child
+
+  This will create an interface named ib0.8001 with P_Key 0x8001.  To
+  remove a subinterface, use the "delete_child" file::
+
+    echo 0x8001 > /sys/class/net/ib0/delete_child
+
+  The P_Key for any interface is given by the "pkey" file, and the
+  main interface for a subinterface is in "parent."
+
+  Child interface create/delete can also be done using IPoIB's
+  rtnl_link_ops, where children created using either way behave the same.
+
+Datagram vs Connected modes
+===========================
+
+  The IPoIB driver supports two modes of operation: datagram and
+  connected.  The mode is set and read through an interface's
+  /sys/class/net/<intf name>/mode file.
+
+  In datagram mode, the IB UD (Unreliable Datagram) transport is used
+  and so the interface MTU has is equal to the IB L2 MTU minus the
+  IPoIB encapsulation header (4 bytes).  For example, in a typical IB
+  fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
+
+  In connected mode, the IB RC (Reliable Connected) transport is used.
+  Connected mode takes advantage of the connected nature of the IB
+  transport and allows an MTU up to the maximal IP packet size of 64K,
+  which reduces the number of IP packets needed for handling large UDP
+  datagrams, TCP segments, etc and increases the performance for large
+  messages.
+
+  In connected mode, the interface's UD QP is still used for multicast
+  and communication with peers that don't support connected mode. In
+  this case, RX emulation of ICMP PMTU packets is used to cause the
+  networking stack to use the smaller UD MTU for these neighbours.
+
+Stateless offloads
+==================
+
+  If the IB HW supports IPoIB stateless offloads, IPoIB advertises
+  TCP/IP checksum and/or Large Send (LSO) offloading capability to the
+  network stack.
+
+  Large Receive (LRO) offloading is also implemented and may be turned
+  on/off using ethtool calls.  Currently LRO is supported only for
+  checksum offload capable devices.
+
+  Stateless offloads are supported only in datagram mode.
+
+Interrupt moderation
+====================
+
+  If the underlying IB device supports CQ event moderation, one can
+  use ethtool to set interrupt mitigation parameters and thus reduce
+  the overhead incurred by handling interrupts.  The main code path of
+  IPoIB doesn't use events for TX completion signaling so only RX
+  moderation is supported.
+
+Debugging Information
+=====================
+
+  By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
+  to 'y', tracing messages are compiled into the driver.  They are
+  turned on by setting the module parameters debug_level and
+  mcast_debug_level to 1.  These parameters can be controlled at
+  runtime through files in /sys/module/ib_ipoib/.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
+  virtual filesystem.  By mounting this filesystem, for example with::
+
+    mount -t debugfs none /sys/kernel/debug
+
+  it is possible to get statistics about multicast groups from the
+  files /sys/kernel/debug/ipoib/ib0_mcg and so on.
+
+  The performance impact of this option is negligible, so it
+  is safe to enable this option with debug_level set to 0 for normal
+  operation.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
+  the data path when data_debug_level is set to 1.  However, even with
+  the output disabled, enabling this configuration option will affect
+  performance, because it adds tests to the fast path.
+
+References
+==========
+
+  Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
+    http://ietf.org/rfc/rfc4391.txt
+
+  IP over InfiniBand (IPoIB) Architecture (RFC 4392)
+    http://ietf.org/rfc/rfc4392.txt
+
+  IP over InfiniBand: Connected Mode (RFC 4755)
+    http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
deleted file mode 100644
index 47c1dd9818f2..000000000000
--- a/Documentation/infiniband/ipoib.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-IP OVER INFINIBAND
-
-  The ib_ipoib driver is an implementation of the IP over InfiniBand
-  protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
-  working group.  It is a "native" implementation in the sense of
-  setting the interface type to ARPHRD_INFINIBAND and the hardware
-  address length to 20 (earlier proprietary implementations
-  masqueraded to the kernel as ethernet interfaces).
-
-Partitions and P_Keys
-
-  When the IPoIB driver is loaded, it creates one interface for each
-  port using the P_Key at index 0.  To create an interface with a
-  different P_Key, write the desired P_Key into the main interface's
-  /sys/class/net/<intf name>/create_child file.  For example:
-
-    echo 0x8001 > /sys/class/net/ib0/create_child
-
-  This will create an interface named ib0.8001 with P_Key 0x8001.  To
-  remove a subinterface, use the "delete_child" file:
-
-    echo 0x8001 > /sys/class/net/ib0/delete_child
-
-  The P_Key for any interface is given by the "pkey" file, and the
-  main interface for a subinterface is in "parent."
-
-  Child interface create/delete can also be done using IPoIB's
-  rtnl_link_ops, where children created using either way behave the same.
-
-Datagram vs Connected modes
-
-  The IPoIB driver supports two modes of operation: datagram and
-  connected.  The mode is set and read through an interface's
-  /sys/class/net/<intf name>/mode file.
-
-  In datagram mode, the IB UD (Unreliable Datagram) transport is used
-  and so the interface MTU has is equal to the IB L2 MTU minus the
-  IPoIB encapsulation header (4 bytes).  For example, in a typical IB
-  fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
-
-  In connected mode, the IB RC (Reliable Connected) transport is used.
-  Connected mode takes advantage of the connected nature of the IB
-  transport and allows an MTU up to the maximal IP packet size of 64K,
-  which reduces the number of IP packets needed for handling large UDP
-  datagrams, TCP segments, etc and increases the performance for large
-  messages.
-
-  In connected mode, the interface's UD QP is still used for multicast
-  and communication with peers that don't support connected mode. In
-  this case, RX emulation of ICMP PMTU packets is used to cause the
-  networking stack to use the smaller UD MTU for these neighbours.
-
-Stateless offloads
-
-  If the IB HW supports IPoIB stateless offloads, IPoIB advertises
-  TCP/IP checksum and/or Large Send (LSO) offloading capability to the
-  network stack.
-
-  Large Receive (LRO) offloading is also implemented and may be turned
-  on/off using ethtool calls.  Currently LRO is supported only for
-  checksum offload capable devices.
-
-  Stateless offloads are supported only in datagram mode.  
-
-Interrupt moderation
-
-  If the underlying IB device supports CQ event moderation, one can
-  use ethtool to set interrupt mitigation parameters and thus reduce
-  the overhead incurred by handling interrupts.  The main code path of
-  IPoIB doesn't use events for TX completion signaling so only RX
-  moderation is supported.
-
-Debugging Information
-
-  By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
-  to 'y', tracing messages are compiled into the driver.  They are
-  turned on by setting the module parameters debug_level and
-  mcast_debug_level to 1.  These parameters can be controlled at
-  runtime through files in /sys/module/ib_ipoib/.
-
-  CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
-  virtual filesystem.  By mounting this filesystem, for example with
-
-    mount -t debugfs none /sys/kernel/debug
-
-  it is possible to get statistics about multicast groups from the
-  files /sys/kernel/debug/ipoib/ib0_mcg and so on.
-
-  The performance impact of this option is negligible, so it
-  is safe to enable this option with debug_level set to 0 for normal
-  operation.
-
-  CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
-  the data path when data_debug_level is set to 1.  However, even with
-  the output disabled, enabling this configuration option will affect
-  performance, because it adds tests to the fast path.
-
-References
-
-  Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
-    http://ietf.org/rfc/rfc4391.txt 
-  IP over InfiniBand (IPoIB) Architecture (RFC 4392)
-    http://ietf.org/rfc/rfc4392.txt 
-  IP over InfiniBand: Connected Mode (RFC 4755)
-    http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/opa_vnic.rst b/Documentation/infiniband/opa_vnic.rst
new file mode 100644
index 000000000000..2f888d9ffec0
--- /dev/null
+++ b/Documentation/infiniband/opa_vnic.rst
@@ -0,0 +1,159 @@
+=================================================================
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+=================================================================
+
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
+supports Ethernet functionality over Omni-Path fabric by encapsulating
+the Ethernet packets between HFI nodes.
+
+Architecture
+=============
+The patterns of exchanges of Omni-Path encapsulated Ethernet packets
+involves one or more virtual Ethernet switches overlaid on the Omni-Path
+fabric topology. A subset of HFI nodes on the Omni-Path fabric are
+permitted to exchange encapsulated Ethernet packets across a particular
+virtual Ethernet switch. The virtual Ethernet switches are logical
+abstractions achieved by configuring the HFI nodes on the fabric for
+header generation and processing. In the simplest configuration all HFI
+nodes across the fabric exchange encapsulated Ethernet packets over a
+single virtual Ethernet switch. A virtual Ethernet switch, is effectively
+an independent Ethernet network. The configuration is performed by an
+Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
+application. HFI nodes can have multiple VNICs each connected to a
+different virtual Ethernet switch. The below diagram presents a case
+of two virtual Ethernet switches with two HFI nodes::
+
+                               +-------------------+
+                               |      Subnet/      |
+                               |     Ethernet      |
+                               |      Manager      |
+                               +-------------------+
+                                  /          /
+                                /           /
+                              /            /
+                            /             /
+  +-----------------------------+  +------------------------------+
+  |  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
+  |  +---------+    +---------+ |  | +---------+    +---------+   |
+  |  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
+  +--+---------+----+---------+-+  +-+---------+----+---------+---+
+           |                 \        /                 |
+           |                   \    /                   |
+           |                     \/                     |
+           |                    /  \                    |
+           |                  /      \                  |
+       +-----------+------------+  +-----------+------------+
+       |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
+       +-----------+------------+  +-----------+------------+
+       |          HFI           |  |          HFI           |
+       +------------------------+  +------------------------+
+
+
+The Omni-Path encapsulated Ethernet packet format is as described below.
+
+==================== ================================
+Bits                 Field
+==================== ================================
+Quad Word 0:
+0-19                 SLID (lower 20 bits)
+20-30                Length (in Quad Words)
+31                   BECN bit
+32-51                DLID (lower 20 bits)
+52-56                SC (Service Class)
+57-59                RC (Routing Control)
+60                   FECN bit
+61-62                L2 (=10, 16B format)
+63                   LT (=1, Link Transfer Head Flit)
+
+Quad Word 1:
+0-7                  L4 type (=0x78 ETHERNET)
+8-11                 SLID[23:20]
+12-15                DLID[23:20]
+16-31                PKEY
+32-47                Entropy
+48-63                Reserved
+
+Quad Word 2:
+0-15                 Reserved
+16-31                L4 header
+32-63                Ethernet Packet
+
+Quad Words 3 to N-1:
+0-63                 Ethernet packet (pad extended)
+
+Quad Word N (last):
+0-23                 Ethernet packet (pad extended)
+24-55                ICRC
+56-61                Tail
+62-63                LT (=01, Link Transfer Tail Flit)
+==================== ================================
+
+Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
+packet is quad word aligned. The 'Tail' field contains the number of bytes
+padded. On the receive side the 'Tail' field is read and the padding is
+removed (along with ICRC, Tail and OPA header) before passing packet up
+the network stack.
+
+The L4 header field contains the virtual Ethernet switch id the VNIC port
+belongs to. On the receive side, this field is used to de-multiplex the
+received VNIC packets to different VNIC ports.
+
+Driver Design
+==============
+Intel OPA VNIC software design is presented in the below diagram.
+OPA VNIC functionality has a HW dependent component and a HW
+independent component.
+
+The support has been added for IB device to allocate and free the RDMA
+netdev devices. The RDMA netdev supports interfacing with the network
+stack thus creating standard network interfaces. OPA_VNIC is an RDMA
+netdev device type.
+
+The HW dependent VNIC functionality is part of the HFI1 driver. It
+implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
+It involves HW resource allocation/management for VNIC functionality.
+It interfaces with the network stack and implements the required
+net_device_ops functions. It expects Omni-Path encapsulated Ethernet
+packets in the transmit path and provides HW access to them. It strips
+the Omni-Path header from the received packets before passing them up
+the network stack. It also implements the RDMA netdev control operations.
+
+The OPA VNIC module implements the HW independent VNIC functionality.
+It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
+registers itself with IB core as an IB client and interfaces with the
+IB MAD stack. It exchanges the management information with the Ethernet
+Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
+the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
+set by HW dependent VNIC driver where required to accommodate any control
+operation. It also handles the encapsulation of Ethernet packets with an
+Omni-Path header in the transmit path. For each VNIC interface, the
+information required for encapsulation is configured by the EM via VEMA MAD
+interface. It also passes any control information to the HW dependent driver
+by invoking the RDMA netdev control operations::
+
+        +-------------------+ +----------------------+
+        |                   | |       Linux          |
+        |     IB MAD        | |      Network         |
+        |                   | |       Stack          |
+        +-------------------+ +----------------------+
+                 |               |          |
+                 |               |          |
+        +----------------------------+      |
+        |                            |      |
+        |      OPA VNIC Module       |      |
+        |  (OPA VNIC RDMA Netdev     |      |
+        |     & EMA functions)       |      |
+        |                            |      |
+        +----------------------------+      |
+                    |                       |
+                    |                       |
+           +------------------+             |
+           |     IB core      |             |
+           +------------------+             |
+                    |                       |
+                    |                       |
+        +--------------------------------------------+
+        |                                            |
+        |      HFI1 Driver with VNIC support         |
+        |                                            |
+        +--------------------------------------------+
diff --git a/Documentation/infiniband/opa_vnic.txt b/Documentation/infiniband/opa_vnic.txt
deleted file mode 100644
index 282e17be798a..000000000000
--- a/Documentation/infiniband/opa_vnic.txt
+++ /dev/null
@@ -1,153 +0,0 @@
-Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
-supports Ethernet functionality over Omni-Path fabric by encapsulating
-the Ethernet packets between HFI nodes.
-
-Architecture
-=============
-The patterns of exchanges of Omni-Path encapsulated Ethernet packets
-involves one or more virtual Ethernet switches overlaid on the Omni-Path
-fabric topology. A subset of HFI nodes on the Omni-Path fabric are
-permitted to exchange encapsulated Ethernet packets across a particular
-virtual Ethernet switch. The virtual Ethernet switches are logical
-abstractions achieved by configuring the HFI nodes on the fabric for
-header generation and processing. In the simplest configuration all HFI
-nodes across the fabric exchange encapsulated Ethernet packets over a
-single virtual Ethernet switch. A virtual Ethernet switch, is effectively
-an independent Ethernet network. The configuration is performed by an
-Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
-application. HFI nodes can have multiple VNICs each connected to a
-different virtual Ethernet switch. The below diagram presents a case
-of two virtual Ethernet switches with two HFI nodes.
-
-                             +-------------------+
-                             |      Subnet/      |
-                             |     Ethernet      |
-                             |      Manager      |
-                             +-------------------+
-                                /          /
-                              /           /
-                            /            /
-                          /             /
-+-----------------------------+  +------------------------------+
-|  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
-|  +---------+    +---------+ |  | +---------+    +---------+   |
-|  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
-+--+---------+----+---------+-+  +-+---------+----+---------+---+
-         |                 \        /                 |
-         |                   \    /                   |
-         |                     \/                     |
-         |                    /  \                    |
-         |                  /      \                  |
-     +-----------+------------+  +-----------+------------+
-     |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
-     +-----------+------------+  +-----------+------------+
-     |          HFI           |  |          HFI           |
-     +------------------------+  +------------------------+
-
-
-The Omni-Path encapsulated Ethernet packet format is as described below.
-
-Bits          Field
-------------------------------------
-Quad Word 0:
-0-19      SLID (lower 20 bits)
-20-30     Length (in Quad Words)
-31        BECN bit
-32-51     DLID (lower 20 bits)
-52-56     SC (Service Class)
-57-59     RC (Routing Control)
-60        FECN bit
-61-62     L2 (=10, 16B format)
-63        LT (=1, Link Transfer Head Flit)
-
-Quad Word 1:
-0-7       L4 type (=0x78 ETHERNET)
-8-11      SLID[23:20]
-12-15     DLID[23:20]
-16-31     PKEY
-32-47     Entropy
-48-63     Reserved
-
-Quad Word 2:
-0-15      Reserved
-16-31     L4 header
-32-63     Ethernet Packet
-
-Quad Words 3 to N-1:
-0-63      Ethernet packet (pad extended)
-
-Quad Word N (last):
-0-23      Ethernet packet (pad extended)
-24-55     ICRC
-56-61     Tail
-62-63     LT (=01, Link Transfer Tail Flit)
-
-Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
-packet is quad word aligned. The 'Tail' field contains the number of bytes
-padded. On the receive side the 'Tail' field is read and the padding is
-removed (along with ICRC, Tail and OPA header) before passing packet up
-the network stack.
-
-The L4 header field contains the virtual Ethernet switch id the VNIC port
-belongs to. On the receive side, this field is used to de-multiplex the
-received VNIC packets to different VNIC ports.
-
-Driver Design
-==============
-Intel OPA VNIC software design is presented in the below diagram.
-OPA VNIC functionality has a HW dependent component and a HW
-independent component.
-
-The support has been added for IB device to allocate and free the RDMA
-netdev devices. The RDMA netdev supports interfacing with the network
-stack thus creating standard network interfaces. OPA_VNIC is an RDMA
-netdev device type.
-
-The HW dependent VNIC functionality is part of the HFI1 driver. It
-implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
-It involves HW resource allocation/management for VNIC functionality.
-It interfaces with the network stack and implements the required
-net_device_ops functions. It expects Omni-Path encapsulated Ethernet
-packets in the transmit path and provides HW access to them. It strips
-the Omni-Path header from the received packets before passing them up
-the network stack. It also implements the RDMA netdev control operations.
-
-The OPA VNIC module implements the HW independent VNIC functionality.
-It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
-registers itself with IB core as an IB client and interfaces with the
-IB MAD stack. It exchanges the management information with the Ethernet
-Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
-the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
-set by HW dependent VNIC driver where required to accommodate any control
-operation. It also handles the encapsulation of Ethernet packets with an
-Omni-Path header in the transmit path. For each VNIC interface, the
-information required for encapsulation is configured by the EM via VEMA MAD
-interface. It also passes any control information to the HW dependent driver
-by invoking the RDMA netdev control operations.
-
-        +-------------------+ +----------------------+
-        |                   | |       Linux          |
-        |     IB MAD        | |      Network         |
-        |                   | |       Stack          |
-        +-------------------+ +----------------------+
-                 |               |          |
-                 |               |          |
-        +----------------------------+      |
-        |                            |      |
-        |      OPA VNIC Module       |      |
-        |  (OPA VNIC RDMA Netdev     |      |
-        |     & EMA functions)       |      |
-        |                            |      |
-        +----------------------------+      |
-                    |                       |
-                    |                       |
-           +------------------+             |
-           |     IB core      |             |
-           +------------------+             |
-                    |                       |
-                    |                       |
-        +--------------------------------------------+
-        |                                            |
-        |      HFI1 Driver with VNIC support         |
-        |                                            |
-        +--------------------------------------------+
diff --git a/Documentation/infiniband/sysfs.rst b/Documentation/infiniband/sysfs.rst
new file mode 100644
index 000000000000..f0abd6fa48f4
--- /dev/null
+++ b/Documentation/infiniband/sysfs.rst
@@ -0,0 +1,6 @@
+===========
+Sysfs files
+===========
+
+The sysfs interface has moved to
+Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
deleted file mode 100644
index 9fab5062f84b..000000000000
--- a/Documentation/infiniband/sysfs.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-SYSFS FILES
-
-The sysfs interface has moved to
-Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/tag_matching.rst b/Documentation/infiniband/tag_matching.rst
new file mode 100644
index 000000000000..ef56ea585f92
--- /dev/null
+++ b/Documentation/infiniband/tag_matching.rst
@@ -0,0 +1,69 @@
+==================
+Tag matching logic
+==================
+
+The MPI standard defines a set of rules, known as tag-matching, for matching
+source send operations to destination receives.  The following parameters must
+match the following source and destination parameters:
+
+*	Communicator
+*	User tag - wild card may be specified by the receiver
+*	Source rank – wild car may be specified by the receiver
+*	Destination rank – wild
+
+The ordering rules require that when more than one pair of send and receive
+message envelopes may match, the pair that includes the earliest posted-send
+and the earliest posted-receive is the pair that must be used to satisfy the
+matching operation. However, this doesn’t imply that tags are consumed in
+the order they are created, e.g., a later generated tag may be consumed, if
+earlier tags can’t be used to satisfy the matching rules.
+
+When a message is sent from the sender to the receiver, the communication
+library may attempt to process the operation either after or before the
+corresponding matching receive is posted.  If a matching receive is posted,
+this is an expected message, otherwise it is called an unexpected message.
+Implementations frequently use different matching schemes for these two
+different matching instances.
+
+To keep MPI library memory footprint down, MPI implementations typically use
+two different protocols for this purpose:
+
+1.	The Eager protocol- the complete message is sent when the send is
+processed by the sender. A completion send is received in the send_cq
+notifying that the buffer can be reused.
+
+2.	The Rendezvous Protocol - the sender sends the tag-matching header,
+and perhaps a portion of data when first notifying the receiver. When the
+corresponding buffer is posted, the responder will use the information from
+the header to initiate an RDMA READ operation directly to the matching buffer.
+A fin message needs to be received in order for the buffer to be reused.
+
+Tag matching implementation
+===========================
+
+There are two types of matching objects used, the posted receive list and the
+unexpected message list. The application posts receive buffers through calls
+to the MPI receive routines in the posted receive list and posts send messages
+using the MPI send routines. The head of the posted receive list may be
+maintained by the hardware, with the software expected to shadow this list.
+
+When send is initiated and arrives at the receive side, if there is no
+pre-posted receive for this arriving message, it is passed to the software and
+placed in the unexpected message list. Otherwise the match is processed,
+including rendezvous processing, if appropriate, delivering the data to the
+specified receive buffer. This allows overlapping receive-side MPI tag
+matching with computation.
+
+When a receive-message is posted, the communication library will first check
+the software unexpected message list for a matching receive. If a match is
+found, data is delivered to the user buffer, using a software controlled
+protocol. The UCX implementation uses either an eager or rendezvous protocol,
+depending on data size. If no match is found, the entire pre-posted receive
+list is maintained by the hardware, and there is space to add one more
+pre-posted receive to this list, this receive is passed to the hardware.
+Software is expected to shadow this list, to help with processing MPI cancel
+operations. In addition, because hardware and software are not expected to be
+tightly synchronized with respect to the tag-matching operation, this shadow
+list is used to detect the case that a pre-posted receive is passed to the
+hardware, as the matching unexpected message is being passed from the hardware
+to the software.
diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.txt
deleted file mode 100644
index d2a3bf819226..000000000000
--- a/Documentation/infiniband/tag_matching.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-Tag matching logic
-
-The MPI standard defines a set of rules, known as tag-matching, for matching
-source send operations to destination receives.  The following parameters must
-match the following source and destination parameters:
-*	Communicator
-*	User tag - wild card may be specified by the receiver
-*	Source rank – wild car may be specified by the receiver
-*	Destination rank – wild
-The ordering rules require that when more than one pair of send and receive
-message envelopes may match, the pair that includes the earliest posted-send
-and the earliest posted-receive is the pair that must be used to satisfy the
-matching operation. However, this doesn’t imply that tags are consumed in
-the order they are created, e.g., a later generated tag may be consumed, if
-earlier tags can’t be used to satisfy the matching rules.
-
-When a message is sent from the sender to the receiver, the communication
-library may attempt to process the operation either after or before the
-corresponding matching receive is posted.  If a matching receive is posted,
-this is an expected message, otherwise it is called an unexpected message.
-Implementations frequently use different matching schemes for these two
-different matching instances.
-
-To keep MPI library memory footprint down, MPI implementations typically use
-two different protocols for this purpose:
-
-1.	The Eager protocol- the complete message is sent when the send is
-processed by the sender. A completion send is received in the send_cq
-notifying that the buffer can be reused.
-
-2.	The Rendezvous Protocol - the sender sends the tag-matching header,
-and perhaps a portion of data when first notifying the receiver. When the
-corresponding buffer is posted, the responder will use the information from
-the header to initiate an RDMA READ operation directly to the matching buffer.
-A fin message needs to be received in order for the buffer to be reused.
-
-Tag matching implementation
-
-There are two types of matching objects used, the posted receive list and the
-unexpected message list. The application posts receive buffers through calls
-to the MPI receive routines in the posted receive list and posts send messages
-using the MPI send routines. The head of the posted receive list may be
-maintained by the hardware, with the software expected to shadow this list.
-
-When send is initiated and arrives at the receive side, if there is no
-pre-posted receive for this arriving message, it is passed to the software and
-placed in the unexpected message list. Otherwise the match is processed,
-including rendezvous processing, if appropriate, delivering the data to the
-specified receive buffer. This allows overlapping receive-side MPI tag
-matching with computation.
-
-When a receive-message is posted, the communication library will first check
-the software unexpected message list for a matching receive. If a match is
-found, data is delivered to the user buffer, using a software controlled
-protocol. The UCX implementation uses either an eager or rendezvous protocol,
-depending on data size. If no match is found, the entire pre-posted receive
-list is maintained by the hardware, and there is space to add one more
-pre-posted receive to this list, this receive is passed to the hardware.
-Software is expected to shadow this list, to help with processing MPI cancel
-operations. In addition, because hardware and software are not expected to be
-tightly synchronized with respect to the tag-matching operation, this shadow
-list is used to detect the case that a pre-posted receive is passed to the
-hardware, as the matching unexpected message is being passed from the hardware
-to the software.
diff --git a/Documentation/infiniband/user_mad.rst b/Documentation/infiniband/user_mad.rst
new file mode 100644
index 000000000000..d88abfc0e370
--- /dev/null
+++ b/Documentation/infiniband/user_mad.rst
@@ -0,0 +1,166 @@
+====================
+Userspace MAD access
+====================
+
+Device files
+============
+
+  Each port of each InfiniBand device has a "umad" device and an
+  "issm" device attached.  For example, a two-port HCA will have two
+  umad devices and two issm devices, while a switch will have one
+  device of each type (for switch port 0).
+
+Creating MAD agents
+===================
+
+  A MAD agent can be created by filling in a struct ib_user_mad_reg_req
+  and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
+  descriptor for the appropriate device file.  If the registration
+  request succeeds, a 32-bit id will be returned in the structure.
+  For example::
+
+	struct ib_user_mad_reg_req req = { /* ... */ };
+	ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
+        if (!ret)
+		my_agent = req.id;
+	else
+		perror("agent register");
+
+  Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
+  ioctl.  Also, all agents registered through a file descriptor will
+  be unregistered when the descriptor is closed.
+
+  2014
+       a new registration ioctl is now provided which allows additional
+       fields to be provided during registration.
+       Users of this registration call are implicitly setting the use of
+       pkey_index (see below).
+
+Receiving MADs
+==============
+
+  MADs are received using read().  The receive side now supports
+  RMPP. The buffer passed to read() must be at least one
+  struct ib_user_mad + 256 bytes. For example:
+
+  If the buffer passed is not large enough to hold the received
+  MAD (RMPP), the errno is set to ENOSPC and the length of the
+  buffer needed is set in mad.length.
+
+  Example for normal MAD (non RMPP) reads::
+
+	struct ib_user_mad *mad;
+	mad = malloc(sizeof *mad + 256);
+	ret = read(fd, mad, sizeof *mad + 256);
+	if (ret != sizeof mad + 256) {
+		perror("read");
+		free(mad);
+	}
+
+  Example for RMPP reads::
+
+	struct ib_user_mad *mad;
+	mad = malloc(sizeof *mad + 256);
+	ret = read(fd, mad, sizeof *mad + 256);
+	if (ret == -ENOSPC)) {
+		length = mad.length;
+		free(mad);
+		mad = malloc(sizeof *mad + length);
+		ret = read(fd, mad, sizeof *mad + length);
+	}
+	if (ret < 0) {
+		perror("read");
+		free(mad);
+	}
+
+  In addition to the actual MAD contents, the other struct ib_user_mad
+  fields will be filled in with information on the received MAD.  For
+  example, the remote LID will be in mad.lid.
+
+  If a send times out, a receive will be generated with mad.status set
+  to ETIMEDOUT.  Otherwise when a MAD has been successfully received,
+  mad.status will be 0.
+
+  poll()/select() may be used to wait until a MAD can be read.
+
+Sending MADs
+============
+
+  MADs are sent using write().  The agent ID for sending should be
+  filled into the id field of the MAD, the destination LID should be
+  filled into the lid field, and so on.  The send side does support
+  RMPP so arbitrary length MAD can be sent. For example::
+
+	struct ib_user_mad *mad;
+
+	mad = malloc(sizeof *mad + mad_length);
+
+	/* fill in mad->data */
+
+	mad->hdr.id  = my_agent;	/* req.id from agent registration */
+	mad->hdr.lid = my_dest;		/* in network byte order... */
+	/* etc. */
+
+	ret = write(fd, &mad, sizeof *mad + mad_length);
+	if (ret != sizeof *mad + mad_length)
+		perror("write");
+
+Transaction IDs
+===============
+
+  Users of the umad devices can use the lower 32 bits of the
+  transaction ID field (that is, the least significant half of the
+  field in network byte order) in MADs being sent to match
+  request/response pairs.  The upper 32 bits are reserved for use by
+  the kernel and will be overwritten before a MAD is sent.
+
+P_Key Index Handling
+====================
+
+  The old ib_umad interface did not allow setting the P_Key index for
+  MADs that are sent and did not provide a way for obtaining the P_Key
+  index of received MADs.  A new layout for struct ib_user_mad_hdr
+  with a pkey_index member has been defined; however, to preserve binary
+  compatibility with older applications, this new layout will not be used
+  unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
+  are called before a file descriptor is used for anything else.
+
+  In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
+  to 6, the new layout of struct ib_user_mad_hdr will be used by
+  default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
+
+Setting IsSM Capability Bit
+===========================
+
+  To set the IsSM capability bit for a port, simply open the
+  corresponding issm device file.  If the IsSM bit is already set,
+  then the open call will block until the bit is cleared (or return
+  immediately with errno set to EAGAIN if the O_NONBLOCK flag is
+  passed to open()).  The IsSM bit will be cleared when the issm file
+  is closed.  No read, write or other operations can be performed on
+  the issm file.
+
+/dev files
+==========
+
+  To create the appropriate character device files automatically with
+  udev, a rule like::
+
+    KERNEL=="umad*", NAME="infiniband/%k"
+    KERNEL=="issm*", NAME="infiniband/%k"
+
+  can be used.  This will create device nodes named::
+
+    /dev/infiniband/umad0
+    /dev/infiniband/issm0
+
+  for the first port, and so on.  The InfiniBand device and port
+  associated with these devices can be determined from the files::
+
+    /sys/class/infiniband_mad/umad0/ibdev
+    /sys/class/infiniband_mad/umad0/port
+
+  and::
+
+    /sys/class/infiniband_mad/issm0/ibdev
+    /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt
deleted file mode 100644
index 7aca13a54a3a..000000000000
--- a/Documentation/infiniband/user_mad.txt
+++ /dev/null
@@ -1,153 +0,0 @@
-USERSPACE MAD ACCESS
-
-Device files
-
-  Each port of each InfiniBand device has a "umad" device and an
-  "issm" device attached.  For example, a two-port HCA will have two
-  umad devices and two issm devices, while a switch will have one
-  device of each type (for switch port 0).
-
-Creating MAD agents
-
-  A MAD agent can be created by filling in a struct ib_user_mad_reg_req
-  and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
-  descriptor for the appropriate device file.  If the registration
-  request succeeds, a 32-bit id will be returned in the structure.
-  For example:
-
-	struct ib_user_mad_reg_req req = { /* ... */ };
-	ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
-        if (!ret)
-		my_agent = req.id;
-	else
-		perror("agent register");
-
-  Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
-  ioctl.  Also, all agents registered through a file descriptor will
-  be unregistered when the descriptor is closed.
-
-  2014 -- a new registration ioctl is now provided which allows additional
-       fields to be provided during registration.
-       Users of this registration call are implicitly setting the use of
-       pkey_index (see below).
-
-Receiving MADs
-
-  MADs are received using read().  The receive side now supports
-  RMPP. The buffer passed to read() must be at least one
-  struct ib_user_mad + 256 bytes. For example:
-
-  If the buffer passed is not large enough to hold the received
-  MAD (RMPP), the errno is set to ENOSPC and the length of the
-  buffer needed is set in mad.length.
-
-  Example for normal MAD (non RMPP) reads:
-	struct ib_user_mad *mad;
-	mad = malloc(sizeof *mad + 256);
-	ret = read(fd, mad, sizeof *mad + 256);
-	if (ret != sizeof mad + 256) {
-		perror("read");
-		free(mad);
-	}
-
-  Example for RMPP reads:
-	struct ib_user_mad *mad;
-	mad = malloc(sizeof *mad + 256);
-	ret = read(fd, mad, sizeof *mad + 256);
-	if (ret == -ENOSPC)) {
-		length = mad.length;
-		free(mad);
-		mad = malloc(sizeof *mad + length);
-		ret = read(fd, mad, sizeof *mad + length);
-	}
-	if (ret < 0) {
-		perror("read");
-		free(mad);
-	}
-
-  In addition to the actual MAD contents, the other struct ib_user_mad
-  fields will be filled in with information on the received MAD.  For
-  example, the remote LID will be in mad.lid.
-
-  If a send times out, a receive will be generated with mad.status set
-  to ETIMEDOUT.  Otherwise when a MAD has been successfully received,
-  mad.status will be 0.
-
-  poll()/select() may be used to wait until a MAD can be read.
-
-Sending MADs
-
-  MADs are sent using write().  The agent ID for sending should be
-  filled into the id field of the MAD, the destination LID should be
-  filled into the lid field, and so on.  The send side does support
-  RMPP so arbitrary length MAD can be sent. For example:
-
-	struct ib_user_mad *mad;
-
-	mad = malloc(sizeof *mad + mad_length);
-
-	/* fill in mad->data */
-
-	mad->hdr.id  = my_agent;	/* req.id from agent registration */
-	mad->hdr.lid = my_dest;		/* in network byte order... */
-	/* etc. */
-
-	ret = write(fd, &mad, sizeof *mad + mad_length);
-	if (ret != sizeof *mad + mad_length)
-		perror("write");
-
-Transaction IDs
-
-  Users of the umad devices can use the lower 32 bits of the
-  transaction ID field (that is, the least significant half of the
-  field in network byte order) in MADs being sent to match
-  request/response pairs.  The upper 32 bits are reserved for use by
-  the kernel and will be overwritten before a MAD is sent.
-
-P_Key Index Handling
-
-  The old ib_umad interface did not allow setting the P_Key index for
-  MADs that are sent and did not provide a way for obtaining the P_Key
-  index of received MADs.  A new layout for struct ib_user_mad_hdr
-  with a pkey_index member has been defined; however, to preserve binary
-  compatibility with older applications, this new layout will not be used
-  unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
-  are called before a file descriptor is used for anything else.
-
-  In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
-  to 6, the new layout of struct ib_user_mad_hdr will be used by
-  default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
-
-Setting IsSM Capability Bit
-
-  To set the IsSM capability bit for a port, simply open the
-  corresponding issm device file.  If the IsSM bit is already set,
-  then the open call will block until the bit is cleared (or return
-  immediately with errno set to EAGAIN if the O_NONBLOCK flag is
-  passed to open()).  The IsSM bit will be cleared when the issm file
-  is closed.  No read, write or other operations can be performed on
-  the issm file.
-
-/dev files
-
-  To create the appropriate character device files automatically with
-  udev, a rule like
-
-    KERNEL=="umad*", NAME="infiniband/%k"
-    KERNEL=="issm*", NAME="infiniband/%k"
-
-  can be used.  This will create device nodes named
-
-    /dev/infiniband/umad0
-    /dev/infiniband/issm0
-
-  for the first port, and so on.  The InfiniBand device and port
-  associated with these devices can be determined from the files
-
-    /sys/class/infiniband_mad/umad0/ibdev
-    /sys/class/infiniband_mad/umad0/port
-
-  and
-
-    /sys/class/infiniband_mad/issm0/ibdev
-    /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_verbs.rst b/Documentation/infiniband/user_verbs.rst
new file mode 100644
index 000000000000..8ddc4b1cfef2
--- /dev/null
+++ b/Documentation/infiniband/user_verbs.rst
@@ -0,0 +1,75 @@
+======================
+Userspace verbs access
+======================
+
+  The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
+  enables direct userspace access to IB hardware via "verbs," as
+  described in chapter 11 of the InfiniBand Architecture Specification.
+
+  To use the verbs, the libibverbs library, available from
+  https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
+  device-independent API for using the ib_uverbs interface.
+  libibverbs also requires appropriate device-dependent kernel and
+  userspace driver for your InfiniBand hardware.  For example, to use
+  a Mellanox HCA, you will need the ib_mthca kernel module and the
+  libmthca userspace driver be installed.
+
+User-kernel communication
+=========================
+
+  Userspace communicates with the kernel for slow path, resource
+  management operations via the /dev/infiniband/uverbsN character
+  devices.  Fast path operations are typically performed by writing
+  directly to hardware registers mmap()ed into userspace, with no
+  system call or context switch into the kernel.
+
+  Commands are sent to the kernel via write()s on these device files.
+  The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
+  The structs for commands that require a response from the kernel
+  contain a 64-bit field used to pass a pointer to an output buffer.
+  Status is returned to userspace as the return value of the write()
+  system call.
+
+Resource management
+===================
+
+  Since creation and destruction of all IB resources is done by
+  commands passed through a file descriptor, the kernel can keep track
+  of which resources are attached to a given userspace context.  The
+  ib_uverbs module maintains idr tables that are used to translate
+  between kernel pointers and opaque userspace handles, so that kernel
+  pointers are never exposed to userspace and userspace cannot trick
+  the kernel into following a bogus pointer.
+
+  This also allows the kernel to clean up when a process exits and
+  prevent one process from touching another process's resources.
+
+Memory pinning
+==============
+
+  Direct userspace I/O requires that memory regions that are potential
+  I/O targets be kept resident at the same physical address.  The
+  ib_uverbs module manages pinning and unpinning memory regions via
+  get_user_pages() and put_page() calls.  It also accounts for the
+  amount of memory pinned in the process's pinned_vm, and checks that
+  unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
+
+  Pages that are pinned multiple times are counted each time they are
+  pinned, so the value of pinned_vm may be an overestimate of the
+  number of pages pinned by a process.
+
+/dev files
+==========
+
+  To create the appropriate character device files automatically with
+  udev, a rule like::
+
+    KERNEL=="uverbs*", NAME="infiniband/%k"
+
+  can be used.  This will create device nodes named::
+
+    /dev/infiniband/uverbs0
+
+  and so on.  Since the InfiniBand userspace verbs should be safe for
+  use by non-privileged processes, it may be useful to add an
+  appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
deleted file mode 100644
index 47ebf2f80b2b..000000000000
--- a/Documentation/infiniband/user_verbs.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-USERSPACE VERBS ACCESS
-
-  The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
-  enables direct userspace access to IB hardware via "verbs," as
-  described in chapter 11 of the InfiniBand Architecture Specification.
-
-  To use the verbs, the libibverbs library, available from
-  https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
-  device-independent API for using the ib_uverbs interface.
-  libibverbs also requires appropriate device-dependent kernel and
-  userspace driver for your InfiniBand hardware.  For example, to use
-  a Mellanox HCA, you will need the ib_mthca kernel module and the
-  libmthca userspace driver be installed.
-
-User-kernel communication
-
-  Userspace communicates with the kernel for slow path, resource
-  management operations via the /dev/infiniband/uverbsN character
-  devices.  Fast path operations are typically performed by writing
-  directly to hardware registers mmap()ed into userspace, with no
-  system call or context switch into the kernel.
-
-  Commands are sent to the kernel via write()s on these device files.
-  The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
-  The structs for commands that require a response from the kernel
-  contain a 64-bit field used to pass a pointer to an output buffer.
-  Status is returned to userspace as the return value of the write()
-  system call.
-
-Resource management
-
-  Since creation and destruction of all IB resources is done by
-  commands passed through a file descriptor, the kernel can keep track
-  of which resources are attached to a given userspace context.  The
-  ib_uverbs module maintains idr tables that are used to translate
-  between kernel pointers and opaque userspace handles, so that kernel
-  pointers are never exposed to userspace and userspace cannot trick
-  the kernel into following a bogus pointer.
-
-  This also allows the kernel to clean up when a process exits and
-  prevent one process from touching another process's resources.
-
-Memory pinning
-
-  Direct userspace I/O requires that memory regions that are potential
-  I/O targets be kept resident at the same physical address.  The
-  ib_uverbs module manages pinning and unpinning memory regions via
-  get_user_pages() and put_page() calls.  It also accounts for the
-  amount of memory pinned in the process's pinned_vm, and checks that
-  unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
-
-  Pages that are pinned multiple times are counted each time they are
-  pinned, so the value of pinned_vm may be an overestimate of the
-  number of pages pinned by a process.
-
-/dev files
-
-  To create the appropriate character device files automatically with
-  udev, a rule like
-
-    KERNEL=="uverbs*", NAME="infiniband/%k"
-
-  can be used.  This will create device nodes named
-
-    /dev/infiniband/uverbs0
-
-  and so on.  Since the InfiniBand userspace verbs should be safe for
-  use by non-privileged processes, it may be useful to add an
-  appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/input/conf.py b/Documentation/input/conf.py
deleted file mode 100644
index d2352fdc92ed..000000000000
--- a/Documentation/input/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "The Linux input driver subsystem"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'linux-input.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/input/input.rst b/Documentation/input/input.rst
index 47f86a4bf16c..0eb61e67a7b7 100644
--- a/Documentation/input/input.rst
+++ b/Documentation/input/input.rst
@@ -188,7 +188,7 @@ LCDs and many other purposes.
 
 The monitor and speaker controls should be easy to add to the hid/input
 interface, but for the UPSs and LCDs it doesn't make much sense. For this,
-the hiddev interface was designed. See Documentation/hid/hiddev.txt
+the hiddev interface was designed. See Documentation/hid/hiddev.rst
 for more information about it.
 
 The usage of the usbhid module is very simple, it takes no parameters,
diff --git a/Documentation/ioctl/botching-up-ioctls.rst b/Documentation/ioctl/botching-up-ioctls.rst
new file mode 100644
index 000000000000..ac697fef3545
--- /dev/null
+++ b/Documentation/ioctl/botching-up-ioctls.rst
@@ -0,0 +1,225 @@
+=================================
+(How to avoid) Botching up ioctls
+=================================
+
+From: http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
+
+By: Daniel Vetter, Copyright © 2013 Intel Corporation
+
+One clear insight kernel graphics hackers gained in the past few years is that
+trying to come up with a unified interface to manage the execution units and
+memory on completely different GPUs is a futile effort. So nowadays every
+driver has its own set of ioctls to allocate memory and submit work to the GPU.
+Which is nice, since there's no more insanity in the form of fake-generic, but
+actually only used once interfaces. But the clear downside is that there's much
+more potential to screw things up.
+
+To avoid repeating all the same mistakes again I've written up some of the
+lessons learned while botching the job for the drm/i915 driver. Most of these
+only cover technicalities and not the big-picture issues like what the command
+submission ioctl exactly should look like. Learning these lessons is probably
+something every GPU driver has to do on its own.
+
+
+Prerequisites
+-------------
+
+First the prerequisites. Without these you have already failed, because you
+will need to add a 32-bit compat layer:
+
+ * Only use fixed sized integers. To avoid conflicts with typedefs in userspace
+   the kernel has special types like __u32, __s64. Use them.
+
+ * Align everything to the natural size and use explicit padding. 32-bit
+   platforms don't necessarily align 64-bit values to 64-bit boundaries, but
+   64-bit platforms do. So we always need padding to the natural size to get
+   this right.
+
+ * Pad the entire struct to a multiple of 64-bits if the structure contains
+   64-bit types - the structure size will otherwise differ on 32-bit versus
+   64-bit. Having a different structure size hurts when passing arrays of
+   structures to the kernel, or if the kernel checks the structure size, which
+   e.g. the drm core does.
+
+ * Pointers are __u64, cast from/to a uintprt_t on the userspace side and
+   from/to a void __user * in the kernel. Try really hard not to delay this
+   conversion or worse, fiddle the raw __u64 through your code since that
+   diminishes the checking tools like sparse can provide. The macro
+   u64_to_user_ptr can be used in the kernel to avoid warnings about integers
+   and pointres of different sizes.
+
+
+Basics
+------
+
+With the joys of writing a compat layer avoided we can take a look at the basic
+fumbles. Neglecting these will make backward and forward compatibility a real
+pain. And since getting things wrong on the first attempt is guaranteed you
+will have a second iteration or at least an extension for any given interface.
+
+ * Have a clear way for userspace to figure out whether your new ioctl or ioctl
+   extension is supported on a given kernel. If you can't rely on old kernels
+   rejecting the new flags/modes or ioctls (since doing that was botched in the
+   past) then you need a driver feature flag or revision number somewhere.
+
+ * Have a plan for extending ioctls with new flags or new fields at the end of
+   the structure. The drm core checks the passed-in size for each ioctl call
+   and zero-extends any mismatches between kernel and userspace. That helps,
+   but isn't a complete solution since newer userspace on older kernels won't
+   notice that the newly added fields at the end get ignored. So this still
+   needs a new driver feature flags.
+
+ * Check all unused fields and flags and all the padding for whether it's 0,
+   and reject the ioctl if that's not the case. Otherwise your nice plan for
+   future extensions is going right down the gutters since someone will submit
+   an ioctl struct with random stack garbage in the yet unused parts. Which
+   then bakes in the ABI that those fields can never be used for anything else
+   but garbage. This is also the reason why you must explicitly pad all
+   structures, even if you never use them in an array - the padding the compiler
+   might insert could contain garbage.
+
+ * Have simple testcases for all of the above.
+
+
+Fun with Error Paths
+--------------------
+
+Nowadays we don't have any excuse left any more for drm drivers being neat
+little root exploits. This means we both need full input validation and solid
+error handling paths - GPUs will die eventually in the oddmost corner cases
+anyway:
+
+ * The ioctl must check for array overflows. Also it needs to check for
+   over/underflows and clamping issues of integer values in general. The usual
+   example is sprite positioning values fed directly into the hardware with the
+   hardware just having 12 bits or so. Works nicely until some odd display
+   server doesn't bother with clamping itself and the cursor wraps around the
+   screen.
+
+ * Have simple testcases for every input validation failure case in your ioctl.
+   Check that the error code matches your expectations. And finally make sure
+   that you only test for one single error path in each subtest by submitting
+   otherwise perfectly valid data. Without this an earlier check might reject
+   the ioctl already and shadow the codepath you actually want to test, hiding
+   bugs and regressions.
+
+ * Make all your ioctls restartable. First X really loves signals and second
+   this will allow you to test 90% of all error handling paths by just
+   interrupting your main test suite constantly with signals. Thanks to X's
+   love for signal you'll get an excellent base coverage of all your error
+   paths pretty much for free for graphics drivers. Also, be consistent with
+   how you handle ioctl restarting - e.g. drm has a tiny drmIoctl helper in its
+   userspace library. The i915 driver botched this with the set_tiling ioctl,
+   now we're stuck forever with some arcane semantics in both the kernel and
+   userspace.
+
+ * If you can't make a given codepath restartable make a stuck task at least
+   killable. GPUs just die and your users won't like you more if you hang their
+   entire box (by means of an unkillable X process). If the state recovery is
+   still too tricky have a timeout or hangcheck safety net as a last-ditch
+   effort in case the hardware has gone bananas.
+
+ * Have testcases for the really tricky corner cases in your error recovery code
+   - it's way too easy to create a deadlock between your hangcheck code and
+   waiters.
+
+
+Time, Waiting and Missing it
+----------------------------
+
+GPUs do most everything asynchronously, so we have a need to time operations and
+wait for outstanding ones. This is really tricky business; at the moment none of
+the ioctls supported by the drm/i915 get this fully right, which means there's
+still tons more lessons to learn here.
+
+ * Use CLOCK_MONOTONIC as your reference time, always. It's what alsa, drm and
+   v4l use by default nowadays. But let userspace know which timestamps are
+   derived from different clock domains like your main system clock (provided
+   by the kernel) or some independent hardware counter somewhere else. Clocks
+   will mismatch if you look close enough, but if performance measuring tools
+   have this information they can at least compensate. If your userspace can
+   get at the raw values of some clocks (e.g. through in-command-stream
+   performance counter sampling instructions) consider exposing those also.
+
+ * Use __s64 seconds plus __u64 nanoseconds to specify time. It's not the most
+   convenient time specification, but it's mostly the standard.
+
+ * Check that input time values are normalized and reject them if not. Note
+   that the kernel native struct ktime has a signed integer for both seconds
+   and nanoseconds, so beware here.
+
+ * For timeouts, use absolute times. If you're a good fellow and made your
+   ioctl restartable relative timeouts tend to be too coarse and can
+   indefinitely extend your wait time due to rounding on each restart.
+   Especially if your reference clock is something really slow like the display
+   frame counter. With a spec lawyer hat on this isn't a bug since timeouts can
+   always be extended - but users will surely hate you if their neat animations
+   starts to stutter due to this.
+
+ * Consider ditching any synchronous wait ioctls with timeouts and just deliver
+   an asynchronous event on a pollable file descriptor. It fits much better
+   into event driven applications' main loop.
+
+ * Have testcases for corner-cases, especially whether the return values for
+   already-completed events, successful waits and timed-out waits are all sane
+   and suiting to your needs.
+
+
+Leaking Resources, Not
+----------------------
+
+A full-blown drm driver essentially implements a little OS, but specialized to
+the given GPU platforms. This means a driver needs to expose tons of handles
+for different objects and other resources to userspace. Doing that right
+entails its own little set of pitfalls:
+
+ * Always attach the lifetime of your dynamically created resources to the
+   lifetime of a file descriptor. Consider using a 1:1 mapping if your resource
+   needs to be shared across processes -  fd-passing over unix domain sockets
+   also simplifies lifetime management for userspace.
+
+ * Always have O_CLOEXEC support.
+
+ * Ensure that you have sufficient insulation between different clients. By
+   default pick a private per-fd namespace which forces any sharing to be done
+   explicitly. Only go with a more global per-device namespace if the objects
+   are truly device-unique. One counterexample in the drm modeset interfaces is
+   that the per-device modeset objects like connectors share a namespace with
+   framebuffer objects, which mostly are not shared at all. A separate
+   namespace, private by default, for framebuffers would have been more
+   suitable.
+
+ * Think about uniqueness requirements for userspace handles. E.g. for most drm
+   drivers it's a userspace bug to submit the same object twice in the same
+   command submission ioctl. But then if objects are shareable userspace needs
+   to know whether it has seen an imported object from a different process
+   already or not. I haven't tried this myself yet due to lack of a new class
+   of objects, but consider using inode numbers on your shared file descriptors
+   as unique identifiers - it's how real files are told apart, too.
+   Unfortunately this requires a full-blown virtual filesystem in the kernel.
+
+
+Last, but not Least
+-------------------
+
+Not every problem needs a new ioctl:
+
+ * Think hard whether you really want a driver-private interface. Of course
+   it's much quicker to push a driver-private interface than engaging in
+   lengthy discussions for a more generic solution. And occasionally doing a
+   private interface to spearhead a new concept is what's required. But in the
+   end, once the generic interface comes around you'll end up maintainer two
+   interfaces. Indefinitely.
+
+ * Consider other interfaces than ioctls. A sysfs attribute is much better for
+   per-device settings, or for child objects with fairly static lifetimes (like
+   output connectors in drm with all the detection override attributes). Or
+   maybe only your testsuite needs this interface, and then debugfs with its
+   disclaimer of not having a stable ABI would be better.
+
+Finally, the name of the game is to get it right on the first attempt, since if
+your driver proves popular and your hardware platforms long-lived then you'll
+be stuck with a given ioctl essentially forever. You can try to deprecate
+horrible ioctls on newer iterations of your hardware, but generally it takes
+years to accomplish this. And then again years until the last user able to
+complain about regressions disappears, too.
diff --git a/Documentation/ioctl/botching-up-ioctls.txt b/Documentation/ioctl/botching-up-ioctls.txt
deleted file mode 100644
index 883fb034bd04..000000000000
--- a/Documentation/ioctl/botching-up-ioctls.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-(How to avoid) Botching up ioctls
-=================================
-
-From: http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
-
-By: Daniel Vetter, Copyright © 2013 Intel Corporation
-
-One clear insight kernel graphics hackers gained in the past few years is that
-trying to come up with a unified interface to manage the execution units and
-memory on completely different GPUs is a futile effort. So nowadays every
-driver has its own set of ioctls to allocate memory and submit work to the GPU.
-Which is nice, since there's no more insanity in the form of fake-generic, but
-actually only used once interfaces. But the clear downside is that there's much
-more potential to screw things up.
-
-To avoid repeating all the same mistakes again I've written up some of the
-lessons learned while botching the job for the drm/i915 driver. Most of these
-only cover technicalities and not the big-picture issues like what the command
-submission ioctl exactly should look like. Learning these lessons is probably
-something every GPU driver has to do on its own.
-
-
-Prerequisites
--------------
-
-First the prerequisites. Without these you have already failed, because you
-will need to add a 32-bit compat layer:
-
- * Only use fixed sized integers. To avoid conflicts with typedefs in userspace
-   the kernel has special types like __u32, __s64. Use them.
-
- * Align everything to the natural size and use explicit padding. 32-bit
-   platforms don't necessarily align 64-bit values to 64-bit boundaries, but
-   64-bit platforms do. So we always need padding to the natural size to get
-   this right.
-
- * Pad the entire struct to a multiple of 64-bits if the structure contains
-   64-bit types - the structure size will otherwise differ on 32-bit versus
-   64-bit. Having a different structure size hurts when passing arrays of
-   structures to the kernel, or if the kernel checks the structure size, which
-   e.g. the drm core does.
-
- * Pointers are __u64, cast from/to a uintprt_t on the userspace side and
-   from/to a void __user * in the kernel. Try really hard not to delay this
-   conversion or worse, fiddle the raw __u64 through your code since that
-   diminishes the checking tools like sparse can provide. The macro
-   u64_to_user_ptr can be used in the kernel to avoid warnings about integers
-   and pointres of different sizes.
-
-
-Basics
-------
-
-With the joys of writing a compat layer avoided we can take a look at the basic
-fumbles. Neglecting these will make backward and forward compatibility a real
-pain. And since getting things wrong on the first attempt is guaranteed you
-will have a second iteration or at least an extension for any given interface.
-
- * Have a clear way for userspace to figure out whether your new ioctl or ioctl
-   extension is supported on a given kernel. If you can't rely on old kernels
-   rejecting the new flags/modes or ioctls (since doing that was botched in the
-   past) then you need a driver feature flag or revision number somewhere.
-
- * Have a plan for extending ioctls with new flags or new fields at the end of
-   the structure. The drm core checks the passed-in size for each ioctl call
-   and zero-extends any mismatches between kernel and userspace. That helps,
-   but isn't a complete solution since newer userspace on older kernels won't
-   notice that the newly added fields at the end get ignored. So this still
-   needs a new driver feature flags.
-
- * Check all unused fields and flags and all the padding for whether it's 0,
-   and reject the ioctl if that's not the case. Otherwise your nice plan for
-   future extensions is going right down the gutters since someone will submit
-   an ioctl struct with random stack garbage in the yet unused parts. Which
-   then bakes in the ABI that those fields can never be used for anything else
-   but garbage. This is also the reason why you must explicitly pad all
-   structures, even if you never use them in an array - the padding the compiler
-   might insert could contain garbage.
-
- * Have simple testcases for all of the above.
-
-
-Fun with Error Paths
---------------------
-
-Nowadays we don't have any excuse left any more for drm drivers being neat
-little root exploits. This means we both need full input validation and solid
-error handling paths - GPUs will die eventually in the oddmost corner cases
-anyway:
-
- * The ioctl must check for array overflows. Also it needs to check for
-   over/underflows and clamping issues of integer values in general. The usual
-   example is sprite positioning values fed directly into the hardware with the
-   hardware just having 12 bits or so. Works nicely until some odd display
-   server doesn't bother with clamping itself and the cursor wraps around the
-   screen.
-
- * Have simple testcases for every input validation failure case in your ioctl.
-   Check that the error code matches your expectations. And finally make sure
-   that you only test for one single error path in each subtest by submitting
-   otherwise perfectly valid data. Without this an earlier check might reject
-   the ioctl already and shadow the codepath you actually want to test, hiding
-   bugs and regressions.
-
- * Make all your ioctls restartable. First X really loves signals and second
-   this will allow you to test 90% of all error handling paths by just
-   interrupting your main test suite constantly with signals. Thanks to X's
-   love for signal you'll get an excellent base coverage of all your error
-   paths pretty much for free for graphics drivers. Also, be consistent with
-   how you handle ioctl restarting - e.g. drm has a tiny drmIoctl helper in its
-   userspace library. The i915 driver botched this with the set_tiling ioctl,
-   now we're stuck forever with some arcane semantics in both the kernel and
-   userspace.
-
- * If you can't make a given codepath restartable make a stuck task at least
-   killable. GPUs just die and your users won't like you more if you hang their
-   entire box (by means of an unkillable X process). If the state recovery is
-   still too tricky have a timeout or hangcheck safety net as a last-ditch
-   effort in case the hardware has gone bananas.
-
- * Have testcases for the really tricky corner cases in your error recovery code
-   - it's way too easy to create a deadlock between your hangcheck code and
-   waiters.
-
-
-Time, Waiting and Missing it
-----------------------------
-
-GPUs do most everything asynchronously, so we have a need to time operations and
-wait for outstanding ones. This is really tricky business; at the moment none of
-the ioctls supported by the drm/i915 get this fully right, which means there's
-still tons more lessons to learn here.
-
- * Use CLOCK_MONOTONIC as your reference time, always. It's what alsa, drm and
-   v4l use by default nowadays. But let userspace know which timestamps are
-   derived from different clock domains like your main system clock (provided
-   by the kernel) or some independent hardware counter somewhere else. Clocks
-   will mismatch if you look close enough, but if performance measuring tools
-   have this information they can at least compensate. If your userspace can
-   get at the raw values of some clocks (e.g. through in-command-stream
-   performance counter sampling instructions) consider exposing those also.
-
- * Use __s64 seconds plus __u64 nanoseconds to specify time. It's not the most
-   convenient time specification, but it's mostly the standard.
-
- * Check that input time values are normalized and reject them if not. Note
-   that the kernel native struct ktime has a signed integer for both seconds
-   and nanoseconds, so beware here.
-
- * For timeouts, use absolute times. If you're a good fellow and made your
-   ioctl restartable relative timeouts tend to be too coarse and can
-   indefinitely extend your wait time due to rounding on each restart.
-   Especially if your reference clock is something really slow like the display
-   frame counter. With a spec lawyer hat on this isn't a bug since timeouts can
-   always be extended - but users will surely hate you if their neat animations
-   starts to stutter due to this.
-
- * Consider ditching any synchronous wait ioctls with timeouts and just deliver
-   an asynchronous event on a pollable file descriptor. It fits much better
-   into event driven applications' main loop.
-
- * Have testcases for corner-cases, especially whether the return values for
-   already-completed events, successful waits and timed-out waits are all sane
-   and suiting to your needs.
-
-
-Leaking Resources, Not
-----------------------
-
-A full-blown drm driver essentially implements a little OS, but specialized to
-the given GPU platforms. This means a driver needs to expose tons of handles
-for different objects and other resources to userspace. Doing that right
-entails its own little set of pitfalls:
-
- * Always attach the lifetime of your dynamically created resources to the
-   lifetime of a file descriptor. Consider using a 1:1 mapping if your resource
-   needs to be shared across processes -  fd-passing over unix domain sockets
-   also simplifies lifetime management for userspace.
-
- * Always have O_CLOEXEC support.
-
- * Ensure that you have sufficient insulation between different clients. By
-   default pick a private per-fd namespace which forces any sharing to be done
-   explicitly. Only go with a more global per-device namespace if the objects
-   are truly device-unique. One counterexample in the drm modeset interfaces is
-   that the per-device modeset objects like connectors share a namespace with
-   framebuffer objects, which mostly are not shared at all. A separate
-   namespace, private by default, for framebuffers would have been more
-   suitable.
-
- * Think about uniqueness requirements for userspace handles. E.g. for most drm
-   drivers it's a userspace bug to submit the same object twice in the same
-   command submission ioctl. But then if objects are shareable userspace needs
-   to know whether it has seen an imported object from a different process
-   already or not. I haven't tried this myself yet due to lack of a new class
-   of objects, but consider using inode numbers on your shared file descriptors
-   as unique identifiers - it's how real files are told apart, too.
-   Unfortunately this requires a full-blown virtual filesystem in the kernel.
-
-
-Last, but not Least
--------------------
-
-Not every problem needs a new ioctl:
-
- * Think hard whether you really want a driver-private interface. Of course
-   it's much quicker to push a driver-private interface than engaging in
-   lengthy discussions for a more generic solution. And occasionally doing a
-   private interface to spearhead a new concept is what's required. But in the
-   end, once the generic interface comes around you'll end up maintainer two
-   interfaces. Indefinitely.
-
- * Consider other interfaces than ioctls. A sysfs attribute is much better for
-   per-device settings, or for child objects with fairly static lifetimes (like
-   output connectors in drm with all the detection override attributes). Or
-   maybe only your testsuite needs this interface, and then debugfs with its
-   disclaimer of not having a stable ABI would be better.
-
-Finally, the name of the game is to get it right on the first attempt, since if
-your driver proves popular and your hardware platforms long-lived then you'll
-be stuck with a given ioctl essentially forever. You can try to deprecate
-horrible ioctls on newer iterations of your hardware, but generally it takes
-years to accomplish this. And then again years until the last user able to
-complain about regressions disappears, too.
diff --git a/Documentation/ioctl/cdrom.rst b/Documentation/ioctl/cdrom.rst
new file mode 100644
index 000000000000..3b4c0506de46
--- /dev/null
+++ b/Documentation/ioctl/cdrom.rst
@@ -0,0 +1,1233 @@
+============================
+Summary of CDROM ioctl calls
+============================
+
+- Edward A. Falk <efalk@google.com>
+
+November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the CDROM layer.  These are by-and-large implemented (as of Linux 2.6)
+in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/cdrom.h>.  As of this writing, they
+are as follows:
+
+	======================	===============================================
+	CDROMPAUSE		Pause Audio Operation
+	CDROMRESUME		Resume paused Audio Operation
+	CDROMPLAYMSF		Play Audio MSF (struct cdrom_msf)
+	CDROMPLAYTRKIND		Play Audio Track/index (struct cdrom_ti)
+	CDROMREADTOCHDR		Read TOC header (struct cdrom_tochdr)
+	CDROMREADTOCENTRY	Read TOC entry (struct cdrom_tocentry)
+	CDROMSTOP		Stop the cdrom drive
+	CDROMSTART		Start the cdrom drive
+	CDROMEJECT		Ejects the cdrom media
+	CDROMVOLCTRL		Control output volume (struct cdrom_volctrl)
+	CDROMSUBCHNL		Read subchannel data (struct cdrom_subchnl)
+	CDROMREADMODE2		Read CDROM mode 2 data (2336 Bytes)
+				(struct cdrom_read)
+	CDROMREADMODE1		Read CDROM mode 1 data (2048 Bytes)
+				(struct cdrom_read)
+	CDROMREADAUDIO		(struct cdrom_read_audio)
+	CDROMEJECT_SW		enable(1)/disable(0) auto-ejecting
+	CDROMMULTISESSION	Obtain the start-of-last-session
+				address of multi session disks
+				(struct cdrom_multisession)
+	CDROM_GET_MCN		Obtain the "Universal Product Code"
+				if available (struct cdrom_mcn)
+	CDROM_GET_UPC		Deprecated, use CDROM_GET_MCN instead.
+	CDROMRESET		hard-reset the drive
+	CDROMVOLREAD		Get the drive's volume setting
+				(struct cdrom_volctrl)
+	CDROMREADRAW		read data in raw mode (2352 Bytes)
+				(struct cdrom_read)
+	CDROMREADCOOKED		read data in cooked mode
+	CDROMSEEK		seek msf address
+	CDROMPLAYBLK		scsi-cd only, (struct cdrom_blk)
+	CDROMREADALL		read all 2646 bytes
+	CDROMGETSPINDOWN	return 4-bit spindown value
+	CDROMSETSPINDOWN	set 4-bit spindown value
+	CDROMCLOSETRAY		pendant of CDROMEJECT
+	CDROM_SET_OPTIONS	Set behavior options
+	CDROM_CLEAR_OPTIONS	Clear behavior options
+	CDROM_SELECT_SPEED	Set the CD-ROM speed
+	CDROM_SELECT_DISC	Select disc (for juke-boxes)
+	CDROM_MEDIA_CHANGED	Check is media changed
+	CDROM_DRIVE_STATUS	Get tray position, etc.
+	CDROM_DISC_STATUS	Get disc type, etc.
+	CDROM_CHANGER_NSLOTS	Get number of slots
+	CDROM_LOCKDOOR		lock or unlock door
+	CDROM_DEBUG		Turn debug messages on/off
+	CDROM_GET_CAPABILITY	get capabilities
+	CDROMAUDIOBUFSIZ	set the audio buffer size
+	DVD_READ_STRUCT		Read structure
+	DVD_WRITE_STRUCT	Write structure
+	DVD_AUTH		Authentication
+	CDROM_SEND_PACKET	send a packet to the drive
+	CDROM_NEXT_WRITABLE	get next writable block
+	CDROM_LAST_WRITTEN	get last block written on disc
+	======================	===============================================
+
+
+The information that follows was determined from reading kernel source
+code.  It is likely that some corrections will be made over time.
+
+------------------------------------------------------------------------------
+
+General:
+
+	Unless otherwise specified, all ioctl calls return 0 on success
+	and -1 with errno set to an appropriate value on error.  (Some
+	ioctls return non-negative data values.)
+
+	Unless otherwise specified, all ioctl calls return -1 and set
+	errno to EFAULT on a failed attempt to copy data to or from user
+	address space.
+
+	Individual drivers may return error codes not listed here.
+
+	Unless otherwise specified, all data structures and constants
+	are defined in <linux/cdrom.h>
+
+------------------------------------------------------------------------------
+
+
+CDROMPAUSE
+	Pause Audio Operation
+
+
+	usage::
+
+	  ioctl(fd, CDROMPAUSE, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+
+CDROMRESUME
+	Resume paused Audio Operation
+
+
+	usage::
+
+	  ioctl(fd, CDROMRESUME, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+
+CDROMPLAYMSF
+	Play Audio MSF
+
+	(struct cdrom_msf)
+
+
+	usage::
+
+	  struct cdrom_msf msf;
+
+	  ioctl(fd, CDROMPLAYMSF, &msf);
+
+	inputs:
+		cdrom_msf structure, describing a segment of music to play
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+	notes:
+		- MSF stands for minutes-seconds-frames
+		- LBA stands for logical block address
+		- Segment is described as start and end times, where each time
+		  is described as minutes:seconds:frames.
+		  A frame is 1/75 of a second.
+
+
+CDROMPLAYTRKIND
+	Play Audio Track/index
+
+	(struct cdrom_ti)
+
+
+	usage::
+
+	  struct cdrom_ti ti;
+
+	  ioctl(fd, CDROMPLAYTRKIND, &ti);
+
+	inputs:
+		cdrom_ti structure, describing a segment of music to play
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+	notes:
+		- Segment is described as start and end times, where each time
+		  is described as a track and an index.
+
+
+
+CDROMREADTOCHDR
+	Read TOC header
+
+	(struct cdrom_tochdr)
+
+
+	usage::
+
+	  cdrom_tochdr header;
+
+	  ioctl(fd, CDROMREADTOCHDR, &header);
+
+	inputs:
+		cdrom_tochdr structure
+
+
+	outputs:
+		cdrom_tochdr structure
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+
+
+CDROMREADTOCENTRY
+	Read TOC entry
+
+	(struct cdrom_tocentry)
+
+
+	usage::
+
+	  struct cdrom_tocentry entry;
+
+	  ioctl(fd, CDROMREADTOCENTRY, &entry);
+
+	inputs:
+		cdrom_tocentry structure
+
+
+	outputs:
+		cdrom_tocentry structure
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+	  - EINVAL	entry.cdte_format not CDROM_MSF or CDROM_LBA
+	  - EINVAL	requested track out of bounds
+	  - EIO		I/O error reading TOC
+
+	notes:
+		- TOC stands for Table Of Contents
+		- MSF stands for minutes-seconds-frames
+		- LBA stands for logical block address
+
+
+
+CDROMSTOP
+	Stop the cdrom drive
+
+
+	usage::
+
+	  ioctl(fd, CDROMSTOP, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+	notes:
+	  - Exact interpretation of this ioctl depends on the device,
+	    but most seem to spin the drive down.
+
+
+CDROMSTART
+	Start the cdrom drive
+
+
+	usage::
+
+	  ioctl(fd, CDROMSTART, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+	notes:
+	  - Exact interpretation of this ioctl depends on the device,
+	    but most seem to spin the drive up and/or close the tray.
+	    Other devices ignore the ioctl completely.
+
+
+CDROMEJECT
+	- Ejects the cdrom media
+
+
+	usage::
+
+	  ioctl(fd, CDROMEJECT, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error returns:
+	  - ENOSYS	cd drive not capable of ejecting
+	  - EBUSY	other processes are accessing drive, or door is locked
+
+	notes:
+		- See CDROM_LOCKDOOR, below.
+
+
+
+
+CDROMCLOSETRAY
+	pendant of CDROMEJECT
+
+
+	usage::
+
+	  ioctl(fd, CDROMCLOSETRAY, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error returns:
+	  - ENOSYS	cd drive not capable of closing the tray
+	  - EBUSY	other processes are accessing drive, or door is locked
+
+	notes:
+		- See CDROM_LOCKDOOR, below.
+
+
+
+
+CDROMVOLCTRL
+	Control output volume (struct cdrom_volctrl)
+
+
+	usage::
+
+	  struct cdrom_volctrl volume;
+
+	  ioctl(fd, CDROMVOLCTRL, &volume);
+
+	inputs:
+		cdrom_volctrl structure containing volumes for up to 4
+		channels.
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+
+
+CDROMVOLREAD
+	Get the drive's volume setting
+
+	(struct cdrom_volctrl)
+
+
+	usage::
+
+	  struct cdrom_volctrl volume;
+
+	  ioctl(fd, CDROMVOLREAD, &volume);
+
+	inputs:
+		none
+
+
+	outputs:
+		The current volume settings.
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+
+
+
+CDROMSUBCHNL
+	Read subchannel data
+
+	(struct cdrom_subchnl)
+
+
+	usage::
+
+	  struct cdrom_subchnl q;
+
+	  ioctl(fd, CDROMSUBCHNL, &q);
+
+	inputs:
+		cdrom_subchnl structure
+
+
+	outputs:
+		cdrom_subchnl structure
+
+
+	error return:
+	  - ENOSYS	cd drive not audio-capable.
+	  - EINVAL	format not CDROM_MSF or CDROM_LBA
+
+	notes:
+		- Format is converted to CDROM_MSF or CDROM_LBA
+		  as per user request on return
+
+
+
+CDROMREADRAW
+	read data in raw mode (2352 Bytes)
+
+	(struct cdrom_read)
+
+	usage::
+
+	  union {
+
+	    struct cdrom_msf msf;		/* input */
+	    char buffer[CD_FRAMESIZE_RAW];	/* return */
+	  } arg;
+	  ioctl(fd, CDROMREADRAW, &arg);
+
+	inputs:
+		cdrom_msf structure indicating an address to read.
+
+		Only the start values are significant.
+
+	outputs:
+		Data written to address provided by user.
+
+
+	error return:
+	  - EINVAL	address less than 0, or msf less than 0:2:0
+	  - ENOMEM	out of memory
+
+	notes:
+		- As of 2.6.8.1, comments in <linux/cdrom.h> indicate that this
+		  ioctl accepts a cdrom_read structure, but actual source code
+		  reads a cdrom_msf structure and writes a buffer of data to
+		  the same address.
+
+		- MSF values are converted to LBA values via this formula::
+
+		    lba = (((m * CD_SECS) + s) * CD_FRAMES + f) - CD_MSF_OFFSET;
+
+
+
+
+CDROMREADMODE1
+	Read CDROM mode 1 data (2048 Bytes)
+
+	(struct cdrom_read)
+
+	notes:
+		Identical to CDROMREADRAW except that block size is
+		CD_FRAMESIZE (2048) bytes
+
+
+
+CDROMREADMODE2
+	Read CDROM mode 2 data (2336 Bytes)
+
+	(struct cdrom_read)
+
+	notes:
+		Identical to CDROMREADRAW except that block size is
+		CD_FRAMESIZE_RAW0 (2336) bytes
+
+
+
+CDROMREADAUDIO
+	(struct cdrom_read_audio)
+
+	usage::
+
+	  struct cdrom_read_audio ra;
+
+	  ioctl(fd, CDROMREADAUDIO, &ra);
+
+	inputs:
+		cdrom_read_audio structure containing read start
+		point and length
+
+	outputs:
+		audio data, returned to buffer indicated by ra
+
+
+	error return:
+	  - EINVAL	format not CDROM_MSF or CDROM_LBA
+	  - EINVAL	nframes not in range [1 75]
+	  - ENXIO	drive has no queue (probably means invalid fd)
+	  - ENOMEM	out of memory
+
+
+CDROMEJECT_SW
+	enable(1)/disable(0) auto-ejecting
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, CDROMEJECT_SW, val);
+
+	inputs:
+		Flag specifying auto-eject flag.
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	Drive is not capable of ejecting.
+	  - EBUSY	Door is locked
+
+
+
+
+CDROMMULTISESSION
+	Obtain the start-of-last-session address of multi session disks
+
+	(struct cdrom_multisession)
+
+	usage::
+
+	  struct cdrom_multisession ms_info;
+
+	  ioctl(fd, CDROMMULTISESSION, &ms_info);
+
+	inputs:
+		cdrom_multisession structure containing desired
+
+	  format.
+
+	outputs:
+		cdrom_multisession structure is filled with last_session
+		information.
+
+	error return:
+	  - EINVAL	format not CDROM_MSF or CDROM_LBA
+
+
+CDROM_GET_MCN
+	Obtain the "Universal Product Code"
+	if available
+
+	(struct cdrom_mcn)
+
+
+	usage::
+
+	  struct cdrom_mcn mcn;
+
+	  ioctl(fd, CDROM_GET_MCN, &mcn);
+
+	inputs:
+		none
+
+
+	outputs:
+		Universal Product Code
+
+
+	error return:
+	  - ENOSYS	Drive is not capable of reading MCN data.
+
+	notes:
+		- Source code comments state::
+
+		    The following function is implemented, although very few
+		    audio discs give Universal Product Code information, which
+		    should just be the Medium Catalog Number on the box.  Note,
+		    that the way the code is written on the CD is /not/ uniform
+		    across all discs!
+
+
+
+
+CDROM_GET_UPC
+	CDROM_GET_MCN  (deprecated)
+
+
+	Not implemented, as of 2.6.8.1
+
+
+
+CDROMRESET
+	hard-reset the drive
+
+
+	usage::
+
+	  ioctl(fd, CDROMRESET, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - ENOSYS	Drive is not capable of resetting.
+
+
+
+
+CDROMREADCOOKED
+	read data in cooked mode
+
+
+	usage::
+
+	  u8 buffer[CD_FRAMESIZE]
+
+	  ioctl(fd, CDROMREADCOOKED, buffer);
+
+	inputs:
+		none
+
+
+	outputs:
+		2048 bytes of data, "cooked" mode.
+
+
+	notes:
+		Not implemented on all drives.
+
+
+
+
+
+CDROMREADALL
+	read all 2646 bytes
+
+
+	Same as CDROMREADCOOKED, but reads 2646 bytes.
+
+
+
+CDROMSEEK
+	seek msf address
+
+
+	usage::
+
+	  struct cdrom_msf msf;
+
+	  ioctl(fd, CDROMSEEK, &msf);
+
+	inputs:
+		MSF address to seek to.
+
+
+	outputs:
+		none
+
+
+
+
+CDROMPLAYBLK
+	scsi-cd only
+
+	(struct cdrom_blk)
+
+
+	usage::
+
+	  struct cdrom_blk blk;
+
+	  ioctl(fd, CDROMPLAYBLK, &blk);
+
+	inputs:
+		Region to play
+
+
+	outputs:
+		none
+
+
+
+
+CDROMGETSPINDOWN
+	usage::
+
+	  char spindown;
+
+	  ioctl(fd, CDROMGETSPINDOWN, &spindown);
+
+	inputs:
+		none
+
+
+	outputs:
+		The value of the current 4-bit spindown value.
+
+
+
+
+
+CDROMSETSPINDOWN
+	usage::
+
+	  char spindown
+
+	  ioctl(fd, CDROMSETSPINDOWN, &spindown);
+
+	inputs:
+		4-bit value used to control spindown (TODO: more detail here)
+
+
+	outputs:
+		none
+
+
+
+
+
+
+CDROM_SET_OPTIONS
+	Set behavior options
+
+
+	usage::
+
+	  int options;
+
+	  ioctl(fd, CDROM_SET_OPTIONS, options);
+
+	inputs:
+		New values for drive options.  The logical 'or' of:
+
+	    ==============      ==================================
+	    CDO_AUTO_CLOSE	close tray on first open(2)
+	    CDO_AUTO_EJECT	open tray on last release
+	    CDO_USE_FFLAGS	use O_NONBLOCK information on open
+	    CDO_LOCK		lock tray on open files
+	    CDO_CHECK_TYPE	check type on open for data
+	    ==============      ==================================
+
+	outputs:
+		Returns the resulting options settings in the
+		ioctl return value.  Returns -1 on error.
+
+	error return:
+	  - ENOSYS	selected option(s) not supported by drive.
+
+
+
+
+CDROM_CLEAR_OPTIONS
+	Clear behavior options
+
+
+	Same as CDROM_SET_OPTIONS, except that selected options are
+	turned off.
+
+
+
+CDROM_SELECT_SPEED
+	Set the CD-ROM speed
+
+
+	usage::
+
+	  int speed;
+
+	  ioctl(fd, CDROM_SELECT_SPEED, speed);
+
+	inputs:
+		New drive speed.
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - ENOSYS	speed selection not supported by drive.
+
+
+
+CDROM_SELECT_DISC
+	Select disc (for juke-boxes)
+
+
+	usage::
+
+	  int disk;
+
+	  ioctl(fd, CDROM_SELECT_DISC, disk);
+
+	inputs:
+		Disk to load into drive.
+
+
+	outputs:
+		none
+
+
+	error return:
+	  - EINVAL	Disk number beyond capacity of drive
+
+
+
+CDROM_MEDIA_CHANGED
+	Check is media changed
+
+
+	usage::
+
+	  int slot;
+
+	  ioctl(fd, CDROM_MEDIA_CHANGED, slot);
+
+	inputs:
+		Slot number to be tested, always zero except for jukeboxes.
+
+		May also be special values CDSL_NONE or CDSL_CURRENT
+
+	outputs:
+		Ioctl return value is 0 or 1 depending on whether the media
+
+	  has been changed, or -1 on error.
+
+	error returns:
+	  - ENOSYS	Drive can't detect media change
+	  - EINVAL	Slot number beyond capacity of drive
+	  - ENOMEM	Out of memory
+
+
+
+CDROM_DRIVE_STATUS
+	Get tray position, etc.
+
+
+	usage::
+
+	  int slot;
+
+	  ioctl(fd, CDROM_DRIVE_STATUS, slot);
+
+	inputs:
+		Slot number to be tested, always zero except for jukeboxes.
+
+		May also be special values CDSL_NONE or CDSL_CURRENT
+
+	outputs:
+		Ioctl return value will be one of the following values
+
+	  from <linux/cdrom.h>:
+
+	    =================== ==========================
+	    CDS_NO_INFO		Information not available.
+	    CDS_NO_DISC
+	    CDS_TRAY_OPEN
+	    CDS_DRIVE_NOT_READY
+	    CDS_DISC_OK
+	    -1			error
+	    =================== ==========================
+
+	error returns:
+	  - ENOSYS	Drive can't detect drive status
+	  - EINVAL	Slot number beyond capacity of drive
+	  - ENOMEM	Out of memory
+
+
+
+
+CDROM_DISC_STATUS
+	Get disc type, etc.
+
+
+	usage::
+
+	  ioctl(fd, CDROM_DISC_STATUS, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		Ioctl return value will be one of the following values
+
+	  from <linux/cdrom.h>:
+
+	    - CDS_NO_INFO
+	    - CDS_AUDIO
+	    - CDS_MIXED
+	    - CDS_XA_2_2
+	    - CDS_XA_2_1
+	    - CDS_DATA_1
+
+	error returns:
+		none at present
+
+	notes:
+	    - Source code comments state::
+
+
+		Ok, this is where problems start.  The current interface for
+		the CDROM_DISC_STATUS ioctl is flawed.  It makes the false
+		assumption that CDs are all CDS_DATA_1 or all CDS_AUDIO, etc.
+		Unfortunately, while this is often the case, it is also
+		very common for CDs to have some tracks with data, and some
+		tracks with audio.	Just because I feel like it, I declare
+		the following to be the best way to cope.  If the CD has
+		ANY data tracks on it, it will be returned as a data CD.
+		If it has any XA tracks, I will return it as that.	Now I
+		could simplify this interface by combining these returns with
+		the above, but this more clearly demonstrates the problem
+		with the current interface.  Too bad this wasn't designed
+		to use bitmasks...	       -Erik
+
+		Well, now we have the option CDS_MIXED: a mixed-type CD.
+		User level programmers might feel the ioctl is not very
+		useful.
+				---david
+
+
+
+
+CDROM_CHANGER_NSLOTS
+	Get number of slots
+
+
+	usage::
+
+	  ioctl(fd, CDROM_CHANGER_NSLOTS, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		The ioctl return value will be the number of slots in a
+		CD changer.  Typically 1 for non-multi-disk devices.
+
+	error returns:
+		none
+
+
+
+CDROM_LOCKDOOR
+	lock or unlock door
+
+
+	usage::
+
+	  int lock;
+
+	  ioctl(fd, CDROM_LOCKDOOR, lock);
+
+	inputs:
+		Door lock flag, 1=lock, 0=unlock
+
+
+	outputs:
+		none
+
+
+	error returns:
+	  - EDRIVE_CANT_DO_THIS
+
+				Door lock function not supported.
+	  - EBUSY
+
+				Attempt to unlock when multiple users
+				have the drive open and not CAP_SYS_ADMIN
+
+	notes:
+		As of 2.6.8.1, the lock flag is a global lock, meaning that
+		all CD drives will be locked or unlocked together.  This is
+		probably a bug.
+
+		The EDRIVE_CANT_DO_THIS value is defined in <linux/cdrom.h>
+		and is currently (2.6.8.1) the same as EOPNOTSUPP
+
+
+
+CDROM_DEBUG
+	Turn debug messages on/off
+
+
+	usage::
+
+	  int debug;
+
+	  ioctl(fd, CDROM_DEBUG, debug);
+
+	inputs:
+		Cdrom debug flag, 0=disable, 1=enable
+
+
+	outputs:
+		The ioctl return value will be the new debug flag.
+
+
+	error return:
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+
+
+
+CDROM_GET_CAPABILITY
+	get capabilities
+
+
+	usage::
+
+	  ioctl(fd, CDROM_GET_CAPABILITY, 0);
+
+
+	inputs:
+		none
+
+
+	outputs:
+		The ioctl return value is the current device capability
+		flags.  See CDC_CLOSE_TRAY, CDC_OPEN_TRAY, etc.
+
+
+
+CDROMAUDIOBUFSIZ
+	set the audio buffer size
+
+
+	usage::
+
+	  int arg;
+
+	  ioctl(fd, CDROMAUDIOBUFSIZ, val);
+
+	inputs:
+		New audio buffer size
+
+
+	outputs:
+		The ioctl return value is the new audio buffer size, or -1
+		on error.
+
+	error return:
+	  - ENOSYS	Not supported by this driver.
+
+	notes:
+		Not supported by all drivers.
+
+
+
+
+DVD_READ_STRUCT			Read structure
+
+	usage::
+
+	  dvd_struct s;
+
+	  ioctl(fd, DVD_READ_STRUCT, &s);
+
+	inputs:
+		dvd_struct structure, containing:
+
+	    =================== ==========================================
+	    type		specifies the information desired, one of
+				DVD_STRUCT_PHYSICAL, DVD_STRUCT_COPYRIGHT,
+				DVD_STRUCT_DISCKEY, DVD_STRUCT_BCA,
+				DVD_STRUCT_MANUFACT
+	    physical.layer_num	desired layer, indexed from 0
+	    copyright.layer_num	desired layer, indexed from 0
+	    disckey.agid
+	    =================== ==========================================
+
+	outputs:
+		dvd_struct structure, containing:
+
+	    =================== ================================
+	    physical		for type == DVD_STRUCT_PHYSICAL
+	    copyright		for type == DVD_STRUCT_COPYRIGHT
+	    disckey.value	for type == DVD_STRUCT_DISCKEY
+	    bca.{len,value}	for type == DVD_STRUCT_BCA
+	    manufact.{len,valu}	for type == DVD_STRUCT_MANUFACT
+	    =================== ================================
+
+	error returns:
+	  - EINVAL	physical.layer_num exceeds number of layers
+	  - EIO		Received invalid response from drive
+
+
+
+DVD_WRITE_STRUCT		Write structure
+
+	Not implemented, as of 2.6.8.1
+
+
+
+DVD_AUTH			Authentication
+
+	usage::
+
+	  dvd_authinfo ai;
+
+	  ioctl(fd, DVD_AUTH, &ai);
+
+	inputs:
+		dvd_authinfo structure.  See <linux/cdrom.h>
+
+
+	outputs:
+		dvd_authinfo structure.
+
+
+	error return:
+	  - ENOTTY	ai.type not recognized.
+
+
+
+CDROM_SEND_PACKET
+	send a packet to the drive
+
+
+	usage::
+
+	  struct cdrom_generic_command cgc;
+
+	  ioctl(fd, CDROM_SEND_PACKET, &cgc);
+
+	inputs:
+		cdrom_generic_command structure containing the packet to send.
+
+
+	outputs:
+		none
+
+	  cdrom_generic_command structure containing results.
+
+	error return:
+	  - EIO
+
+			command failed.
+	  - EPERM
+
+			Operation not permitted, either because a
+			write command was attempted on a drive which
+			is opened read-only, or because the command
+			requires CAP_SYS_RAWIO
+	  - EINVAL
+
+			cgc.data_direction not set
+
+
+
+CDROM_NEXT_WRITABLE
+	get next writable block
+
+
+	usage::
+
+	  long next;
+
+	  ioctl(fd, CDROM_NEXT_WRITABLE, &next);
+
+	inputs:
+		none
+
+
+	outputs:
+		The next writable block.
+
+
+	notes:
+		If the device does not support this ioctl directly, the
+
+	  ioctl will return CDROM_LAST_WRITTEN + 7.
+
+
+
+CDROM_LAST_WRITTEN
+	get last block written on disc
+
+
+	usage::
+
+	  long last;
+
+	  ioctl(fd, CDROM_LAST_WRITTEN, &last);
+
+	inputs:
+		none
+
+
+	outputs:
+		The last block written on disc
+
+
+	notes:
+		If the device does not support this ioctl directly, the
+		result is derived from the disc's table of contents.  If the
+		table of contents can't be read, this ioctl returns an
+		error.
diff --git a/Documentation/ioctl/cdrom.txt b/Documentation/ioctl/cdrom.txt
deleted file mode 100644
index a4d62a9d6771..000000000000
--- a/Documentation/ioctl/cdrom.txt
+++ /dev/null
@@ -1,967 +0,0 @@
-		Summary of CDROM ioctl calls.
-		============================
-
-		Edward A. Falk <efalk@google.com>
-
-		November, 2004
-
-This document attempts to describe the ioctl(2) calls supported by
-the CDROM layer.  These are by-and-large implemented (as of Linux 2.6)
-in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
-
-ioctl values are listed in <linux/cdrom.h>.  As of this writing, they
-are as follows:
-
-	CDROMPAUSE		Pause Audio Operation
-	CDROMRESUME		Resume paused Audio Operation
-	CDROMPLAYMSF		Play Audio MSF (struct cdrom_msf)
-	CDROMPLAYTRKIND		Play Audio Track/index (struct cdrom_ti)
-	CDROMREADTOCHDR		Read TOC header (struct cdrom_tochdr)
-	CDROMREADTOCENTRY	Read TOC entry (struct cdrom_tocentry)
-	CDROMSTOP		Stop the cdrom drive
-	CDROMSTART		Start the cdrom drive
-	CDROMEJECT		Ejects the cdrom media
-	CDROMVOLCTRL		Control output volume (struct cdrom_volctrl)
-	CDROMSUBCHNL		Read subchannel data (struct cdrom_subchnl)
-	CDROMREADMODE2		Read CDROM mode 2 data (2336 Bytes)
-					   (struct cdrom_read)
-	CDROMREADMODE1		Read CDROM mode 1 data (2048 Bytes)
-					   (struct cdrom_read)
-	CDROMREADAUDIO		(struct cdrom_read_audio)
-	CDROMEJECT_SW		enable(1)/disable(0) auto-ejecting
-	CDROMMULTISESSION	Obtain the start-of-last-session
-				  address of multi session disks
-				  (struct cdrom_multisession)
-	CDROM_GET_MCN		Obtain the "Universal Product Code"
-				   if available (struct cdrom_mcn)
-	CDROM_GET_UPC		Deprecated, use CDROM_GET_MCN instead.
-	CDROMRESET		hard-reset the drive
-	CDROMVOLREAD		Get the drive's volume setting
-					  (struct cdrom_volctrl)
-	CDROMREADRAW		read data in raw mode (2352 Bytes)
-					   (struct cdrom_read)
-	CDROMREADCOOKED		read data in cooked mode
-	CDROMSEEK		seek msf address
-	CDROMPLAYBLK		scsi-cd only, (struct cdrom_blk)
-	CDROMREADALL		read all 2646 bytes
-	CDROMGETSPINDOWN	return 4-bit spindown value
-	CDROMSETSPINDOWN	set 4-bit spindown value
-	CDROMCLOSETRAY		pendant of CDROMEJECT
-	CDROM_SET_OPTIONS	Set behavior options
-	CDROM_CLEAR_OPTIONS	Clear behavior options
-	CDROM_SELECT_SPEED	Set the CD-ROM speed
-	CDROM_SELECT_DISC	Select disc (for juke-boxes)
-	CDROM_MEDIA_CHANGED	Check is media changed
-	CDROM_DRIVE_STATUS	Get tray position, etc.
-	CDROM_DISC_STATUS	Get disc type, etc.
-	CDROM_CHANGER_NSLOTS	Get number of slots
-	CDROM_LOCKDOOR		lock or unlock door
-	CDROM_DEBUG		Turn debug messages on/off
-	CDROM_GET_CAPABILITY	get capabilities
-	CDROMAUDIOBUFSIZ	set the audio buffer size
-	DVD_READ_STRUCT		Read structure
-	DVD_WRITE_STRUCT	Write structure
-	DVD_AUTH		Authentication
-	CDROM_SEND_PACKET	send a packet to the drive
-	CDROM_NEXT_WRITABLE	get next writable block
-	CDROM_LAST_WRITTEN	get last block written on disc
-
-
-The information that follows was determined from reading kernel source
-code.  It is likely that some corrections will be made over time.
-
-
-
-
-
-
-
-General:
-
-	Unless otherwise specified, all ioctl calls return 0 on success
-	and -1 with errno set to an appropriate value on error.  (Some
-	ioctls return non-negative data values.)
-
-	Unless otherwise specified, all ioctl calls return -1 and set
-	errno to EFAULT on a failed attempt to copy data to or from user
-	address space.
-
-	Individual drivers may return error codes not listed here.
-
-	Unless otherwise specified, all data structures and constants
-	are defined in <linux/cdrom.h>
-
-
-
-
-CDROMPAUSE			Pause Audio Operation
-
-	usage:
-
-	  ioctl(fd, CDROMPAUSE, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-
-CDROMRESUME			Resume paused Audio Operation
-
-	usage:
-
-	  ioctl(fd, CDROMRESUME, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-
-CDROMPLAYMSF			Play Audio MSF (struct cdrom_msf)
-
-	usage:
-
-	  struct cdrom_msf msf;
-	  ioctl(fd, CDROMPLAYMSF, &msf);
-
-	inputs:
-	  cdrom_msf structure, describing a segment of music to play
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-	notes:
-	  MSF stands for minutes-seconds-frames
-	  LBA stands for logical block address
-
-	  Segment is described as start and end times, where each time
-	  is described as minutes:seconds:frames.  A frame is 1/75 of
-	  a second.
-
-
-CDROMPLAYTRKIND			Play Audio Track/index (struct cdrom_ti)
-
-	usage:
-
-	  struct cdrom_ti ti;
-	  ioctl(fd, CDROMPLAYTRKIND, &ti);
-
-	inputs:
-	  cdrom_ti structure, describing a segment of music to play
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-	notes:
-	  Segment is described as start and end times, where each time
-	  is described as a track and an index.
-
-
-
-CDROMREADTOCHDR			Read TOC header (struct cdrom_tochdr)
-
-	usage:
-
-	  cdrom_tochdr header;
-	  ioctl(fd, CDROMREADTOCHDR, &header);
-
-	inputs:
-	  cdrom_tochdr structure
-
-	outputs:
-	  cdrom_tochdr structure
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-
-
-CDROMREADTOCENTRY		Read TOC entry (struct cdrom_tocentry)
-
-	usage:
-
-	  struct cdrom_tocentry entry;
-	  ioctl(fd, CDROMREADTOCENTRY, &entry);
-
-	inputs:
-	  cdrom_tocentry structure
-
-	outputs:
-	  cdrom_tocentry structure
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-	  EINVAL	entry.cdte_format not CDROM_MSF or CDROM_LBA
-	  EINVAL	requested track out of bounds
-	  EIO		I/O error reading TOC
-
-	notes:
-	  TOC stands for Table Of Contents
-	  MSF stands for minutes-seconds-frames
-	  LBA stands for logical block address
-
-
-
-CDROMSTOP			Stop the cdrom drive
-
-	usage:
-
-	  ioctl(fd, CDROMSTOP, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-	notes:
-	  Exact interpretation of this ioctl depends on the device,
-	  but most seem to spin the drive down.
-
-
-CDROMSTART			Start the cdrom drive
-
-	usage:
-
-	  ioctl(fd, CDROMSTART, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-	notes:
-	  Exact interpretation of this ioctl depends on the device,
-	  but most seem to spin the drive up and/or close the tray.
-	  Other devices ignore the ioctl completely.
-
-
-CDROMEJECT			Ejects the cdrom media
-
-	usage:
-
-	  ioctl(fd, CDROMEJECT, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error returns:
-	  ENOSYS	cd drive not capable of ejecting
-	  EBUSY		other processes are accessing drive, or door is locked
-
-	notes:
-	  See CDROM_LOCKDOOR, below.
-
-
-
-CDROMCLOSETRAY			pendant of CDROMEJECT
-
-	usage:
-
-	  ioctl(fd, CDROMCLOSETRAY, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error returns:
-	  ENOSYS	cd drive not capable of closing the tray
-	  EBUSY		other processes are accessing drive, or door is locked
-
-	notes:
-	  See CDROM_LOCKDOOR, below.
-
-
-
-CDROMVOLCTRL			Control output volume (struct cdrom_volctrl)
-
-	usage:
-
-	  struct cdrom_volctrl volume;
-	  ioctl(fd, CDROMVOLCTRL, &volume);
-
-	inputs:
-	  cdrom_volctrl structure containing volumes for up to 4
-	  channels.
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-
-
-CDROMVOLREAD			Get the drive's volume setting
-					  (struct cdrom_volctrl)
-
-	usage:
-
-	  struct cdrom_volctrl volume;
-	  ioctl(fd, CDROMVOLREAD, &volume);
-
-	inputs:		none
-
-	outputs:
-	  The current volume settings.
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-
-
-
-CDROMSUBCHNL			Read subchannel data (struct cdrom_subchnl)
-
-	usage:
-
-	  struct cdrom_subchnl q;
-	  ioctl(fd, CDROMSUBCHNL, &q);
-
-	inputs:
-	  cdrom_subchnl structure
-
-	outputs:
-	  cdrom_subchnl structure
-
-	error return:
-	  ENOSYS	cd drive not audio-capable.
-	  EINVAL	format not CDROM_MSF or CDROM_LBA
-
-	notes:
-	  Format is converted to CDROM_MSF or CDROM_LBA
-	  as per user request on return
-
-
-
-CDROMREADRAW			read data in raw mode (2352 Bytes)
-					   (struct cdrom_read)
-
-	usage:
-
-	  union {
-	    struct cdrom_msf msf;		/* input */
-	    char buffer[CD_FRAMESIZE_RAW];	/* return */
-	  } arg;
-	  ioctl(fd, CDROMREADRAW, &arg);
-
-	inputs:
-	  cdrom_msf structure indicating an address to read.
-	  Only the start values are significant.
-
-	outputs:
-	  Data written to address provided by user.
-
-	error return:
-	  EINVAL	address less than 0, or msf less than 0:2:0
-	  ENOMEM	out of memory
-
-	notes:
-	  As of 2.6.8.1, comments in <linux/cdrom.h> indicate that this
-	  ioctl accepts a cdrom_read structure, but actual source code
-	  reads a cdrom_msf structure and writes a buffer of data to
-	  the same address.
-
-	  MSF values are converted to LBA values via this formula:
-
-	    lba = (((m * CD_SECS) + s) * CD_FRAMES + f) - CD_MSF_OFFSET;
-
-
-
-
-CDROMREADMODE1			Read CDROM mode 1 data (2048 Bytes)
-					   (struct cdrom_read)
-
-	notes:
-	  Identical to CDROMREADRAW except that block size is
-	  CD_FRAMESIZE (2048) bytes
-
-
-
-CDROMREADMODE2			Read CDROM mode 2 data (2336 Bytes)
-					   (struct cdrom_read)
-
-	notes:
-	  Identical to CDROMREADRAW except that block size is
-	  CD_FRAMESIZE_RAW0 (2336) bytes
-
-
-
-CDROMREADAUDIO			(struct cdrom_read_audio)
-
-	usage:
-
-	  struct cdrom_read_audio ra;
-	  ioctl(fd, CDROMREADAUDIO, &ra);
-
-	inputs:
-	  cdrom_read_audio structure containing read start
-	  point and length
-
-	outputs:
-	  audio data, returned to buffer indicated by ra
-
-	error return:
-	  EINVAL	format not CDROM_MSF or CDROM_LBA
-	  EINVAL	nframes not in range [1 75]
-	  ENXIO		drive has no queue (probably means invalid fd)
-	  ENOMEM	out of memory
-
-
-CDROMEJECT_SW			enable(1)/disable(0) auto-ejecting
-
-	usage:
-
-	  int val;
-	  ioctl(fd, CDROMEJECT_SW, val);
-
-	inputs:
-	  Flag specifying auto-eject flag.
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	Drive is not capable of ejecting.
-	  EBUSY		Door is locked
-
-
-
-
-CDROMMULTISESSION		Obtain the start-of-last-session
-				  address of multi session disks
-				  (struct cdrom_multisession)
-	usage:
-
-	  struct cdrom_multisession ms_info;
-	  ioctl(fd, CDROMMULTISESSION, &ms_info);
-
-	inputs:
-	  cdrom_multisession structure containing desired
-	  format.
-
-	outputs:
-	  cdrom_multisession structure is filled with last_session
-	  information.
-
-	error return:
-	  EINVAL	format not CDROM_MSF or CDROM_LBA
-
-
-CDROM_GET_MCN			Obtain the "Universal Product Code"
-				   if available (struct cdrom_mcn)
-
-	usage:
-
-	  struct cdrom_mcn mcn;
-	  ioctl(fd, CDROM_GET_MCN, &mcn);
-
-	inputs:		none
-
-	outputs:
-	  Universal Product Code
-
-	error return:
-	  ENOSYS	Drive is not capable of reading MCN data.
-
-	notes:
-	  Source code comments state:
-
-	    The following function is implemented, although very few
-	    audio discs give Universal Product Code information, which
-	    should just be the Medium Catalog Number on the box.  Note,
-	    that the way the code is written on the CD is /not/ uniform
-	    across all discs!
-
-
-
-
-CDROM_GET_UPC			CDROM_GET_MCN  (deprecated)
-
-	Not implemented, as of 2.6.8.1
-
-
-
-CDROMRESET			hard-reset the drive
-
-	usage:
-
-	  ioctl(fd, CDROMRESET, 0);
-
-	inputs:		none
-
-	outputs:	none
-
-	error return:
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  ENOSYS	Drive is not capable of resetting.
-
-
-
-
-CDROMREADCOOKED			read data in cooked mode
-
-	usage:
-
-	  u8 buffer[CD_FRAMESIZE]
-	  ioctl(fd, CDROMREADCOOKED, buffer);
-
-	inputs:		none
-
-	outputs:
-	  2048 bytes of data, "cooked" mode.
-
-	notes:
-	  Not implemented on all drives.
-
-
-
-
-CDROMREADALL			read all 2646 bytes
-
-	Same as CDROMREADCOOKED, but reads 2646 bytes.
-
-
-
-CDROMSEEK			seek msf address
-
-	usage:
-
-	  struct cdrom_msf msf;
-	  ioctl(fd, CDROMSEEK, &msf);
-
-	inputs:
-	  MSF address to seek to.
-
-	outputs:	none
-
-
-
-CDROMPLAYBLK			scsi-cd only, (struct cdrom_blk)
-
-	usage:
-
-	  struct cdrom_blk blk;
-	  ioctl(fd, CDROMPLAYBLK, &blk);
-
-	inputs:
-	  Region to play
-
-	outputs:	none
-
-
-
-CDROMGETSPINDOWN
-
-	usage:
-
-	  char spindown;
-	  ioctl(fd, CDROMGETSPINDOWN, &spindown);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current 4-bit spindown value.
-
-
-
-
-CDROMSETSPINDOWN
-
-	usage:
-
-	  char spindown
-	  ioctl(fd, CDROMSETSPINDOWN, &spindown);
-
-	inputs:
-	  4-bit value used to control spindown (TODO: more detail here)
-
-	outputs:	none
-
-
-
-
-
-CDROM_SET_OPTIONS		Set behavior options
-
-	usage:
-
-	  int options;
-	  ioctl(fd, CDROM_SET_OPTIONS, options);
-
-	inputs:
-	  New values for drive options.  The logical 'or' of:
-	    CDO_AUTO_CLOSE	close tray on first open(2)
-	    CDO_AUTO_EJECT	open tray on last release
-	    CDO_USE_FFLAGS	use O_NONBLOCK information on open
-	    CDO_LOCK		lock tray on open files
-	    CDO_CHECK_TYPE	check type on open for data
-
-	outputs:
-	  Returns the resulting options settings in the
-	  ioctl return value.  Returns -1 on error.
-
-	error return:
-	  ENOSYS	selected option(s) not supported by drive.
-
-
-
-
-CDROM_CLEAR_OPTIONS		Clear behavior options
-
-	Same as CDROM_SET_OPTIONS, except that selected options are
-	turned off.
-
-
-
-CDROM_SELECT_SPEED		Set the CD-ROM speed
-
-	usage:
-
-	  int speed;
-	  ioctl(fd, CDROM_SELECT_SPEED, speed);
-
-	inputs:
-	  New drive speed.
-
-	outputs:	none
-
-	error return:
-	  ENOSYS	speed selection not supported by drive.
-
-
-
-CDROM_SELECT_DISC		Select disc (for juke-boxes)
-
-	usage:
-
-	  int disk;
-	  ioctl(fd, CDROM_SELECT_DISC, disk);
-
-	inputs:
-	  Disk to load into drive.
-
-	outputs:	none
-
-	error return:
-	  EINVAL	Disk number beyond capacity of drive
-
-
-
-CDROM_MEDIA_CHANGED		Check is media changed
-
-	usage:
-
-	  int slot;
-	  ioctl(fd, CDROM_MEDIA_CHANGED, slot);
-
-	inputs:
-	  Slot number to be tested, always zero except for jukeboxes.
-	  May also be special values CDSL_NONE or CDSL_CURRENT
-
-	outputs:
-	  Ioctl return value is 0 or 1 depending on whether the media
-	  has been changed, or -1 on error.
-
-	error returns:
-	  ENOSYS	Drive can't detect media change
-	  EINVAL	Slot number beyond capacity of drive
-	  ENOMEM	Out of memory
-
-
-
-CDROM_DRIVE_STATUS		Get tray position, etc.
-
-	usage:
-
-	  int slot;
-	  ioctl(fd, CDROM_DRIVE_STATUS, slot);
-
-	inputs:
-	  Slot number to be tested, always zero except for jukeboxes.
-	  May also be special values CDSL_NONE or CDSL_CURRENT
-
-	outputs:
-	  Ioctl return value will be one of the following values
-	  from <linux/cdrom.h>:
-
-	    CDS_NO_INFO		Information not available.
-	    CDS_NO_DISC
-	    CDS_TRAY_OPEN
-	    CDS_DRIVE_NOT_READY
-	    CDS_DISC_OK
-	    -1			error
-
-	error returns:
-	  ENOSYS	Drive can't detect drive status
-	  EINVAL	Slot number beyond capacity of drive
-	  ENOMEM	Out of memory
-
-
-
-
-CDROM_DISC_STATUS		Get disc type, etc.
-
-	usage:
-
-	  ioctl(fd, CDROM_DISC_STATUS, 0);
-
-	inputs:		none
-
-	outputs:
-	  Ioctl return value will be one of the following values
-	  from <linux/cdrom.h>:
-	    CDS_NO_INFO
-	    CDS_AUDIO
-	    CDS_MIXED
-	    CDS_XA_2_2
-	    CDS_XA_2_1
-	    CDS_DATA_1
-
-	error returns:	none at present
-
-	notes:
-	  Source code comments state:
-
-	    Ok, this is where problems start.  The current interface for
-	    the CDROM_DISC_STATUS ioctl is flawed.  It makes the false
-	    assumption that CDs are all CDS_DATA_1 or all CDS_AUDIO, etc.
-	    Unfortunately, while this is often the case, it is also
-	    very common for CDs to have some tracks with data, and some
-	    tracks with audio.	Just because I feel like it, I declare
-	    the following to be the best way to cope.  If the CD has
-	    ANY data tracks on it, it will be returned as a data CD.
-	    If it has any XA tracks, I will return it as that.	Now I
-	    could simplify this interface by combining these returns with
-	    the above, but this more clearly demonstrates the problem
-	    with the current interface.  Too bad this wasn't designed
-	    to use bitmasks...	       -Erik
-
-	    Well, now we have the option CDS_MIXED: a mixed-type CD.
-	    User level programmers might feel the ioctl is not very
-	    useful.
-			---david
-
-
-
-
-CDROM_CHANGER_NSLOTS		Get number of slots
-
-	usage:
-
-	  ioctl(fd, CDROM_CHANGER_NSLOTS, 0);
-
-	inputs:		none
-
-	outputs:
-	  The ioctl return value will be the number of slots in a
-	  CD changer.  Typically 1 for non-multi-disk devices.
-
-	error returns:	none
-
-
-
-CDROM_LOCKDOOR			lock or unlock door
-
-	usage:
-
-	  int lock;
-	  ioctl(fd, CDROM_LOCKDOOR, lock);
-
-	inputs:
-	  Door lock flag, 1=lock, 0=unlock
-
-	outputs:	none
-
-	error returns:
-	  EDRIVE_CANT_DO_THIS	Door lock function not supported.
-	  EBUSY			Attempt to unlock when multiple users
-	  			have the drive open and not CAP_SYS_ADMIN
-
-	notes:
-	  As of 2.6.8.1, the lock flag is a global lock, meaning that
-	  all CD drives will be locked or unlocked together.  This is
-	  probably a bug.
-
-	  The EDRIVE_CANT_DO_THIS value is defined in <linux/cdrom.h>
-	  and is currently (2.6.8.1) the same as EOPNOTSUPP
-
-
-
-CDROM_DEBUG			Turn debug messages on/off
-
-	usage:
-
-	  int debug;
-	  ioctl(fd, CDROM_DEBUG, debug);
-
-	inputs:
-	  Cdrom debug flag, 0=disable, 1=enable
-
-	outputs:
-	  The ioctl return value will be the new debug flag.
-
-	error return:
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-
-
-
-CDROM_GET_CAPABILITY		get capabilities
-
-	usage:
-
-	  ioctl(fd, CDROM_GET_CAPABILITY, 0);
-
-	inputs:		none
-
-	outputs:
-	  The ioctl return value is the current device capability
-	  flags.  See CDC_CLOSE_TRAY, CDC_OPEN_TRAY, etc.
-
-
-
-CDROMAUDIOBUFSIZ		set the audio buffer size
-
-	usage:
-
-	  int arg;
-	  ioctl(fd, CDROMAUDIOBUFSIZ, val);
-
-	inputs:
-	  New audio buffer size
-
-	outputs:
-	  The ioctl return value is the new audio buffer size, or -1
-	  on error.
-
-	error return:
-	  ENOSYS	Not supported by this driver.
-
-	notes:
-	  Not supported by all drivers.
-
-
-
-DVD_READ_STRUCT			Read structure
-
-	usage:
-
-	  dvd_struct s;
-	  ioctl(fd, DVD_READ_STRUCT, &s);
-
-	inputs:
-	  dvd_struct structure, containing:
-	    type		specifies the information desired, one of
-	    			DVD_STRUCT_PHYSICAL, DVD_STRUCT_COPYRIGHT,
-				DVD_STRUCT_DISCKEY, DVD_STRUCT_BCA,
-				DVD_STRUCT_MANUFACT
-	    physical.layer_num	desired layer, indexed from 0
-	    copyright.layer_num	desired layer, indexed from 0
-	    disckey.agid
-
-	outputs:
-	  dvd_struct structure, containing:
-	    physical		for type == DVD_STRUCT_PHYSICAL
-	    copyright		for type == DVD_STRUCT_COPYRIGHT
-	    disckey.value	for type == DVD_STRUCT_DISCKEY
-	    bca.{len,value}	for type == DVD_STRUCT_BCA
-	    manufact.{len,valu}	for type == DVD_STRUCT_MANUFACT
-
-	error returns:
-	  EINVAL	physical.layer_num exceeds number of layers
-	  EIO		Received invalid response from drive
-
-
-
-DVD_WRITE_STRUCT		Write structure
-
-	Not implemented, as of 2.6.8.1
-
-
-
-DVD_AUTH			Authentication
-
-	usage:
-
-	  dvd_authinfo ai;
-	  ioctl(fd, DVD_AUTH, &ai);
-
-	inputs:
-	  dvd_authinfo structure.  See <linux/cdrom.h>
-
-	outputs:
-	  dvd_authinfo structure.
-
-	error return:
-	  ENOTTY	ai.type not recognized.
-
-
-
-CDROM_SEND_PACKET		send a packet to the drive
-
-	usage:
-
-	  struct cdrom_generic_command cgc;
-	  ioctl(fd, CDROM_SEND_PACKET, &cgc);
-
-	inputs:
-	  cdrom_generic_command structure containing the packet to send.
-
-	outputs:	none
-	  cdrom_generic_command structure containing results.
-
-	error return:
-	  EIO		command failed.
-	  EPERM		Operation not permitted, either because a
-			write command was attempted on a drive which
-			is opened read-only, or because the command
-			requires CAP_SYS_RAWIO
-	  EINVAL	cgc.data_direction not set
-
-
-
-CDROM_NEXT_WRITABLE		get next writable block
-
-	usage:
-
-	  long next;
-	  ioctl(fd, CDROM_NEXT_WRITABLE, &next);
-
-	inputs:		none
-
-	outputs:
-	  The next writable block.
-
-	notes:
-	  If the device does not support this ioctl directly, the
-	  ioctl will return CDROM_LAST_WRITTEN + 7.
-
-
-
-CDROM_LAST_WRITTEN		get last block written on disc
-
-	usage:
-
-	  long last;
-	  ioctl(fd, CDROM_LAST_WRITTEN, &last);
-
-	inputs:		none
-
-	outputs:
-	  The last block written on disc
-
-	notes:
-	  If the device does not support this ioctl directly, the
-	  result is derived from the disc's table of contents.  If the
-	  table of contents can't be read, this ioctl returns an
-	  error.
diff --git a/Documentation/ioctl/hdio.rst b/Documentation/ioctl/hdio.rst
new file mode 100644
index 000000000000..e822e3dff176
--- /dev/null
+++ b/Documentation/ioctl/hdio.rst
@@ -0,0 +1,1342 @@
+==============================
+Summary of `HDIO_` ioctl calls
+==============================
+
+- Edward A. Falk <efalk@google.com>
+
+November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the HD/IDE layer.  These are by-and-large implemented (as of Linux 2.6)
+in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/hdreg.h>.  As of this writing, they
+are as follows:
+
+    ioctls that pass argument pointers to user space:
+
+	=======================	=======================================
+	HDIO_GETGEO		get device geometry
+	HDIO_GET_UNMASKINTR	get current unmask setting
+	HDIO_GET_MULTCOUNT	get current IDE blockmode setting
+	HDIO_GET_QDMA		get use-qdma flag
+	HDIO_SET_XFER		set transfer rate via proc
+	HDIO_OBSOLETE_IDENTITY	OBSOLETE, DO NOT USE
+	HDIO_GET_KEEPSETTINGS	get keep-settings-on-reset flag
+	HDIO_GET_32BIT		get current io_32bit setting
+	HDIO_GET_NOWERR		get ignore-write-error flag
+	HDIO_GET_DMA		get use-dma flag
+	HDIO_GET_NICE		get nice flags
+	HDIO_GET_IDENTITY	get IDE identification info
+	HDIO_GET_WCACHE		get write cache mode on|off
+	HDIO_GET_ACOUSTIC	get acoustic value
+	HDIO_GET_ADDRESS	get sector addressing mode
+	HDIO_GET_BUSSTATE	get the bus state of the hwif
+	HDIO_TRISTATE_HWIF	execute a channel tristate
+	HDIO_DRIVE_RESET	execute a device reset
+	HDIO_DRIVE_TASKFILE	execute raw taskfile
+	HDIO_DRIVE_TASK		execute task and special drive command
+	HDIO_DRIVE_CMD		execute a special drive command
+	HDIO_DRIVE_CMD_AEB	HDIO_DRIVE_TASK
+	=======================	=======================================
+
+    ioctls that pass non-pointer values:
+
+	=======================	=======================================
+	HDIO_SET_MULTCOUNT	change IDE blockmode
+	HDIO_SET_UNMASKINTR	permit other irqs during I/O
+	HDIO_SET_KEEPSETTINGS	keep ioctl settings on reset
+	HDIO_SET_32BIT		change io_32bit flags
+	HDIO_SET_NOWERR		change ignore-write-error flag
+	HDIO_SET_DMA		change use-dma flag
+	HDIO_SET_PIO_MODE	reconfig interface to new speed
+	HDIO_SCAN_HWIF		register and (re)scan interface
+	HDIO_SET_NICE		set nice flags
+	HDIO_UNREGISTER_HWIF	unregister interface
+	HDIO_SET_WCACHE		change write cache enable-disable
+	HDIO_SET_ACOUSTIC	change acoustic behavior
+	HDIO_SET_BUSSTATE	set the bus state of the hwif
+	HDIO_SET_QDMA		change use-qdma flag
+	HDIO_SET_ADDRESS	change lba addressing modes
+
+	HDIO_SET_IDE_SCSI	Set scsi emulation mode on/off
+	HDIO_SET_SCSI_IDE	not implemented yet
+	=======================	=======================================
+
+
+The information that follows was determined from reading kernel source
+code.  It is likely that some corrections will be made over time.
+
+------------------------------------------------------------------------------
+
+General:
+
+	Unless otherwise specified, all ioctl calls return 0 on success
+	and -1 with errno set to an appropriate value on error.
+
+	Unless otherwise specified, all ioctl calls return -1 and set
+	errno to EFAULT on a failed attempt to copy data to or from user
+	address space.
+
+	Unless otherwise specified, all data structures and constants
+	are defined in <linux/hdreg.h>
+
+------------------------------------------------------------------------------
+
+HDIO_GETGEO
+	get device geometry
+
+
+	usage::
+
+	  struct hd_geometry geom;
+
+	  ioctl(fd, HDIO_GETGEO, &geom);
+
+
+	inputs:
+		none
+
+
+
+	outputs:
+		hd_geometry structure containing:
+
+
+	    =========	==================================
+	    heads	number of heads
+	    sectors	number of sectors/track
+	    cylinders	number of cylinders, mod 65536
+	    start	starting sector of this partition.
+	    =========	==================================
+
+
+	error returns:
+	  - EINVAL
+
+			if the device is not a disk drive or floppy drive,
+			or if the user passes a null pointer
+
+
+	notes:
+		Not particularly useful with modern disk drives, whose geometry
+		is a polite fiction anyway.  Modern drives are addressed
+		purely by sector number nowadays (lba addressing), and the
+		drive geometry is an abstraction which is actually subject
+		to change.  Currently (as of Nov 2004), the geometry values
+		are the "bios" values -- presumably the values the drive had
+		when Linux first booted.
+
+		In addition, the cylinders field of the hd_geometry is an
+		unsigned short, meaning that on most architectures, this
+		ioctl will not return a meaningful value on drives with more
+		than 65535 tracks.
+
+		The start field is unsigned long, meaning that it will not
+		contain a meaningful value for disks over 219 Gb in size.
+
+
+
+
+HDIO_GET_UNMASKINTR
+	get current unmask setting
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_UNMASKINTR, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the drive's current unmask setting
+
+
+
+
+
+HDIO_SET_UNMASKINTR
+	permit other irqs during I/O
+
+
+	usage::
+
+	  unsigned long val;
+
+	  ioctl(fd, HDIO_SET_UNMASKINTR, val);
+
+	inputs:
+		New value for unmask flag
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY	Controller busy
+
+
+
+
+HDIO_GET_MULTCOUNT
+	get current IDE blockmode setting
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_MULTCOUNT, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current IDE block mode setting.  This
+		controls how many sectors the drive will transfer per
+		interrupt.
+
+
+
+HDIO_SET_MULTCOUNT
+	change IDE blockmode
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_MULTCOUNT, val);
+
+	inputs:
+		New value for IDE block mode setting.  This controls how many
+		sectors the drive will transfer per interrupt.
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range supported by disk.
+	  - EBUSY	Controller busy or blockmode already set.
+	  - EIO		Drive did not accept new block mode.
+
+	notes:
+	  Source code comments read::
+
+	    This is tightly woven into the driver->do_special cannot
+	    touch.  DON'T do it again until a total personality rewrite
+	    is committed.
+
+	  If blockmode has already been set, this ioctl will fail with
+	  -EBUSY
+
+
+
+HDIO_GET_QDMA
+	get use-qdma flag
+
+
+	Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_XFER
+	set transfer rate via proc
+
+
+	Not implemented, as of 2.6.8.1
+
+
+
+HDIO_OBSOLETE_IDENTITY
+	OBSOLETE, DO NOT USE
+
+
+	Same as HDIO_GET_IDENTITY (see below), except that it only
+	returns the first 142 bytes of drive identity information.
+
+
+
+HDIO_GET_IDENTITY
+	get IDE identification info
+
+
+	usage::
+
+	  unsigned char identity[512];
+
+	  ioctl(fd, HDIO_GET_IDENTITY, identity);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		ATA drive identity information.  For full description, see
+		the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
+		the ATA specification.
+
+	error returns:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - ENOMSG	IDENTIFY DEVICE information not available
+
+	notes:
+		Returns information that was obtained when the drive was
+		probed.  Some of this information is subject to change, and
+		this ioctl does not re-probe the drive to update the
+		information.
+
+		This information is also available from /proc/ide/hdX/identify
+
+
+
+HDIO_GET_KEEPSETTINGS
+	get keep-settings-on-reset flag
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current "keep settings" flag
+
+
+
+	notes:
+		When set, indicates that kernel should restore settings
+		after a drive reset.
+
+
+
+HDIO_SET_KEEPSETTINGS
+	keep ioctl settings on reset
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
+
+	inputs:
+		New value for keep_settings flag
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY		Controller busy
+
+
+
+HDIO_GET_32BIT
+	get current io_32bit setting
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_32BIT, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current io_32bit setting
+
+
+
+	notes:
+		0=16-bit, 1=32-bit, 2,3 = 32bit+sync
+
+
+
+
+
+HDIO_GET_NOWERR
+	get ignore-write-error flag
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_NOWERR, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current ignore-write-error flag
+
+
+
+
+
+HDIO_GET_DMA
+	get use-dma flag
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_DMA, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current use-dma flag
+
+
+
+
+
+HDIO_GET_NICE
+	get nice flags
+
+
+	usage::
+
+	  long nice;
+
+	  ioctl(fd, HDIO_GET_NICE, &nice);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The drive's "nice" values.
+
+
+
+	notes:
+		Per-drive flags which determine when the system will give more
+		bandwidth to other devices sharing the same IDE bus.
+
+		See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
+
+
+
+
+HDIO_SET_NICE
+	set nice flags
+
+
+	usage::
+
+	  unsigned long nice;
+
+	  ...
+	  ioctl(fd, HDIO_SET_NICE, nice);
+
+	inputs:
+		bitmask of nice flags.
+
+
+
+	outputs:
+		none
+
+
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EPERM	Flags other than DSC_OVERLAP and NICE_1 set.
+	  - EPERM	DSC_OVERLAP specified but not supported by drive
+
+	notes:
+		This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
+		provided by the user.
+
+		Nice flags are listed in <linux/hdreg.h>, starting with
+		IDE_NICE_DSC_OVERLAP.  These values represent shifts.
+
+
+
+
+
+HDIO_GET_WCACHE
+	get write cache mode on|off
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_WCACHE, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current write cache mode
+
+
+
+
+
+HDIO_GET_ACOUSTIC
+	get acoustic value
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_ACOUSTIC, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current acoustic settings
+
+
+
+	notes:
+		See HDIO_SET_ACOUSTIC
+
+
+
+
+
+HDIO_GET_ADDRESS
+	usage::
+
+
+	  long val;
+
+	  ioctl(fd, HDIO_GET_ADDRESS, &val);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		The value of the current addressing mode:
+
+	    =  ===================
+	    0  28-bit
+	    1  48-bit
+	    2  48-bit doing 28-bit
+	    3  64-bit
+	    =  ===================
+
+
+
+HDIO_GET_BUSSTATE
+	get the bus state of the hwif
+
+
+	usage::
+
+	  long state;
+
+	  ioctl(fd, HDIO_SCAN_HWIF, &state);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		Current power state of the IDE bus.  One of BUSSTATE_OFF,
+		BUSSTATE_ON, or BUSSTATE_TRISTATE
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+
+
+
+
+HDIO_SET_BUSSTATE
+	set the bus state of the hwif
+
+
+	usage::
+
+	  int state;
+
+	  ...
+	  ioctl(fd, HDIO_SCAN_HWIF, state);
+
+	inputs:
+		Desired IDE power state.  One of BUSSTATE_OFF, BUSSTATE_ON,
+		or BUSSTATE_TRISTATE
+
+	outputs:
+		none
+
+
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
+	  - EOPNOTSUPP	Hardware interface does not support bus power control
+
+
+
+
+HDIO_TRISTATE_HWIF
+	execute a channel tristate
+
+
+	Not implemented, as of 2.6.8.1.  See HDIO_SET_BUSSTATE
+
+
+
+HDIO_DRIVE_RESET
+	execute a device reset
+
+
+	usage::
+
+	  int args[3]
+
+	  ...
+	  ioctl(fd, HDIO_DRIVE_RESET, args);
+
+	inputs:
+		none
+
+
+
+	outputs:
+		none
+
+
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - ENXIO	No such device:	phy dead or ctl_addr == 0
+	  - EIO		I/O error:	reset timed out or hardware error
+
+	notes:
+
+	  - Execute a reset on the device as soon as the current IO
+	    operation has completed.
+
+	  - Executes an ATAPI soft reset if applicable, otherwise
+	    executes an ATA soft reset on the controller.
+
+
+
+HDIO_DRIVE_TASKFILE
+	execute raw taskfile
+
+
+	Note:
+		If you don't have a copy of the ANSI ATA specification
+		handy, you should probably ignore this ioctl.
+
+	- Execute an ATA disk command directly by writing the "taskfile"
+	  registers of the drive.  Requires ADMIN and RAWIO access
+	  privileges.
+
+	usage::
+
+	  struct {
+
+	    ide_task_request_t req_task;
+	    u8 outbuf[OUTPUT_SIZE];
+	    u8 inbuf[INPUT_SIZE];
+	  } task;
+	  memset(&task.req_task, 0, sizeof(task.req_task));
+	  task.req_task.out_size = sizeof(task.outbuf);
+	  task.req_task.in_size = sizeof(task.inbuf);
+	  ...
+	  ioctl(fd, HDIO_DRIVE_TASKFILE, &task);
+	  ...
+
+	inputs:
+
+	  (See below for details on memory area passed to ioctl.)
+
+	  ============	===================================================
+	  io_ports[8]	values to be written to taskfile registers
+	  hob_ports[8]	high-order bytes, for extended commands.
+	  out_flags	flags indicating which registers are valid
+	  in_flags	flags indicating which registers should be returned
+	  data_phase	see below
+	  req_cmd	command type to be executed
+	  out_size	size of output buffer
+	  outbuf	buffer of data to be transmitted to disk
+	  inbuf		buffer of data to be received from disk (see [1])
+	  ============	===================================================
+
+	outputs:
+
+	  ===========	====================================================
+	  io_ports[]	values returned in the taskfile registers
+	  hob_ports[]	high-order bytes, for extended commands.
+	  out_flags	flags indicating which registers are valid (see [2])
+	  in_flags	flags indicating which registers should be returned
+	  outbuf	buffer of data to be transmitted to disk (see [1])
+	  inbuf		buffer of data to be received from disk
+	  ===========	====================================================
+
+	error returns:
+	  - EACCES	CAP_SYS_ADMIN or CAP_SYS_RAWIO privilege not set.
+	  - ENOMSG	Device is not a disk drive.
+	  - ENOMEM	Unable to allocate memory for task
+	  - EFAULT	req_cmd == TASKFILE_IN_OUT (not implemented as of 2.6.8)
+	  - EPERM
+
+			req_cmd == TASKFILE_MULTI_OUT and drive
+			multi-count not yet set.
+	  - EIO		Drive failed the command.
+
+	notes:
+
+	  [1] READ THE FOLLOWING NOTES *CAREFULLY*.  THIS IOCTL IS
+	  FULL OF GOTCHAS.  Extreme caution should be used with using
+	  this ioctl.  A mistake can easily corrupt data or hang the
+	  system.
+
+	  [2] Both the input and output buffers are copied from the
+	  user and written back to the user, even when not used.
+
+	  [3] If one or more bits are set in out_flags and in_flags is
+	  zero, the following values are used for in_flags.all and
+	  written back into in_flags on completion.
+
+	   * IDE_TASKFILE_STD_IN_FLAGS | (IDE_HOB_STD_IN_FLAGS << 8)
+	     if LBA48 addressing is enabled for the drive
+	   * IDE_TASKFILE_STD_IN_FLAGS
+	     if CHS/LBA28
+
+	  The association between in_flags.all and each enable
+	  bitfield flips depending on endianness; fortunately, TASKFILE
+	  only uses inflags.b.data bit and ignores all other bits.
+	  The end result is that, on any endian machines, it has no
+	  effect other than modifying in_flags on completion.
+
+	  [4] The default value of SELECT is (0xa0|DEV_bit|LBA_bit)
+	  except for four drives per port chipsets.  For four drives
+	  per port chipsets, it's (0xa0|DEV_bit|LBA_bit) for the first
+	  pair and (0x80|DEV_bit|LBA_bit) for the second pair.
+
+	  [5] The argument to the ioctl is a pointer to a region of
+	  memory containing a ide_task_request_t structure, followed
+	  by an optional buffer of data to be transmitted to the
+	  drive, followed by an optional buffer to receive data from
+	  the drive.
+
+	  Command is passed to the disk drive via the ide_task_request_t
+	  structure, which contains these fields:
+
+	    ============	===============================================
+	    io_ports[8]		values for the taskfile registers
+	    hob_ports[8]	high-order bytes, for extended commands
+	    out_flags		flags indicating which entries in the
+				io_ports[] and hob_ports[] arrays
+				contain valid values.  Type ide_reg_valid_t.
+	    in_flags		flags indicating which entries in the
+				io_ports[] and hob_ports[] arrays
+				are expected to contain valid values
+				on return.
+	    data_phase		See below
+	    req_cmd		Command type, see below
+	    out_size		output (user->drive) buffer size, bytes
+	    in_size		input (drive->user) buffer size, bytes
+	    ============	===============================================
+
+	  When out_flags is zero, the following registers are loaded.
+
+	    ============	===============================================
+	    HOB_FEATURE		If the drive supports LBA48
+	    HOB_NSECTOR		If the drive supports LBA48
+	    HOB_SECTOR		If the drive supports LBA48
+	    HOB_LCYL		If the drive supports LBA48
+	    HOB_HCYL		If the drive supports LBA48
+	    FEATURE
+	    NSECTOR
+	    SECTOR
+	    LCYL
+	    HCYL
+	    SELECT		First, masked with 0xE0 if LBA48, 0xEF
+				otherwise; then, or'ed with the default
+				value of SELECT.
+	    ============	===============================================
+
+	  If any bit in out_flags is set, the following registers are loaded.
+
+	    ============	===============================================
+	    HOB_DATA		If out_flags.b.data is set.  HOB_DATA will
+				travel on DD8-DD15 on little endian machines
+				and on DD0-DD7 on big endian machines.
+	    DATA		If out_flags.b.data is set.  DATA will
+				travel on DD0-DD7 on little endian machines
+				and on DD8-DD15 on big endian machines.
+	    HOB_NSECTOR		If out_flags.b.nsector_hob is set
+	    HOB_SECTOR		If out_flags.b.sector_hob is set
+	    HOB_LCYL		If out_flags.b.lcyl_hob is set
+	    HOB_HCYL		If out_flags.b.hcyl_hob is set
+	    FEATURE		If out_flags.b.feature is set
+	    NSECTOR		If out_flags.b.nsector is set
+	    SECTOR		If out_flags.b.sector is set
+	    LCYL		If out_flags.b.lcyl is set
+	    HCYL		If out_flags.b.hcyl is set
+	    SELECT		Or'ed with the default value of SELECT and
+				loaded regardless of out_flags.b.select.
+	    ============	===============================================
+
+	  Taskfile registers are read back from the drive into
+	  {io|hob}_ports[] after the command completes iff one of the
+	  following conditions is met; otherwise, the original values
+	  will be written back, unchanged.
+
+	    1. The drive fails the command (EIO).
+	    2. One or more than one bits are set in out_flags.
+	    3. The requested data_phase is TASKFILE_NO_DATA.
+
+	    ============	===============================================
+	    HOB_DATA		If in_flags.b.data is set.  It will contain
+				DD8-DD15 on little endian machines and
+				DD0-DD7 on big endian machines.
+	    DATA		If in_flags.b.data is set.  It will contain
+				DD0-DD7 on little endian machines and
+				DD8-DD15 on big endian machines.
+	    HOB_FEATURE		If the drive supports LBA48
+	    HOB_NSECTOR		If the drive supports LBA48
+	    HOB_SECTOR		If the drive supports LBA48
+	    HOB_LCYL		If the drive supports LBA48
+	    HOB_HCYL		If the drive supports LBA48
+	    NSECTOR
+	    SECTOR
+	    LCYL
+	    HCYL
+	    ============	===============================================
+
+	  The data_phase field describes the data transfer to be
+	  performed.  Value is one of:
+
+	    ===================        ========================================
+	    TASKFILE_IN
+	    TASKFILE_MULTI_IN
+	    TASKFILE_OUT
+	    TASKFILE_MULTI_OUT
+	    TASKFILE_IN_OUT
+	    TASKFILE_IN_DMA
+	    TASKFILE_IN_DMAQ		== IN_DMA (queueing not supported)
+	    TASKFILE_OUT_DMA
+	    TASKFILE_OUT_DMAQ		== OUT_DMA (queueing not supported)
+	    TASKFILE_P_IN		unimplemented
+	    TASKFILE_P_IN_DMA		unimplemented
+	    TASKFILE_P_IN_DMAQ		unimplemented
+	    TASKFILE_P_OUT		unimplemented
+	    TASKFILE_P_OUT_DMA		unimplemented
+	    TASKFILE_P_OUT_DMAQ		unimplemented
+	    ===================        ========================================
+
+	  The req_cmd field classifies the command type.  It may be
+	  one of:
+
+	    ========================    =======================================
+	    IDE_DRIVE_TASK_NO_DATA
+	    IDE_DRIVE_TASK_SET_XFER	unimplemented
+	    IDE_DRIVE_TASK_IN
+	    IDE_DRIVE_TASK_OUT		unimplemented
+	    IDE_DRIVE_TASK_RAW_WRITE
+	    ========================    =======================================
+
+	  [6] Do not access {in|out}_flags->all except for resetting
+	  all the bits.  Always access individual bit fields.  ->all
+	  value will flip depending on endianness.  For the same
+	  reason, do not use IDE_{TASKFILE|HOB}_STD_{OUT|IN}_FLAGS
+	  constants defined in hdreg.h.
+
+
+
+HDIO_DRIVE_CMD
+	execute a special drive command
+
+
+	Note:  If you don't have a copy of the ANSI ATA specification
+	handy, you should probably ignore this ioctl.
+
+	usage::
+
+	  u8 args[4+XFER_SIZE];
+
+	  ...
+	  ioctl(fd, HDIO_DRIVE_CMD, args);
+
+	inputs:
+	    Commands other than WIN_SMART:
+
+	    =======     =======
+	    args[0]	COMMAND
+	    args[1]	NSECTOR
+	    args[2]	FEATURE
+	    args[3]	NSECTOR
+	    =======     =======
+
+	    WIN_SMART:
+
+	    =======     =======
+	    args[0]	COMMAND
+	    args[1]	SECTOR
+	    args[2]	FEATURE
+	    args[3]	NSECTOR
+	    =======     =======
+
+	outputs:
+		args[] buffer is filled with register values followed by any
+
+
+	  data returned by the disk.
+
+	    ========	====================================================
+	    args[0]	status
+	    args[1]	error
+	    args[2]	NSECTOR
+	    args[3]	undefined
+	    args[4+]	NSECTOR * 512 bytes of data returned by the command.
+	    ========	====================================================
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
+	  - ENOMEM	Unable to allocate memory for task
+	  - EIO		Drive reports error
+
+	notes:
+
+	  [1] For commands other than WIN_SMART, args[1] should equal
+	  args[3].  SECTOR, LCYL and HCYL are undefined.  For
+	  WIN_SMART, 0x4f and 0xc2 are loaded into LCYL and HCYL
+	  respectively.  In both cases SELECT will contain the default
+	  value for the drive.  Please refer to HDIO_DRIVE_TASKFILE
+	  notes for the default value of SELECT.
+
+	  [2] If NSECTOR value is greater than zero and the drive sets
+	  DRQ when interrupting for the command, NSECTOR * 512 bytes
+	  are read from the device into the area following NSECTOR.
+	  In the above example, the area would be
+	  args[4..4+XFER_SIZE].  16bit PIO is used regardless of
+	  HDIO_SET_32BIT setting.
+
+	  [3] If COMMAND == WIN_SETFEATURES && FEATURE == SETFEATURES_XFER
+	  && NSECTOR >= XFER_SW_DMA_0 && the drive supports any DMA
+	  mode, IDE driver will try to tune the transfer mode of the
+	  drive accordingly.
+
+
+
+HDIO_DRIVE_TASK
+	execute task and special drive command
+
+
+	Note:  If you don't have a copy of the ANSI ATA specification
+	handy, you should probably ignore this ioctl.
+
+	usage::
+
+	  u8 args[7];
+
+	  ...
+	  ioctl(fd, HDIO_DRIVE_TASK, args);
+
+	inputs:
+	    Taskfile register values:
+
+	    =======	=======
+	    args[0]	COMMAND
+	    args[1]	FEATURE
+	    args[2]	NSECTOR
+	    args[3]	SECTOR
+	    args[4]	LCYL
+	    args[5]	HCYL
+	    args[6]	SELECT
+	    =======	=======
+
+	outputs:
+	    Taskfile register values:
+
+
+	    =======	=======
+	    args[0]	status
+	    args[1]	error
+	    args[2]	NSECTOR
+	    args[3]	SECTOR
+	    args[4]	LCYL
+	    args[5]	HCYL
+	    args[6]	SELECT
+	    =======	=======
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
+	  - ENOMEM	Unable to allocate memory for task
+	  - ENOMSG	Device is not a disk drive.
+	  - EIO		Drive failed the command.
+
+	notes:
+
+	  [1] DEV bit (0x10) of SELECT register is ignored and the
+	  appropriate value for the drive is used.  All other bits
+	  are used unaltered.
+
+
+
+HDIO_DRIVE_CMD_AEB
+	HDIO_DRIVE_TASK
+
+
+	Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_32BIT
+	change io_32bit flags
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_32BIT, val);
+
+	inputs:
+		New value for io_32bit flag
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 3]
+	  - EBUSY	Controller busy
+
+
+
+
+HDIO_SET_NOWERR
+	change ignore-write-error flag
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_NOWERR, val);
+
+	inputs:
+		New value for ignore-write-error flag.  Used for ignoring
+
+
+	  WRERR_STAT
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY		Controller busy
+
+
+
+HDIO_SET_DMA
+	change use-dma flag
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_SET_DMA, val);
+
+	inputs:
+		New value for use-dma flag
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY	Controller busy
+
+
+
+HDIO_SET_PIO_MODE
+	reconfig interface to new speed
+
+
+	usage::
+
+	  long val;
+
+	  ioctl(fd, HDIO_SET_PIO_MODE, val);
+
+	inputs:
+		New interface speed.
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 255]
+	  - EBUSY	Controller busy
+
+
+
+HDIO_SCAN_HWIF
+	register and (re)scan interface
+
+
+	usage::
+
+	  int args[3]
+
+	  ...
+	  ioctl(fd, HDIO_SCAN_HWIF, args);
+
+	inputs:
+
+	  =======	=========================
+	  args[0]	io address to probe
+
+
+	  args[1]	control address to probe
+	  args[2]	irq number
+	  =======	=========================
+
+	outputs:
+		none
+
+
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
+	  - EIO		Probe failed.
+
+	notes:
+		This ioctl initializes the addresses and irq for a disk
+		controller, probes for drives, and creates /proc/ide
+		interfaces as appropriate.
+
+
+
+HDIO_UNREGISTER_HWIF
+	unregister interface
+
+
+	usage::
+
+	  int index;
+
+	  ioctl(fd, HDIO_UNREGISTER_HWIF, index);
+
+	inputs:
+		index		index of hardware interface to unregister
+
+
+
+	outputs:
+		none
+
+
+
+	error returns:
+	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
+
+	notes:
+		This ioctl removes a hardware interface from the kernel.
+
+		Currently (2.6.8) this ioctl silently fails if any drive on
+		the interface is busy.
+
+
+
+HDIO_SET_WCACHE
+	change write cache enable-disable
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_WCACHE, val);
+
+	inputs:
+		New value for write cache enable
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY	Controller busy
+
+
+
+HDIO_SET_ACOUSTIC
+	change acoustic behavior
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_ACOUSTIC, val);
+
+	inputs:
+		New value for drive acoustic settings
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 254]
+	  - EBUSY	Controller busy
+
+
+
+HDIO_SET_QDMA
+	change use-qdma flag
+
+
+	Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_ADDRESS
+	change lba addressing modes
+
+
+	usage::
+
+	  int val;
+
+	  ioctl(fd, HDIO_SET_ADDRESS, val);
+
+	inputs:
+		New value for addressing mode
+
+	    =   ===================
+	    0   28-bit
+	    1   48-bit
+	    2   48-bit doing 28-bit
+	    =   ===================
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 2]
+	  - EBUSY		Controller busy
+	  - EIO		Drive does not support lba48 mode.
+
+
+HDIO_SET_IDE_SCSI
+	usage::
+
+
+	  long val;
+
+	  ioctl(fd, HDIO_SET_IDE_SCSI, val);
+
+	inputs:
+		New value for scsi emulation mode (?)
+
+
+
+	outputs:
+		none
+
+
+
+	error return:
+	  - EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
+	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
+	  - EINVAL	value out of range [0 1]
+	  - EBUSY	Controller busy
+
+
+
+HDIO_SET_SCSI_IDE
+	Not implemented, as of 2.6.8.1
diff --git a/Documentation/ioctl/hdio.txt b/Documentation/ioctl/hdio.txt
deleted file mode 100644
index 18eb98c44ffe..000000000000
--- a/Documentation/ioctl/hdio.txt
+++ /dev/null
@@ -1,1071 +0,0 @@
-		Summary of HDIO_ ioctl calls.
-		============================
-
-		Edward A. Falk <efalk@google.com>
-
-		November, 2004
-
-This document attempts to describe the ioctl(2) calls supported by
-the HD/IDE layer.  These are by-and-large implemented (as of Linux 2.6)
-in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
-
-ioctl values are listed in <linux/hdreg.h>.  As of this writing, they
-are as follows:
-
-    ioctls that pass argument pointers to user space:
-
-	HDIO_GETGEO		get device geometry
-	HDIO_GET_UNMASKINTR	get current unmask setting
-	HDIO_GET_MULTCOUNT	get current IDE blockmode setting
-	HDIO_GET_QDMA		get use-qdma flag
-	HDIO_SET_XFER		set transfer rate via proc
-	HDIO_OBSOLETE_IDENTITY	OBSOLETE, DO NOT USE
-	HDIO_GET_KEEPSETTINGS	get keep-settings-on-reset flag
-	HDIO_GET_32BIT		get current io_32bit setting
-	HDIO_GET_NOWERR		get ignore-write-error flag
-	HDIO_GET_DMA		get use-dma flag
-	HDIO_GET_NICE		get nice flags
-	HDIO_GET_IDENTITY	get IDE identification info
-	HDIO_GET_WCACHE		get write cache mode on|off
-	HDIO_GET_ACOUSTIC	get acoustic value
-	HDIO_GET_ADDRESS	get sector addressing mode
-	HDIO_GET_BUSSTATE	get the bus state of the hwif
-	HDIO_TRISTATE_HWIF	execute a channel tristate
-	HDIO_DRIVE_RESET	execute a device reset
-	HDIO_DRIVE_TASKFILE	execute raw taskfile
-	HDIO_DRIVE_TASK		execute task and special drive command
-	HDIO_DRIVE_CMD		execute a special drive command
-	HDIO_DRIVE_CMD_AEB	HDIO_DRIVE_TASK
-
-    ioctls that pass non-pointer values:
-
-	HDIO_SET_MULTCOUNT	change IDE blockmode
-	HDIO_SET_UNMASKINTR	permit other irqs during I/O
-	HDIO_SET_KEEPSETTINGS	keep ioctl settings on reset
-	HDIO_SET_32BIT		change io_32bit flags
-	HDIO_SET_NOWERR		change ignore-write-error flag
-	HDIO_SET_DMA		change use-dma flag
-	HDIO_SET_PIO_MODE	reconfig interface to new speed
-	HDIO_SCAN_HWIF		register and (re)scan interface
-	HDIO_SET_NICE		set nice flags
-	HDIO_UNREGISTER_HWIF	unregister interface
-	HDIO_SET_WCACHE		change write cache enable-disable
-	HDIO_SET_ACOUSTIC	change acoustic behavior
-	HDIO_SET_BUSSTATE	set the bus state of the hwif
-	HDIO_SET_QDMA		change use-qdma flag
-	HDIO_SET_ADDRESS	change lba addressing modes
-
-	HDIO_SET_IDE_SCSI	Set scsi emulation mode on/off
-	HDIO_SET_SCSI_IDE	not implemented yet
-
-
-The information that follows was determined from reading kernel source
-code.  It is likely that some corrections will be made over time.
-
-
-
-
-
-
-
-General:
-
-	Unless otherwise specified, all ioctl calls return 0 on success
-	and -1 with errno set to an appropriate value on error.
-
-	Unless otherwise specified, all ioctl calls return -1 and set
-	errno to EFAULT on a failed attempt to copy data to or from user
-	address space.
-
-	Unless otherwise specified, all data structures and constants
-	are defined in <linux/hdreg.h>
-
-
-
-HDIO_GETGEO			get device geometry
-
-	usage:
-
-	  struct hd_geometry geom;
-	  ioctl(fd, HDIO_GETGEO, &geom);
-
-
-	inputs:		none
-
-	outputs:
-
-	  hd_geometry structure containing:
-
-	    heads	number of heads
-	    sectors	number of sectors/track
-	    cylinders	number of cylinders, mod 65536
-	    start	starting sector of this partition.
-
-
-	error returns:
-	  EINVAL	if the device is not a disk drive or floppy drive,
-	  		or if the user passes a null pointer
-
-
-	notes:
-
-	  Not particularly useful with modern disk drives, whose geometry
-	  is a polite fiction anyway.  Modern drives are addressed
-	  purely by sector number nowadays (lba addressing), and the
-	  drive geometry is an abstraction which is actually subject
-	  to change.  Currently (as of Nov 2004), the geometry values
-	  are the "bios" values -- presumably the values the drive had
-	  when Linux first booted.
-
-	  In addition, the cylinders field of the hd_geometry is an
-	  unsigned short, meaning that on most architectures, this
-	  ioctl will not return a meaningful value on drives with more
-	  than 65535 tracks.
-
-	  The start field is unsigned long, meaning that it will not
-	  contain a meaningful value for disks over 219 Gb in size.
-
-
-
-
-HDIO_GET_UNMASKINTR		get current unmask setting
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_UNMASKINTR, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the drive's current unmask setting
-
-
-
-HDIO_SET_UNMASKINTR		permit other irqs during I/O
-
-	usage:
-
-	  unsigned long val;
-	  ioctl(fd, HDIO_SET_UNMASKINTR, val);
-
-	inputs:
-	  New value for unmask flag
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-
-HDIO_GET_MULTCOUNT		get current IDE blockmode setting
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_MULTCOUNT, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current IDE block mode setting.  This
-	  controls how many sectors the drive will transfer per
-	  interrupt.
-
-
-
-HDIO_SET_MULTCOUNT		change IDE blockmode
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_MULTCOUNT, val);
-
-	inputs:
-	  New value for IDE block mode setting.  This controls how many
-	  sectors the drive will transfer per interrupt.
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range supported by disk.
-	  EBUSY		Controller busy or blockmode already set.
-	  EIO		Drive did not accept new block mode.
-
-	notes:
-
-	  Source code comments read:
-
-	    This is tightly woven into the driver->do_special cannot
-	    touch.  DON'T do it again until a total personality rewrite
-	    is committed.
-
-	  If blockmode has already been set, this ioctl will fail with
-	  EBUSY
-
-
-
-HDIO_GET_QDMA			get use-qdma flag
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_XFER			set transfer rate via proc
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_OBSOLETE_IDENTITY		OBSOLETE, DO NOT USE
-
-	Same as HDIO_GET_IDENTITY (see below), except that it only
-	returns the first 142 bytes of drive identity information.
-
-
-
-HDIO_GET_IDENTITY		get IDE identification info
-
-	usage:
-
-	  unsigned char identity[512];
-	  ioctl(fd, HDIO_GET_IDENTITY, identity);
-
-	inputs:		none
-
-	outputs:
-
-	  ATA drive identity information.  For full description, see
-	  the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
-	  the ATA specification.
-
-	error returns:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  ENOMSG	IDENTIFY DEVICE information not available
-
-	notes:
-
-	  Returns information that was obtained when the drive was
-	  probed.  Some of this information is subject to change, and
-	  this ioctl does not re-probe the drive to update the
-	  information.
-
-	  This information is also available from /proc/ide/hdX/identify
-
-
-
-HDIO_GET_KEEPSETTINGS		get keep-settings-on-reset flag
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current "keep settings" flag
-
-	notes:
-
-	  When set, indicates that kernel should restore settings
-	  after a drive reset.
-
-
-
-HDIO_SET_KEEPSETTINGS		keep ioctl settings on reset
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
-
-	inputs:
-	  New value for keep_settings flag
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-HDIO_GET_32BIT			get current io_32bit setting
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_32BIT, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current io_32bit setting
-
-	notes:
-
-	  0=16-bit, 1=32-bit, 2,3 = 32bit+sync
-
-
-
-HDIO_GET_NOWERR			get ignore-write-error flag
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_NOWERR, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current ignore-write-error flag
-
-
-
-HDIO_GET_DMA			get use-dma flag
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_DMA, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current use-dma flag
-
-
-
-HDIO_GET_NICE			get nice flags
-
-	usage:
-
-	  long nice;
-	  ioctl(fd, HDIO_GET_NICE, &nice);
-
-	inputs:		none
-
-	outputs:
-
-	  The drive's "nice" values.
-
-	notes:
-
-	  Per-drive flags which determine when the system will give more
-	  bandwidth to other devices sharing the same IDE bus.
-	  See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
-
-
-
-
-HDIO_SET_NICE			set nice flags
-
-	usage:
-
-	  unsigned long nice;
-	  ...
-	  ioctl(fd, HDIO_SET_NICE, nice);
-
-	inputs:
-	  bitmask of nice flags.
-
-	outputs:	none
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EPERM		Flags other than DSC_OVERLAP and NICE_1 set.
-	  EPERM		DSC_OVERLAP specified but not supported by drive
-
-	notes:
-
-	  This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
-	  provided by the user.
-
-	  Nice flags are listed in <linux/hdreg.h>, starting with
-	  IDE_NICE_DSC_OVERLAP.  These values represent shifts.
-
-
-
-
-
-HDIO_GET_WCACHE			get write cache mode on|off
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_WCACHE, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current write cache mode
-
-
-
-HDIO_GET_ACOUSTIC		get acoustic value
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_ACOUSTIC, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current acoustic settings
-
-	notes:
-
-	  See HDIO_SET_ACOUSTIC
-
-
-
-HDIO_GET_ADDRESS
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_GET_ADDRESS, &val);
-
-	inputs:		none
-
-	outputs:
-	  The value of the current addressing mode:
-	    0 = 28-bit
-	    1 = 48-bit
-	    2 = 48-bit doing 28-bit
-	    3 = 64-bit
-
-
-
-HDIO_GET_BUSSTATE		get the bus state of the hwif
-
-	usage:
-
-	  long state;
-	  ioctl(fd, HDIO_SCAN_HWIF, &state);
-
-	inputs:		none
-
-	outputs:
-	  Current power state of the IDE bus.  One of BUSSTATE_OFF,
-	  BUSSTATE_ON, or BUSSTATE_TRISTATE
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-
-
-
-
-HDIO_SET_BUSSTATE		set the bus state of the hwif
-
-	usage:
-
-	  int state;
-	  ...
-	  ioctl(fd, HDIO_SCAN_HWIF, state);
-
-	inputs:
-	  Desired IDE power state.  One of BUSSTATE_OFF, BUSSTATE_ON,
-	  or BUSSTATE_TRISTATE
-
-	outputs:	none
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  EOPNOTSUPP	Hardware interface does not support bus power control
-
-
-
-
-HDIO_TRISTATE_HWIF		execute a channel tristate
-
-	Not implemented, as of 2.6.8.1.  See HDIO_SET_BUSSTATE
-
-
-
-HDIO_DRIVE_RESET		execute a device reset
-
-	usage:
-
-	  int args[3]
-	  ...
-	  ioctl(fd, HDIO_DRIVE_RESET, args);
-
-	inputs:		none
-
-	outputs:	none
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  ENXIO		No such device:	phy dead or ctl_addr == 0
-	  EIO		I/O error:	reset timed out or hardware error
-
-	notes:
-
-	  Execute a reset on the device as soon as the current IO
-	  operation has completed.
-
-	  Executes an ATAPI soft reset if applicable, otherwise
-	  executes an ATA soft reset on the controller.
-
-
-
-HDIO_DRIVE_TASKFILE		execute raw taskfile
-
-	Note:  If you don't have a copy of the ANSI ATA specification
-	handy, you should probably ignore this ioctl.
-
-	Execute an ATA disk command directly by writing the "taskfile"
-	registers of the drive.  Requires ADMIN and RAWIO access
-	privileges.
-
-	usage:
-
-	  struct {
-	    ide_task_request_t req_task;
-	    u8 outbuf[OUTPUT_SIZE];
-	    u8 inbuf[INPUT_SIZE];
-	  } task;
-	  memset(&task.req_task, 0, sizeof(task.req_task));
-	  task.req_task.out_size = sizeof(task.outbuf);
-	  task.req_task.in_size = sizeof(task.inbuf);
-	  ...
-	  ioctl(fd, HDIO_DRIVE_TASKFILE, &task);
-	  ...
-
-	inputs:
-
-	  (See below for details on memory area passed to ioctl.)
-
-	  io_ports[8]	values to be written to taskfile registers
-	  hob_ports[8]	high-order bytes, for extended commands.
-	  out_flags	flags indicating which registers are valid
-	  in_flags	flags indicating which registers should be returned
-	  data_phase	see below
-	  req_cmd	command type to be executed
-	  out_size	size of output buffer
-	  outbuf	buffer of data to be transmitted to disk
-	  inbuf		buffer of data to be received from disk (see [1])
-
-	outputs:
-
-	  io_ports[]	values returned in the taskfile registers
-	  hob_ports[]	high-order bytes, for extended commands.
-	  out_flags	flags indicating which registers are valid (see [2])
-	  in_flags	flags indicating which registers should be returned
-	  outbuf	buffer of data to be transmitted to disk (see [1])
-	  inbuf		buffer of data to be received from disk
-
-	error returns:
-	  EACCES	CAP_SYS_ADMIN or CAP_SYS_RAWIO privilege not set.
-	  ENOMSG	Device is not a disk drive.
-	  ENOMEM	Unable to allocate memory for task
-	  EFAULT	req_cmd == TASKFILE_IN_OUT (not implemented as of 2.6.8)
-	  EPERM		req_cmd == TASKFILE_MULTI_OUT and drive
-	  		multi-count not yet set.
-	  EIO		Drive failed the command.
-
-	notes:
-
-	  [1] READ THE FOLLOWING NOTES *CAREFULLY*.  THIS IOCTL IS
-	  FULL OF GOTCHAS.  Extreme caution should be used with using
-	  this ioctl.  A mistake can easily corrupt data or hang the
-	  system.
-
-	  [2] Both the input and output buffers are copied from the
-	  user and written back to the user, even when not used.
-
-	  [3] If one or more bits are set in out_flags and in_flags is
-	  zero, the following values are used for in_flags.all and
-	  written back into in_flags on completion.
-
-	   * IDE_TASKFILE_STD_IN_FLAGS | (IDE_HOB_STD_IN_FLAGS << 8)
-	     if LBA48 addressing is enabled for the drive
-	   * IDE_TASKFILE_STD_IN_FLAGS
-	     if CHS/LBA28
-
-	  The association between in_flags.all and each enable
-	  bitfield flips depending on endianness; fortunately, TASKFILE
-	  only uses inflags.b.data bit and ignores all other bits.
-	  The end result is that, on any endian machines, it has no
-	  effect other than modifying in_flags on completion.
-
-	  [4] The default value of SELECT is (0xa0|DEV_bit|LBA_bit)
-	  except for four drives per port chipsets.  For four drives
-	  per port chipsets, it's (0xa0|DEV_bit|LBA_bit) for the first
-	  pair and (0x80|DEV_bit|LBA_bit) for the second pair.
-
-	  [5] The argument to the ioctl is a pointer to a region of
-	  memory containing a ide_task_request_t structure, followed
-	  by an optional buffer of data to be transmitted to the
-	  drive, followed by an optional buffer to receive data from
-	  the drive.
-
-	  Command is passed to the disk drive via the ide_task_request_t
-	  structure, which contains these fields:
-
-	    io_ports[8]		values for the taskfile registers
-	    hob_ports[8]	high-order bytes, for extended commands
-	    out_flags		flags indicating which entries in the
-	    			io_ports[] and hob_ports[] arrays
-				contain valid values.  Type ide_reg_valid_t.
-	    in_flags		flags indicating which entries in the
-	    			io_ports[] and hob_ports[] arrays
-				are expected to contain valid values
-				on return.
-	    data_phase		See below
-	    req_cmd		Command type, see below
-	    out_size		output (user->drive) buffer size, bytes
-	    in_size		input (drive->user) buffer size, bytes
-
-	  When out_flags is zero, the following registers are loaded.
-
-	    HOB_FEATURE		If the drive supports LBA48
-	    HOB_NSECTOR		If the drive supports LBA48
-	    HOB_SECTOR		If the drive supports LBA48
-	    HOB_LCYL		If the drive supports LBA48
-	    HOB_HCYL		If the drive supports LBA48
-	    FEATURE
-	    NSECTOR
-	    SECTOR
-	    LCYL
-	    HCYL
-	    SELECT		First, masked with 0xE0 if LBA48, 0xEF
-				otherwise; then, or'ed with the default
-				value of SELECT.
-
-	  If any bit in out_flags is set, the following registers are loaded.
-
-	    HOB_DATA		If out_flags.b.data is set.  HOB_DATA will
-				travel on DD8-DD15 on little endian machines
-				and on DD0-DD7 on big endian machines.
-	    DATA		If out_flags.b.data is set.  DATA will
-				travel on DD0-DD7 on little endian machines
-				and on DD8-DD15 on big endian machines.
-	    HOB_NSECTOR		If out_flags.b.nsector_hob is set
-	    HOB_SECTOR		If out_flags.b.sector_hob is set
-	    HOB_LCYL		If out_flags.b.lcyl_hob is set
-	    HOB_HCYL		If out_flags.b.hcyl_hob is set
-	    FEATURE		If out_flags.b.feature is set
-	    NSECTOR		If out_flags.b.nsector is set
-	    SECTOR		If out_flags.b.sector is set
-	    LCYL		If out_flags.b.lcyl is set
-	    HCYL		If out_flags.b.hcyl is set
-	    SELECT		Or'ed with the default value of SELECT and
-				loaded regardless of out_flags.b.select.
-
-	  Taskfile registers are read back from the drive into
-	  {io|hob}_ports[] after the command completes iff one of the
-	  following conditions is met; otherwise, the original values
-	  will be written back, unchanged.
-
-	    1. The drive fails the command (EIO).
-	    2. One or more than one bits are set in out_flags.
-	    3. The requested data_phase is TASKFILE_NO_DATA.
-
-	    HOB_DATA		If in_flags.b.data is set.  It will contain
-				DD8-DD15 on little endian machines and
-				DD0-DD7 on big endian machines.
-	    DATA		If in_flags.b.data is set.  It will contain
-				DD0-DD7 on little endian machines and
-				DD8-DD15 on big endian machines.
-	    HOB_FEATURE		If the drive supports LBA48
-	    HOB_NSECTOR		If the drive supports LBA48
-	    HOB_SECTOR		If the drive supports LBA48
-	    HOB_LCYL		If the drive supports LBA48
-	    HOB_HCYL		If the drive supports LBA48
-	    NSECTOR
-	    SECTOR
-	    LCYL
-	    HCYL
-
-	  The data_phase field describes the data transfer to be
-	  performed.  Value is one of:
-
-	    TASKFILE_IN
-	    TASKFILE_MULTI_IN
-	    TASKFILE_OUT
-	    TASKFILE_MULTI_OUT
-	    TASKFILE_IN_OUT
-	    TASKFILE_IN_DMA
-	    TASKFILE_IN_DMAQ		== IN_DMA (queueing not supported)
-	    TASKFILE_OUT_DMA
-	    TASKFILE_OUT_DMAQ		== OUT_DMA (queueing not supported)
-	    TASKFILE_P_IN		unimplemented
-	    TASKFILE_P_IN_DMA		unimplemented
-	    TASKFILE_P_IN_DMAQ		unimplemented
-	    TASKFILE_P_OUT		unimplemented
-	    TASKFILE_P_OUT_DMA		unimplemented
-	    TASKFILE_P_OUT_DMAQ		unimplemented
-
-	  The req_cmd field classifies the command type.  It may be
-	  one of:
-
-	    IDE_DRIVE_TASK_NO_DATA
-	    IDE_DRIVE_TASK_SET_XFER	unimplemented
-	    IDE_DRIVE_TASK_IN
-	    IDE_DRIVE_TASK_OUT		unimplemented
-	    IDE_DRIVE_TASK_RAW_WRITE
-
-	  [6] Do not access {in|out}_flags->all except for resetting
-	  all the bits.  Always access individual bit fields.  ->all
-	  value will flip depending on endianness.  For the same
-	  reason, do not use IDE_{TASKFILE|HOB}_STD_{OUT|IN}_FLAGS
-	  constants defined in hdreg.h.
-
-
-
-HDIO_DRIVE_CMD			execute a special drive command
-
-	Note:  If you don't have a copy of the ANSI ATA specification
-	handy, you should probably ignore this ioctl.
-
-	usage:
-
-	  u8 args[4+XFER_SIZE];
-	  ...
-	  ioctl(fd, HDIO_DRIVE_CMD, args);
-
-	inputs:
-
-	  Commands other than WIN_SMART
-	    args[0]	COMMAND
-	    args[1]	NSECTOR
-	    args[2]	FEATURE
-	    args[3]	NSECTOR
-
-	  WIN_SMART
-	    args[0]	COMMAND
-	    args[1]	SECTOR
-	    args[2]	FEATURE
-	    args[3]	NSECTOR
-
-	outputs:
-
-	  args[] buffer is filled with register values followed by any
-	  data returned by the disk.
-	    args[0]	status
-	    args[1]	error
-	    args[2]	NSECTOR
-	    args[3]	undefined
-	    args[4+]	NSECTOR * 512 bytes of data returned by the command.
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  ENOMEM	Unable to allocate memory for task
-	  EIO		Drive reports error
-
-	notes:
-
-	  [1] For commands other than WIN_SMART, args[1] should equal
-	  args[3].  SECTOR, LCYL and HCYL are undefined.  For
-	  WIN_SMART, 0x4f and 0xc2 are loaded into LCYL and HCYL
-	  respectively.  In both cases SELECT will contain the default
-	  value for the drive.  Please refer to HDIO_DRIVE_TASKFILE
-	  notes for the default value of SELECT.
-
-	  [2] If NSECTOR value is greater than zero and the drive sets
-	  DRQ when interrupting for the command, NSECTOR * 512 bytes
-	  are read from the device into the area following NSECTOR.
-	  In the above example, the area would be
-	  args[4..4+XFER_SIZE].  16bit PIO is used regardless of
-	  HDIO_SET_32BIT setting.
-
-	  [3] If COMMAND == WIN_SETFEATURES && FEATURE == SETFEATURES_XFER
-	  && NSECTOR >= XFER_SW_DMA_0 && the drive supports any DMA
-	  mode, IDE driver will try to tune the transfer mode of the
-	  drive accordingly.
-
-
-
-HDIO_DRIVE_TASK			execute task and special drive command
-
-	Note:  If you don't have a copy of the ANSI ATA specification
-	handy, you should probably ignore this ioctl.
-
-	usage:
-
-	  u8 args[7];
-	  ...
-	  ioctl(fd, HDIO_DRIVE_TASK, args);
-
-	inputs:
-
-	  Taskfile register values:
-	    args[0]	COMMAND
-	    args[1]	FEATURE
-	    args[2]	NSECTOR
-	    args[3]	SECTOR
-	    args[4]	LCYL
-	    args[5]	HCYL
-	    args[6]	SELECT
-
-	outputs:
-
-	  Taskfile register values:
-	    args[0]	status
-	    args[1]	error
-	    args[2]	NSECTOR
-	    args[3]	SECTOR
-	    args[4]	LCYL
-	    args[5]	HCYL
-	    args[6]	SELECT
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  ENOMEM	Unable to allocate memory for task
-	  ENOMSG	Device is not a disk drive.
-	  EIO		Drive failed the command.
-
-	notes:
-
-	  [1] DEV bit (0x10) of SELECT register is ignored and the
-	  appropriate value for the drive is used.  All other bits
-	  are used unaltered.
-
-
-
-HDIO_DRIVE_CMD_AEB		HDIO_DRIVE_TASK
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_32BIT			change io_32bit flags
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_32BIT, val);
-
-	inputs:
-	  New value for io_32bit flag
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 3]
-	  EBUSY		Controller busy
-
-
-
-
-HDIO_SET_NOWERR			change ignore-write-error flag
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_NOWERR, val);
-
-	inputs:
-	  New value for ignore-write-error flag.  Used for ignoring
-	  WRERR_STAT
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SET_DMA			change use-dma flag
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_SET_DMA, val);
-
-	inputs:
-	  New value for use-dma flag
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SET_PIO_MODE		reconfig interface to new speed
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_SET_PIO_MODE, val);
-
-	inputs:
-	  New interface speed.
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 255]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SCAN_HWIF			register and (re)scan interface
-
-	usage:
-
-	  int args[3]
-	  ...
-	  ioctl(fd, HDIO_SCAN_HWIF, args);
-
-	inputs:
-	  args[0]	io address to probe
-	  args[1]	control address to probe
-	  args[2]	irq number
-
-	outputs:	none
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  EIO		Probe failed.
-
-	notes:
-
-	  This ioctl initializes the addresses and irq for a disk
-	  controller, probes for drives, and creates /proc/ide
-	  interfaces as appropriate.
-
-
-
-HDIO_UNREGISTER_HWIF		unregister interface
-
-	usage:
-
-	  int index;
-	  ioctl(fd, HDIO_UNREGISTER_HWIF, index);
-
-	inputs:
-	  index		index of hardware interface to unregister
-
-	outputs:	none
-
-	error returns:
-	  EACCES	Access denied:  requires CAP_SYS_RAWIO
-
-	notes:
-
-	  This ioctl removes a hardware interface from the kernel.
-
-	  Currently (2.6.8) this ioctl silently fails if any drive on
-	  the interface is busy.
-
-
-
-HDIO_SET_WCACHE			change write cache enable-disable
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_WCACHE, val);
-
-	inputs:
-	  New value for write cache enable
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SET_ACOUSTIC		change acoustic behavior
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_ACOUSTIC, val);
-
-	inputs:
-	  New value for drive acoustic settings
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 254]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SET_QDMA			change use-qdma flag
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_ADDRESS		change lba addressing modes
-
-	usage:
-
-	  int val;
-	  ioctl(fd, HDIO_SET_ADDRESS, val);
-
-	inputs:
-	  New value for addressing mode
-	    0 = 28-bit
-	    1 = 48-bit
-	    2 = 48-bit doing 28-bit
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 2]
-	  EBUSY		Controller busy
-	  EIO		Drive does not support lba48 mode.
-
-
-HDIO_SET_IDE_SCSI
-
-	usage:
-
-	  long val;
-	  ioctl(fd, HDIO_SET_IDE_SCSI, val);
-
-	inputs:
-	  New value for scsi emulation mode (?)
-
-	outputs:	none
-
-	error return:
-	  EINVAL	(bdev != bdev->bd_contains) (not sure what this means)
-	  EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  EINVAL	value out of range [0 1]
-	  EBUSY		Controller busy
-
-
-
-HDIO_SET_SCSI_IDE
-
-	Not implemented, as of 2.6.8.1
-
-
diff --git a/Documentation/ioctl/index.rst b/Documentation/ioctl/index.rst
new file mode 100644
index 000000000000..0f0a857f6615
--- /dev/null
+++ b/Documentation/ioctl/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+IOCTLs
+======
+
+.. toctree::
+   :maxdepth: 1
+
+   ioctl-number
+
+   botching-up-ioctls
+   ioctl-decoding
+
+   cdrom
+   hdio
diff --git a/Documentation/ioctl/ioctl-decoding.rst b/Documentation/ioctl/ioctl-decoding.rst
new file mode 100644
index 000000000000..380d6bb3e3ea
--- /dev/null
+++ b/Documentation/ioctl/ioctl-decoding.rst
@@ -0,0 +1,31 @@
+==============================
+Decoding an IOCTL Magic Number
+==============================
+
+To decode a hex IOCTL code:
+
+Most architectures use this generic format, but check
+include/ARCH/ioctl.h for specifics, e.g. powerpc
+uses 3 bits to encode read/write and 13 bits for size.
+
+ ====== ==================================
+ bits   meaning
+ ====== ==================================
+ 31-30	00 - no parameters: uses _IO macro
+	10 - read: _IOR
+	01 - write: _IOW
+	11 - read/write: _IOWR
+
+ 29-16	size of arguments
+
+ 15-8	ascii character supposedly
+	unique to each driver
+
+ 7-0	function #
+ ====== ==================================
+
+
+So for example 0x82187201 is a read with arg length of 0x218,
+character 'r' function 1. Grepping the source reveals this is::
+
+	#define VFAT_IOCTL_READDIR_BOTH         _IOR('r', 1, struct dirent [2])
diff --git a/Documentation/ioctl/ioctl-decoding.txt b/Documentation/ioctl/ioctl-decoding.txt
deleted file mode 100644
index e35efb0cec2e..000000000000
--- a/Documentation/ioctl/ioctl-decoding.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-To decode a hex IOCTL code:
-
-Most architectures use this generic format, but check
-include/ARCH/ioctl.h for specifics, e.g. powerpc
-uses 3 bits to encode read/write and 13 bits for size.
-
- bits    meaning
- 31-30	00 - no parameters: uses _IO macro
-	10 - read: _IOR
-	01 - write: _IOW
-	11 - read/write: _IOWR
-
- 29-16	size of arguments
-
- 15-8	ascii character supposedly
-	unique to each driver
-
- 7-0	function #
-
-
-So for example 0x82187201 is a read with arg length of 0x218,
-character 'r' function 1. Grepping the source reveals this is:
-
-#define VFAT_IOCTL_READDIR_BOTH         _IOR('r', 1, struct dirent [2])
diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
new file mode 100644
index 000000000000..7f8dcae7a230
--- /dev/null
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -0,0 +1,361 @@
+=============
+Ioctl Numbers
+=============
+
+19 October 1999
+
+Michael Elizabeth Chastain
+<mec@shout.net>
+
+If you are adding new ioctl's to the kernel, you should use the _IO
+macros defined in <linux/ioctl.h>:
+
+    ====== == ============================================
+    _IO    an ioctl with no parameters
+    _IOW   an ioctl with write parameters (copy_from_user)
+    _IOR   an ioctl with read parameters  (copy_to_user)
+    _IOWR  an ioctl with both write and read parameters.
+    ====== == ============================================
+
+'Write' and 'read' are from the user's point of view, just like the
+system calls 'write' and 'read'.  For example, a SET_FOO ioctl would
+be _IOW, although the kernel would actually read data from user space;
+a GET_FOO ioctl would be _IOR, although the kernel would actually write
+data to user space.
+
+The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
+or number from the table below.  Because of the large number of drivers,
+many drivers share a partial letter with other drivers.
+
+If you are writing a driver for a new device and need a letter, pick an
+unused block with enough room for expansion: 32 to 256 ioctl commands.
+You can register the block by patching this file and submitting the
+patch to Linus Torvalds.  Or you can e-mail me at <mec@shout.net> and
+I'll register one for you.
+
+The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
+to distinguish ioctls from each other.  The third argument to _IOW,
+_IOR, or _IOWR is the type of the data going into the kernel or coming
+out of the kernel (e.g.  'int' or 'struct foo').  NOTE!  Do NOT use
+sizeof(arg) as the third argument as this results in your ioctl thinking
+it passes an argument of type size_t.
+
+Some devices use their major number as the identifier; this is OK, as
+long as it is unique.  Some devices are irregular and don't follow any
+convention at all.
+
+Following this convention is good because:
+
+(1) Keeping the ioctl's globally unique helps error checking:
+    if a program calls an ioctl on the wrong device, it will get an
+    error rather than some unexpected behaviour.
+
+(2) The 'strace' build procedure automatically finds ioctl numbers
+    defined with _IO, _IOW, _IOR, or _IOWR.
+
+(3) 'strace' can decode numbers back into useful names when the
+    numbers are unique.
+
+(4) People looking for ioctls can grep for them more easily when
+    this convention is used to define the ioctl numbers.
+
+(5) When following the convention, the driver code can use generic
+    code to copy the parameters between user and kernel space.
+
+This table lists ioctls visible from user land for Linux/x86.  It contains
+most drivers up to 2.6.31, but I know I am missing some.  There has been
+no attempt to list non-X86 architectures or ioctls from drivers/staging/.
+
+====  =====  ======================================================= ================================================================
+Code  Seq#    Include File                                           Comments
+      (hex)
+====  =====  ======================================================= ================================================================
+0x00  00-1F  linux/fs.h                                              conflict!
+0x00  00-1F  scsi/scsi_ioctl.h                                       conflict!
+0x00  00-1F  linux/fb.h                                              conflict!
+0x00  00-1F  linux/wavefront.h                                       conflict!
+0x02  all    linux/fd.h
+0x03  all    linux/hdreg.h
+0x04  D2-DC  linux/umsdos_fs.h                                       Dead since 2.6.11, but don't reuse these.
+0x06  all    linux/lp.h
+0x09  all    linux/raid/md_u.h
+0x10  00-0F  drivers/char/s390/vmcp.h
+0x10  10-1F  arch/s390/include/uapi/sclp_ctl.h
+0x10  20-2F  arch/s390/include/uapi/asm/hypfs.h
+0x12  all    linux/fs.h
+             linux/blkpg.h
+0x1b  all                                                            InfiniBand Subsystem
+                                                                     <http://infiniband.sourceforge.net/>
+0x20  all    drivers/cdrom/cm206.h
+0x22  all    scsi/sg.h
+'!'   00-1F  uapi/linux/seccomp.h
+'#'   00-3F                                                          IEEE 1394 Subsystem
+                                                                     Block for the entire subsystem
+'$'   00-0F  linux/perf_counter.h, linux/perf_event.h
+'%'   00-0F  include/uapi/linux/stm.h                                System Trace Module subsystem
+                                                                     <mailto:alexander.shishkin@linux.intel.com>
+'&'   00-07  drivers/firewire/nosy-user.h
+'1'   00-1F  linux/timepps.h                                         PPS kit from Ulrich Windl
+                                                                     <ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
+'2'   01-04  linux/i2o.h
+'3'   00-0F  drivers/s390/char/raw3270.h                             conflict!
+'3'   00-1F  linux/suspend_ioctls.h,                                 conflict!
+             kernel/power/user.c
+'8'   all                                                            SNP8023 advanced NIC card
+                                                                     <mailto:mcr@solidum.com>
+';'   64-7F  linux/vfio.h
+'@'   00-0F  linux/radeonfb.h                                        conflict!
+'@'   00-0F  drivers/video/aty/aty128fb.c                            conflict!
+'A'   00-1F  linux/apm_bios.h                                        conflict!
+'A'   00-0F  linux/agpgart.h,                                        conflict!
+             drivers/char/agp/compat_ioctl.h
+'A'   00-7F  sound/asound.h                                          conflict!
+'B'   00-1F  linux/cciss_ioctl.h                                     conflict!
+'B'   00-0F  include/linux/pmu.h                                     conflict!
+'B'   C0-FF  advanced bbus                                           <mailto:maassen@uni-freiburg.de>
+'C'   all    linux/soundcard.h                                       conflict!
+'C'   01-2F  linux/capi.h                                            conflict!
+'C'   F0-FF  drivers/net/wan/cosa.h                                  conflict!
+'D'   all    arch/s390/include/asm/dasd.h
+'D'   40-5F  drivers/scsi/dpt/dtpi_ioctl.h
+'D'   05     drivers/scsi/pmcraid.h
+'E'   all    linux/input.h                                           conflict!
+'E'   00-0F  xen/evtchn.h                                            conflict!
+'F'   all    linux/fb.h                                              conflict!
+'F'   01-02  drivers/scsi/pmcraid.h                                  conflict!
+'F'   20     drivers/video/fsl-diu-fb.h                              conflict!
+'F'   20     drivers/video/intelfb/intelfb.h                         conflict!
+'F'   20     linux/ivtvfb.h                                          conflict!
+'F'   20     linux/matroxfb.h                                        conflict!
+'F'   20     drivers/video/aty/atyfb_base.c                          conflict!
+'F'   00-0F  video/da8xx-fb.h                                        conflict!
+'F'   80-8F  linux/arcfb.h                                           conflict!
+'F'   DD     video/sstfb.h                                           conflict!
+'G'   00-3F  drivers/misc/sgi-gru/grulib.h                           conflict!
+'G'   00-0F  linux/gigaset_dev.h                                     conflict!
+'H'   00-7F  linux/hiddev.h                                          conflict!
+'H'   00-0F  linux/hidraw.h                                          conflict!
+'H'   01     linux/mei.h                                             conflict!
+'H'   02     linux/mei.h                                             conflict!
+'H'   03     linux/mei.h                                             conflict!
+'H'   00-0F  sound/asound.h                                          conflict!
+'H'   20-40  sound/asound_fm.h                                       conflict!
+'H'   80-8F  sound/sfnt_info.h                                       conflict!
+'H'   10-8F  sound/emu10k1.h                                         conflict!
+'H'   10-1F  sound/sb16_csp.h                                        conflict!
+'H'   10-1F  sound/hda_hwdep.h                                       conflict!
+'H'   40-4F  sound/hdspm.h                                           conflict!
+'H'   40-4F  sound/hdsp.h                                            conflict!
+'H'   90     sound/usb/usx2y/usb_stream.h
+'H'   A0     uapi/linux/usb/cdc-wdm.h
+'H'   C0-F0  net/bluetooth/hci.h                                     conflict!
+'H'   C0-DF  net/bluetooth/hidp/hidp.h                               conflict!
+'H'   C0-DF  net/bluetooth/cmtp/cmtp.h                               conflict!
+'H'   C0-DF  net/bluetooth/bnep/bnep.h                               conflict!
+'H'   F1     linux/hid-roccat.h                                      <mailto:erazor_de@users.sourceforge.net>
+'H'   F8-FA  sound/firewire.h
+'I'   all    linux/isdn.h                                            conflict!
+'I'   00-0F  drivers/isdn/divert/isdn_divert.h                       conflict!
+'I'   40-4F  linux/mISDNif.h                                         conflict!
+'J'   00-1F  drivers/scsi/gdth_ioctl.h
+'K'   all    linux/kd.h
+'L'   00-1F  linux/loop.h                                            conflict!
+'L'   10-1F  drivers/scsi/mpt3sas/mpt3sas_ctl.h                      conflict!
+'L'   20-2F  linux/lightnvm.h
+'L'   E0-FF  linux/ppdd.h                                            encrypted disk device driver
+                                                                     <http://linux01.gwdg.de/~alatham/ppdd.html>
+'M'   all    linux/soundcard.h                                       conflict!
+'M'   01-16  mtd/mtd-abi.h                                           conflict!
+      and    drivers/mtd/mtdchar.c
+'M'   01-03  drivers/scsi/megaraid/megaraid_sas.h
+'M'   00-0F  drivers/video/fsl-diu-fb.h                              conflict!
+'N'   00-1F  drivers/usb/scanner.h
+'N'   40-7F  drivers/block/nvme.c
+'O'   00-06  mtd/ubi-user.h                                          UBI
+'P'   all    linux/soundcard.h                                       conflict!
+'P'   60-6F  sound/sscape_ioctl.h                                    conflict!
+'P'   00-0F  drivers/usb/class/usblp.c                               conflict!
+'P'   01-09  drivers/misc/pci_endpoint_test.c                        conflict!
+'Q'   all    linux/soundcard.h
+'R'   00-1F  linux/random.h                                          conflict!
+'R'   01     linux/rfkill.h                                          conflict!
+'R'   C0-DF  net/bluetooth/rfcomm.h
+'S'   all    linux/cdrom.h                                           conflict!
+'S'   80-81  scsi/scsi_ioctl.h                                       conflict!
+'S'   82-FF  scsi/scsi.h                                             conflict!
+'S'   00-7F  sound/asequencer.h                                      conflict!
+'T'   all    linux/soundcard.h                                       conflict!
+'T'   00-AF  sound/asound.h                                          conflict!
+'T'   all    arch/x86/include/asm/ioctls.h                           conflict!
+'T'   C0-DF  linux/if_tun.h                                          conflict!
+'U'   all    sound/asound.h                                          conflict!
+'U'   00-CF  linux/uinput.h                                          conflict!
+'U'   00-EF  linux/usbdevice_fs.h
+'U'   C0-CF  drivers/bluetooth/hci_uart.h
+'V'   all    linux/vt.h                                              conflict!
+'V'   all    linux/videodev2.h                                       conflict!
+'V'   C0     linux/ivtvfb.h                                          conflict!
+'V'   C0     linux/ivtv.h                                            conflict!
+'V'   C0     media/davinci/vpfe_capture.h                            conflict!
+'V'   C0     media/si4713.h                                          conflict!
+'W'   00-1F  linux/watchdog.h                                        conflict!
+'W'   00-1F  linux/wanrouter.h                                       conflict! (pre 3.9)
+'W'   00-3F  sound/asound.h                                          conflict!
+'W'   40-5F  drivers/pci/switch/switchtec.c
+'X'   all    fs/xfs/xfs_fs.h,                                        conflict!
+             fs/xfs/linux-2.6/xfs_ioctl32.h,
+             include/linux/falloc.h,
+             linux/fs.h,
+'X'   all    fs/ocfs2/ocfs_fs.h                                      conflict!
+'X'   01     linux/pktcdvd.h                                         conflict!
+'Y'   all    linux/cyclades.h
+'Z'   14-15  drivers/message/fusion/mptctl.h
+'['   00-3F  linux/usb/tmc.h                                         USB Test and Measurement Devices
+                                                                     <mailto:gregkh@linuxfoundation.org>
+'a'   all    linux/atm*.h, linux/sonet.h                             ATM on linux
+                                                                     <http://lrcwww.epfl.ch/>
+'a'   00-0F  drivers/crypto/qat/qat_common/adf_cfg_common.h          conflict! qat driver
+'b'   00-FF                                                          conflict! bit3 vme host bridge
+                                                                     <mailto:natalia@nikhefk.nikhef.nl>
+'c'   all    linux/cm4000_cs.h                                       conflict!
+'c'   00-7F  linux/comstats.h                                        conflict!
+'c'   00-7F  linux/coda.h                                            conflict!
+'c'   00-1F  linux/chio.h                                            conflict!
+'c'   80-9F  arch/s390/include/asm/chsc.h                            conflict!
+'c'   A0-AF  arch/x86/include/asm/msr.h conflict!
+'d'   00-FF  linux/char/drm/drm.h                                    conflict!
+'d'   02-40  pcmcia/ds.h                                             conflict!
+'d'   F0-FF  linux/digi1.h
+'e'   all    linux/digi1.h                                           conflict!
+'f'   00-1F  linux/ext2_fs.h                                         conflict!
+'f'   00-1F  linux/ext3_fs.h                                         conflict!
+'f'   00-0F  fs/jfs/jfs_dinode.h                                     conflict!
+'f'   00-0F  fs/ext4/ext4.h                                          conflict!
+'f'   00-0F  linux/fs.h                                              conflict!
+'f'   00-0F  fs/ocfs2/ocfs2_fs.h                                     conflict!
+'g'   00-0F  linux/usb/gadgetfs.h
+'g'   20-2F  linux/usb/g_printer.h
+'h'   00-7F                                                          conflict! Charon filesystem
+                                                                     <mailto:zapman@interlan.net>
+'h'   00-1F  linux/hpet.h                                            conflict!
+'h'   80-8F  fs/hfsplus/ioctl.c
+'i'   00-3F  linux/i2o-dev.h                                         conflict!
+'i'   0B-1F  linux/ipmi.h                                            conflict!
+'i'   80-8F  linux/i8k.h
+'j'   00-3F  linux/joystick.h
+'k'   00-0F  linux/spi/spidev.h                                      conflict!
+'k'   00-05  video/kyro.h                                            conflict!
+'k'   10-17  linux/hsi/hsi_char.h                                    HSI character device
+'l'   00-3F  linux/tcfs_fs.h                                         transparent cryptographic file system
+                                                                     <http://web.archive.org/web/%2A/http://mikonos.dia.unisa.it/tcfs>
+'l'   40-7F  linux/udf_fs_i.h                                        in development:
+                                                                     <http://sourceforge.net/projects/linux-udf/>
+'m'   00-09  linux/mmtimer.h                                         conflict!
+'m'   all    linux/mtio.h                                            conflict!
+'m'   all    linux/soundcard.h                                       conflict!
+'m'   all    linux/synclink.h                                        conflict!
+'m'   00-19  drivers/message/fusion/mptctl.h                         conflict!
+'m'   00     drivers/scsi/megaraid/megaraid_ioctl.h                  conflict!
+'n'   00-7F  linux/ncp_fs.h and fs/ncpfs/ioctl.c
+'n'   80-8F  uapi/linux/nilfs2_api.h                                 NILFS2
+'n'   E0-FF  linux/matroxfb.h                                        matroxfb
+'o'   00-1F  fs/ocfs2/ocfs2_fs.h                                     OCFS2
+'o'   00-03  mtd/ubi-user.h                                          conflict! (OCFS2 and UBI overlaps)
+'o'   40-41  mtd/ubi-user.h                                          UBI
+'o'   01-A1  `linux/dvb/*.h`                                         DVB
+'p'   00-0F  linux/phantom.h                                         conflict! (OpenHaptics needs this)
+'p'   00-1F  linux/rtc.h                                             conflict!
+'p'   00-3F  linux/mc146818rtc.h                                     conflict!
+'p'   40-7F  linux/nvram.h
+'p'   80-9F  linux/ppdev.h                                           user-space parport
+                                                                     <mailto:tim@cyberelk.net>
+'p'   A1-A5  linux/pps.h                                             LinuxPPS
+                                                                     <mailto:giometti@linux.it>
+'q'   00-1F  linux/serio.h
+'q'   80-FF  linux/telephony.h                                       Internet PhoneJACK, Internet LineJACK
+             linux/ixjuser.h                                         <http://web.archive.org/web/%2A/http://www.quicknet.net>
+'r'   00-1F  linux/msdos_fs.h and fs/fat/dir.c
+'s'   all    linux/cdk.h
+'t'   00-7F  linux/ppp-ioctl.h
+'t'   80-8F  linux/isdn_ppp.h
+'t'   90-91  linux/toshiba.h                                         toshiba and toshiba_acpi SMM
+'u'   00-1F  linux/smb_fs.h                                          gone
+'u'   20-3F  linux/uvcvideo.h                                        USB video class host driver
+'u'   40-4f  linux/udmabuf.h                                         userspace dma-buf misc device
+'v'   00-1F  linux/ext2_fs.h                                         conflict!
+'v'   00-1F  linux/fs.h                                              conflict!
+'v'   00-0F  linux/sonypi.h                                          conflict!
+'v'   00-0F  media/v4l2-subdev.h                                     conflict!
+'v'   C0-FF  linux/meye.h                                            conflict!
+'w'   all                                                            CERN SCI driver
+'y'   00-1F                                                          packet based user level communications
+                                                                     <mailto:zapman@interlan.net>
+'z'   00-3F                                                          CAN bus card conflict!
+                                                                     <mailto:hdstich@connectu.ulm.circular.de>
+'z'   40-7F                                                          CAN bus card conflict!
+                                                                     <mailto:oe@port.de>
+'z'   10-4F  drivers/s390/crypto/zcrypt_api.h                        conflict!
+'|'   00-7F  linux/media.h
+0x80  00-1F  linux/fb.h
+0x89  00-06  arch/x86/include/asm/sockios.h
+0x89  0B-DF  linux/sockios.h
+0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
+0x89  E0-EF  linux/dn.h                                              PROTOPRIVATE range
+0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
+0x8B  all    linux/wireless.h
+0x8C  00-3F                                                          WiNRADiO driver
+                                                                     <http://www.winradio.com.au/>
+0x90  00     drivers/cdrom/sbpcd.h
+0x92  00-0F  drivers/usb/mon/mon_bin.c
+0x93  60-7F  linux/auto_fs.h
+0x94  all    fs/btrfs/ioctl.h                                        Btrfs filesystem
+             and linux/fs.h                                          some lifted to vfs/generic
+0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
+0x99  00-0F                                                          537-Addinboard driver
+                                                                     <mailto:buk@buks.ipn.de>
+0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
+                                                                     <mailto:kenji@bitgate.com>
+0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
+0xA3  80-8F                                                          Port ACL  in development:
+                                                                     <mailto:tlewis@mindspring.com>
+0xA3  90-9F  linux/dtlk.h
+0xA4  00-1F  uapi/linux/tee.h                                        Generic TEE subsystem
+0xAA  00-3F  linux/uapi/linux/userfaultfd.h
+0xAB  00-1F  linux/nbd.h
+0xAC  00-1F  linux/raw.h
+0xAD  00                                                             Netfilter device in development:
+                                                                     <mailto:rusty@rustcorp.com.au>
+0xAE  all    linux/kvm.h                                             Kernel-based Virtual Machine
+                                                                     <mailto:kvm@vger.kernel.org>
+0xAF  00-1F  linux/fsl_hypervisor.h                                  Freescale hypervisor
+0xB0  all                                                            RATIO devices in development:
+                                                                     <mailto:vgo@ratio.de>
+0xB1  00-1F                                                          PPPoX
+                                                                     <mailto:mostrows@styx.uwaterloo.ca>
+0xB3  00     linux/mmc/ioctl.h
+0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
+0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
+0xB6  all    linux/fpga-dfl.h
+0xC0  00-0F  linux/usb/iowarrior.h
+0xCA  00-0F  uapi/misc/cxl.h
+0xCA  10-2F  uapi/misc/ocxl.h
+0xCA  80-BF  uapi/scsi/cxlflash_ioctl.h
+0xCB  00-1F                                                          CBM serial IEC bus in development:
+                                                                     <mailto:michael.klein@puffin.lb.shuttle.de>
+0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
+0xCD  01     linux/reiserfs_fs.h
+0xCF  02     fs/cifs/ioctl.c
+0xDB  00-0F  drivers/char/mwave/mwavepub.h
+0xDD  00-3F                                                          ZFCP device driver see drivers/s390/scsi/
+                                                                     <mailto:aherrman@de.ibm.com>
+0xE5  00-3F  linux/fuse.h
+0xEC  00-01  drivers/platform/chrome/cros_ec_dev.h                   ChromeOS EC driver
+0xF3  00-3F  drivers/usb/misc/sisusbvga/sisusb.h                     sisfb (in development)
+                                                                     <mailto:thomas@winischhofer.net>
+0xF4  00-1F  video/mbxfb.h                                           mbxfb
+                                                                     <mailto:raph@8d.com>
+0xF6  all                                                            LTTng Linux Trace Toolkit Next Generation
+                                                                     <mailto:mathieu.desnoyers@efficios.com>
+0xFD  all    linux/dm-ioctl.h
+0xFE  all    linux/isst_if.h
+====  =====  ======================================================= ================================================================
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
deleted file mode 100644
index c9558146ac58..000000000000
--- a/Documentation/ioctl/ioctl-number.txt
+++ /dev/null
@@ -1,350 +0,0 @@
-Ioctl Numbers
-19 October 1999
-Michael Elizabeth Chastain
-<mec@shout.net>
-
-If you are adding new ioctl's to the kernel, you should use the _IO
-macros defined in <linux/ioctl.h>:
-
-    _IO    an ioctl with no parameters
-    _IOW   an ioctl with write parameters (copy_from_user)
-    _IOR   an ioctl with read parameters  (copy_to_user)
-    _IOWR  an ioctl with both write and read parameters.
-
-'Write' and 'read' are from the user's point of view, just like the
-system calls 'write' and 'read'.  For example, a SET_FOO ioctl would
-be _IOW, although the kernel would actually read data from user space;
-a GET_FOO ioctl would be _IOR, although the kernel would actually write
-data to user space.
-
-The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
-or number from the table below.  Because of the large number of drivers,
-many drivers share a partial letter with other drivers.
-
-If you are writing a driver for a new device and need a letter, pick an
-unused block with enough room for expansion: 32 to 256 ioctl commands.
-You can register the block by patching this file and submitting the
-patch to Linus Torvalds.  Or you can e-mail me at <mec@shout.net> and
-I'll register one for you.
-
-The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
-to distinguish ioctls from each other.  The third argument to _IOW,
-_IOR, or _IOWR is the type of the data going into the kernel or coming
-out of the kernel (e.g.  'int' or 'struct foo').  NOTE!  Do NOT use
-sizeof(arg) as the third argument as this results in your ioctl thinking
-it passes an argument of type size_t.
-
-Some devices use their major number as the identifier; this is OK, as
-long as it is unique.  Some devices are irregular and don't follow any
-convention at all.
-
-Following this convention is good because:
-
-(1) Keeping the ioctl's globally unique helps error checking:
-    if a program calls an ioctl on the wrong device, it will get an
-    error rather than some unexpected behaviour.
-
-(2) The 'strace' build procedure automatically finds ioctl numbers
-    defined with _IO, _IOW, _IOR, or _IOWR.
-
-(3) 'strace' can decode numbers back into useful names when the
-    numbers are unique.
-
-(4) People looking for ioctls can grep for them more easily when
-    this convention is used to define the ioctl numbers.
-
-(5) When following the convention, the driver code can use generic
-    code to copy the parameters between user and kernel space.
-
-This table lists ioctls visible from user land for Linux/x86.  It contains
-most drivers up to 2.6.31, but I know I am missing some.  There has been
-no attempt to list non-X86 architectures or ioctls from drivers/staging/.
-
-Code  Seq#(hex)	Include File		Comments
-========================================================
-0x00	00-1F	linux/fs.h		conflict!
-0x00	00-1F	scsi/scsi_ioctl.h	conflict!
-0x00	00-1F	linux/fb.h		conflict!
-0x00	00-1F	linux/wavefront.h	conflict!
-0x02	all	linux/fd.h
-0x03	all	linux/hdreg.h
-0x04	D2-DC	linux/umsdos_fs.h	Dead since 2.6.11, but don't reuse these.
-0x06	all	linux/lp.h
-0x09	all	linux/raid/md_u.h
-0x10	00-0F	drivers/char/s390/vmcp.h
-0x10	10-1F	arch/s390/include/uapi/sclp_ctl.h
-0x10	20-2F	arch/s390/include/uapi/asm/hypfs.h
-0x12	all	linux/fs.h
-		linux/blkpg.h
-0x1b	all	InfiniBand Subsystem	<http://infiniband.sourceforge.net/>
-0x20	all	drivers/cdrom/cm206.h
-0x22	all	scsi/sg.h
-'!'	00-1F	uapi/linux/seccomp.h
-'#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem
-'$'	00-0F	linux/perf_counter.h, linux/perf_event.h
-'%'	00-0F	include/uapi/linux/stm.h
-					System Trace Module subsystem
-					<mailto:alexander.shishkin@linux.intel.com>
-'&'	00-07	drivers/firewire/nosy-user.h
-'1'	00-1F	<linux/timepps.h>	PPS kit from Ulrich Windl
-					<ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
-'2'	01-04	linux/i2o.h
-'3'	00-0F	drivers/s390/char/raw3270.h	conflict!
-'3'	00-1F	linux/suspend_ioctls.h	conflict!
-		and kernel/power/user.c
-'8'	all				SNP8023 advanced NIC card
-					<mailto:mcr@solidum.com>
-';'	64-7F	linux/vfio.h
-'@'	00-0F	linux/radeonfb.h	conflict!
-'@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
-'A'	00-1F	linux/apm_bios.h	conflict!
-'A'	00-0F	linux/agpgart.h		conflict!
-		and drivers/char/agp/compat_ioctl.h
-'A'	00-7F	sound/asound.h		conflict!
-'B'	00-1F	linux/cciss_ioctl.h	conflict!
-'B'	00-0F	include/linux/pmu.h	conflict!
-'B'	C0-FF				advanced bbus
-					<mailto:maassen@uni-freiburg.de>
-'C'	all	linux/soundcard.h	conflict!
-'C'	01-2F	linux/capi.h		conflict!
-'C'	F0-FF	drivers/net/wan/cosa.h	conflict!
-'D'	all	arch/s390/include/asm/dasd.h
-'D'	40-5F	drivers/scsi/dpt/dtpi_ioctl.h
-'D'	05	drivers/scsi/pmcraid.h
-'E'	all	linux/input.h		conflict!
-'E'	00-0F	xen/evtchn.h		conflict!
-'F'	all	linux/fb.h		conflict!
-'F'	01-02	drivers/scsi/pmcraid.h	conflict!
-'F'	20	drivers/video/fsl-diu-fb.h	conflict!
-'F'	20	drivers/video/intelfb/intelfb.h	conflict!
-'F'	20	linux/ivtvfb.h		conflict!
-'F'	20	linux/matroxfb.h	conflict!
-'F'	20	drivers/video/aty/atyfb_base.c	conflict!
-'F'	00-0F	video/da8xx-fb.h	conflict!
-'F'	80-8F	linux/arcfb.h		conflict!
-'F'	DD	video/sstfb.h		conflict!
-'G'	00-3F	drivers/misc/sgi-gru/grulib.h	conflict!
-'G'	00-0F	linux/gigaset_dev.h	conflict!
-'H'	00-7F	linux/hiddev.h		conflict!
-'H'	00-0F	linux/hidraw.h		conflict!
-'H'	01	linux/mei.h		conflict!
-'H'	02	linux/mei.h		conflict!
-'H'	03	linux/mei.h		conflict!
-'H'	00-0F	sound/asound.h		conflict!
-'H'	20-40	sound/asound_fm.h	conflict!
-'H'	80-8F	sound/sfnt_info.h	conflict!
-'H'	10-8F	sound/emu10k1.h		conflict!
-'H'	10-1F	sound/sb16_csp.h	conflict!
-'H'	10-1F	sound/hda_hwdep.h	conflict!
-'H'	40-4F	sound/hdspm.h		conflict!
-'H'	40-4F	sound/hdsp.h		conflict!
-'H'	90	sound/usb/usx2y/usb_stream.h
-'H'	A0	uapi/linux/usb/cdc-wdm.h
-'H'	C0-F0	net/bluetooth/hci.h	conflict!
-'H'	C0-DF	net/bluetooth/hidp/hidp.h	conflict!
-'H'	C0-DF	net/bluetooth/cmtp/cmtp.h	conflict!
-'H'	C0-DF	net/bluetooth/bnep/bnep.h	conflict!
-'H'	F1	linux/hid-roccat.h	<mailto:erazor_de@users.sourceforge.net>
-'H'	F8-FA	sound/firewire.h
-'I'	all	linux/isdn.h		conflict!
-'I'	00-0F	drivers/isdn/divert/isdn_divert.h	conflict!
-'I'	40-4F	linux/mISDNif.h		conflict!
-'J'	00-1F	drivers/scsi/gdth_ioctl.h
-'K'	all	linux/kd.h
-'L'	00-1F	linux/loop.h		conflict!
-'L'	10-1F	drivers/scsi/mpt3sas/mpt3sas_ctl.h	conflict!
-'L'	20-2F	linux/lightnvm.h
-'L'	E0-FF	linux/ppdd.h		encrypted disk device driver
-					<http://linux01.gwdg.de/~alatham/ppdd.html>
-'M'	all	linux/soundcard.h	conflict!
-'M'	01-16	mtd/mtd-abi.h		conflict!
-		and drivers/mtd/mtdchar.c
-'M'	01-03	drivers/scsi/megaraid/megaraid_sas.h
-'M'	00-0F	drivers/video/fsl-diu-fb.h	conflict!
-'N'	00-1F	drivers/usb/scanner.h
-'N'	40-7F	drivers/block/nvme.c
-'O'     00-06   mtd/ubi-user.h		UBI
-'P'	all	linux/soundcard.h	conflict!
-'P'	60-6F	sound/sscape_ioctl.h	conflict!
-'P'	00-0F	drivers/usb/class/usblp.c	conflict!
-'P'	01-09	drivers/misc/pci_endpoint_test.c	conflict!
-'Q'	all	linux/soundcard.h
-'R'	00-1F	linux/random.h		conflict!
-'R'	01	linux/rfkill.h		conflict!
-'R'	C0-DF	net/bluetooth/rfcomm.h
-'S'	all	linux/cdrom.h		conflict!
-'S'	80-81	scsi/scsi_ioctl.h	conflict!
-'S'	82-FF	scsi/scsi.h		conflict!
-'S'	00-7F	sound/asequencer.h	conflict!
-'T'	all	linux/soundcard.h	conflict!
-'T'	00-AF	sound/asound.h		conflict!
-'T'	all	arch/x86/include/asm/ioctls.h	conflict!
-'T'	C0-DF	linux/if_tun.h		conflict!
-'U'	all	sound/asound.h		conflict!
-'U'	00-CF	linux/uinput.h		conflict!
-'U'	00-EF	linux/usbdevice_fs.h
-'U'	C0-CF	drivers/bluetooth/hci_uart.h
-'V'	all	linux/vt.h		conflict!
-'V'	all	linux/videodev2.h	conflict!
-'V'	C0	linux/ivtvfb.h		conflict!
-'V'	C0	linux/ivtv.h		conflict!
-'V'	C0	media/davinci/vpfe_capture.h	conflict!
-'V'	C0	media/si4713.h		conflict!
-'W'	00-1F	linux/watchdog.h	conflict!
-'W'	00-1F	linux/wanrouter.h	conflict!		(pre 3.9)
-'W'	00-3F	sound/asound.h		conflict!
-'W'	40-5F   drivers/pci/switch/switchtec.c
-'X'	all	fs/xfs/xfs_fs.h		conflict!
-		and fs/xfs/linux-2.6/xfs_ioctl32.h
-		and include/linux/falloc.h
-		and linux/fs.h
-'X'	all	fs/ocfs2/ocfs_fs.h	conflict!
-'X'	01	linux/pktcdvd.h		conflict!
-'Y'	all	linux/cyclades.h
-'Z'	14-15	drivers/message/fusion/mptctl.h
-'['	00-3F	linux/usb/tmc.h		USB Test and Measurement Devices
-					<mailto:gregkh@linuxfoundation.org>
-'a'	all	linux/atm*.h, linux/sonet.h	ATM on linux
-					<http://lrcwww.epfl.ch/>
-'a'	00-0F	drivers/crypto/qat/qat_common/adf_cfg_common.h	conflict! qat driver
-'b'	00-FF				conflict! bit3 vme host bridge
-					<mailto:natalia@nikhefk.nikhef.nl>
-'c'	all	linux/cm4000_cs.h	conflict!
-'c'	00-7F	linux/comstats.h	conflict!
-'c'	00-7F	linux/coda.h		conflict!
-'c'	00-1F	linux/chio.h		conflict!
-'c'	80-9F	arch/s390/include/asm/chsc.h	conflict!
-'c'	A0-AF   arch/x86/include/asm/msr.h	conflict!
-'d'	00-FF	linux/char/drm/drm.h	conflict!
-'d'	02-40	pcmcia/ds.h		conflict!
-'d'	F0-FF	linux/digi1.h
-'e'	all	linux/digi1.h		conflict!
-'f'	00-1F	linux/ext2_fs.h		conflict!
-'f'	00-1F	linux/ext3_fs.h		conflict!
-'f'	00-0F	fs/jfs/jfs_dinode.h	conflict!
-'f'	00-0F	fs/ext4/ext4.h		conflict!
-'f'	00-0F	linux/fs.h		conflict!
-'f'	00-0F	fs/ocfs2/ocfs2_fs.h	conflict!
-'g'	00-0F	linux/usb/gadgetfs.h
-'g'	20-2F	linux/usb/g_printer.h
-'h'	00-7F				conflict! Charon filesystem
-					<mailto:zapman@interlan.net>
-'h'	00-1F	linux/hpet.h		conflict!
-'h'	80-8F	fs/hfsplus/ioctl.c
-'i'	00-3F	linux/i2o-dev.h		conflict!
-'i'	0B-1F	linux/ipmi.h		conflict!
-'i'	80-8F	linux/i8k.h
-'j'	00-3F	linux/joystick.h
-'k'	00-0F	linux/spi/spidev.h	conflict!
-'k'	00-05	video/kyro.h		conflict!
-'k'	10-17	linux/hsi/hsi_char.h	HSI character device
-'l'	00-3F	linux/tcfs_fs.h		transparent cryptographic file system
-					<http://web.archive.org/web/*/http://mikonos.dia.unisa.it/tcfs>
-'l'	40-7F	linux/udf_fs_i.h	in development:
-					<http://sourceforge.net/projects/linux-udf/>
-'m'	00-09	linux/mmtimer.h		conflict!
-'m'	all	linux/mtio.h		conflict!
-'m'	all	linux/soundcard.h	conflict!
-'m'	all	linux/synclink.h	conflict!
-'m'	00-19	drivers/message/fusion/mptctl.h	conflict!
-'m'	00	drivers/scsi/megaraid/megaraid_ioctl.h	conflict!
-'n'	00-7F	linux/ncp_fs.h and fs/ncpfs/ioctl.c
-'n'	80-8F	uapi/linux/nilfs2_api.h	NILFS2
-'n'	E0-FF	linux/matroxfb.h	matroxfb
-'o'	00-1F	fs/ocfs2/ocfs2_fs.h	OCFS2
-'o'     00-03   mtd/ubi-user.h		conflict! (OCFS2 and UBI overlaps)
-'o'     40-41   mtd/ubi-user.h		UBI
-'o'     01-A1   linux/dvb/*.h		DVB
-'p'	00-0F	linux/phantom.h		conflict! (OpenHaptics needs this)
-'p'	00-1F	linux/rtc.h		conflict!
-'p'	00-3F	linux/mc146818rtc.h	conflict!
-'p'	40-7F	linux/nvram.h
-'p'	80-9F	linux/ppdev.h		user-space parport
-					<mailto:tim@cyberelk.net>
-'p'	A1-A5	linux/pps.h		LinuxPPS
-					<mailto:giometti@linux.it>
-'q'	00-1F	linux/serio.h
-'q'	80-FF	linux/telephony.h	Internet PhoneJACK, Internet LineJACK
-		linux/ixjuser.h		<http://web.archive.org/web/*/http://www.quicknet.net>
-'r'	00-1F	linux/msdos_fs.h and fs/fat/dir.c
-'s'	all	linux/cdk.h
-'t'	00-7F	linux/ppp-ioctl.h
-'t'	80-8F	linux/isdn_ppp.h
-'t'	90-91	linux/toshiba.h		toshiba and toshiba_acpi SMM
-'u'	00-1F	linux/smb_fs.h		gone
-'u'	20-3F	linux/uvcvideo.h	USB video class host driver
-'u'	40-4f	linux/udmabuf.h		userspace dma-buf misc device
-'v'	00-1F	linux/ext2_fs.h		conflict!
-'v'	00-1F	linux/fs.h		conflict!
-'v'	00-0F	linux/sonypi.h		conflict!
-'v'	00-0F	media/v4l2-subdev.h	conflict!
-'v'	C0-FF	linux/meye.h		conflict!
-'w'	all				CERN SCI driver
-'y'	00-1F				packet based user level communications
-					<mailto:zapman@interlan.net>
-'z'	00-3F				CAN bus card	conflict!
-					<mailto:hdstich@connectu.ulm.circular.de>
-'z'	40-7F				CAN bus card	conflict!
-					<mailto:oe@port.de>
-'z'	10-4F	drivers/s390/crypto/zcrypt_api.h	conflict!
-'|'	00-7F	linux/media.h
-0x80	00-1F	linux/fb.h
-0x89	00-06	arch/x86/include/asm/sockios.h
-0x89	0B-DF	linux/sockios.h
-0x89	E0-EF	linux/sockios.h		SIOCPROTOPRIVATE range
-0x89	E0-EF	linux/dn.h		PROTOPRIVATE range
-0x89	F0-FF	linux/sockios.h		SIOCDEVPRIVATE range
-0x8B	all	linux/wireless.h
-0x8C	00-3F				WiNRADiO driver
-					<http://www.winradio.com.au/>
-0x90	00	drivers/cdrom/sbpcd.h
-0x92	00-0F	drivers/usb/mon/mon_bin.c
-0x93	60-7F	linux/auto_fs.h
-0x94	all	fs/btrfs/ioctl.h	Btrfs filesystem
-		and linux/fs.h		some lifted to vfs/generic
-0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
-0x99	00-0F				537-Addinboard driver
-					<mailto:buk@buks.ipn.de>
-0xA0	all	linux/sdp/sdp.h		Industrial Device Project
-					<mailto:kenji@bitgate.com>
-0xA1	0	linux/vtpm_proxy.h	TPM Emulator Proxy Driver
-0xA3	80-8F	Port ACL		in development:
-					<mailto:tlewis@mindspring.com>
-0xA3	90-9F	linux/dtlk.h
-0xA4	00-1F	uapi/linux/tee.h	Generic TEE subsystem
-0xAA	00-3F	linux/uapi/linux/userfaultfd.h
-0xAB	00-1F	linux/nbd.h
-0xAC	00-1F	linux/raw.h
-0xAD	00	Netfilter device	in development:
-					<mailto:rusty@rustcorp.com.au>
-0xAE	all	linux/kvm.h		Kernel-based Virtual Machine
-					<mailto:kvm@vger.kernel.org>
-0xAF	00-1F	linux/fsl_hypervisor.h	Freescale hypervisor
-0xB0	all	RATIO devices		in development:
-					<mailto:vgo@ratio.de>
-0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
-0xB3	00	linux/mmc/ioctl.h
-0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
-0xB5	00-0F	uapi/linux/rpmsg.h	<mailto:linux-remoteproc@vger.kernel.org>
-0xB6	all	linux/fpga-dfl.h
-0xC0	00-0F	linux/usb/iowarrior.h
-0xCA	00-0F	uapi/misc/cxl.h
-0xCA	10-2F	uapi/misc/ocxl.h
-0xCA	80-BF	uapi/scsi/cxlflash_ioctl.h
-0xCB	00-1F	CBM serial IEC bus	in development:
-					<mailto:michael.klein@puffin.lb.shuttle.de>
-0xCC	00-0F	drivers/misc/ibmvmc.h    pseries VMC driver
-0xCD	01	linux/reiserfs_fs.h
-0xCF	02	fs/cifs/ioctl.c
-0xDB	00-0F	drivers/char/mwave/mwavepub.h
-0xDD	00-3F	ZFCP device driver	see drivers/s390/scsi/
-					<mailto:aherrman@de.ibm.com>
-0xE5	00-3F	linux/fuse.h
-0xEC	00-01	drivers/platform/chrome/cros_ec_dev.h	ChromeOS EC driver
-0xF3	00-3F	drivers/usb/misc/sisusbvga/sisusb.h	sisfb (in development)
-					<mailto:thomas@winischhofer.net>
-0xF4	00-1F	video/mbxfb.h		mbxfb
-					<mailto:raph@8d.com>
-0xF6	all	LTTng			Linux Trace Toolkit Next Generation
-					<mailto:mathieu.desnoyers@efficios.com>
-0xFD	all	linux/dm-ioctl.h
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
deleted file mode 100644
index 49df45f90e8a..000000000000
--- a/Documentation/iostats.txt
+++ /dev/null
@@ -1,193 +0,0 @@
-=====================
-I/O statistics fields
-=====================
-
-Since 2.4.20 (and some versions before, with patches), and 2.5.45,
-more extensive disk statistics have been introduced to help measure disk
-activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
-the work for you, but in case you are interested in creating your own
-tools, the fields are explained here.
-
-In 2.4 now, the information is found as additional fields in
-``/proc/partitions``.  In 2.6 and upper, the same information is found in two
-places: one is in the file ``/proc/diskstats``, and the other is within
-the sysfs file system, which must be mounted in order to obtain
-the information. Throughout this document we'll assume that sysfs
-is mounted on ``/sys``, although of course it may be mounted anywhere.
-Both ``/proc/diskstats`` and sysfs use the same source for the information
-and so should not differ.
-
-Here are examples of these different formats::
-
-   2.4:
-      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
-
-   2.6+ sysfs:
-      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      35486    38030    38030    38030
-
-   2.6+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3    1   hda1 35486 38030 38030 38030
-
-   4.18+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
-
-On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
-a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
-
-The advantage of one over the other is that the sysfs choice works well
-if you are watching a known, small set of disks.  ``/proc/diskstats`` may
-be a better choice if you are watching a large number of disks because
-you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
-each snapshot of your disk statistics.
-
-In 2.4, the statistics fields are those after the device name. In
-the above example, the first field of statistics would be 446216.
-By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
-find just the eleven fields, beginning with 446216.  If you look at
-``/proc/diskstats``, the eleven fields will be preceded by the major and
-minor device numbers, and device name.  Each of these formats provides
-eleven fields of statistics, each meaning exactly the same things.
-All fields except field 9 are cumulative since boot.  Field 9 should
-go to zero as I/Os complete; all others only increase (unless they
-overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
-(native word size) numbers, and on a very busy or long-lived system they
-may wrap. Applications should be prepared to deal with that; unless
-your observations are measured in large numbers of minutes or hours,
-they should not wrap twice before you notice them.
-
-Each set of stats only applies to the indicated device; if you want
-system-wide stats you'll have to find all the devices and sum them all up.
-
-Field  1 -- # of reads completed
-    This is the total number of reads completed successfully.
-
-Field  2 -- # of reads merged, field 6 -- # of writes merged
-    Reads and writes which are adjacent to each other may be merged for
-    efficiency.  Thus two 4K reads may become one 8K read before it is
-    ultimately handed to the disk, and so it will be counted (and queued)
-    as only one I/O.  This field lets you know how often this was done.
-
-Field  3 -- # of sectors read
-    This is the total number of sectors read successfully.
-
-Field  4 -- # of milliseconds spent reading
-    This is the total number of milliseconds spent by all reads (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  5 -- # of writes completed
-    This is the total number of writes completed successfully.
-
-Field  6 -- # of writes merged
-    See the description of field 2.
-
-Field  7 -- # of sectors written
-    This is the total number of sectors written successfully.
-
-Field  8 -- # of milliseconds spent writing
-    This is the total number of milliseconds spent by all writes (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  9 -- # of I/Os currently in progress
-    The only field that should go to zero. Incremented as requests are
-    given to appropriate struct request_queue and decremented as they finish.
-
-Field 10 -- # of milliseconds spent doing I/Os
-    This field increases so long as field 9 is nonzero.
-
-Field 11 -- weighted # of milliseconds spent doing I/Os
-    This field is incremented at each I/O start, I/O completion, I/O
-    merge, or read of these stats by the number of I/Os in progress
-    (field 9) times the number of milliseconds spent doing I/O since the
-    last update of this field.  This can provide an easy measure of both
-    I/O completion time and the backlog that may be accumulating.
-
-Field 12 -- # of discards completed
-    This is the total number of discards completed successfully.
-
-Field 13 -- # of discards merged
-    See the description of field 2
-
-Field 14 -- # of sectors discarded
-    This is the total number of sectors discarded successfully.
-
-Field 15 -- # of milliseconds spent discarding
-    This is the total number of milliseconds spent by all discards (as
-    measured from __make_request() to end_that_request_last()).
-
-To avoid introducing performance bottlenecks, no locks are held while
-modifying these counters.  This implies that minor inaccuracies may be
-introduced when changes collide, so (for instance) adding up all the
-read I/Os issued per partition should equal those made to the disks ...
-but due to the lack of locking it may only be very close.
-
-In 2.6+, there are counters for each CPU, which make the lack of locking
-almost a non-issue.  When the statistics are read, the per-CPU counters
-are summed (possibly overflowing the unsigned long variable they are
-summed to) and the result given to the user.  There is no convenient
-user interface for accessing the per-CPU counters themselves.
-
-Disks vs Partitions
--------------------
-
-There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
-As a result, some statistic information disappeared. The translation from
-a disk address relative to a partition to the disk address relative to
-the host disk happens much earlier.  All merges and timings now happen
-at the disk level rather than at both the disk and partition level as
-in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
-partitions from that for disks.  There are only *four* fields available
-for partitions on 2.6+ machines.  This is reflected in the examples above.
-
-Field  1 -- # of reads issued
-    This is the total number of reads issued to this partition.
-
-Field  2 -- # of sectors read
-    This is the total number of sectors requested to be read from this
-    partition.
-
-Field  3 -- # of writes issued
-    This is the total number of writes issued to this partition.
-
-Field  4 -- # of sectors written
-    This is the total number of sectors requested to be written to
-    this partition.
-
-Note that since the address is translated to a disk-relative one, and no
-record of the partition-relative address is kept, the subsequent success
-or failure of the read cannot be attributed to the partition.  In other
-words, the number of reads for partitions is counted slightly before time
-of queuing for partitions, and at completion for whole disks.  This is
-a subtle distinction that is probably uninteresting for most cases.
-
-More significant is the error induced by counting the numbers of
-reads/writes before merges for partitions and after for disks. Since a
-typical workload usually contains a lot of successive and adjacent requests,
-the number of reads/writes issued can be several times higher than the
-number of reads/writes completed.
-
-In 2.6.25, the full statistic set is again available for partitions and
-disk and partition statistics are consistent again. Since we still don't
-keep record of the partition-relative address, an operation is attributed to
-the partition which contains the first sector of the request after the
-eventual merges. As requests can be merged across partition, this could lead
-to some (probably insignificant) inaccuracy.
-
-Additional notes
-----------------
-
-In 2.6+, sysfs is not mounted by default.  If your distribution of
-Linux hasn't added it already, here's the line you'll want to add to
-your ``/etc/fstab``::
-
-	none /sys sysfs defaults 0 0
-
-
-In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
-appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
-``/proc/stat`` take a very different format from those in ``/proc/partitions``
-(see proc(5), if your system has it.)
-
--- ricklind@us.ibm.com
diff --git a/Documentation/isdn/HiSax.cert b/Documentation/isdn/HiSax.cert
deleted file mode 100644
index f2a6fcb8efee..000000000000
--- a/Documentation/isdn/HiSax.cert
+++ /dev/null
@@ -1,96 +0,0 @@
------BEGIN PGP SIGNED MESSAGE-----
-
-First:
-
-    HiSax is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-However, if you wish to modify the HiSax sources, please note the following:
-
-HiSax has passed the ITU approval test suite with ELSA Quickstep ISDN cards
-and Eicon Technology Diva 2.01 PCI card.
-The certification is only valid for the combination of the tested software
-version and the tested hardware. Any changes to the HiSax source code may
-therefore affect the certification.
-
-Additional ITU approval tests have been carried out for all generic cards
-using Colognechip single chip solutions HFC-S PCI A for PCI cards as well
-as HFC-S USB based USB ISDN ta adapters.
-These tests included all layers 1-3 and as well all functional tests for 
-the layer 1. Because all hardware based on these chips are complete ISDN
-solutions in one chip all cards and USB-TAs using these chips are to be
-regarded as approved for those tests. Some additional electrical tests
-of the layer 1 which are independent of the driver and related to a
-special hardware used will be regarded as approved if at least one 
-solution has been tested including those electrical tests. So if cards 
-or tas have been completely approved for any other os, the approval
-for those electrical tests is valid for linux, too.
-Please send any questions regarding this drivers or approval abouts to 
-werner@isdn-development.de 
-Additional information and the type approval documents will be found
-shortly on the Colognechip website www.colognechip.com 
-
-If you change the main files of the HiSax ISDN stack, the certification will
-become invalid. Because in most countries it is illegal to connect
-unapproved ISDN equipment to the public network, I have to guarantee that
-changes in HiSax do not affect the certification.
-
-In order to make a valid certification apparent to the user, I have built in
-some validation checks that are made during the make process. The HiSax main
-files are protected by md5 checksums and the md5sum file is pgp signed by
-myself:
-
-KeyID 1024/FF992F6D 1997/01/16 Karsten Keil <kkeil@suse.de>
-Key fingerprint = 92 6B F7 58 EE 86 28 C8  C4 1A E6 DC 39 89 F2 AA
-
-Only if the checksums are OK, and the signature of the file
-"drivers/isdn/hisax/md5sums.asc" match, is the certification valid; a
-message confirming this is then displayed during the hisax init process.
-
-The affected files are:
-
-drivers/isdn/hisax/isac.c
-drivers/isdn/hisax/isdnl1.c
-drivers/isdn/hisax/isdnl2.c
-drivers/isdn/hisax/isdnl3.c
-drivers/isdn/hisax/tei.c
-drivers/isdn/hisax/callc.c
-drivers/isdn/hisax/l3dss1.c
-drivers/isdn/hisax/l3_1tr6.c
-drivers/isdn/hisax/cert.c
-drivers/isdn/hisax/elsa.c
-drivers/isdn/hisax/diva.c
-drivers/isdn/hisax/hfc_pci.c
-
-Please send any changes, bugfixes and patches to me rather than implementing
-them directly into the HiSax sources.
-
-This does not reduce your rights granted by the GNU General Public License.
-If you wish to change the sources, go ahead; but note that then the
-certification is invalid even if you use one of the approved cards.
-
-Here are the certification registration numbers for ELSA Quickstep cards:
-German   D133361J CETECOM ICT Services GmbH 0682
-European D133362J CETECOM ICT Services GmbH 0682
-
-
-Karsten Keil
-keil@isdn4linux.de
-
------BEGIN PGP SIGNATURE-----
-Version: 2.6.3i
-Charset: noconv
-
-iQCVAwUBOFAwqTpxHvX/mS9tAQFI2QP9GLDK2iy/KBhwReE3F7LeO+tVhffTVZ3a
-20q5/z/WcIg/pnH0uTkl2UgDXBFXYl45zJyDGNpAposIFmT+Edd14o7Vj1w/BBdn
-Y+5rBmJf+gyBu61da5d6bv0lpymwRa/um+ri+ilYnZ/XPfg5JKhdjGSBCJuJAElM
-d2jFbTrsMYw=
-=LNf9
------END PGP SIGNATURE-----
diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE
deleted file mode 100644
index 5df17e5b25c8..000000000000
--- a/Documentation/isdn/INTERFACE
+++ /dev/null
@@ -1,759 +0,0 @@
-$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $
-
-Description of the Interface between Linklevel and Hardwarelevel
-  of isdn4linux:
-
-
-  The Communication between Linklevel (LL) and Hardwarelevel (HL)
-  is based on the struct isdn_if (defined in isdnif.h).
-
-  An HL-driver can register itself at LL by calling the function
-  register_isdn() with a pointer to that struct. Prior to that, it has
-  to preset some of the fields of isdn_if. The LL sets the rest of
-  the fields. All further communication is done via callbacks using
-  the function-pointers defined in isdn_if.
-
-  Changes/Version numbering:
-
-  During development of the ISDN subsystem, several changes have been
-  made to the interface. Before it went into kernel, the package
-  had a unique version number. The last version, distributed separately
-  was 0.7.4. When the subsystem went into kernel, every functional unit
-  got a separate version number. These numbers are shown at initialization,
-  separated by slashes:
-
-     c.c/t.t/n.n/p.p/a.a/v.v
-
-  where
-
-   c.c is the revision of the common code.
-   t.t is the revision of the tty related code.
-   n.n is the revision of the network related code.
-   p.p is the revision of the ppp related code.
-   a.a is the revision of the audio related code.
-   v.v is the revision of the V.110 related code.
-
-  Changes in this document are marked with '***CHANGEx' where x representing
-  the version number. If that number starts with 0, it refers to the old,
-  separately distributed package. If it starts with one of the letters
-  above, it refers to the revision of the corresponding module. 
-  ***CHANGEIx refers to the revision number of the isdnif.h  
-
-1. Description of the fields of isdn_if:
-
-  int channels;
-
-    This field has to be set by the HL-driver to the number of channels
-    supported prior to calling register_isdn(). Upon return of the call,
-    the LL puts an id there, which has to be used by the HL-driver when
-    invoking the other callbacks.
-
-  int maxbufsize;
-
-    ***CHANGE0.6: New since this version.
-
-    Also to be preset by the HL-driver. With this value the HL-driver
-    tells the LL the maximum size of a data-packet it will accept. 
-
-  unsigned long features;
-
-    To be preset by the HL-driver. Using this field, the HL-driver
-    announces the features supported. At the moment this is limited to
-    report the supported layer2 and layer3-protocols. For setting this
-    field the constants ISDN_FEATURE..., declared in isdnif.h have to be
-    used.
-
-    ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set.
-
-  unsigned short hl_hdrlen;
-
-    ***CHANGE0.7.4: New field.
-
-    To be preset by the HL-driver, if it supports sk_buff's. The driver
-    should put here the amount of additional space needed in sk_buff's for
-    its internal purposes. Drivers not supporting sk_buff's should 
-    initialize this field to 0.
-
-  void (*rcvcallb_skb)(int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-
-    This field will be set by LL. The HL-driver delivers received data-
-    packets by calling this function. Upon calling, the HL-driver must
-    already have its private data pulled off the head of the sk_buff.
-
-    Parameter:
-      int              driver-Id
-      int              Channel-number locally to the driver. (starting with 0)
-      struct sk_buff * Pointer to sk_buff, containing received data.
-
-  int (*statcallb)(isdn_ctrl*);
-
-    This field will be set by LL. This function has to be called by the
-    HL-driver for signaling status-changes or other events to the LL.
-
-    Parameter:
-      isdn_ctrl*
-
-      The struct isdn_ctrl also defined in isdn_if. The exact meanings of its
-      fields are described together with the descriptions of the possible
-      events. Here is only a short description of the fields:
-
-        driver  = driver Id.
-        command = event-type. (one of the constants ISDN_STAT_...)
-        arg     = depends on event-type.
-        num     = depends on event-type.
-
-    Returnvalue:
-      0 on success, else -1
-
-  int (*command)(isdn_ctrl*);
-
-    This field has to be preset by the HL-driver. It points to a function,
-    to be called by LL to perform functions like dialing, B-channel
-    setup, etc. The exact meaning of the parameters is described with the
-    descriptions of the possible commands.
-
-    Parameter:
-      isdn_ctrl*
-        driver  = driver-Id
-        command = command to perform. (one of the constants ISDN_CMD_...)
-        arg     = depends on command.
-        num     = depends on command.
-    
-    Returnvalue:
-      >=0 on success, else error-code (-ENODEV etc.)
-
-  int (*writebuf_skb)(int, int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-    ***CHANGEI.1.21: New field.
-
-    This field has to be preset by the HL-driver. The given function will
-    be called by the LL for delivering data to be send via B-Channel.
-
- 
-    Parameter:
-      int              driver-Id ***CHANGE0.7.4: New parameter.
-      int              channel-number locally to the HL-driver. (starts with 0)
-      int	       ack ***ChangeI1.21: New parameter
-		       If this is !0, the driver has to signal the delivery
-		       by sending an ISDN_STAT_BSENT. If this is 0, the driver
-		       MUST NOT send an ISDN_STAT_BSENT.
-      struct sk_buff * Pointer to sk_buff containing data to be send via
-                       B-channel.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL on
-      oversized packets etc.)
-
-  int (*writecmd)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform write-requests on /dev/isdnctrl (i.e. sending commands
-    to the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL etc.)
-
-  int (*readstat)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform read-requests on /dev/isdnctrl (i.e. reading replies
-    from the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data on success, else error-code (-EINVAL etc.)
-
-  char id[20];
-       ***CHANGE0.7: New since this version.
-
-   This string has to be preset by the HL-driver. Its purpose is for
-   identification of the driver by the user. Eg.: it is shown in the
-   status-info of /dev/isdninfo. Furthermore it is used as Id for binding
-   net-interfaces to a specific channel. If a string of length zero is
-   given, upon return, isdn4linux will replace it by a generic name. (line0,
-   line1 etc.) It is recommended to make this string configurable during
-   module-load-time. (copy a global variable to this string.) For doing that,
-   modules 1.2.8 or newer are necessary.
-
-2. Description of the commands, a HL-driver has to support:
-
-   All commands will be performed by calling the function command() described
-   above from within the LL. The field command of the struct-parameter will
-   contain the desired command, the field driver is always set to the
-   appropriate driver-Id.
-
-   Until now, the following commands are defined:
-
-***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing
-                the old "num" and a new setup_type struct used for ISDN_CMD_DIAL
-                and ISDN_STAT_ICALL callback.
-
-   ISDN_CMD_IOCTL:
-
-     This command is intended for performing ioctl-calls for configuring
-     hardware or similar purposes (setting port-addresses, loading firmware
-     etc.) For this purpose, in the LL all ioctl-calls with an argument
-     >= IIOCDRVCTL (0x100) will be handed transparently to this
-     function after subtracting 0x100 and placing the result in arg.
-     Example:
-       If a userlevel-program calls ioctl(0x101,...) the function gets
-       called with the field command set to 1.
-
-     Parameter:
-       driver   = driver-Id.
-       command  = ISDN_CMD_IOCTL
-       arg      = Original ioctl-cmd - IIOCDRVCTL
-       parm.num = first bytes filled with (unsigned long)arg
-   
-     Returnvalue:
-       Depending on driver.
-
-  
-  ISDN_CMD_DIAL:
-
-    This command is used to tell the HL-driver it should dial a given
-    number.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_DIAL
-      arg         = channel-number locally to the driver. (starting with 0)
-      
-      parm.setup.phone  = An ASCII-String containing the number to dial.
-      parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN.
-      parm.setup.si1    = The Service-Indicator.
-      parm.setup.si2    = Additional Service-Indicator.
-
-                    If the Line has been designed as SPV (a special german
-                    feature, meaning semi-leased-line) the phone has to
-                    start with an "S".
-      ***CHANGE0.6: In previous versions the EAZ has been given in the
-                    highbyte of arg.
-    ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo.
-
-  ISDN_CMD_ACCEPTD:
-
-    With this command, the HL-driver is told to accept a D-Channel-setup.
-    (Response to an incoming call)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_ACCEPTB:
-
-    With this command, the HL-driver is told to perform a B-Channel-setup.
-    (after establishing D-Channel-Connection)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTB
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_HANGUP:
-
-    With this command, the HL-driver is told to hangup (B-Channel if
-    established first, then D-Channel). This command is also used for
-    actively rejecting an incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_HANGUP
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_CLREAZ:
-
-    With this command, the HL-driver is told not to signal incoming
-    calls to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_CLREAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_SETEAZ:
-
-    With this command, the HL-driver is told to signal incoming calls for
-    the given EAZs/MSNs to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired EAZ's/MSN's
-                    (comma-separated). If an empty String is given, the
-                    HL-driver should respond to ALL incoming calls,
-                    regardless of the destination-address.
-      ***CHANGE0.6: New since this version the "empty-string"-feature.
-
-  ISDN_CMD_GETEAZ: (currently unused)
-
-    With this command, the HL-driver is told to report the current setting
-    given with ISDN_CMD_SETEAZ.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current EAZ's/MSN's
-
-  ISDN_CMD_SETSIL: (currently unused)
-
-    With this command, the HL-driver is told to signal only incoming
-    calls with the given Service-Indicators.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired Service-Indicators.
-
-  ISDN_CMD_GETSIL: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    Service-Indicators it will respond to.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current Service-Indicators.
-
-  ISDN_CMD_SETL2:
-
-    With this command, the HL-driver is told to select the given Layer-2-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L2...
-      parm        = unused.
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-2-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L2_PROTO)
-
-  ISDN_CMD_SETL3:
-
-    With this command, the HL-driver is told to select the given Layer-3-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L3...
-      parm.fax    = Pointer to T30_s fax struct. (fax usage only)
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-3-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L3_PROTO)
-
-  ISDN_CMD_PROCEED: 
-
-    With this command, the HL-driver is told to proceed with a incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_PROCEED
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    PROCEED message
-
-  ISDN_CMD_ALERT: 
-
-    With this command, the HL-driver is told to alert a proceeding call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ALERT
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    ALERT message
-
-  ISDN_CMD_REDIR: 
-
-    With this command, the HL-driver is told to redirect a call in proceeding
-    or alerting state.  
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_REDIR
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 protocol
-      setup.screen= screening indicator
-      setup.phone = redirected to party number
-
-  ISDN_CMD_PROT_IO:
-
-    With this call, the LL-driver invokes protocol specific features through
-    the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_CMD_PROT_IO
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific CMD.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-
-  ISDN_CMD_FAXCMD:
-
-    With this command the HL-driver receives a fax sub-command.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_FAXCMD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-
-3. Description of the events to be signaled by the HL-driver to the LL.
-
-  All status-changes are signaled via calling the previously described
-  function statcallb(). The field command of the struct isdn_cmd has
-  to be set by the HL-driver with the appropriate Status-Id (event-number).
-  The field arg has to be set to the channel-number (locally to the driver,
-  starting with 0) to which this event applies. (Exception: STAVAIL-event)
-
-  Until now, the following Status-Ids are defined:
-
-  ISDN_STAT_AVAIL:
-
-    With this call, the HL-driver signals the availability of new data
-    for readstat(). Used only for debugging-purposes, see description
-    of readstat().
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STAVAIL
-      arg         = length of available data.
-      parm        = unused.
-
-  ISDN_STAT_ICALL:
-  ISDN_STAT_ICALLW:
-
-    With this call, the HL-driver signals an incoming call to the LL.
-    If ICALLW is signalled the incoming call is a waiting call without
-    a available B-chan.
-
-    Parameter:
-      driver            = driver-Id
-      command           = ISDN_STAT_ICALL
-      arg               = channel-number, locally to the driver. (starting with 0)
-      para.setup.phone  = Callernumber.
-      para.setup.eazmsn = CalledNumber.
-      para.setup.si1    = Service Indicator.
-      para.setup.si2    = Additional Service Indicator.
-      para.setup.plan   = octet 3 from Calling party number Information Element.
-      para.setup.screen = octet 3a from Calling party number Information Element.
-
-    Return:
-      0           = No device matching this call.
-      1           = At least one device matching this call (RING on ttyI).
-                    HL-driver may send ALERTING on the D-channel in this case.
-      2           = Call will be rejected.
-      3           = Incoming called party number is currently incomplete.
-                    Additional digits are required. 
-                    Used for signalling with PtP connections.
-      4	          = Call will be held in a proceeding state 
-                    (HL driver sends PROCEEDING)
-                    Used when a user space prog needs time to interpret a call
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      5           = Call will be actively deflected to another party
-                    Only available in DSS1/EURO protocol
-		    para.setup.phone must be set to destination party number
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      -1          = An error happened. (Invalid parameters for example.)
-  The keypad support now is included in the dial command.	        
-
-
-  ISDN_STAT_RUN:
-
-    With this call, the HL-driver signals availability of the ISDN-card.
-    (after initializing, loading firmware)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_RUN
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_STOP:
-
-    With this call, the HL-driver signals unavailability of the ISDN-card.
-    (before unloading, while resetting/reconfiguring the card)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STOP
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_DCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the
-   remote-station has initiated establishment)
-
-   The HL driver should call this when the logical l2/l3 protocol 
-   connection on top of the physical B-channel is established.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing type of connection (for analog
-		    modem only). This will be appended to the CONNECT message
-		    e.g. 14400/V.32bis
-
-  ISDN_STAT_DHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup or if the remote-station has actively
-   rejected a call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup.
-
-   The HL driver should call this as soon as the logical l2/l3 protocol 
-   connection on top of the physical B-channel is released.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_CINF:
-
-   With this call, the HL-driver delivers charge-unit information to the
-   LL.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_CINF
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing charge-units (digits only).
-
-  ISDN_STAT_LOAD: (currently unused)
-
-  ISDN_STAT_UNLOAD:
-
-   With this call, the HL-driver signals that it will be unloaded now. This
-   tells the LL to release all corresponding data-structures.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_UNLOAD
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_BSENT:
-
-    With this call the HL-driver signals the delivery of a data-packet.
-    This callback is used by the network-interfaces only, tty-Emulation
-    does not need this call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BSENT
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.length = ***CHANGEI.1.21: New field.
-		    the driver has to set this to the original length
-		    of the skb at the time of receiving it from the linklevel.
-
-  ISDN_STAT_NODCH:
-
-    With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if
-    no D-Channel is available.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_ADDCH: 
-
-    This call is for HL-drivers, which are unable to check card-type
-    or numbers of supported channels before they have loaded any firmware
-    using ioctl. Those HL-driver simply set the channel-parameter to a
-    minimum channel-number when registering, and later if they know
-    the real amount, perform this call, allocating additional channels.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_ADDCH
-      arg         = number of channels to be added.
-      parm        = unused.
-
-  ISDN_STAT_CAUSE:
-
-    With this call, the HL-driver delivers CAUSE-messages to the LL.
-    Currently the LL does not use this messages. Their contents is simply
-    logged via kernel-messages. Therefore, currently the format of the
-    messages is completely free. However they should be printable.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing CAUSE-message.
-
-  ISDN_STAT_DISPLAY:
-
-    With this call, the HL-driver delivers DISPLAY-messages to the LL.
-    Currently the LL does not use this messages. 
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISPLAY
-      arg         = channel-number, locally to the driver. (starting with 0)
-      para.display= string containing DISPLAY-message.
-
-  ISDN_STAT_PROT:
-
-    With this call, the HL-driver delivers protocol specific infos to the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_PROT
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific STAT.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-  ISDN_STAT_DISCH:
-
-    With this call, the HL-driver signals the LL to disable or enable the
-    use of supplied channel and driver.
-    The call may be used to reduce the available number of B-channels after
-    loading the driver. The LL has to ignore a disabled channel when searching
-    for free channels. The HL driver itself never delivers STAT callbacks for
-    disabled channels. 	    
-    The LL returns a nonzero code if the operation was not successful or the
-    selected channel is actually regarded as busy.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num[0] = 0 if channel shall be disabled, else enabled.
-
-  ISDN_STAT_L1ERR:
-
-    ***CHANGEI1.21 new status message.
-    A signal can be sent to the linklevel if an Layer1-error results in
-    packet-loss on receive or send. The field errcode of the cmd.parm
-    union describes the error more precisely.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_L1ERR
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.errcode= ISDN_STAT_L1ERR_SEND:     Packet lost while sending.
-		    ISDN_STAT_L1ERR_RECV:     Packet lost while receiving.
-  ISDN_STAT_FAXIND:
-
-    With this call the HL-driver signals a fax sub-command to the LL.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_STAT_FAXIND
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax
deleted file mode 100644
index 9c8c6d914ec7..000000000000
--- a/Documentation/isdn/INTERFACE.fax
+++ /dev/null
@@ -1,163 +0,0 @@
-$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $
-
-
-Description of the fax-subinterface between linklevel and hardwarelevel of 
-  isdn4linux. 
-
-  The communication between linklevel (LL) and hardwarelevel (HL) for fax
-  is based on the struct T30_s (defined in isdnif.h).
-  This struct is allocated in the LL.  
-  In order to use fax, the LL provides the pointer to this struct with the 
-  command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup 
-  and when a new channel to a new connection is assigned. 
-
-
-Data handling:
-  In send-mode the HL-driver has to handle the <DLE> codes and the bit-order 
-  conversion by itself. 
-  In receive-mode the LL-driver takes care of the bit-order conversion
-  (specified by +FBOR)
-
-Structure T30_s description:
-
-  This structure stores the values (set by AT-commands), the remote-
-  capability-values and the command-codes between LL and HL.
-
-  If the HL-driver receives ISDN_CMD_FAXCMD, all needed information
-  is in this struct set by the LL.
-  To signal information to the LL, the HL-driver has to set the 
-  parameters and use ISDN_STAT_FAXIND.
-  (Please refer to INTERFACE)
-
-Structure T30_s:
-
-  All members are 8-bit unsigned (__u8)
-
-  -  resolution     
-  -  rate
-  -  width
-  -  length
-  -  compression
-  -  ecm
-  -  binary
-  -  scantime
-  -  id[]
-  Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ...
-
-  -  r_resolution
-  -  r_rate
-  -  r_width
-  -  r_length
-  -  r_compression
-  -  r_ecm
-  -  r_binary
-  -  r_scantime
-  -  r_id[]
-  Remote faxmachine's parameters. To be set by HL-driver.
-
-  -  phase      
-  Defines the actual state of fax connection. Set by HL or LL
-  depending on progress and type of connection.
-  If the phase changes because of an AT command, the LL driver
-  changes this value. Otherwise the HL-driver takes care of it, but
-  only necessary on call establishment (from IDLE to PHASE_A).
-  (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E])
-
-  -  direction
-  Defines outgoing/send or incoming/receive connection.
-  (ISDN_TTY_FAX_CONN_[IN,OUT])
-
-  -  code
-  Commands from LL to HL; possible constants : 
-      ISDN_TTY_FAX_DR        signals +FDR command to HL
-
-      ISDN_TTY_FAX_DT        signals +FDT command to HL 
-
-      ISDN_TTY_FAX_ET        signals +FET command to HL
-
-
-  Other than that the "code" is set with the hangup-code value at
-  the end of connection for the +FHNG message.
-        
-  -  r_code 
-  Commands from HL to LL; possible constants :
-      ISDN_TTY_FAX_CFR       output of +FCFR message. 
-
-      ISDN_TTY_FAX_RID       output of remote ID set in r_id[]
-                             (+FCSI/+FTSI on send/receive)
-
-      ISDN_TTY_FAX_DCS       output of +FDCS and CONNECT message,
-                             switching to phase C.
-
-      ISDN_TTY_FAX_ET        signals end of data,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_FCON      signals the established, outgoing connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_FCON_I    signals the established, incoming connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_DIS       output of +FDIS message and values.
-
-      ISDN_TTY_FAX_SENT      signals that all data has been sent 
-                             and <DLE><ETX> is acknowledged,
-                             OK message will be sent.
-
-      ISDN_TTY_FAX_PTS       signals a msg-confirmation (page sent successful),
-                             depending on fet value:
-                             0: output OK message (more pages follow)
-                             1: switching to phase B (next document)
-
-      ISDN_TTY_FAX_TRAIN_OK  output of +FDCS and OK message (for receive mode).
-
-      ISDN_TTY_FAX_EOP       signals end of data in receive mode,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_HNG       output of the +FHNG and value set by code and
-                             OK message, switching to phase E.
-
-
-  -  badlin
-  Value of +FBADLIN  
-
-  -  badmul
-  Value of +FBADMUL
-
-  -  bor
-  Value of +FBOR
-
-  -  fet
-  Value of +FET command in send-mode.
-  Set by HL in receive-mode for +FET message.
-
-  -  pollid[]  
-  ID-string, set by +FCIG
-
-  -  cq
-  Value of +FCQ
-
-  -  cr
-  Value of +FCR
-
-  -  ctcrty
-  Value of +FCTCRTY
-
-  -  minsp
-  Value of +FMINSP
-
-  -  phcto
-  Value of +FPHCTO
-
-  -  rel
-  Value of +FREL
-
-  -  nbc
-  Value of +FNBC (0,1)
-  (+FNBC is not a known class 2 fax command, I added this to change the
-   automatic "best capabilities" connection in the eicon HL-driver)
-
-  
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
deleted file mode 100644
index 74bd2bdb455b..000000000000
--- a/Documentation/isdn/README
+++ /dev/null
@@ -1,599 +0,0 @@
-README for the ISDN-subsystem
-
-1. Preface
-
-  1.1 Introduction
-
-  This README describes how to set up and how to use the different parts
-  of the ISDN-subsystem.
-
-  For using the ISDN-subsystem, some additional userlevel programs are
-  necessary. Those programs and some contributed utilities are available
-  at
-
-   ftp.isdn4linux.de
-
-   /pub/isdn4linux/isdn4k-utils-<VersionNumber>.tar.gz
-
-
-  We also have set up a mailing-list:
-
-   The isdn4linux-project originates in Germany, and therefore by historical
-   reasons, the mailing-list's primary language is german. However mails
-   written in english have been welcome all the time.
-
-   to subscribe: write a email to majordomo@listserv.isdn4linux.de,
-   Subject irrelevant, in the message body:
-   subscribe isdn4linux <your_email_address>
-
-   To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de
-
-   This mailinglist is bidirectionally gated to the newsgroup
-
-     de.alt.comm.isdn4linux
-
-  There is also a well maintained FAQ in English available at
-     https://www.mhessler.de/i4lfaq/
-  It can be viewed online, or downloaded in sgml/text/html format.
-  The FAQ can also be viewed online at
-     https://www.isdn4linux.de/faq/i4lfaq.html
-  or downloaded from
-     ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
-
-  1.1 Technical details
-
-  In the following Text, the terms MSN and EAZ are used.
-
-  MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies
-  to Euro(EDSS1)-type lines. Usually it is simply the phone number.
-
-  EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and
-  applies to German 1TR6-type lines. This is a one-digit string,
-  simply appended to the base phone number
-
-  The internal handling is nearly identical, so replace the appropriate
-  term to that one, which applies to your local ISDN-environment.
-
-  When the link-level-module isdn.o is loaded, it supports up to 16
-  low-level-modules with up to 64 channels. (The number 64 is arbitrarily
-  chosen and can be configured at compile-time --ISDN_MAX in isdn.h).
-  A low-level-driver can register itself through an interface (which is
-  defined in isdnif.h) and gets assigned a slot.
-  The following char-devices are made available for each channel:
-
-  A raw-control-device with the following functions:
-     write: raw D-channel-messages (format: depends on driver).
-     read:  raw D-channel-messages (format: depends on driver).
-     ioctl: depends on driver, i.e. for the ICN-driver, the base-address of
-            the ports and the shared memory on the card can be set and read
-            also the boot-code and the protocol software can be loaded into
-            the card.
-
-   O N L Y !!!  for debugging (no locking against other devices):
-   One raw-data-device with the following functions:
-     write: data to B-channel.
-     read:  data from B-channel.
-
-   In addition the following devices are made available:
-
-   128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator:
-   The functionality is almost the same as that of a serial device
-   (the line-discs are handled by the kernel), which lets you run
-   SLIP, CSLIP and asynchronous PPP through the devices. We have tested
-   Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax. 
-
-   The modem-emulation supports the following:
-           1.3.1 Commands:
-
-               ATA      Answer incoming call.
-               ATD<No.> Dial, the number may contain:
-                        [0-9] and [,#.*WPT-S]
-                        the latter are ignored until 'S'.
-                        The 'S' must precede the number, if
-                        the line is a SPV (German 1TR6).
-               ATE0     Echo off.
-               ATE1     Echo on (default).
-               ATH      Hang-up.
-               ATH1     Off hook (ignored).
-               ATH0     Hang-up.
-               ATI      Return "ISDN for Linux...".
-               ATI0        "
-               ATI1        "
-               ATI2     Report of last connection.
-               ATO      On line (data mode).
-               ATQ0     Enable result codes (default).
-               ATQ1     Disable result codes (default).
-               ATSx=y   Set register x to y.
-               ATSx?    Show contents of register x.
-               ATV0     Numeric responses.
-               ATV1     English responses (default).
-               ATZ      Load registers and EAZ/MSN from Profile.
-               AT&Bx    Set Send-Packet-size to x (max. 4000)
-                        The real packet-size may be limited by the
-                        low-level-driver used. e.g. the HiSax-Module-
-                        limit is 2000. You will get NO Error-Message,
-                        if you set it to higher values, because at the
-                        time of giving this command the corresponding
-                        driver may not be selected (see "Automatic
-                        Assignment") however the size of outgoing packets
-                        will be limited correctly.
-               AT&D0    Ignore DTR
-               AT&D2    DTR-low-edge: Hang up and return to
-                        command mode (default).
-               AT&D3    Same as AT&D2 but also resets all registers.
-               AT&Ex    Set the EAZ/MSN for this channel to x.
-               AT&F     Reset all registers and profile to "factory-defaults"
-               AT&Lx    Set list of phone numbers to listen on.  x is a
-                        list of wildcard patterns separated by semicolon.
-                        If this is set, it has precedence over the MSN set
-                        by AT&E.
-               AT&Rx    Select V.110 bitrate adaption.
-                        This command enables V.110 protocol with 9600 baud
-                        (x=9600), 19200 baud (x=19200) or 38400 baud
-                        (x=38400). A value of x=0 disables V.110 switching
-                        back to default X.75. This command sets the following
-                        Registers:
-                          Reg 14 (Layer-2 protocol):
-                            x = 0:     0
-                            x = 9600:  7
-                            x = 19200: 8
-                            x = 38400: 9
-                          Reg 18.2 = 1
-                          Reg 19 (Additional Service Indicator):
-                            x = 0:       0
-                            x = 9600:  197
-                            x = 19200: 199
-                            x = 38400: 198
-                          Note on value in Reg 19:
-                            There is _NO_ common convention for 38400 baud.
-                            The value 198 is chosen arbitrarily. Users
-                            _MUST_ negotiate this value before establishing
-                            a connection.
-               AT&Sx    Set window-size (x = 1..8) (not yet implemented)
-               AT&V     Show all settings.
-               AT&W0    Write registers and EAZ/MSN to profile. See also
-                        iprofd (5.c in this README).
-               AT&X0    BTX-mode and T.70-mode off (default)
-               AT&X1    BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0)
-               AT&X2    T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0)
-               AT+Rx    Resume a suspended call with CallID x (x = 1,2,3...)
-               AT+Sx    Suspend a call with CallID x (x = 1,2,3...)
-
-           For voice-mode commands refer to README.audio
-
-           1.3.2 Escape sequence:
-               During a connection, the emulation reacts just like
-               a normal modem to the escape sequence <DELAY>+++<DELAY>.
-               (The escape character - default '+' - can be set in the
-               register 2).
-               The DELAY must at least be 1.5 seconds long and delay
-               between the escape characters must not exceed 0.5 seconds.
-
-           1.3.3 Registers:
-
-              Nr.  Default  Description
-              0    0        Answer on ring number.
-                            (no auto-answer if S0=0).
-              1    0        Count of rings.
-              2    43       Escape character.
-                            (a value >= 128 disables the escape sequence).
-              3    13       Carriage return character (ASCII).
-              4    10       Line feed character (ASCII).
-              5    8        Backspace character (ASCII).
-              6    3        Delay in seconds before dialing.
-              7    60       Wait for carrier.
-              8    2        Pause time for comma (ignored)
-              9    6        Carrier detect time (ignored)
-             10    7        Carrier loss to disconnect time (ignored).
-             11    70       Touch tone timing (ignored).
-             12    69       Bit coded register:
-                            Bit 0:    0 = Suppress response messages.
-                                      1 = Show response messages.
-                            Bit 1:    0 = English response messages.
-                                      1 = Numeric response messages.
-                            Bit 2:    0 = Echo off.
-                                      1 = Echo on.
-                            Bit 3     0 = DCD always on.
-                                      1 = DCD follows carrier.
-                            Bit 4     0 = CTS follows RTS
-                                      1 = Ignore RTS, CTS always on.
-                            Bit 5     0 = return to command mode on DTR low.
-                                      1 = Same as 0 but also resets all
-                                          registers.
-                                      See also register 13, bit 2
-                            Bit 6     0 = DSR always on.
-                                      1 = DSR only on if channel is available.
-                            Bit 7     0 = Cisco-PPP-flag-hack off (default).
-                                      1 = Cisco-PPP-flag-hack on.
-             13   0         Bit coded register:
-                            Bit 0:    0 = Use delayed tty-send-algorithm
-                                      1 = Direct tty-send.
-                            Bit 1:    0 = T.70 protocol (Only for BTX!) off
-                                      1 = T.70 protocol (Only for BTX!) on
-                            Bit 2:    0 = Don't hangup on DTR low.
-                                      1 = Hangup on DTR low.
-                            Bit 3:    0 = Standard response messages
-                                      1 = Extended response messages
-                            Bit 4:    0 = CALLER NUMBER before every RING.
-                                      1 = CALLER NUMBER after first RING.
-                            Bit 5:    0 = T.70 extended protocol off
-                                      1 = T.70 extended protocol on
-                            Bit 6:    0 = Special RUNG Message off
-                                      1 = Special RUNG Message on
-                                          "RUNG" is delivered on a ttyI, if
-                                          an incoming call happened (RING) and
-                                          the remote party hung up before any
-                                          local ATA was given.
-			    Bit 7:    0 = Don't show display messages from net
-                                      1 = Show display messages from net
-				          (S12 Bit 1 must be 0 too)      
-             14   0         Layer-2 protocol:
-                                      0 = X75/LAPB with I-frames
-                                      1 = X75/LAPB with UI-frames
-                                      2 = X75/LAPB with BUI-frames
-                                      3 = HDLC
-                                      4 = Transparent (audio)
-                                      7 = V.110, 9600 baud
-                                      8 = V.110, 19200 baud
-                                      9 = V.110, 38400 baud
-                                     10 = Analog Modem (only if hardware supports this)
-                                     11 = Fax G3 (only if hardware supports this)
-             15   0         Layer-3 protocol:
-                                      0 = transparent
-                                      1 = transparent with audio features (e.g. DSP)
-                                      2 = Fax G3 Class 2 commands (S14 has to be set to 11)
-                                      3 = Fax G3 Class 1 commands (S14 has to be set to 11)
-             16   250       Send-Packet-size/16
-             17   8         Window-size (not yet implemented)
-             18   4         Bit coded register, Service-Octet-1 to accept,
-                            or to be used on dialout:
-                            Bit 0:    Service 1 (audio) when set.
-                            Bit 1:    Service 5 (BTX) when set.
-                            Bit 2:    Service 7 (data) when set.
-                            Note: It is possible to set more than one
-                                  bit. In this case, on incoming calls
-                                  the selected services are accepted,
-                                  and if the service is "audio", the
-                                  Layer-2-protocol is automatically
-                                  changed to 4 regardless of the setting
-                                  of register 14. On outgoing calls,
-                                  the most significant 1-bit is chosen to
-                                  select the outgoing service octet.
-             19   0         Service-Octet-2
-             20   0         Bit coded register (readonly)
-                            Service-Octet-1 of last call.
-                            Bit mapping is the same as register 18
-             21   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3 of calling party number IE (Numbering plan)
-                            See section 4.5.10 of ITU Q.931
-             22   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3a of calling party number IE (Screening info)
-                            See section 4.5.10 of ITU Q.931
-             23   0         Bit coded register:
-                            Bit 0:    0 = Add CPN to RING message off
-                                      1 = Add CPN to RING message on
-                            Bit 1:    0 = Add CPN to FCON message off
-                                      1 = Add CPN to FCON message on
-                            Bit 2:    0 = Add CDN to RING/FCON message off
-                                      1 = Add CDN to RING/FCON message on
-
-  Last but not least a (at the moment fairly primitive) device to request
-  the line-status (/dev/isdninfo) is made available.
-
-  Automatic assignment of devices to lines:
-
-  All inactive physical lines are listening to all EAZs for incoming
-  calls and are NOT assigned to a specific tty or network interface.
-  When an incoming call is detected, the driver looks first for a network
-  interface and then for an opened tty which:
-
-  1. is configured for the same EAZ.
-  2. has the same protocol settings for the B-channel.
-  3. (only for network interfaces if the security flag is set)
-     contains the caller number in its access list.
-  4. Either the channel is not bound exclusively to another Net-interface, or
-     it is bound AND the other checks apply to exactly this interface.
-     (For usage of the bind-features, refer to the isdnctrl-man-page)
-
-  Only when a matching interface or tty is found is the call accepted
-  and the "connection" between the low-level-layer and the link-level-layer
-  is established and kept until the end of the connection.
-  In all other cases no connection is established. Isdn4linux can be
-  configured to either do NOTHING in this case (which is useful, if
-  other, external devices with the same EAZ/MSN are connected to the bus)
-  or to reject the call actively. (isdnctrl busreject ...)
-
-  For an outgoing call, the inactive physical lines are searched.
-  The call is placed on the first physical line, which supports the
-  requested protocols for the B-channel. If a net-interface, however
-  is pre-bound to a channel, this channel is used directly.
-
-  This makes it possible to configure several network interfaces and ttys
-  for one EAZ, if the network interfaces are set to secure operation.
-  If an incoming call matches one network interface, it gets connected to it.
-  If another incoming call for the same EAZ arrives, which does not match
-  a network interface, the first tty gets a "RING" and so on.
-
-2 System prerequisites:
-
-  ATTENTION!
-
-  Always use the latest module utilities. The current version is
-  named in Documentation/Changes. Some old versions of insmod
-  are not capable of setting the driver-Ids correctly.
-
-3. Lowlevel-driver configuration.
-
-   Configuration depends on how the drivers are built. See the
-   README.<yourDriver> for information on driver-specific setup.
-
-4. Device-inodes
-
-   The major and minor numbers and their names are described in
-   Documentation/admin-guide/devices.rst. The major numbers are:
-
-     43 for the ISDN-tty's.
-     44 for the ISDN-callout-tty's.
-     45 for control/info/debug devices.
-
-5. Application
-
-   a) For some card-types, firmware has to be loaded into the cards, before
-      proceeding with device-independent setup. See README.<yourDriver>
-      for how to do that.
-
-   b) If you only intend to use ttys, you are nearly ready now.
-
-   c) If you want to have really permanent "Modem"-settings on disk, you
-      can start the daemon iprofd. Give it a path to a file at the command-
-      line. It will store the profile-settings in this file every time
-      an AT&W0 is performed on any ISDN-tty. If the file already exists,
-      all profiles are initialized from this file. If you want to unload
-      any of the modules, kill iprofd first.
-
-   d) For networking, continue: Create an interface:
-       isdnctrl addif isdn0
-
-   e) Set the EAZ (or MSN for Euro-ISDN):
-       isdnctrl eaz isdn0 2
-
-     (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your
-      real MSN e.g.: Phone-Number)
-
-   f) Set the number for outgoing calls on the interface:
-       isdnctrl addphone isdn0 out 1234567
-       ... (this can be executed more than once, all assigned numbers are
-            tried in order)
-      and the number(s) for incoming calls:
-       isdnctrl addphone isdn0 in 1234567
-
-   g) Set the timeout for hang-up:
-       isdnctrl huptimeout isdn0 <timeout_in_seconds>
-
-   h) additionally you may activate charge-hang-up (= Hang up before
-      next charge-info, this only works, if your isdn-provider transmits
-      the charge-info during and after the connection):
-       isdnctrl chargehup isdn0 on
-
-   i) Set the dial mode of the interface:
-       isdnctrl dialmode isdn0 auto
-      "off" means that you (or the system) cannot make any connection
-        (neither incoming or outgoing connections are possible). Use
-        this if you want to be sure that no connections will be made.
-      "auto" means that the interface is in auto-dial mode, and will
-        attempt to make a connection whenever a network data packet needs
-        the interface's link. Note that this can cause unexpected dialouts,
-        and lead to a high phone bill! Some daemons or other pc's that use
-        this interface can cause this.
-        Incoming connections are also possible.
-      "manual" is a dial mode created to prevent the unexpected dialouts.
-        In this mode, the interface will never make any connections on its
-        own. You must explicitly initiate a connection with "isdnctrl dial
-        isdn0". However, after an idle time of no traffic as configured for
-	the huptimeout value with isdnctrl, the connection _will_ be ended.
-	If you don't want any automatic hangup, set the huptimeout value to 0.
-        "manual" is the default.
-
-   j) Setup the interface with ifconfig as usual, and set a route to it.
-
-   k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use
-     the script tools/tcltk/isdnmon. You can add actions for line-status
-     changes. See the comments at the beginning of the script for how to
-     do that. There are other tty-based tools in the tools-subdirectory
-     contributed by Michael Knigge (imon), Volker Götz (imontty) and
-     Andreas Kool (isdnmon).
-
-   l) For initial testing, you can set the verbose-level to 2 (default: 0).
-      Then all incoming calls are logged, even if they are not addressed
-      to one of the configured net-interfaces:
-      isdnctrl verbose 2
-
-  Now you are ready! A ping to the set address should now result in an
-  automatic dial-out (look at syslog kernel-messages).
-  The phone numbers and EAZs can be assigned at any time with isdnctrl.
-  You can add as many interfaces as you like with addif following the
-  directions above. Of course, there may be some limitations. But we have
-  tested as many as 20 interfaces without any problem. However, if you
-  don't give an interface name to addif, the  kernel will assign a name
-  which starts with "eth". The number of "eth"-interfaces is limited by
-  the kernel.
-
-5. Additional options for isdnctrl:
-
-   "isdnctrl secure <InterfaceName> on"
-   Only incoming calls, for which the caller-id is listed in the access
-   list of the interface are accepted. You can add caller-id's With the
-   command "isdnctrl addphone <InterfaceName> in <caller-id>"
-   Euro-ISDN does not transmit the leading '0' of the caller-id for an
-   incoming call, therefore you should configure it accordingly.
-   If the real number for the dialout e.g. is "09311234567" the number
-   to configure here is "9311234567". The pattern-match function
-   works similar to the shell mechanism.
-
-     ?     one arbitrary digit
-     *     zero or arbitrary many digits
-     [123] one of the digits in the list
-     [1-5] one digit between '1' and '5'
-           a '^' as the first character in a list inverts the list
-
-
-   "isdnctrl secure <InterfaceName> off"
-   Switch off secure operation (default).
-
-   "isdnctrl ihup <InterfaceName> [on|off]"
-   Switch the hang-up-timer for incoming calls on or off.
-
-   "isdnctrl eaz <InterfaceName>"
-   Returns the EAZ of an interface.
-
-   "isdnctrl delphone <InterfaceName> in|out <number>"
-   Deletes a number from one of the access-lists of the interface.
-
-   "isdnctrl delif <InterfaceName>"
-   Removes the interface (and possible slaves) from the kernel.
-   (You have to unregister it with "ifconfig <InterfaceName> down" before).
-
-   "isdnctrl callback <InterfaceName> [on|off]"
-   Switches an interface to callback-mode. In this mode, an incoming call
-   will be rejected and after this the remote-station will be called. If
-   you test this feature by using ping, some routers will re-dial very
-   quickly, so that the callback from isdn4linux may not be recognized.
-   In this case use ping with the option -i <sec> to increase the interval
-   between echo-packets.
-
-   "isdnctrl cbdelay <InterfaceName> [seconds]"
-   Sets the delay (default 5 sec) between an incoming call and start of
-   dialing when callback is enabled.
-
-   "isdnctrl cbhup <InterfaceName> [on|off]"
-   This enables (default) or disables an active hangup (reject) when getting an
-   incoming call for an interface which is configured for callback.
-
-   "isdnctrl encap <InterfaceName> <EncapType>"
-   Selects the type of packet-encapsulation. The encapsulation can be changed
-   only while an interface is down.
-
-   At the moment the following values are supported:
-
-   rawip    (Default) Selects raw-IP-encapsulation. This means, MAC-headers
-            are stripped off.
-   ip       IP with type-field. Same as IP but the type-field of the MAC-header
-            is preserved.
-   x25iface X.25 interface encapsulation (first byte semantics as defined in
-            ../networking/x25-iface.txt). Use this for running the linux
-            X.25 network protocol stack (AF_X25 sockets) on top of isdn.
-   cisco-h  A special-mode for communicating with a Cisco, which is configured
-            to do "hdlc"
-   ethernet No stripping. Packets are sent with full MAC-header.
-            The Ethernet-address of the interface is faked, from its
-            IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values.
-   syncppp  Synchronous PPP
-
-   uihdlc   HDLC with UI-frame-header (for use with DOS ISPA, option -h1)
-
-
-   NOTE:    x25iface encapsulation is currently experimental. Please
-            read README.x25 for further details
-
-
-   Watching packets, using standard-tcpdump will fail for all encapsulations
-   except ethernet because tcpdump does not know how to handle packets
-   without MAC-header. A patch for tcpdump is included in the utility-package
-   mentioned above.
-
-   "isdnctrl l2_prot <InterfaceName> <L2-ProtocolName>"
-   Selects a layer-2-protocol.
-   (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available.
-   With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be
-   possible too. See README.x25 for x25 related l2 protocols.)
-
-   isdnctrl l3_prot <InterfaceName> <L3-ProtocolName>
-   The same for layer-3. (At the moment only "trans" is allowed)
-
-   "isdnctrl list <InterfaceName>"
-   Shows all parameters of an interface and the charge-info.
-   Try "all" as the interface name.
-
-   "isdnctrl hangup <InterfaceName>"
-   Forces hangup of an interface.
-
-   "isdnctrl bind <InterfaceName> <DriverId>,<ChannelNumber> [exclusive]"
-   If you are using more than one ISDN card, it is sometimes necessary to
-   dial out using a specific card or even preserve a specific channel for
-   dialout of a specific net-interface. This can be done with the above
-   command. Replace <DriverId> by whatever you assigned while loading the
-   module. The <ChannelNumber> is counted from zero. The upper limit
-   depends on the card used. At the moment no card supports more than
-   2 channels, so the upper limit is one.
-
-   "isdnctrl unbind <InterfaceName>"
-   unbinds a previously bound interface.
-
-   "isdnctrl busreject <DriverId> on|off"
-   If switched on, isdn4linux replies a REJECT to incoming calls, it
-   cannot match to any configured interface.
-   If switched off, nothing happens in this case.
-   You normally should NOT enable this feature, if the ISDN adapter is not
-   the only device connected to the S0-bus. Otherwise it could happen that
-   isdn4linux rejects an incoming call, which belongs to another device on
-   the bus.
-
-   "isdnctrl addslave <InterfaceName> <SlaveName>
-   Creates a slave interface for channel-bundling. Slave interfaces are
-   not seen by the kernel, but their ISDN-part can be configured with
-   isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more
-   than two channels are to be bundled, feel free to create as many as you
-   want. InterfaceName must be a real interface, NOT a slave. Slave interfaces
-   start dialing, if the master interface resp. the previous slave interface
-   has a load of more than 7000 cps. They hangup if the load goes under 7000
-   cps, according to their "huptimeout"-parameter.
-
-   "isdnctrl sdelay <InterfaceName> secs."
-   This sets the minimum time an Interface has to be fully loaded, until
-   it sends a dial-request to its slave.
-
-   "isdnctrl dial <InterfaceName>"
-   Forces an interface to start dialing even if no packets are to be
-   transferred.
-
-   "isdnctrl mapping <DriverId> MSN0,MSN1,MSN2,...MSN9"
-   This installs a mapping table for EAZ<->MSN-mapping for a single line.
-   Missing MSN's have to be given as "-" or can be omitted, if at the end
-   of the commandline.
-   With this command, it's now possible to have an interface listening to
-   mixed 1TR6- and Euro-Type lines. In this case, the interface has to be
-   configured to a 1TR6-type EAZ (one digit). The mapping is also valid
-   for tty-emulation. Seen from the interface/tty-level the mapping
-   CAN be used, however it's possible to use single tty's/interfaces with
-   real MSN's (more digits) also, in which case the mapping will be ignored.
-   Here is an example:
-
-   You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with
-   MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO".
-
-   isdnctrl mapping EURO -,987654,987655,987656,-,987655
-   ...
-   isdnctrl eaz isdn0 1      # listen on 12345671(1tr6) and 987654(euro)
-   ...
-   isdnctrl eaz isdn1 4      # listen on 12345674(1tr6) only.
-   ...
-   isdnctrl eaz isdn2 987654 # listen on 987654(euro) only.
-
-   Same scheme is used with AT&E...  at the tty's.
-
-6. If you want to write a new low-level-driver, you are welcome.
-   The interface to the link-level-module is described in the file INTERFACE.
-   If the interface should be expanded for any reason, don't do it
-   on your own, send me a mail containing the proposed changes and
-   some reasoning about them.
-   If other drivers will not be affected, I will include the changes
-   in the next release.
-   For developers only, there is a second mailing-list. Write to me
-   (fritz@isdn4linux.de), if you want to join that list.
-
-Have fun!
-
- -Fritz
-
diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ
deleted file mode 100644
index e5dd1addacdd..000000000000
--- a/Documentation/isdn/README.FAQ
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The FAQ for isdn4linux
-======================
-
-Please note that there is a big FAQ available in the isdn4k-utils.
-You find it in:
- isdn4k-utils/FAQ/i4lfaq.sgml
-
-In case you just want to see the FAQ online, or download the newest version,
-you can have a look at my website:
-https://www.mhessler.de/i4lfaq/ (view + download)
-or:
-https://www.isdn4linux.de/faq/4lfaq.html (view)
-
-As the extension tells, the FAQ is in SGML format, and you can convert it
-into text/html/... format by using the sgml2txt/sgml2html/... tools.
-Alternatively, you can also do a 'configure; make all' in the FAQ directory.
-
-
-Please have a look at the FAQ before posting anything in the Mailinglist,
-or the newsgroup!
-
-
-Matthias Hessler
-hessler@isdn4linux.de
-
diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax
deleted file mode 100644
index b1a573cf4472..000000000000
--- a/Documentation/isdn/README.HiSax
+++ /dev/null
@@ -1,659 +0,0 @@
-HiSax is a Linux hardware-level driver for passive ISDN cards with Siemens
-chipset (ISAC_S 2085/2086/2186, HSCX SAB 82525). It is based on the Teles
-driver from Jan den Ouden.
-It is meant to be used with isdn4linux, an ISDN link-level module for Linux
-written by Fritz Elfert.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-
-Supported cards
----------------
-
-Teles 8.0/16.0/16.3 and compatible ones
-Teles 16.3c
-Teles S0/PCMCIA
-Teles PCI
-Teles S0Box
-Creatix S0Box
-Creatix PnP S0
-Compaq ISDN S0 ISA card
-AVM A1 (Fritz, Teledat 150)
-AVM Fritz PCMCIA
-AVM Fritz PnP
-AVM Fritz PCI
-ELSA Microlink PCC-16, PCF, PCF-Pro, PCC-8
-ELSA Quickstep 1000
-ELSA Quickstep 1000PCI
-ELSA Quickstep 3000 (same settings as QS1000)
-ELSA Quickstep 3000PCI
-ELSA PCMCIA
-ITK ix1-micro Rev.2
-Eicon Diva 2.0 ISA and PCI (S0 and U interface, no PRO version)
-Eicon Diva 2.01 ISA and PCI
-Eicon Diva 2.02 PCI
-Eicon Diva Piccola
-ASUSCOM NETWORK INC. ISDNLink 128K PC adapter (order code I-IN100-ST-D)
-Dynalink IS64PH (OEM version of ASUSCOM NETWORK INC. ISDNLink 128K adapter)
-PCBIT-DP (OEM version of ASUSCOM NETWORK INC. ISDNLink)
-HFC-2BS0 based cards (TeleInt SA1)
-Sedlbauer Speed Card (Speed Win, Teledat 100, PCI, Fax+)
-Sedlbauer Speed Star/Speed Star2 (PCMCIA)
-Sedlbauer ISDN-Controller PC/104
-USR Sportster internal TA (compatible Stollmann tina-pp V3)
-USR internal TA PCI
-ith Kommunikationstechnik GmbH MIC 16 ISA card
-Traverse Technologie NETjet PCI S0 card and NETspider U card
-Ovislink ISDN sc100-p card (NETjet driver)
-Dr. Neuhaus Niccy PnP/PCI
-Siemens I-Surf 1.0
-Siemens I-Surf 2.0 (with IPAC, try type 12 asuscom) 
-ACER P10
-HST Saphir
-Berkom Telekom A4T
-Scitel Quadro
-Gazel ISDN cards
-HFC-PCI based cards
-Winbond W6692 based cards
-HFC-S+, HFC-SP/PCMCIA cards
-formula-n enternow
-Gerdes Power ISDN
-
-Note: PCF, PCF-Pro: up to now, only the ISDN part is supported
-      PCC-8: not tested yet
-      Eicon.Diehl Diva U interface not tested
-
-If you know other passive cards with the Siemens chipset, please let me know.
-You can combine any card, if there is no conflict between the resources
-(io, mem, irq).
-
-
-Configuring the driver
-----------------------
-
-The HiSax driver can either be built directly into the kernel or as a module.
-It can be configured using the command line feature while loading the kernel
-with LILO or LOADLIN or, if built as a module, using insmod/modprobe with
-parameters.
-There is also some config needed before you compile the kernel and/or
-modules. It is included in the normal "make [menu]config" target at the
-kernel. Don't forget it, especially to select the right D-channel protocol.
-
-Please note: In older versions of the HiSax driver, all PnP cards
-needed to be configured with isapnp and worked only with the HiSax
-driver used as a module.
-
-In the current version, HiSax will automatically use the in-kernel
-ISAPnP support, provided you selected it during kernel configuration
-(CONFIG_ISAPNP), if you don't give the io=, irq= command line parameters.
-
-The affected card types are: 4,7,12,14,19,27-30
-
-a) when built as a module
--------------------------
-
-insmod/modprobe  hisax.o \
-  io=iobase irq=IRQ mem=membase type=card_type \
-  protocol=D_channel_protocol id=idstring
-
-or, if several cards are installed:
-
-insmod/modprobe hisax.o \
-  io=iobase1,iobase2,... irq=IRQ1,IRQ2,... mem=membase1,membase2,... \
-  type=card_type1,card_type2,... \
-  protocol=D_channel_protocol1,D_channel_protocol2,... \
-  id=idstring1%idstring2 ...
-
-where "iobaseN" represents the I/O base address of the Nth card, "membaseN"
-the memory base address of the Nth card, etc.
-
-The reason for the delimiter "%" being used in the idstrings is that ","
-won't work with the current modules package.
-
-The parameters may be specified in any order. For example, the "io"
-parameter may precede the "irq" parameter, or vice versa. If several
-cards are installed, the ordering within the comma separated parameter
-lists must of course be consistent.
-
-Only parameters applicable to the card type need to be specified. For
-example, the Teles 16.3 card is not memory-mapped, so the "mem"
-parameter may be omitted for this card. Sometimes it may be necessary
-to specify a dummy parameter, however. This is the case when there is
-a card of a different type later in the list that needs a parameter
-which the preceding card does not. For instance, if a Teles 16.0 card
-is listed after a Teles 16.3 card, a dummy memory base parameter of 0
-must be specified for the 16.3. Instead of a dummy value, the parameter
-can also be skipped by simply omitting the value. For example:
-mem=,0xd0000. See example 6 below.
-
-The parameter for the D-Channel protocol may be omitted if you selected the
-correct one during kernel config. Valid values are "1" for German 1TR6,
-"2" for EDSS1 (Euro ISDN), "3" for leased lines (no D-Channel) and "4"
-for US NI1.
-With US NI1 you have to include your SPID into the MSN setting in the form
-<MSN>:<SPID> for example (your phonenumber is 1234 your SPID 5678):
-AT&E1234:5678                       on ttyI interfaces
-isdnctrl eaz ippp0 1234:5678        on network devices
-
-The Creatix/Teles PnP cards use io1= and io2= instead of io= for specifying
-the I/O addresses of the ISAC and HSCX chips, respectively.
-
-Card types:
-
-    Type                Required parameters (in addition to type and protocol)
-
-    1   Teles 16.0               irq, mem, io
-    2   Teles  8.0               irq, mem
-    3   Teles 16.3 (non PnP)     irq, io
-    4   Creatix/Teles PnP        irq, io0 (ISAC), io1 (HSCX)
-    5   AVM A1 (Fritz)           irq, io
-    6   ELSA PCC/PCF cards       io or nothing for autodetect (the iobase is
-                                 required only if you have more than one ELSA
-                                 card in your PC)
-    7   ELSA Quickstep 1000      irq, io  (from isapnp setup)
-    8   Teles 16.3 PCMCIA     	 irq, io
-    9   ITK ix1-micro Rev.2      irq, io
-   10   ELSA PCMCIA		 irq, io  (set with card manager)
-   11   Eicon.Diehl Diva ISA PnP irq, io
-   11   Eicon.Diehl Diva PCI     no parameter
-   12   ASUS COM ISDNLink        irq, io  (from isapnp setup)
-   13   HFC-2BS0 based cards     irq, io
-   14   Teles 16.3c PnP          irq, io
-   15   Sedlbauer Speed Card     irq, io
-   15   Sedlbauer PC/104         irq, io
-   15   Sedlbauer Speed PCI	 no parameter
-   16   USR Sportster internal   irq, io
-   17   MIC card                 irq, io
-   18   ELSA Quickstep 1000PCI   no parameter
-   19   Compaq ISDN S0 ISA card  irq, io0, io1, io (from isapnp setup io=IO2)
-   20   NETjet PCI card          no parameter
-   21   Teles PCI                no parameter
-   22   Sedlbauer Speed Star (PCMCIA) irq, io (set with card manager)
-   24   Dr. Neuhaus Niccy PnP    irq, io0, io1 (from isapnp setup)
-   24   Dr. Neuhaus Niccy PCI    no parameter
-   25   Teles S0Box              irq, io (of the used lpt port)
-   26   AVM A1 PCMCIA (Fritz!)   irq, io (set with card manager)
-   27   AVM PnP (Fritz!PnP)      irq, io  (from isapnp setup)
-   27   AVM PCI (Fritz!PCI)      no parameter
-   28   Sedlbauer Speed Fax+     irq, io (from isapnp setup)
-   29	Siemens I-Surf 1.0       irq, io, memory (from isapnp setup)   
-   30	ACER P10                 irq, io (from isapnp setup)   
-   31	HST Saphir               irq, io
-   32	Telekom A4T              none
-   33	Scitel Quadro		 subcontroller (4*S0, subctrl 1...4)
-   34	Gazel ISDN cards (ISA)   irq,io
-   34	Gazel ISDN cards (PCI)   none
-   35	HFC 2BDS0 PCI            none
-   36	W6692 based PCI cards    none
-   37	HFC 2BDS0 S+, SP         irq,io 
-   38	NETspider U PCI card     none
-   39	HFC 2BDS0 SP/PCMCIA      irq,io (set with cardmgr)
-   40   hotplug interface
-   41   Formula-n enter:now PCI  none
-
-At the moment IRQ sharing is only possible with PCI cards. Please make sure
-that your IRQ is free and enabled for ISA use.
-
-
-Examples for module loading
-
-1. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 10
-   modprobe hisax type=3 protocol=2 io=0x280 irq=10
-
-2. Teles 16.0, 1TR6 ISDN, I/O base d80 hex, IRQ 5, Memory d0000 hex
-   modprobe hisax protocol=1 type=1 io=0xd80 mem=0xd0000 irq=5
-
-3. Fritzcard, Euro ISDN, I/O base 340 hex, IRQ 10 and ELSA PCF, Euro ISDN
-   modprobe hisax type=5,6 protocol=2,2 io=0x340 irq=10 id=Fritz%Elsa
-
-4. Any ELSA PCC/PCF card, Euro ISDN
-   modprobe hisax type=6 protocol=2
-
-5. Teles 16.3 PnP, Euro ISDN, with isapnp configured
-   isapnp config:  (INT 0 (IRQ 10 (MODE +E)))
- 		   (IO 0 (BASE 0x0580))
-                   (IO 1 (BASE 0x0180))
-   modprobe hisax type=4 protocol=2 irq=10 io0=0x580 io1=0x180
-
-   In the current version of HiSax, you can instead simply use
-
-   modprobe hisax type=4 protocol=2
-
-   if you configured your kernel for ISAPnP. Don't run isapnp in
-   this case!
-
-6. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 12 and
-   Teles 16.0, 1TR6, IRQ 5, Memory d0000 hex
-   modprobe hisax type=3,1 protocol=2,1 io=0x280 mem=0,0xd0000
-
-   Please note the dummy 0 memory address for the Teles 16.3, used as a
-   placeholder as described above, in the last example.
-
-7. Teles PCMCIA, Euro ISDN, I/O base 180 hex, IRQ 15 (default values)
-   modprobe hisax type=8 protocol=2 io=0x180 irq=15
-
-
-b) using LILO/LOADLIN, with the driver compiled directly into the kernel
-------------------------------------------------------------------------
-
-hisax=typ1,dp1,pa_1,pb_1,pc_1[,typ2,dp2,pa_2 ... \
-      typn,dpn,pa_n,pb_n,pc_n][,idstring1[,idstring2,...,idstringn]]
-
-where
-     typ1 = type of 1st card (default depends on kernel settings)
-     dp1  = D-Channel protocol of 1st card. 1=1TR6, 2=EDSS1, 3=leased
-     pa_1 = 1st parameter (depending on the type of the card)
-     pb_1 = 2nd parameter (    "     "   "   "   "   "   "  )
-     pc_1 = 3rd parameter (    "     "   "   "   "   "   "  )
-
-     typ2,dp2,pa_2,pb_2,pc_2 = Parameters of the second card (defaults: none)
-     typn,dpn,pa_n,pb_n,pc_n = Parameters of the n'th card (up to 16 cards are
-                                                                     supported)
-
-     idstring = Driver ID for accessing the particular card with utility
-                programs and for identification when using a line monitor
-                (default: "HiSax")
-
-                Note: the ID string must start with an alphabetical character!
-
-Card types:
-
-type
-    1 	Teles 16.0     	        pa=irq  pb=membase  pc=iobase
-    2 	Teles  8.0              pa=irq  pb=membase
-    3 	Teles 16.3              pa=irq  pb=iobase
-    4 	Creatix/Teles PNP     	ONLY WORKS AS A MODULE !
-    5 	AVM A1 (Fritz)          pa=irq  pb=iobase
-    6 	ELSA PCC/PCF cards      pa=iobase or nothing for autodetect
-    7   ELSA Quickstep 1000     ONLY WORKS AS A MODULE !
-    8   Teles S0 PCMCIA         pa=irq  pb=iobase
-    9   ITK ix1-micro Rev.2     pa=irq  pb=iobase
-   10   ELSA PCMCIA             pa=irq, pb=io  (set with card manager)
-   11   Eicon.Diehl Diva ISAPnP ONLY WORKS AS A MODULE !
-   11   Eicon.Diehl Diva PCI    no parameter
-   12   ASUS COM ISDNLink       ONLY WORKS AS A MODULE !
-   13	HFC-2BS0 based cards    pa=irq  pb=io
-   14   Teles 16.3c PnP         ONLY WORKS AS A MODULE !
-   15	Sedlbauer Speed Card    pa=irq  pb=io (Speed Win only as module !)
-   15   Sedlbauer PC/104        pa=irq  pb=io
-   15   Sedlbauer Speed PCI	no parameter
-   16   USR Sportster internal  pa=irq  pb=io
-   17   MIC card                pa=irq  pb=io
-   18   ELSA Quickstep 1000PCI  no parameter
-   19   Compaq ISDN S0 ISA card ONLY WORKS AS A MODULE !
-   20   NETjet PCI card         no parameter
-   21   Teles PCI               no parameter
-   22   Sedlbauer Speed Star (PCMCIA)  pa=irq, pb=io  (set with card manager)
-   24   Dr. Neuhaus Niccy PnP   ONLY WORKS AS A MODULE !
-   24   Dr. Neuhaus Niccy PCI   no parameter
-   25   Teles S0Box             pa=irq, pb=io (of the used lpt port)
-   26   AVM A1 PCMCIA (Fritz!)  pa=irq, pb=io (set with card manager)
-   27   AVM PnP (Fritz!PnP)     ONLY WORKS AS A MODULE !
-   27   AVM PCI (Fritz!PCI)     no parameter
-   28   Sedlbauer Speed Fax+    ONLY WORKS AS A MODULE !
-   29	Siemens I-Surf 1.0      ONLY WORKS AS A MODULE !
-   30	ACER P10                ONLY WORKS AS A MODULE !
-   31	HST Saphir              pa=irq, pb=io
-   32	Telekom A4T             no parameter
-   33	Scitel Quadro		subcontroller (4*S0, subctrl 1...4)
-   34	Gazel ISDN cards (ISA)  pa=irq, pb=io
-   34	Gazel ISDN cards (PCI)  no parameter
-   35	HFC 2BDS0 PCI           no parameter
-   36	W6692 based PCI cards   none
-   37	HFC 2BDS0 S+,SP/PCMCIA  ONLY WORKS AS A MODULE !
-   38	NETspider U PCI card    none
-   39	HFC 2BDS0 SP/PCMCIA     ONLY WORKS AS A MODULE !
-   40   hotplug interface	ONLY WORKS AS A MODULE !
-   41   Formula-n enter:now PCI none
-
-Running the driver
-------------------
-
-When you insmod isdn.o and hisax.o (or with the in-kernel version, during
-boot time), a few lines should appear in your syslog. Look for something like:
-
-Apr 13 21:01:59 kke01 kernel: HiSax: Driver for Siemens chip set ISDN cards
-Apr 13 21:01:59 kke01 kernel: HiSax: Version 2.9
-Apr 13 21:01:59 kke01 kernel: HiSax: Revisions 1.14/1.9/1.10/1.25/1.8
-Apr 13 21:01:59 kke01 kernel: HiSax: Total 1 card defined
-Apr 13 21:01:59 kke01 kernel: HiSax: Card 1 Protocol EDSS1 Id=HiSax1 (0)
-Apr 13 21:01:59 kke01 kernel: HiSax: Elsa driver Rev. 1.13
-...
-Apr 13 21:01:59 kke01 kernel: Elsa: PCF-Pro found at 0x360 Rev.:C IRQ 10
-Apr 13 21:01:59 kke01 kernel: Elsa: timer OK; resetting card
-Apr 13 21:01:59 kke01 kernel: Elsa: HSCX version A: V2.1  B: V2.1
-Apr 13 21:01:59 kke01 kernel: Elsa: ISAC 2086/2186 V1.1
-...
-Apr 13 21:01:59 kke01 kernel: HiSax: DSS1 Rev. 1.14
-Apr 13 21:01:59 kke01 kernel: HiSax: 2 channels added
-
-This means that the card is ready for use.
-Cabling problems or line-downs are not detected, and only some ELSA cards can
-detect the S0 power.
-
-Remember that, according to the new strategy for accessing low-level drivers
-from within isdn4linux, you should also define a driver ID while doing
-insmod: Simply append hisax_id=<SomeString> to the insmod command line. This
-string MUST NOT start with a digit or a small 'x'!
-
-At this point you can run a 'cat /dev/isdnctrl0' and view debugging messages.
-
-At the moment, debugging messages are enabled with the hisaxctrl tool:
-
-    hisaxctrl <DriverId> DebugCmd <debugging_flags>
-
-<DriverId> default is HiSax, if you didn't specify one.
-
-DebugCmd is  1  for generic debugging
-            11  for layer 1 development debugging
-            13  for layer 3 development debugging
-
-where <debugging_flags> is the integer sum of the following debugging
-options you wish enabled:
-
-With DebugCmd set to 1:
-
-   0x0001  Link-level <--> hardware-level communication
-   0x0002  Top state machine
-   0x0004  D-Channel Frames for isdnlog
-   0x0008  D-Channel Q.921
-   0x0010  B-Channel X.75
-   0x0020  D-Channel l2
-   0x0040  B-Channel l2
-   0x0080  D-Channel link state debugging
-   0x0100  B-Channel link state debugging
-   0x0200  TEI debug
-   0x0400  LOCK debug in callc.c
-   0x0800  More paranoid debug in callc.c (not for normal use)
-   0x1000  D-Channel l1 state debugging
-   0x2000  B-Channel l1 state debugging
-
-With DebugCmd set to 11:
-
-   0x0001  Warnings (default: on)
-   0x0002  IRQ status
-   0x0004  ISAC
-   0x0008  ISAC FIFO
-   0x0010  HSCX
-   0x0020  HSCX FIFO (attention: full B-Channel output!)
-   0x0040  D-Channel LAPD frame types
-   0x0080  IPAC debug
-   0x0100  HFC receive debug
-   0x0200  ISAC monitor debug
-   0x0400  D-Channel frames for isdnlog (set with 1 0x4 too)
-   0x0800  D-Channel message verbose
-
-With DebugCmd set to 13:
-
-         1  Warnings (default: on)
-         2  l3 protocol descriptor errors
-         4  l3 state machine
-         8  charge info debugging (1TR6)
-
-For example, 'hisaxctrl HiSax 1 0x3ff' enables full generic debugging.
-
-Because of some obscure problems with some switch equipment, the delay
-between the CONNECT message and sending the first data on the B-channel is now
-configurable with
-
-hisaxctrl <DriverId> 2 <delay>
-<delay> in ms Value between 50 and 800 ms is recommended.
-
-Downloading Firmware
---------------------
-At the moment, the Sedlbauer speed fax+ is the only card, which
-needs to download firmware.
-The firmware is downloaded with the hisaxctrl tool:
-
-    hisaxctrl <DriverId> 9 <firmware_filename>
-
-<DriverId> default is HiSax, if you didn't specify one,
-
-where <firmware_filename> is the filename of the firmware file.
-
-For example, 'hisaxctrl HiSax 9 ISAR.BIN' downloads the firmware for
-ISAR based cards (like the Sedlbauer speed fax+).
-
-Warning
--------
-HiSax is a work in progress and may crash your machine.
-For certification look at HiSax.cert file.
-
-Limitations
------------
-At this time, HiSax only works on Euro ISDN lines and German 1TR6 lines.
-For leased lines see appendix.
-
-Bugs
-----
-If you find any, please let me know.
-
-
-Thanks
-------
-Special thanks to:
-
-        Emil Stephan for the name HiSax which is a mix of HSCX and ISAC.
-
-        Fritz Elfert, Jan den Ouden, Michael Hipp, Michael Wein,
-        Andreas Kool, Pekka Sarnila, Sim Yskes, Johan Myrre'en,
-	Klaus-Peter Nischke (ITK AG), Christof Petig, Werner Fehn (ELSA GmbH),
-	Volker Schmidt
-	Edgar Toernig and Marcus Niemann for the Sedlbauer driver
-	Stephan von Krawczynski
-	Juergen Quade for the Leased Line part
-	Klaus Lichtenwalder (Klaus.Lichtenwalder@WebForum.DE), for ELSA PCMCIA support
-	Enrik Berkhan (enrik@starfleet.inka.de) for S0BOX specific stuff
-	Ton van Rosmalen for Teles PCI
-	Petr Novak <petr.novak@i.cz> for Winbond W6692 support
-	Werner Cornelius <werner@isdn4linux.de> for HFC-PCI, HFC-S(+/P) and supplementary services support
-        and more people who are hunting bugs. (If I forgot somebody, please
-	send me a mail).
-
-        Firma ELSA GmbH
-        Firma Eicon.Diehl GmbH
-        Firma Dynalink NL
-	Firma ASUSCOM NETWORK INC. Taiwan
-	Firma S.u.S.E
-	Firma ith Kommunikationstechnik GmbH
-	Firma Traverse Technologie Australia
-	Firma Medusa GmbH  (www.medusa.de).
-	Firma Quant-X Austria for sponsoring a DEC Alpha board+CPU
-	Firma Cologne Chip Designs GmbH
-
-        My girl friend and partner in life Ute for her patience with me.
-
-
-Enjoy,
-
-Karsten Keil
-keil@isdn4linux.de
-
-
-Appendix: Teles PCMCIA driver
------------------------------
-
-See
-   http://www.linux.no/teles_cs.txt 
-for instructions.
-
-Appendix: Linux and ISDN-leased lines
--------------------------------------
-
-Original from Juergen Quade, new version KKe.
-
-Attention NEW VERSION, the old leased line syntax won't work !!!
-
-You can use HiSax to connect your Linux-Box via an ISDN leased line
-to e.g. the Internet:
-
-1. Build a kernel which includes the HiSax driver either as a module
-   or as part of the kernel.
-     cd /usr/src/linux
-     make menuconfig
-     <ISDN subsystem - ISDN support -- HiSax>
-     make clean; make zImage; make modules; make modules_install
-2. Install the new kernel
-     cp /usr/src/linux/arch/x86/boot/zImage /etc/kernel/linux.isdn
-     vi /etc/lilo.conf
-     <add new kernel in the bootable image section>
-     lilo
-3. in case the hisax driver is a "fixed" part of the kernel, configure
-   the driver with lilo:
-     vi /etc/lilo.conf
-     <add HiSax driver parameter in the global section (see below)>
-     lilo
-   Your lilo.conf _might_ look like the following:
-
-	# LILO configuration-file
-	# global section
-    # teles 16.0 on IRQ=5, MEM=0xd8000, PORT=0xd80
-	append="hisax=1,3,5,0xd8000,0xd80,HiSax"
-    # teles 16.3 (non pnp) on IRQ=15, PORT=0xd80
-	# append="hisax=3,3,5,0xd8000,0xd80,HiSax"
-	boot=/dev/sda
-	compact        # faster, but won't work on all systems.
-	linear
-	read-only
-	prompt
-	timeout=100
-	vga = normal    # force sane state
-	# Linux bootable partition config begins
-	image = /etc/kernel/linux.isdn
-	root = /dev/sda1
-	label = linux.isdn
-	#
-	image = /etc/kernel/linux-2.0.30
-	root = /dev/sda1
-	label = linux.secure
-
-   In the line starting with "append" you have to adapt the parameters
-   according to your card (see above in this file)
-
-3. boot the new linux.isdn kernel
-4. start the ISDN subsystem:
-   a) load - if necessary - the modules (depends, whether you compiled
-      the ISDN driver as module or not)
-      According to the type of card you have to specify the necessary
-      driver parameter (irq, io, mem, type, protocol).
-      For the leased line the protocol is "3". See the table above for
-      the parameters, which you have to specify depending on your card.
-   b) configure i4l
-      /sbin/isdnctrl addif isdn0
-      # EAZ  1 -- B1 channel   2 --B2 channel
-      /sbin/isdnctrl eaz isdn0 1
-      /sbin/isdnctrl secure isdn0 on
-      /sbin/isdnctrl huptimeout isdn0 0
-      /sbin/isdnctrl l2_prot isdn0 hdlc
-      # Attention you must not set an outgoing number !!! This won't work !!!
-      # The incoming number is LEASED0 for the first card, LEASED1 for the
-      # second and so on.
-      /sbin/isdnctrl addphone isdn0 in LEASED0
-      # Here is no need to bind the channel.
-   c) in case the remote partner is a CISCO:
-      /sbin/isdnctrl encap isdn0 cisco-h
-   d) configure the interface
-      /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
-   e) set the routes
-      /sbin/route add -host ${REMOTE_IP} isdn0
-      /sbin/route add default gw ${REMOTE_IP}
-   f) switch the card into leased mode for each used B-channel
-      /sbin/hisaxctrl HiSax 5 1
-
-Remarks:
-a) Use state of the art isdn4k-utils
-
-Here an example script:
-#!/bin/sh
-# Start/Stop ISDN leased line connection
-
-I4L_AS_MODULE=yes
-I4L_REMOTE_IS_CISCO=no
-I4L_MODULE_PARAMS="type=16 io=0x268 irq=7 "
-I4L_DEBUG=no
-I4L_LEASED_128K=yes
-LOCAL_IP=192.168.1.1
-REMOTE_IP=192.168.2.1
-
-case "$1" in
-    start)
-	echo "Starting ISDN ..."
-        if [ ${I4L_AS_MODULE} = "yes" ]; then
-		echo "loading modules..."
-		/sbin/modprobe hisax ${I4L_MODULE_PARAMS}
-	fi
-	# configure interface
-	/sbin/isdnctrl addif isdn0
-	/sbin/isdnctrl secure isdn0 on
-	if [ ${I4L_DEBUG} = "yes" ]; then
-		/sbin/isdnctrl verbose 7
-		/sbin/hisaxctrl HiSax 1 0xffff
-		/sbin/hisaxctrl HiSax 11 0xff
-		cat  /dev/isdnctrl >/tmp/lea.log &
-	fi
-	if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
-		/sbin/isdnctrl encap isdn0 cisco-h
-	fi
-	/sbin/isdnctrl huptimeout isdn0 0
-	# B-CHANNEL 1
-	/sbin/isdnctrl eaz isdn0 1
-	/sbin/isdnctrl l2_prot isdn0 hdlc
-	# 1. card
-	/sbin/isdnctrl addphone isdn0 in LEASED0
-        if [ ${I4L_LEASED_128K} = "yes" ]; then
-		/sbin/isdnctrl addslave isdn0 isdn0s
-		/sbin/isdnctrl secure isdn0s on
-		/sbin/isdnctrl huptimeout isdn0s 0
-		# B-CHANNEL 2
-		/sbin/isdnctrl eaz isdn0s 2
-		/sbin/isdnctrl l2_prot isdn0s hdlc
-		# 1. card
-		/sbin/isdnctrl addphone isdn0s in LEASED0
-		if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
-			/sbin/isdnctrl encap isdn0s cisco-h
-		fi
-	fi
-	/sbin/isdnctrl dialmode isdn0 manual
-	# configure tcp/ip
-	/sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
-	/sbin/route add -host ${REMOTE_IP} isdn0
-	/sbin/route add default gw ${REMOTE_IP}
-	# switch to leased mode
-	# B-CHANNEL 1
-	/sbin/hisaxctrl HiSax 5 1
-        if [ ${I4L_LEASED_128K} = "yes" ]; then
-		# B-CHANNEL 2
-		sleep 10; /* Wait for master */
-		/sbin/hisaxctrl HiSax 5 2
-	fi
-	;;
-    stop)
-	/sbin/ifconfig isdn0 down
-	/sbin/isdnctrl delif isdn0
-	if [ ${I4L_DEBUG} = "yes" ]; then
-		killall cat
-	fi
-	if [ ${I4L_AS_MODULE} = "yes" ]; then
-		/sbin/rmmod hisax
-		/sbin/rmmod isdn
-		/sbin/rmmod ppp
-		/sbin/rmmod slhc
-	fi
-	;;
-    *)
-	echo "Usage: $0 {start|stop}"
-	exit 1
-esac
-exit 0
diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio
deleted file mode 100644
index 8ebca19290d9..000000000000
--- a/Documentation/isdn/README.audio
+++ /dev/null
@@ -1,138 +0,0 @@
-$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $
-
-ISDN subsystem for Linux.
-  Description of audio mode.
-
-When enabled during kernel configuration, the tty emulator of the ISDN
-subsystem is capable of a reduced set of commands to support audio.
-This document describes the commands supported and the format of
-audio data.
-
-Commands for enabling/disabling audio mode:
-
-        AT+FCLASS=8      Enable audio mode.
-                         This affects the following registers:
-                           S18: Bits 0 and 2 are set.
-                           S16: Set to 48 and any further change to
-                                larger values is blocked.
-        AT+FCLASS=0      Disable audio mode.
-                         Register 18 is set to 4.
-        AT+FCLASS=?      Show possible modes.
-        AT+FCLASS?       Report current mode (0 or 8).
-
-Commands supported in audio mode:
-
-All audio mode commands have one of the following forms:
-
-        AT+Vxx?          Show current setting.
-        AT+Vxx=?         Show possible settings.
-        AT+Vxx=v         Set simple parameter.
-        AT+Vxx=v,v ...   Set complex parameter.
-
-where xx is a two-character code and v are alphanumerical parameters.
-The following commands are supported:
-
-        AT+VNH=x         Auto hangup setting. NO EFFECT, supported
-                         for compatibility only.
-        AT+VNH?          Always reporting "1"
-        AT+VNH=?         Always reporting "1"
-
-        AT+VIP           Reset all audio parameters.
-
-        AT+VLS=x         Line select. x is one of the following:
-                           0 = No device.
-                           2 = Phone line.
-        AT+VLS=?         Always reporting "0,2"
-        AT+VLS?          Show current line.
-
-        AT+VRX           Start recording. Emulator responds with
-                         CONNECT and starts sending audio data to
-                         the application. See below for data format
-
-        AT+VSD=x,y       Set silence-detection parameters.
-                         Possible parameters:
-                           x = 0 ... 31  sensitivity threshold level.
-                                         (default 0 , deactivated)
-                           y = 0 ... 255 range of interval in units
-                                         of 0.1 second. (default 70)
-        AT+VSD=?         Report possible parameters.
-        AT+VSD?          Show current parameters.
-
-        AT+VDD=x,y       Set DTMF-detection parameters.
-                         Only possible if online and during this connection.
-                         Possible parameters:
-                           x = 0 ... 15  sensitivity threshold level.
-                                         (default 0 , I4L soft-decode)
-                                         (1-15 soft-decode off, hardware on)
-                           y = 0 ... 255 tone duration in units of 5ms.
-                                         Not for I4L soft decode (default 8, 40ms)
-        AT+VDD=?         Report possible parameters.
-        AT+VDD?          Show current parameters.
-
-        AT+VSM=x         Select audio data format.
-                         Possible parameters:
-                           2 = ADPCM-2
-                           3 = ADPCM-3
-                           4 = ADPCM-4
-                           5 = aLAW
-                           6 = uLAW
-        AT+VSM=?         Show possible audio formats.
-
-        AT+VTX           Start audio playback. Emulator responds
-                         with CONNECT and starts sending audio data
-                         received from the application via phone line.
-General behavior and description of data formats/protocol.
-    when a connection is made:
-
-      On incoming calls, if the application responds to a RING
-      with ATA, depending on the calling service, the emulator
-      responds with either CONNECT (data call) or VCON (voice call).
-      
-      On outgoing voice calls, the emulator responds with VCON
-      upon connection setup.
-
-  Audio recording.
-
-    When receiving audio data, a kind of bisync protocol is used.
-    Upon AT+VRX command, the emulator responds with CONNECT, and
-    starts sending audio data to the application. There are several
-    escape sequences defined, all using DLE (0x10) as Escape char:
-
-    <DLE><ETX>              End of audio data. (i.e. caused by a
-                            hangup of the remote side) Emulator stops
-                            recording, responding with VCON.
-    <DLE><DC4>		    Abort recording, (send by appl.) Emulator
-			    stops recording, sends DLE,ETX.
-    <DLE><DLE>              Escape sequence for DLE in data stream.
-    <DLE>0                  Touchtone "0" received.
-         ...
-    <DLE>9                  Touchtone "9" received.
-    <DLE>#                  Touchtone "#" received.
-    <DLE>*                  Touchtone "*" received.
-    <DLE>A                  Touchtone "A" received.
-    <DLE>B                  Touchtone "B" received.
-    <DLE>C                  Touchtone "C" received.
-    <DLE>D                  Touchtone "D" received.
-
-    <DLE>q                  quiet. Silence detected after non-silence.
-    <DLE>s                  silence. Silence detected from the
-                            start of recording.
-
-    Currently unsupported DLE sequences:
-
-    <DLE>c                  FAX calling tone received.
-    <DLE>b                  busy tone received.
-
-  Audio playback.
-
-    When sending audio data, upon AT+VTX command, emulator responds with
-    CONNECT, and starts transferring data from application to the phone line.
-    The same DLE sequences apply to this mode.
-
-  Full-Duplex-Audio:
-
-    When _both_ commands for recording and playback are given in _one_
-    AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected.
-	In this mode, the only way to stop recording is sending <DLE><DC4>
-    and the only way to stop playback is to send <DLE><ETX>.
-
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
deleted file mode 100644
index a76d74845a4c..000000000000
--- a/Documentation/isdn/README.concap
+++ /dev/null
@@ -1,259 +0,0 @@
-Description of the "concap" encapsulation protocol interface
-============================================================
-
-The "concap" interface is intended to be used by network device
-drivers that need to process an encapsulation protocol. 
-It is assumed that the protocol interacts with a linux network device by
-- data transmission
-- connection control (establish, release)
-Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol".
-
-This is currently only used inside the isdn subsystem. But it might
-also be useful to other kinds of network devices. Thus, if you want
-to suggest changes that improve usability or performance of the
-interface, please let me know. I'm willing to include them in future
-releases (even if I needed to adapt the current isdn code to the
-changed interface).
-
-
-Why is this useful?
-===================
-
-The encapsulation protocol used on top of WAN connections or permanent
-point-to-point links are frequently chosen upon bilateral agreement.
-Thus, a device driver for a certain type of hardware must support
-several different encapsulation protocols at once.
-
-The isdn device driver did already support several different
-encapsulation protocols. The encapsulation protocol is configured by a
-user space utility (isdnctrl). The isdn network interface code then
-uses several case statements which select appropriate actions
-depending on the currently configured encapsulation protocol.
-
-In contrast, LAN network interfaces always used a single encapsulation
-protocol which is unique to the hardware type of the interface. The LAN
-encapsulation is usually done by just sticking a header on the data. Thus,
-traditional linux network device drivers used to process the
-encapsulation protocol directly (usually by just providing a hard_header()
-method in the device structure) using some hardware type specific support
-functions. This is simple, direct and efficient. But it doesn't fit all
-the requirements for complex WAN encapsulations. 
-
-
-   The configurability of the encapsulation protocol to be used
-   makes isdn network interfaces more flexible, but also much more
-   complex than traditional lan network interfaces.
-
-
-Many Encapsulation protocols used on top of WAN connections will not just
-stick a header on the data. They also might need to set up or release
-the WAN connection. They also might want to send other data for their
-private purpose over the wire, e.g. ppp does a lot of link level
-negotiation before the first piece of user data can be transmitted.
-Such encapsulation protocols for WAN devices are typically more complex
-than encapsulation protocols for lan devices. Thus, network interface
-code for typical WAN devices also tends to be more complex.
-
-
-In order to support Linux' x25 PLP implementation on top of
-isdn network interfaces I could have introduced yet another branch to
-the various case statements inside drivers/isdn/isdn_net.c.
-This eventually made isdn_net.c even more complex. In addition, it made
-isdn_net.c harder to maintain. Thus, by identifying an abstract
-interface between the network interface code and the encapsulation
-protocol, complexity could be reduced and maintainability could be
-increased.
-
-
-Likewise, a similar encapsulation protocol will frequently be needed by
-several different interfaces of even different hardware type, e.g. the
-synchronous ppp implementation used by the isdn driver and the
-asynchronous ppp implementation used by the ppp driver have a lot of
-similar code in them. By cleanly separating the encapsulation protocol
-from the hardware specific interface stuff such code could be shared
-better in future.
-
-
-When operating over dial-up-connections (e.g. telephone lines via modem,
-non-permanent virtual circuits of wide area networks, ISDN) many
-encapsulation protocols will need to control the connection. Therefore,
-some basic connection control primitives are supported. The type and
-semantics of the connection (i.e the ISO layer where connection service
-is provided) is outside our scope and might be different depending on
-the encapsulation protocol used, e.g. for a ppp module using our service
-on top of a modem connection a connect_request will result in dialing
-a (somewhere else configured) remote phone number. For an X25-interface
-module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt)
-a connect_request will ask for establishing a reliable lapb
-datalink connection.
-
-
-The encapsulation protocol currently provides the following
-service primitives to the network device.
-
-- create a new encapsulation protocol instance
-- delete encapsulation protocol instance and free all its resources
-- initialize (open) the encapsulation protocol instance for use.
-- deactivate (close) an encapsulation protocol instance.
-- process (xmit) data handed down by upper protocol layer
-- receive data from lower (hardware) layer
-- process connect indication from lower (hardware) layer
-- process disconnect indication from lower (hardware) layer
-
-
-The network interface driver accesses those primitives via callbacks
-provided by the encapsulation protocol instance within a
-struct concap_proto_ops.
-
-struct concap_proto_ops{
-
-	/* create a new encapsulation protocol instance of same type */
-	struct concap_proto *  (*proto_new) (void);
-
-	/* delete encapsulation protocol instance and free all its resources.
-	   cprot may no longer be referenced after calling this */
-	void (*proto_del)(struct concap_proto *cprot);
-
-	/* initialize the protocol's data. To be called at interface startup
-	   or when the device driver resets the interface. All services of the
-	   encapsulation protocol may be used after this*/
-	int (*restart)(struct concap_proto *cprot, 
-		       struct net_device *ndev,
-		       struct concap_device_ops *dops);
-
-	/* deactivate an encapsulation protocol instance. The encapsulation
-	   protocol may not call any *dops methods after this. */
-	int (*close)(struct concap_proto *cprot);
-
-	/* process a frame handed down to us by upper layer */
-	int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called for each data entity received from lower layer*/ 
-	int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called when a connection was set up/down.
-	   Protocols that don't process these primitives might fill in
-	   dummy methods here */
-	int (*connect_ind)(struct concap_proto *cprot);
-	int (*disconn_ind)(struct concap_proto *cprot);
-};
-
-
-The data structures are defined in the header file include/linux/concap.h.
-
-
-A Network interface using encapsulation protocols must also provide
-some service primitives to the encapsulation protocol:
-
-- request data being submitted by lower layer (device hardware) 
-- request a connection being set up by lower layer 
-- request a connection being released by lower layer
-
-The encapsulation protocol accesses those primitives via callbacks
-provided by the network interface within a struct concap_device_ops.
-
-struct concap_device_ops{
-
-	/* to request data be submitted by device */ 
-	int (*data_req)(struct concap_proto *, struct sk_buff *);
-
-	/* Control methods must be set to NULL by devices which do not
-	   support connection control. */
-	/* to request a connection be set up */ 
-	int (*connect_req)(struct concap_proto *);
-
-	/* to request a connection be released */
-	int (*disconn_req)(struct concap_proto *);	
-};
-
-The network interface does not explicitly provide a receive service
-because the encapsulation protocol directly calls netif_rx(). 
-
-
-
-
-An encapsulation protocol itself is actually the
-struct concap_proto{
-	struct net_device *net_dev;		/* net device using our service  */
-	struct concap_device_ops *dops; /* callbacks provided by device */
- 	struct concap_proto_ops  *pops; /* callbacks provided by us */
-	int flags;
-	void *proto_data;               /* protocol specific private data, to
-					   be accessed via *pops methods only*/
-	/*
-	  :
-	  whatever 
-	  :
-	  */
-};
-
-Most of this is filled in when the device requests the protocol to 
-be reset (opend). The network interface must provide the net_dev and
-dops pointers. Other concap_proto members should be considered private
-data that are only accessed by the pops callback functions. Likewise,
-a concap proto should access the network device's private data
-only by means of the callbacks referred to by the dops pointer.
-
-
-A possible extended device structure which uses the connection controlling
-encapsulation services could look like this:
-
-struct concap_device{
-	struct net_device net_dev;
-	struct my_priv  /* device->local stuff */
-			/* the my_priv struct might contain a 
-			   struct concap_device_ops *dops;
-	                   to provide the device specific callbacks
-			*/
-	struct concap_proto *cprot;        /* callbacks provided by protocol */
-};
-
-
-
-Misc Thoughts
-=============
-
-The concept of the concap proto might help to reuse protocol code and
-reduce the complexity of certain network interface implementations.
-The trade off is that it introduces yet another procedure call layer
-when processing the protocol. This has of course some impact on
-performance. However, typically the concap interface will be used by
-devices attached to slow lines (like telephone, isdn, leased synchronous
-lines). For such slow lines, the overhead is probably negligible.
-This might no longer hold for certain high speed WAN links (like
-ATM).
-
-
-If general linux network interfaces explicitly supported concap
-protocols (e.g. by a member struct concap_proto* in struct net_device)
-then the interface of the service function could be changed
-by passing a pointer of type (struct net_device*) instead of
-type (struct concap_proto*). Doing so would make many of the service
-functions compatible to network device support functions.
-
-e.g. instead of the concap protocol's service function
-
-  int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-we could have
-
-  int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb);
-
-As this is compatible to the dev->hard_start_xmit() method, the device
-driver could directly register the concap protocol's encap_and_xmit()
-function as its hard_start_xmit() method. This would eliminate one
-procedure call layer.
-
-
-The device's data request function could also be defined as
- 
-  int (*data_req)(struct net_device *ndev, struct sk_buff *skb);
-
-This might even allow for some protocol stacking. And the network
-interface might even register the same data_req() function directly
-as its hard_start_xmit() method when a zero layer encapsulation
-protocol is configured. Thus, eliminating the performance penalty
-of the concap interface when a trivial concap protocol is used.
-Nevertheless, the device remains able to support encapsulation
-protocol configuration.
-
diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion
deleted file mode 100644
index bddcd5fb86ff..000000000000
--- a/Documentation/isdn/README.diversion
+++ /dev/null
@@ -1,127 +0,0 @@
-The isdn diversion services are a supporting module working together with
-the isdn4linux and the HiSax module for passive cards. 
-Active cards, TAs and cards using a own or other driver than the HiSax 
-module need to be adapted to the HL<->LL interface described in a separate 
-document. The diversion services may be used with all cards supported by 
-the HiSax driver.
-The diversion kernel interface and controlling tool divertctrl were written
-by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the
-GNU General Public License.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-Table of contents
-=================
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-2. Required hard- and software
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-4. Tracing calling and diversion information
- 
-5. Format of the divert device ASCII output
- 
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-   The i4l diversion services offers call forwarding and logging normally 
-   only supported by isdn phones. Incoming calls may be diverted 
-   unconditionally (CFU), when not reachable (CFNR) or on busy condition 
-   (CFB). 
-   The diversions may be invoked statically in the providers exchange
-   as normally done by isdn phones. In this case all incoming calls
-   with a special (or all) service identifiers are forwarded if the 
-   forwarding reason is met. Activated static services may also be 
-   interrogated (queried).
-   The i4l diversion services additionally offers a dynamic version of
-   call forwarding which is not preprogrammed inside the providers exchange
-   but dynamically activated by i4l.
-   In this case all incoming calls are checked by rules that may be
-   compared to the mechanism of ipfwadm or ipchains. If a given rule matches
-   the checking process is finished and the rule matching will be applied
-   to the call.
-   The rules include primary and secondary service identifiers, called 
-   number and subaddress, callers number and subaddress and whether the rule
-   matches to all filtered calls or only those when all B-channel resources
-   are exhausted.
-   Actions that may be invoked by a rule are ignore, proceed, reject, 
-   direct divert or delayed divert of a call.
-   All incoming calls matching a rule except the ignore rule a reported and
-   logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed
-   is selected the call will be held in a proceeding state (without ringing)
-   for a certain amount of time to let an external program or client decide
-   how to handle the call. 
-            
-
-2. Required hard- and software
-   
-   For using the i4l diversion services the isdn line must be of a EURO/DSS1
-   type. Additionally the i4l services only work together with the HiSax 
-   driver for passive isdn cards. All HiSax supported cards may be used for
-   the diversion purposes.
-   The static diversion services require the provider having static services
-   CFU, CFNR, CFB activated on an MSN-line. The static services may not be 
-   used on a point-to-point connection. Further the static services are only
-   available in some countries (for example germany). Countries requiring the 
-   keypad protocol for activating static diversions (like the netherlands) are
-   not supported but may use the tty devices for this purpose.
-   The dynamic diversion services may be used in all countries if the provider
-   enables the feature CF (call forwarding). This should work on both MSN- and
-   point-to-point lines.
-   To add and delete rules the additional divertctrl program is needed. This
-   program is part of the isdn4kutils package.   
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-
-   To compile the i4l code with diversion support you need to say yes to the 
-   DSS1 diversion services when selecting the i4l options in the kernel 
-   config (menuconfig or config).
-   After having properly activated a make modules and make modules_install all
-   required modules will be correctly installed in the needed modules dirs.
-   As the diversion services are currently not included in the scripts of most
-   standard distributions you will have to add a "insmod dss1_divert" after
-   having loaded the global isdn module.
-   The module can be loaded without any command line parameters.
-   If the module is actually loaded and active may be checked with a 
-   "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is 
-   dynamically created by the diversion module and removed when the module is
-   unloaded.
-
-
-4. Tracing calling and diversion information
- 
-   You also may put a "cat /proc/net/isdn/divert" in the background with the
-   output redirected to a file. Then all actions of the module are logged.
-   The divert file in the proc system may be opened more than once, so in 
-   conjunction with inetd and a small remote client on other machines inside
-   your network incoming calls and reactions by the module may be shown on 
-   every listening machine. 
-   If a call is reported as proceeding an external program or client may 
-   specify during a certain amount of time (normally 4 to 10 seconds) what
-   to do with that call.      
-   To unload the module all open files to the device in the proc system must
-   be closed. Otherwise the module (and isdn.o) may not be unloaded. 
-
-5. Format of the divert device ASCII output
- 
-   To be done later
-
diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax
deleted file mode 100644
index 5314958a8a6e..000000000000
--- a/Documentation/isdn/README.fax
+++ /dev/null
@@ -1,45 +0,0 @@
-
-Fax with isdn4linux
-===================
-
-When enabled during kernel configuration, the tty emulator
-of the ISDN subsystem is capable of the Fax Class 2 commands.
-
-This only makes sense under the following conditions :
-
-- You need the commands as dummy, because you are using
-  hylafax (with patch) for AVM capi.
-- You want to use the fax capabilities of your isdn-card.
-  (supported cards are listed below)
-
-
-NOTE: This implementation does *not* support fax with passive
-      ISDN-cards (known as softfax). The low-level driver of
-      the ISDN-card and/or the card itself must support this.
-
-
-Supported ISDN-Cards
---------------------
-
-Eicon DIVA Server BRI/PCI
-	- full support with both B-channels.
-
-Eicon DIVA Server 4BRI/PCI
-	- full support with all B-channels.
-
-Eicon DIVA Server PRI/PCI
-	- full support on amount of B-channels
-		depending on DSPs on board.
-
-
-
-The command set is known as Class 2 (not Class 2.0) and
-can be activated by AT+FCLASS=2
-
-
-The interface between the link-level-module and the hardware-level driver
-is described in the files INTERFACE.fax and INTERFACE.
-
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 9b1ce277ca3d..f6184b637182 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -48,9 +48,8 @@ GigaSet 307x Device Driver
 
 1.2. Software
      --------
-     The driver works with the Kernel CAPI subsystem as well as the old
-     ISDN4Linux subsystem, so it can be used with any software which is able
-     to use CAPI 2.0 or ISDN4Linux for ISDN connections (voice or data).
+     The driver works with the Kernel CAPI subsystem and can be used with any
+     software which is able to use CAPI 2.0 for ISDN connections (voice or data).
 
      There are some user space tools available at
      https://sourceforge.net/projects/gigaset307x/
@@ -92,7 +91,7 @@ GigaSet 307x Device Driver
 	gigaset	 	debug	   debug level (see section 3.2.)
 
 			startmode  initial operation mode (see section 2.5.):
-	bas_gigaset )		   1=ISDN4linux/CAPI (default), 0=Unimodem
+	bas_gigaset )		   1=CAPI (default), 0=Unimodem
 	ser_gigaset )
 	usb_gigaset )	cidmode    initial Call-ID mode setting (see section
 				   2.5.): 1=on (default), 0=off
@@ -154,18 +153,10 @@ GigaSet 307x Device Driver
 
 2.3. CAPI
      ----
-     If the driver is compiled with CAPI support (kernel configuration option
-     GIGASET_CAPI) the devices will show up as CAPI controllers as soon as the
-     corresponding driver module is loaded, and can then be used with CAPI 2.0
-     kernel and user space applications. For user space access, the module
-     capi.ko must be loaded.
-
-     Legacy ISDN4Linux applications are supported via the capidrv
-     compatibility driver. The kernel module capidrv.ko must be loaded
-     explicitly with the command
-        modprobe capidrv
-     if needed, and cannot be unloaded again without unloading the driver
-     first. (These are limitations of capidrv.)
+     The devices will show up as CAPI controllers as soon as the
+     corresponding driver module is loaded, and can then be used with
+     CAPI 2.0 kernel and user space applications. For user space access,
+     the module capi.ko must be loaded.
 
      Most distributions handle loading and unloading of the various CAPI
      modules automatically via the command capiinit(1) from the capi4k-utils
@@ -173,16 +164,6 @@ GigaSet 307x Device Driver
      Gigaset drivers because it doesn't support more than one module per
      driver.
 
-2.4. ISDN4Linux
-     ----------
-     If the driver is compiled without CAPI support (native ISDN4Linux
-     variant), it registers the device with the legacy ISDN4Linux subsystem
-     after loading the module. It can then be used with ISDN4Linux
-     applications only. Most distributions provide some configuration utility
-     for setting up that subsystem. Otherwise you can use some HOWTOs like
-         http://www.linuxhaven.de/dlhp/HOWTO/DE-ISDN-HOWTO-5.html
-
-
 2.5. Unimodem mode
      -------------
      In this mode the device works like a modem connected to a serial port
@@ -281,8 +262,7 @@ GigaSet 307x Device Driver
      number. Dialing "***" (three asterisks) calls all extensions
      simultaneously (global call).
 
-     This holds for both CAPI 2.0 and ISDN4Linux applications. Unimodem mode
-     does not support internal calls.
+     Unimodem mode does not support internal calls.
 
 2.8. Unregistered Wireless Devices (M101/M105)
      -----------------------------------------
diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci
deleted file mode 100644
index e8a4ef0226e8..000000000000
--- a/Documentation/isdn/README.hfc-pci
+++ /dev/null
@@ -1,41 +0,0 @@
-The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used
-for many OEM cards using this chips.
-Additionally the driver has a special feature which makes it possible
-to read the echo-channel of the isdn bus. So all frames in both directions
-may be logged.
-When the echo logging feature is used the number of available B-channels
-for a HFC-PCI card is reduced to 1. Of course this is only relevant to
-the card, not to the isdn line.
-To activate the echo mode the following ioctls must be entered:
-
-hisaxctrl <driver/cardname> 10 1
-
-This reduces the available channels to 1. There must not be open connections
-through this card when entering the command.
-And then:
-
-hisaxctrl <driver/cardname> 12 1
-
-This enables the echo mode. If Hex logging is activated the isdnctrlx 
-devices show a output with a line beginning of HEX: for the providers
-exchange and ECHO: for isdn devices sending to the provider.
-
-If more than one HFC-PCI cards are installed, a specific card may be selected
-at the hisax module load command line. Supply the load command with the desired
-IO-address of the desired card. 
-Example:
-There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400 
-and 0xdc00
-If you want to use the card at 0xd400 standalone you should supply the insmod
-or depmod with type=35 io=0xd400.
-If you want to use all three cards, but the order needs to be at 0xdc00,0xd400,
-0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00 
-Then the desired card will be the initialised in the desired order.
-If the io parameter is used the io addresses of all used cards should be 
-supplied else the parameter is assumed 0 and a auto search for a free card is
-invoked which may not give the wanted result. 
-
-Comments and reports to werner@isdn4linux.de or werner@isdn-development.de
-
-
-
diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp
deleted file mode 100644
index 27d260095cce..000000000000
--- a/Documentation/isdn/README.syncppp
+++ /dev/null
@@ -1,58 +0,0 @@
-Some additional information for setting up a syncPPP
-connection using network interfaces.
----------------------------------------------------------------
-
-You need one thing beside the isdn4linux package:
-
-  a patched pppd .. (I called it ipppd to show the difference)
-
-Compiling isdn4linux with sync PPP:
------------------------------------
-To compile isdn4linux with the sync PPP part, you have
-to answer the appropriate question when doing a "make config"
-Don't forget to load the slhc.o
-module before the isdn.o module, if VJ-compression support
-is not compiled into your kernel. (e.g if you have no PPP or
-CSLIP in the kernel)
-
-Using isdn4linux with sync PPP:
--------------------------------
-Sync PPP is just another encapsulation for isdn4linux. The
-name to enable sync PPP encapsulation is 'syncppp' .. e.g:
-
-  /sbin/isdnctrl encap ippp0 syncppp
-
-The name of the interface is here 'ippp0'. You need 
-one interface with the name 'ippp0' to saturate the
-ipppd, which checks the ppp version via this interface.
-Currently, all devices must have the name ipppX where
-'X' is a decimal value.
-
-To set up a PPP connection you need the ipppd .. You must start 
-the ipppd once after installing the modules. The ipppd 
-communicates with the isdn4linux link-level driver using the
-/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle
-all devices at once. If you want to use two PPP connections
-at the same time, you have to connect the ipppd to two
-devices .. and so on. 
-I've implemented one additional option for the ipppd:
- 'useifip' will get (if set to not 0.0.0.0) the IP address 
- for the negotiation from the attached network-interface. 
-(also: ipppd will try to negotiate pointopoint IP as remote IP)
-You must disable BSD-compression, this implementation can't
-handle compressed packets.
-
-Check the etc/rc.isdn.syncppp in the isdn4kernel-util package
-for an example setup script.
-
-To use the MPPP stuff, you must configure a slave device
-with isdn4linux. Now call the ipppd with the '+mp' option.
-To increase the number of links, you must use the
-'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is
-an example script)
-
-enjoy it,
-    michael
-     
-
-
diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25
deleted file mode 100644
index e561a77c4e22..000000000000
--- a/Documentation/isdn/README.x25
+++ /dev/null
@@ -1,184 +0,0 @@
-  
-X.25 support within isdn4linux
-==============================
-
-This is alpha/beta test code. Use it completely at your own risk.
-As new versions appear, the stuff described here might suddenly change
-or become invalid without notice.
-
-Keep in mind:
-
-You are using several new parts of the 2.2.x kernel series which
-have not been tested in a large scale. Therefore, you might encounter
-more bugs as usual.
-
-- If you connect to an X.25 neighbour not operated by yourself, ASK the
-  other side first. Be prepared that bugs in the protocol implementation
-  might result in problems.
-
-- This implementation has never wiped out my whole hard disk yet. But as
-  this is experimental code, don't blame me if that happened to you.
-  Backing up important data will never harm.
-
-- Monitor your isdn connections while using this software. This should
-  prevent you from undesired phone bills in case of driver problems.
-  
- 
-
-
-How to configure the kernel
-===========================
- 
-The ITU-T (former CCITT) X.25 network protocol layer has been implemented
-in the Linux source tree since version 2.1.16. The isdn subsystem might be 
-useful to run X.25 on top of ISDN. If you want to try it, select
-
-   "CCITT X.25 Packet Layer"
-
-from the networking options as well as
-
-   "ISDN Support" and "X.25 PLP on Top of ISDN"
-
-from the ISDN subsystem options when you configure your kernel for
-compilation. You currently also need to enable
-"Prompt for development and/or incomplete code/drivers" from the
-"Code maturity level options" menu. For the x25trace utility to work
-you also need to enable "Packet socket".
-
-For local testing it is also recommended to enable the isdnloop driver
-from the isdn subsystem's configuration menu.
-
-For testing, it is recommended that all isdn drivers and the X.25 PLP
-protocol are compiled as loadable modules. Like this, you can recover
-from certain errors by simply unloading and reloading the modules.
-
-
-
-What's it for? How to use it?
-=============================
-
-X.25 on top of isdn might be useful with two different scenarios:
-
-- You might want to access a public X.25 data network from your Linux box.
-  You can use i4l if you were physically connected to the X.25 switch
-  by an ISDN B-channel (leased line as well as dial up connection should
-  work).
-
-  This corresponds to ITU-T recommendation X.31 Case A (circuit-mode
-  access to PSPDN [packet switched public data network]).
-
-  NOTE: X.31 also covers a Case B (access to PSPDN via virtual
-  circuit / packet mode service). The latter mode (which in theory
-  also allows using the D-channel) is not supported by isdn4linux.
-  It should however be possible to establish such packet mode connections
-  with certain active isdn cards provided that the firmware supports X.31
-  and the driver exports this functionality to the user. Currently, 
-  the AVM B1 driver is the only driver which does so. (It should be
-  possible to access D-channel X.31 with active AVM cards using the
-  CAPI interface of the AVM-B1 driver).
-
-- Or you might want to operate certain ISDN teleservices on your linux
-  box. A lot of those teleservices run on top of the ISO-8208
-  (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the
-  same as ITU-T X.25.
-
-  Popular candidates of such teleservices are EUROfile transfer or any
-  teleservice applying ITU-T recommendation T.90.
-
-To use the X.25 protocol on top of isdn, just create an isdn network
-interface as usual, configure your own and/or peer's ISDN numbers,
-and choose x25iface encapsulation by
-
-   isdnctrl encap <iface-name> x25iface.
-
-Once encap is set like this, the device can be used by the X.25 packet layer.
-
-All the stuff needed for X.25 is implemented inside the isdn link
-level (mainly isdn_net.c and some new source files). Thus, it should
-work with every existing HL driver. I was able to successfully open X.25
-connections on top of the isdnloop driver and the hisax driver.
-"x25iface"-encapsulation bypasses demand dialing. Dialing will be
-initiated when the upper (X.25 packet) layer requests the lapb datalink to
-be established. But hangup timeout is still active. Whenever a hangup
-occurs, all existing X.25 connections on that link will be cleared
-It is recommended to use sufficiently large hangup-timeouts for the
-isdn interfaces.
-
-
-In order to set up a conforming protocol stack you also need to
-specify the proper l2_prot parameter:
-
-To operate in ISO-8208  X.25 DTE-DTE mode, use
-
-   isdnctrl l2_prot <iface-name> x75i
-
-To access an X.25 network switch via isdn (your linux box is the DTE), use
-
-   isdnctrl l2_prot <iface-name> x25dte
-
-To mimic an X.25 network switch (DCE side of the connection), use
-
-   isdnctrl l2_prot <iface-name> x25dce
-
-However, x25dte or x25dce is currently not supported by any real HL
-level driver. The main difference between x75i and x25dte/dce is that
-x25d[tc]e uses fixed lap_b addresses. With x75i, the side which
-initiates the isdn connection uses the DTE's lap_b address while the
-called side used the DCE's lap_b address. Thus, l2_prot x75i might
-probably work if you access a public X.25 network as long as the
-corresponding isdn connection is set up by you. At least one test
-was successful to connect via isdn4linux to an X.25 switch using this
-trick. At the switch side, a terminal adapter X.21 was used to connect
-it to the isdn.
-
-
-How to set up a test installation?
-==================================
-
-To test X.25 on top of isdn, you need to get
-
-- a recent version of the "isdnctrl" program that supports setting the new
-  X.25 specific parameters.
-
-- the x25-utils-2.X package from 
-  ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-*
-  (don't confuse the x25-utils with the ax25-utils)
-
-- an application program that uses linux PF_X25 sockets (some are
-  contained in the x25-util package).
-
-Before compiling the user level utilities make sure that the compiler/
-preprocessor will fetch the proper kernel header files of this kernel
-source tree. Either make /usr/include/linux a symbolic link pointing to 
-this kernel's include/linux directory or set the appropriate compiler flags.
-
-When all drivers and interfaces are loaded and configured you need to
-ifconfig the network interfaces up and add X.25-routes to them. Use
-the usual ifconfig tool.
-
-ifconfig <iface-name> up
-
-But a special x25route tool (distributed with the x25-util package)
-is needed to set up X.25 routes. I.e. 
-
-x25route add 01 <iface-name>
-
-will cause all x.25 connections to the destination X.25-address
-"01" to be routed to your created isdn network interface.
-
-There are currently no real X.25 applications available. However, for
-tests, the x25-utils package contains a modified version of telnet
-and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can
-use those for your first tests. Furthermore, you might check
-ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some
-alpha-test implementation ("eftp4linux") of the EUROfile transfer
-protocol.
-
-The scripts distributed with the eftp4linux test releases might also
-provide useful examples for setting up X.25 on top of isdn.
-
-The x25-utility package also contains an x25trace tool that can be
-used to monitor X.25 packets received by the network interfaces.
-The /proc/net/x25* files also contain useful information. 
-
-- Henner
diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ
deleted file mode 100644
index 3257a4bc0786..000000000000
--- a/Documentation/isdn/syncPPP.FAQ
+++ /dev/null
@@ -1,224 +0,0 @@
-simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' 
--------------------------------------------------------------------
-
-Q01: what's pppd, ipppd, syncPPP, asyncPPP ??
-Q02: error message "this system lacks PPP support"
-Q03: strange information using 'ifconfig'
-Q04: MPPP?? What's that and how can I use it ...
-Q05: I tried MPPP but it doesn't work 
-Q06: can I use asynchronous PPP encapsulation with network devices
-Q07: A SunISDN machine can't connect to my i4l system
-Q08: I wanna talk to several machines, which need different configs
-Q09: Starting the ipppd, I get only error messages from i4l
-Q10: I wanna use dynamic IP address assignment 
-Q11: I can't connect. How can I check where the problem is.
-Q12: How can I reduce login delay? 
-
--------------------------------------------------------------------
-
-Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ?
-   what should I use?
-A: The pppd is for asynchronous PPP .. asynchronous means
-   here, the framing is character based. (e.g when
-   using ttyI* or tty* devices)
-
-   The ipppd handles PPP packets coming in HDLC
-   frames (bit based protocol) ... The PPP driver
-   in isdn4linux pushes all IP packets direct
-   to the network layer and all PPP protocol
-   frames to the /dev/ippp* device. 
-   So, the ipppd is a simple external network
-   protocol handler.
-
-   If you login into a remote machine using the
-   /dev/ttyI* devices and then enable PPP on the
-   remote terminal server -> use the 'old' pppd
-
-   If your remote side immediately starts to send
-   frames ... you probably connect to a 
-   syncPPP machine .. use the network device part
-   of isdn4linux with the 'syncppp' encapsulation
-   and make sure, that the ipppd is running and 
-   connected to at least one /dev/ippp*. Check the 
-   isdn4linux manual on how to configure a network device.
-
---
-
-Q02: when I start the ipppd .. I only get the
-   error message "this system lacks PPP support"
-A: check that at least the device 'ippp0' exists.
-   (you can check this e.g with the program 'ifconfig')
-   The ipppd NEEDS this device under THIS name .. 
-   If this device doesn't exists, use:
-	isdnctrl addif ippp0
-	isdnctrl encap ippp0 syncppp
-	... (see isdn4linux doc for more) ...
-A: Maybe you have compiled the ipppd with another
-   kernel source tree than the kernel you currently
-   run ... 
-
---
-
-Q03: when I list the netdevices with ifconfig I see, that
-   my ISDN interface has a HWaddr and IRQ=0 and Base 
-   address = 0 
-A: The device is a fake ethernet device .. ignore IRQ and baseaddr
-   You need the HWaddr only for ethernet encapsulation.
-   
---
-
-Q04: MPPP?? What's that and how can I use it ...
-
-A: MPPP or MP or MPP (Warning: MP is also an 
-   acronym for 'Multi Processor') stands for
-   Multi Point to Point and means bundling
-   of several channels to one logical stream.
-   To enable MPPP negotiation you must call the
-   ipppd with the '+mp' option. 
-   You must also configure a slave device for
-   every additional channel. (see the i4l manual
-   for more)
-   To use channel bundling you must first activate
-   the 'master' or initial call. Now you can add 
-   the slave channels with the command:
-       isdnctrl addlink <device>
-   e.g:
-       isdnctrl addlink ippp0
-   This is different from other encapsulations of
-   isdn4linux! With syncPPP, there is no automatic
-   activation of slave devices.
-
---
-
-Q05: I tried MPPP but it doesn't work .. the ipppd
-   writes in the debug log something like:
-   .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ...
-   .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ...
-
-A: you forgot to compile MPPP/RFC1717 support into the
-   ISDN Subsystem. Recompile with this option enabled.
-
---
-
-Q06: can I use asynchronous PPP encapsulation
-   over the network interface of isdn4linux ..
-
-A: No .. that's not possible .. Use the standard
-   PPP package over the /dev/ttyI* devices. You
-   must not use the ipppd for this.
-   
---
-
-Q07: A SunISDN machine tries to connect my i4l system,
-   which doesn't work.
-   Checking the debug log I just saw garbage like:
-!![ ... fill in the line ... ]!!
-
-A: The Sun tries to talk asynchronous PPP ... i4l
-   can't understand this ... try to use the ttyI*
-   devices with the standard PPP/pppd package
-
-A: (from Alexanter Strauss: )
-!![ ... fill in mail ]!!
-
---
-
-Q08: I wanna talk to remote machines, which need
-   a different configuration. The only way
-   I found to do this is to kill the ipppd and
-   start a new one with another config to connect
-   to the second machine. 
-
-A: you must bind a network interface explicitly to
-   an ippp device, where you can connect a (for this
-   interface) individually configured ipppd.
-
---
-
-Q09: When I start the ipppd I only get error messages
-   from the i4l driver .. 
-
-A: When starting, the ipppd calls functions which may 
-   trigger a network packet. (e.g gethostbyname()).
-   Without the ipppd (at this moment, it is not
-   fully started) we can't handle this network request.
-   Try to configure hostnames necessary for the ipppd
-   in your local /etc/hosts file or in a way, that
-   your system can resolve it without using an
-   isdn/ippp network-interface.
-
---
-
-Q10: I wanna use dynamic IP address assignment ... How 
-   must I configure the network device.
-
-A: At least you must have a route which forwards
-   a packet to the ippp network-interface to trigger
-   the dial-on-demand.
-   A default route to the ippp-interface will work.
-   Now you must choose a dummy IP address for your
-   interface.
-   If for some reason you can't set the default
-   route to the ippp interface, you may take any 
-   address of the subnet from which you expect your
-   dynamic IP number and set a 'network route' for
-   this subnet to the ippp interface.
-   To allow overriding of the dummy address you
-   must call the ipppd with the 'ipcp-accept-local' option.
-
-A: You must know, how the ipppd gets the addresses it wanna
-   configure. If you don't give any option, the ipppd
-   tries to negotiate the local host address!
-   With the option 'noipdefault' it requests an address
-   from the remote machine. With 'useifip' it gets the
-   addresses from the net interface. Or you set the address
-   on the option line with the <a.b.c.d:e.f.g.h> option.
-   Note: the IP address of the remote machine must be configured
-   locally or the remote machine must send it in an IPCP request.
-   If your side doesn't know the IP address after negotiation, it
-   closes the connection!
-   You must allow overriding of address with the 'ipcp-accept-*'
-   options, if you have set your own or the remote address 
-   explicitly.
-
-A: Maybe you try these options .. e.g:   
-
-    /sbin/ipppd :$REMOTE noipdefault /dev/ippp0
-
-   where REMOTE must be the address of the remote machine (the
-   machine, which gives you your address)
-
---
-
-Q11: I can't connect. How can I check where the problem is.
-
-A: A good help log is the debug output from the ipppd...
-   Check whether you can find there:
-   - only a few LCP-conf-req SENT messages (less then 10)
-     and then a Term-REQ:
-     -> check whether your ISDN card is well configured
-        it seems, that your machine doesn't dial
-        (IRQ,IO,Proto, etc problems)
-        Configure your ISDN card to print debug messages and
-        check the /dev/isdnctrl output next time. There
-        you can see, whether there is activity on the card/line.
-   - there are at least a few RECV messages in the log:
-     -> fine: your card is dialing and your remote machine
-        tries to talk with you. Maybe only a missing 
-        authentication. Check your ipppd configuration again.
-   - the ipppd exits for some reason:
-     -> not good ... check /var/adm/syslog and /var/adm/daemon.
-        Could be a bug in the ipppd.
-
---
-
-Q12: How can I reduce login delay?
-
-A: Log a login session ('debug' log) and check which options 
-  your remote side rejects. Next time configure your ipppd
-  to not negotiate these options. Another 'side effect' is, that
-  this increases redundancy. (e.g your remote side is buggy and
-  rejects options in a wrong way).
-
-
-
diff --git a/Documentation/kbuild/headers_install.rst b/Documentation/kbuild/headers_install.rst
new file mode 100644
index 000000000000..f6c6b74a609c
--- /dev/null
+++ b/Documentation/kbuild/headers_install.rst
@@ -0,0 +1,44 @@
+=============================================
+Exporting kernel headers for use by userspace
+=============================================
+
+The "make headers_install" command exports the kernel's header files in a
+form suitable for use by userspace programs.
+
+The linux kernel's exported header files describe the API for user space
+programs attempting to use kernel services.  These kernel header files are
+used by the system's C library (such as glibc or uClibc) to define available
+system calls, as well as constants and structures to be used with these
+system calls.  The C library's header files include the kernel header files
+from the "linux" subdirectory.  The system's libc headers are usually
+installed at the default location /usr/include and the kernel headers in
+subdirectories under that (most notably /usr/include/linux and
+/usr/include/asm).
+
+Kernel headers are backwards compatible, but not forwards compatible.  This
+means that a program built against a C library using older kernel headers
+should run on a newer kernel (although it may not have access to new
+features), but a program built against newer kernel headers may not work on an
+older kernel.
+
+The "make headers_install" command can be run in the top level directory of the
+kernel source code (or using a standard out-of-tree build).  It takes two
+optional arguments::
+
+  make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr
+
+ARCH indicates which architecture to produce headers for, and defaults to the
+current architecture.  The linux/asm directory of the exported kernel headers
+is platform-specific, to see a complete list of supported architectures use
+the command::
+
+  ls -d include/asm-* | sed 's/.*-//'
+
+INSTALL_HDR_PATH indicates where to install the headers. It defaults to
+"./usr".
+
+An 'include' directory is automatically created inside INSTALL_HDR_PATH and
+headers are installed in 'INSTALL_HDR_PATH/include'.
+
+The kernel header export infrastructure is maintained by David Woodhouse
+<dwmw2@infradead.org>.
diff --git a/Documentation/kbuild/headers_install.txt b/Documentation/kbuild/headers_install.txt
deleted file mode 100644
index f0153adb95e2..000000000000
--- a/Documentation/kbuild/headers_install.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-Exporting kernel headers for use by userspace
-=============================================
-
-The "make headers_install" command exports the kernel's header files in a
-form suitable for use by userspace programs.
-
-The linux kernel's exported header files describe the API for user space
-programs attempting to use kernel services.  These kernel header files are
-used by the system's C library (such as glibc or uClibc) to define available
-system calls, as well as constants and structures to be used with these
-system calls.  The C library's header files include the kernel header files
-from the "linux" subdirectory.  The system's libc headers are usually
-installed at the default location /usr/include and the kernel headers in
-subdirectories under that (most notably /usr/include/linux and
-/usr/include/asm).
-
-Kernel headers are backwards compatible, but not forwards compatible.  This
-means that a program built against a C library using older kernel headers
-should run on a newer kernel (although it may not have access to new
-features), but a program built against newer kernel headers may not work on an
-older kernel.
-
-The "make headers_install" command can be run in the top level directory of the
-kernel source code (or using a standard out-of-tree build).  It takes two
-optional arguments:
-
-  make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr
-
-ARCH indicates which architecture to produce headers for, and defaults to the
-current architecture.  The linux/asm directory of the exported kernel headers
-is platform-specific, to see a complete list of supported architectures use
-the command:
-
-  ls -d include/asm-* | sed 's/.*-//'
-
-INSTALL_HDR_PATH indicates where to install the headers. It defaults to
-"./usr".
-
-An 'include' directory is automatically created inside INSTALL_HDR_PATH and
-headers are installed in 'INSTALL_HDR_PATH/include'.
-
-The command "make headers_install_all" exports headers for all architectures
-simultaneously.  (This is mostly of interest to distribution maintainers,
-who create an architecture-independent tarball from the resulting include
-directory.)  You also can use HDR_ARCH_LIST to specify list of architectures.
-Remember to provide the appropriate linux/asm directory via "mv" or "ln -s"
-before building a C library with headers exported this way.
-
-The kernel header export infrastructure is maintained by David Woodhouse
-<dwmw2@infradead.org>.
diff --git a/Documentation/kbuild/index.rst b/Documentation/kbuild/index.rst
new file mode 100644
index 000000000000..e323a3f2cc81
--- /dev/null
+++ b/Documentation/kbuild/index.rst
@@ -0,0 +1,27 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+Kernel Build System
+===================
+
+.. toctree::
+    :maxdepth: 1
+
+    kconfig-language
+    kconfig-macro-language
+
+    kbuild
+    kconfig
+    makefiles
+    modules
+
+    headers_install
+
+    issues
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/kbuild/issues.rst b/Documentation/kbuild/issues.rst
new file mode 100644
index 000000000000..bdab01f733f6
--- /dev/null
+++ b/Documentation/kbuild/issues.rst
@@ -0,0 +1,15 @@
+================
+Recursion issues
+================
+
+issue #1
+--------
+
+.. literalinclude:: Kconfig.recursion-issue-01
+   :language: kconfig
+
+issue #2
+--------
+
+.. literalinclude:: Kconfig.recursion-issue-02
+   :language: kconfig
diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst
new file mode 100644
index 000000000000..61b2181ed3ea
--- /dev/null
+++ b/Documentation/kbuild/kbuild.rst
@@ -0,0 +1,274 @@
+======
+Kbuild
+======
+
+
+Output files
+============
+
+modules.order
+-------------
+This file records the order in which modules appear in Makefiles. This
+is used by modprobe to deterministically resolve aliases that match
+multiple modules.
+
+modules.builtin
+---------------
+This file lists all modules that are built into the kernel. This is used
+by modprobe to not fail when trying to load something builtin.
+
+modules.builtin.modinfo
+-----------------------
+This file contains modinfo from all modules that are built into the kernel.
+Unlike modinfo of a separate module, all fields are prefixed with module name.
+
+
+Environment variables
+=====================
+
+KCPPFLAGS
+---------
+Additional options to pass when preprocessing. The preprocessing options
+will be used in all cases where kbuild does preprocessing including
+building C files and assembler files.
+
+KAFLAGS
+-------
+Additional options to the assembler (for built-in and modules).
+
+AFLAGS_MODULE
+-------------
+Additional assembler options for modules.
+
+AFLAGS_KERNEL
+-------------
+Additional assembler options for built-in.
+
+KCFLAGS
+-------
+Additional options to the C compiler (for built-in and modules).
+
+CFLAGS_KERNEL
+-------------
+Additional options for $(CC) when used to compile
+code that is compiled as built-in.
+
+CFLAGS_MODULE
+-------------
+Additional module specific options to use for $(CC).
+
+LDFLAGS_MODULE
+--------------
+Additional options used for $(LD) when linking modules.
+
+HOSTCFLAGS
+----------
+Additional flags to be passed to $(HOSTCC) when building host programs.
+
+HOSTCXXFLAGS
+------------
+Additional flags to be passed to $(HOSTCXX) when building host programs.
+
+HOSTLDFLAGS
+-----------
+Additional flags to be passed when linking host programs.
+
+HOSTLDLIBS
+----------
+Additional libraries to link against when building host programs.
+
+KBUILD_KCONFIG
+--------------
+Set the top-level Kconfig file to the value of this environment
+variable.  The default name is "Kconfig".
+
+KBUILD_VERBOSE
+--------------
+Set the kbuild verbosity. Can be assigned same values as "V=...".
+
+See make help for the full list.
+
+Setting "V=..." takes precedence over KBUILD_VERBOSE.
+
+KBUILD_EXTMOD
+-------------
+Set the directory to look for the kernel source when building external
+modules.
+
+Setting "M=..." takes precedence over KBUILD_EXTMOD.
+
+KBUILD_OUTPUT
+-------------
+Specify the output directory when building the kernel.
+
+The output directory can also be specified using "O=...".
+
+Setting "O=..." takes precedence over KBUILD_OUTPUT.
+
+KBUILD_DEBARCH
+--------------
+For the deb-pkg target, allows overriding the normal heuristics deployed by
+deb-pkg. Normally deb-pkg attempts to guess the right architecture based on
+the UTS_MACHINE variable, and on some architectures also the kernel config.
+The value of KBUILD_DEBARCH is assumed (not checked) to be a valid Debian
+architecture.
+
+ARCH
+----
+Set ARCH to the architecture to be built.
+
+In most cases the name of the architecture is the same as the
+directory name found in the arch/ directory.
+
+But some architectures such as x86 and sparc have aliases.
+
+- x86: i386 for 32 bit, x86_64 for 64 bit
+- sh: sh for 32 bit, sh64 for 64 bit
+- sparc: sparc32 for 32 bit, sparc64 for 64 bit
+
+CROSS_COMPILE
+-------------
+Specify an optional fixed part of the binutils filename.
+CROSS_COMPILE can be a part of the filename or the full path.
+
+CROSS_COMPILE is also used for ccache in some setups.
+
+CF
+--
+Additional options for sparse.
+
+CF is often used on the command-line like this::
+
+    make CF=-Wbitwise C=2
+
+INSTALL_PATH
+------------
+INSTALL_PATH specifies where to place the updated kernel and system map
+images. Default is /boot, but you can set it to other values.
+
+INSTALLKERNEL
+-------------
+Install script called when using "make install".
+The default name is "installkernel".
+
+The script will be called with the following arguments:
+
+   - $1 - kernel version
+   - $2 - kernel image file
+   - $3 - kernel map file
+   - $4 - default install path (use root directory if blank)
+
+The implementation of "make install" is architecture specific
+and it may differ from the above.
+
+INSTALLKERNEL is provided to enable the possibility to
+specify a custom installer when cross compiling a kernel.
+
+MODLIB
+------
+Specify where to install modules.
+The default value is::
+
+     $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
+
+The value can be overridden in which case the default value is ignored.
+
+INSTALL_MOD_PATH
+----------------
+INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
+relocations required by build roots.  This is not defined in the
+makefile but the argument can be passed to make if needed.
+
+INSTALL_MOD_STRIP
+-----------------
+INSTALL_MOD_STRIP, if defined, will cause modules to be
+stripped after they are installed.  If INSTALL_MOD_STRIP is '1', then
+the default option --strip-debug will be used.  Otherwise,
+INSTALL_MOD_STRIP value will be used as the options to the strip command.
+
+INSTALL_HDR_PATH
+----------------
+INSTALL_HDR_PATH specifies where to install user space headers when
+executing "make headers_*".
+
+The default value is::
+
+    $(objtree)/usr
+
+$(objtree) is the directory where output files are saved.
+The output directory is often set using "O=..." on the commandline.
+
+The value can be overridden in which case the default value is ignored.
+
+KBUILD_ABS_SRCTREE
+--------------------------------------------------
+Kbuild uses a relative path to point to the tree when possible. For instance,
+when building in the source tree, the source tree path is '.'
+
+Setting this flag requests Kbuild to use absolute path to the source tree.
+There are some useful cases to do so, like when generating tag files with
+absolute path entries etc.
+
+KBUILD_SIGN_PIN
+---------------
+This variable allows a passphrase or PIN to be passed to the sign-file
+utility when signing kernel modules, if the private key requires such.
+
+KBUILD_MODPOST_WARN
+-------------------
+KBUILD_MODPOST_WARN can be set to avoid errors in case of undefined
+symbols in the final module linking stage. It changes such errors
+into warnings.
+
+KBUILD_MODPOST_NOFINAL
+----------------------
+KBUILD_MODPOST_NOFINAL can be set to skip the final link of modules.
+This is solely useful to speed up test compiles.
+
+KBUILD_EXTRA_SYMBOLS
+--------------------
+For modules that use symbols from other modules.
+See more details in modules.txt.
+
+ALLSOURCE_ARCHS
+---------------
+For tags/TAGS/cscope targets, you can specify more than one arch
+to be included in the databases, separated by blank space. E.g.::
+
+    $ make ALLSOURCE_ARCHS="x86 mips arm" tags
+
+To get all available archs you can also specify all. E.g.::
+
+    $ make ALLSOURCE_ARCHS=all tags
+
+KBUILD_ENABLE_EXTRA_GCC_CHECKS
+------------------------------
+If enabled over the make command line with "W=1", it turns on additional
+gcc -W... options for more extensive build-time checking.
+
+KBUILD_BUILD_TIMESTAMP
+----------------------
+Setting this to a date string overrides the timestamp used in the
+UTS_VERSION definition (uname -v in the running kernel). The value has to
+be a string that can be passed to date -d. The default value
+is the output of the date command at one point during build.
+
+KBUILD_BUILD_USER, KBUILD_BUILD_HOST
+------------------------------------
+These two variables allow to override the user@host string displayed during
+boot and in /proc/version. The default value is the output of the commands
+whoami and host, respectively.
+
+KBUILD_LDS
+----------
+The linker script with full path. Assigned by the top-level Makefile.
+
+KBUILD_VMLINUX_OBJS
+-------------------
+All object files for vmlinux. They are linked to vmlinux in the same
+order as listed in KBUILD_VMLINUX_OBJS.
+
+KBUILD_VMLINUX_LIBS
+-------------------
+All .a "lib" files for vmlinux. KBUILD_VMLINUX_OBJS and KBUILD_VMLINUX_LIBS
+together specify all the object files used to link vmlinux.
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
deleted file mode 100644
index 9c230ea71963..000000000000
--- a/Documentation/kbuild/kbuild.txt
+++ /dev/null
@@ -1,248 +0,0 @@
-Output files
-
-modules.order
---------------------------------------------------
-This file records the order in which modules appear in Makefiles. This
-is used by modprobe to deterministically resolve aliases that match
-multiple modules.
-
-modules.builtin
---------------------------------------------------
-This file lists all modules that are built into the kernel. This is used
-by modprobe to not fail when trying to load something builtin.
-
-modules.builtin.modinfo
---------------------------------------------------
-This file contains modinfo from all modules that are built into the kernel.
-Unlike modinfo of a separate module, all fields are prefixed with module name.
-
-
-Environment variables
-
-KCPPFLAGS
---------------------------------------------------
-Additional options to pass when preprocessing. The preprocessing options
-will be used in all cases where kbuild does preprocessing including
-building C files and assembler files.
-
-KAFLAGS
---------------------------------------------------
-Additional options to the assembler (for built-in and modules).
-
-AFLAGS_MODULE
---------------------------------------------------
-Additional module specific options to use for $(AS).
-
-AFLAGS_KERNEL
---------------------------------------------------
-Additional options for $(AS) when used for assembler
-code for code that is compiled as built-in.
-
-KCFLAGS
---------------------------------------------------
-Additional options to the C compiler (for built-in and modules).
-
-CFLAGS_KERNEL
---------------------------------------------------
-Additional options for $(CC) when used to compile
-code that is compiled as built-in.
-
-CFLAGS_MODULE
---------------------------------------------------
-Additional module specific options to use for $(CC).
-
-LDFLAGS_MODULE
---------------------------------------------------
-Additional options used for $(LD) when linking modules.
-
-HOSTCFLAGS
---------------------------------------------------
-Additional flags to be passed to $(HOSTCC) when building host programs.
-
-HOSTCXXFLAGS
---------------------------------------------------
-Additional flags to be passed to $(HOSTCXX) when building host programs.
-
-HOSTLDFLAGS
---------------------------------------------------
-Additional flags to be passed when linking host programs.
-
-HOSTLDLIBS
---------------------------------------------------
-Additional libraries to link against when building host programs.
-
-KBUILD_KCONFIG
---------------------------------------------------
-Set the top-level Kconfig file to the value of this environment
-variable.  The default name is "Kconfig".
-
-KBUILD_VERBOSE
---------------------------------------------------
-Set the kbuild verbosity. Can be assigned same values as "V=...".
-See make help for the full list.
-Setting "V=..." takes precedence over KBUILD_VERBOSE.
-
-KBUILD_EXTMOD
---------------------------------------------------
-Set the directory to look for the kernel source when building external
-modules.
-Setting "M=..." takes precedence over KBUILD_EXTMOD.
-
-KBUILD_OUTPUT
---------------------------------------------------
-Specify the output directory when building the kernel.
-The output directory can also be specified using "O=...".
-Setting "O=..." takes precedence over KBUILD_OUTPUT.
-
-KBUILD_DEBARCH
---------------------------------------------------
-For the deb-pkg target, allows overriding the normal heuristics deployed by
-deb-pkg. Normally deb-pkg attempts to guess the right architecture based on
-the UTS_MACHINE variable, and on some architectures also the kernel config.
-The value of KBUILD_DEBARCH is assumed (not checked) to be a valid Debian
-architecture.
-
-ARCH
---------------------------------------------------
-Set ARCH to the architecture to be built.
-In most cases the name of the architecture is the same as the
-directory name found in the arch/ directory.
-But some architectures such as x86 and sparc have aliases.
-x86: i386 for 32 bit, x86_64 for 64 bit
-sh: sh for 32 bit, sh64 for 64 bit
-sparc: sparc32 for 32 bit, sparc64 for 64 bit
-
-CROSS_COMPILE
---------------------------------------------------
-Specify an optional fixed part of the binutils filename.
-CROSS_COMPILE can be a part of the filename or the full path.
-
-CROSS_COMPILE is also used for ccache in some setups.
-
-CF
---------------------------------------------------
-Additional options for sparse.
-CF is often used on the command-line like this:
-
-    make CF=-Wbitwise C=2
-
-INSTALL_PATH
---------------------------------------------------
-INSTALL_PATH specifies where to place the updated kernel and system map
-images. Default is /boot, but you can set it to other values.
-
-INSTALLKERNEL
---------------------------------------------------
-Install script called when using "make install".
-The default name is "installkernel".
-
-The script will be called with the following arguments:
-    $1 - kernel version
-    $2 - kernel image file
-    $3 - kernel map file
-    $4 - default install path (use root directory if blank)
-
-The implementation of "make install" is architecture specific
-and it may differ from the above.
-
-INSTALLKERNEL is provided to enable the possibility to
-specify a custom installer when cross compiling a kernel.
-
-MODLIB
---------------------------------------------------
-Specify where to install modules.
-The default value is:
-
-     $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
-
-The value can be overridden in which case the default value is ignored.
-
-INSTALL_MOD_PATH
---------------------------------------------------
-INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
-relocations required by build roots.  This is not defined in the
-makefile but the argument can be passed to make if needed.
-
-INSTALL_MOD_STRIP
---------------------------------------------------
-INSTALL_MOD_STRIP, if defined, will cause modules to be
-stripped after they are installed.  If INSTALL_MOD_STRIP is '1', then
-the default option --strip-debug will be used.  Otherwise,
-INSTALL_MOD_STRIP value will be used as the options to the strip command.
-
-INSTALL_HDR_PATH
---------------------------------------------------
-INSTALL_HDR_PATH specifies where to install user space headers when
-executing "make headers_*".
-The default value is:
-
-    $(objtree)/usr
-
-$(objtree) is the directory where output files are saved.
-The output directory is often set using "O=..." on the commandline.
-
-The value can be overridden in which case the default value is ignored.
-
-KBUILD_SIGN_PIN
---------------------------------------------------
-This variable allows a passphrase or PIN to be passed to the sign-file
-utility when signing kernel modules, if the private key requires such.
-
-KBUILD_MODPOST_WARN
---------------------------------------------------
-KBUILD_MODPOST_WARN can be set to avoid errors in case of undefined
-symbols in the final module linking stage. It changes such errors
-into warnings.
-
-KBUILD_MODPOST_NOFINAL
---------------------------------------------------
-KBUILD_MODPOST_NOFINAL can be set to skip the final link of modules.
-This is solely useful to speed up test compiles.
-
-KBUILD_EXTRA_SYMBOLS
---------------------------------------------------
-For modules that use symbols from other modules.
-See more details in modules.txt.
-
-ALLSOURCE_ARCHS
---------------------------------------------------
-For tags/TAGS/cscope targets, you can specify more than one arch
-to be included in the databases, separated by blank space. E.g.:
-
-    $ make ALLSOURCE_ARCHS="x86 mips arm" tags
-
-To get all available archs you can also specify all. E.g.:
-
-    $ make ALLSOURCE_ARCHS=all tags
-
-KBUILD_ENABLE_EXTRA_GCC_CHECKS
---------------------------------------------------
-If enabled over the make command line with "W=1", it turns on additional
-gcc -W... options for more extensive build-time checking.
-
-KBUILD_BUILD_TIMESTAMP
---------------------------------------------------
-Setting this to a date string overrides the timestamp used in the
-UTS_VERSION definition (uname -v in the running kernel). The value has to
-be a string that can be passed to date -d. The default value
-is the output of the date command at one point during build.
-
-KBUILD_BUILD_USER, KBUILD_BUILD_HOST
---------------------------------------------------
-These two variables allow to override the user@host string displayed during
-boot and in /proc/version. The default value is the output of the commands
-whoami and host, respectively.
-
-KBUILD_LDS
---------------------------------------------------
-The linker script with full path. Assigned by the top-level Makefile.
-
-KBUILD_VMLINUX_OBJS
---------------------------------------------------
-All object files for vmlinux. They are linked to vmlinux in the same
-order as listed in KBUILD_VMLINUX_OBJS.
-
-KBUILD_VMLINUX_LIBS
---------------------------------------------------
-All .a "lib" files for vmlinux. KBUILD_VMLINUX_OBJS and KBUILD_VMLINUX_LIBS
-together specify all the object files used to link vmlinux.
diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst
new file mode 100644
index 000000000000..74bef19f69f0
--- /dev/null
+++ b/Documentation/kbuild/kconfig-language.rst
@@ -0,0 +1,701 @@
+================
+Kconfig Language
+================
+
+Introduction
+------------
+
+The configuration database is a collection of configuration options
+organized in a tree structure::
+
+	+- Code maturity level options
+	|  +- Prompt for development and/or incomplete code/drivers
+	+- General setup
+	|  +- Networking support
+	|  +- System V IPC
+	|  +- BSD Process Accounting
+	|  +- Sysctl support
+	+- Loadable module support
+	|  +- Enable loadable module support
+	|     +- Set version information on all module symbols
+	|     +- Kernel module loader
+	+- ...
+
+Every entry has its own dependencies. These dependencies are used
+to determine the visibility of an entry. Any child entry is only
+visible if its parent entry is also visible.
+
+Menu entries
+------------
+
+Most entries define a config option; all other entries help to organize
+them. A single configuration option is defined like this::
+
+  config MODVERSIONS
+	bool "Set version information on all module symbols"
+	depends on MODULES
+	help
+	  Usually, modules have to be recompiled whenever you switch to a new
+	  kernel.  ...
+
+Every line starts with a key word and can be followed by multiple
+arguments.  "config" starts a new config entry. The following lines
+define attributes for this config option. Attributes can be the type of
+the config option, input prompt, dependencies, help text and default
+values. A config option can be defined multiple times with the same
+name, but every definition can have only a single input prompt and the
+type must not conflict.
+
+Menu attributes
+---------------
+
+A menu entry can have a number of attributes. Not all of them are
+applicable everywhere (see syntax).
+
+- type definition: "bool"/"tristate"/"string"/"hex"/"int"
+
+  Every config option must have a type. There are only two basic types:
+  tristate and string; the other types are based on these two. The type
+  definition optionally accepts an input prompt, so these two examples
+  are equivalent::
+
+	bool "Networking support"
+
+  and::
+
+	bool
+	prompt "Networking support"
+
+- input prompt: "prompt" <prompt> ["if" <expr>]
+
+  Every menu entry can have at most one prompt, which is used to display
+  to the user. Optionally dependencies only for this prompt can be added
+  with "if".
+
+- default value: "default" <expr> ["if" <expr>]
+
+  A config option can have any number of default values. If multiple
+  default values are visible, only the first defined one is active.
+  Default values are not limited to the menu entry where they are
+  defined. This means the default can be defined somewhere else or be
+  overridden by an earlier definition.
+  The default value is only assigned to the config symbol if no other
+  value was set by the user (via the input prompt above). If an input
+  prompt is visible the default value is presented to the user and can
+  be overridden by him.
+  Optionally, dependencies only for this default value can be added with
+  "if".
+
+ The default value deliberately defaults to 'n' in order to avoid bloating the
+ build. With few exceptions, new config options should not change this. The
+ intent is for "make oldconfig" to add as little as possible to the config from
+ release to release.
+
+ Note:
+	Things that merit "default y/m" include:
+
+	a) A new Kconfig option for something that used to always be built
+	   should be "default y".
+
+	b) A new gatekeeping Kconfig option that hides/shows other Kconfig
+	   options (but does not generate any code of its own), should be
+	   "default y" so people will see those other options.
+
+	c) Sub-driver behavior or similar options for a driver that is
+	   "default n". This allows you to provide sane defaults.
+
+	d) Hardware or infrastructure that everybody expects, such as CONFIG_NET
+	   or CONFIG_BLOCK. These are rare exceptions.
+
+- type definition + default value::
+
+	"def_bool"/"def_tristate" <expr> ["if" <expr>]
+
+  This is a shorthand notation for a type definition plus a value.
+  Optionally dependencies for this default value can be added with "if".
+
+- dependencies: "depends on" <expr>
+
+  This defines a dependency for this menu entry. If multiple
+  dependencies are defined, they are connected with '&&'. Dependencies
+  are applied to all other options within this menu entry (which also
+  accept an "if" expression), so these two examples are equivalent::
+
+	bool "foo" if BAR
+	default y if BAR
+
+  and::
+
+	depends on BAR
+	bool "foo"
+	default y
+
+- reverse dependencies: "select" <symbol> ["if" <expr>]
+
+  While normal dependencies reduce the upper limit of a symbol (see
+  below), reverse dependencies can be used to force a lower limit of
+  another symbol. The value of the current menu symbol is used as the
+  minimal value <symbol> can be set to. If <symbol> is selected multiple
+  times, the limit is set to the largest selection.
+  Reverse dependencies can only be used with boolean or tristate
+  symbols.
+
+  Note:
+	select should be used with care. select will force
+	a symbol to a value without visiting the dependencies.
+	By abusing select you are able to select a symbol FOO even
+	if FOO depends on BAR that is not set.
+	In general use select only for non-visible symbols
+	(no prompts anywhere) and for symbols with no dependencies.
+	That will limit the usefulness but on the other hand avoid
+	the illegal configurations all over.
+
+- weak reverse dependencies: "imply" <symbol> ["if" <expr>]
+
+  This is similar to "select" as it enforces a lower limit on another
+  symbol except that the "implied" symbol's value may still be set to n
+  from a direct dependency or with a visible prompt.
+
+  Given the following example::
+
+    config FOO
+	tristate
+	imply BAZ
+
+    config BAZ
+	tristate
+	depends on BAR
+
+  The following values are possible:
+
+	===		===		=============	==============
+	FOO		BAR		BAZ's default	choice for BAZ
+	===		===		=============	==============
+	n		y		n		N/m/y
+	m		y		m		M/y/n
+	y		y		y		Y/n
+	y		n		*		N
+	===		===		=============	==============
+
+  This is useful e.g. with multiple drivers that want to indicate their
+  ability to hook into a secondary subsystem while allowing the user to
+  configure that subsystem out without also having to unset these drivers.
+
+- limiting menu display: "visible if" <expr>
+
+  This attribute is only applicable to menu blocks, if the condition is
+  false, the menu block is not displayed to the user (the symbols
+  contained there can still be selected by other symbols, though). It is
+  similar to a conditional "prompt" attribute for individual menu
+  entries. Default value of "visible" is true.
+
+- numerical ranges: "range" <symbol> <symbol> ["if" <expr>]
+
+  This allows to limit the range of possible input values for int
+  and hex symbols. The user can only input a value which is larger than
+  or equal to the first symbol and smaller than or equal to the second
+  symbol.
+
+- help text: "help" or "---help---"
+
+  This defines a help text. The end of the help text is determined by
+  the indentation level, this means it ends at the first line which has
+  a smaller indentation than the first line of the help text.
+  "---help---" and "help" do not differ in behaviour, "---help---" is
+  used to help visually separate configuration logic from help within
+  the file as an aid to developers.
+
+- misc options: "option" <symbol>[=<value>]
+
+  Various less common options can be defined via this option syntax,
+  which can modify the behaviour of the menu entry and its config
+  symbol. These options are currently possible:
+
+  - "defconfig_list"
+    This declares a list of default entries which can be used when
+    looking for the default configuration (which is used when the main
+    .config doesn't exists yet.)
+
+  - "modules"
+    This declares the symbol to be used as the MODULES symbol, which
+    enables the third modular state for all config symbols.
+    At most one symbol may have the "modules" option set.
+
+  - "allnoconfig_y"
+    This declares the symbol as one that should have the value y when
+    using "allnoconfig". Used for symbols that hide other symbols.
+
+Menu dependencies
+-----------------
+
+Dependencies define the visibility of a menu entry and can also reduce
+the input range of tristate symbols. The tristate logic used in the
+expressions uses one more state than normal boolean logic to express the
+module state. Dependency expressions have the following syntax::
+
+  <expr> ::= <symbol>                           (1)
+           <symbol> '=' <symbol>                (2)
+           <symbol> '!=' <symbol>               (3)
+           <symbol1> '<' <symbol2>              (4)
+           <symbol1> '>' <symbol2>              (4)
+           <symbol1> '<=' <symbol2>             (4)
+           <symbol1> '>=' <symbol2>             (4)
+           '(' <expr> ')'                       (5)
+           '!' <expr>                           (6)
+           <expr> '&&' <expr>                   (7)
+           <expr> '||' <expr>                   (8)
+
+Expressions are listed in decreasing order of precedence.
+
+(1) Convert the symbol into an expression. Boolean and tristate symbols
+    are simply converted into the respective expression values. All
+    other symbol types result in 'n'.
+(2) If the values of both symbols are equal, it returns 'y',
+    otherwise 'n'.
+(3) If the values of both symbols are equal, it returns 'n',
+    otherwise 'y'.
+(4) If value of <symbol1> is respectively lower, greater, lower-or-equal,
+    or greater-or-equal than value of <symbol2>, it returns 'y',
+    otherwise 'n'.
+(5) Returns the value of the expression. Used to override precedence.
+(6) Returns the result of (2-/expr/).
+(7) Returns the result of min(/expr/, /expr/).
+(8) Returns the result of max(/expr/, /expr/).
+
+An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
+respectively for calculations). A menu entry becomes visible when its
+expression evaluates to 'm' or 'y'.
+
+There are two types of symbols: constant and non-constant symbols.
+Non-constant symbols are the most common ones and are defined with the
+'config' statement. Non-constant symbols consist entirely of alphanumeric
+characters or underscores.
+Constant symbols are only part of expressions. Constant symbols are
+always surrounded by single or double quotes. Within the quote, any
+other character is allowed and the quotes can be escaped using '\'.
+
+Menu structure
+--------------
+
+The position of a menu entry in the tree is determined in two ways. First
+it can be specified explicitly::
+
+  menu "Network device support"
+	depends on NET
+
+  config NETDEVICES
+	...
+
+  endmenu
+
+All entries within the "menu" ... "endmenu" block become a submenu of
+"Network device support". All subentries inherit the dependencies from
+the menu entry, e.g. this means the dependency "NET" is added to the
+dependency list of the config option NETDEVICES.
+
+The other way to generate the menu structure is done by analyzing the
+dependencies. If a menu entry somehow depends on the previous entry, it
+can be made a submenu of it. First, the previous (parent) symbol must
+be part of the dependency list and then one of these two conditions
+must be true:
+
+- the child entry must become invisible, if the parent is set to 'n'
+- the child entry must only be visible, if the parent is visible::
+
+    config MODULES
+	bool "Enable loadable module support"
+
+    config MODVERSIONS
+	bool "Set version information on all module symbols"
+	depends on MODULES
+
+    comment "module support disabled"
+	depends on !MODULES
+
+MODVERSIONS directly depends on MODULES, this means it's only visible if
+MODULES is different from 'n'. The comment on the other hand is only
+visible when MODULES is set to 'n'.
+
+
+Kconfig syntax
+--------------
+
+The configuration file describes a series of menu entries, where every
+line starts with a keyword (except help texts). The following keywords
+end a menu entry:
+
+- config
+- menuconfig
+- choice/endchoice
+- comment
+- menu/endmenu
+- if/endif
+- source
+
+The first five also start the definition of a menu entry.
+
+config::
+
+	"config" <symbol>
+	<config options>
+
+This defines a config symbol <symbol> and accepts any of above
+attributes as options.
+
+menuconfig::
+
+	"menuconfig" <symbol>
+	<config options>
+
+This is similar to the simple config entry above, but it also gives a
+hint to front ends, that all suboptions should be displayed as a
+separate list of options. To make sure all the suboptions will really
+show up under the menuconfig entry and not outside of it, every item
+from the <config options> list must depend on the menuconfig symbol.
+In practice, this is achieved by using one of the next two constructs::
+
+  (1):
+  menuconfig M
+  if M
+      config C1
+      config C2
+  endif
+
+  (2):
+  menuconfig M
+  config C1
+      depends on M
+  config C2
+      depends on M
+
+In the following examples (3) and (4), C1 and C2 still have the M
+dependency, but will not appear under menuconfig M anymore, because
+of C0, which doesn't depend on M::
+
+  (3):
+  menuconfig M
+      config C0
+  if M
+      config C1
+      config C2
+  endif
+
+  (4):
+  menuconfig M
+  config C0
+  config C1
+      depends on M
+  config C2
+      depends on M
+
+choices::
+
+	"choice" [symbol]
+	<choice options>
+	<choice block>
+	"endchoice"
+
+This defines a choice group and accepts any of the above attributes as
+options. A choice can only be of type bool or tristate.  If no type is
+specified for a choice, its type will be determined by the type of
+the first choice element in the group or remain unknown if none of the
+choice elements have a type specified, as well.
+
+While a boolean choice only allows a single config entry to be
+selected, a tristate choice also allows any number of config entries
+to be set to 'm'. This can be used if multiple drivers for a single
+hardware exists and only a single driver can be compiled/loaded into
+the kernel, but all drivers can be compiled as modules.
+
+A choice accepts another option "optional", which allows to set the
+choice to 'n' and no entry needs to be selected.
+If no [symbol] is associated with a choice, then you can not have multiple
+definitions of that choice. If a [symbol] is associated to the choice,
+then you may define the same choice (i.e. with the same entries) in another
+place.
+
+comment::
+
+	"comment" <prompt>
+	<comment options>
+
+This defines a comment which is displayed to the user during the
+configuration process and is also echoed to the output files. The only
+possible options are dependencies.
+
+menu::
+
+	"menu" <prompt>
+	<menu options>
+	<menu block>
+	"endmenu"
+
+This defines a menu block, see "Menu structure" above for more
+information. The only possible options are dependencies and "visible"
+attributes.
+
+if::
+
+	"if" <expr>
+	<if block>
+	"endif"
+
+This defines an if block. The dependency expression <expr> is appended
+to all enclosed menu entries.
+
+source::
+
+	"source" <prompt>
+
+This reads the specified configuration file. This file is always parsed.
+
+mainmenu::
+
+	"mainmenu" <prompt>
+
+This sets the config program's title bar if the config program chooses
+to use it. It should be placed at the top of the configuration, before any
+other statement.
+
+'#' Kconfig source file comment:
+
+An unquoted '#' character anywhere in a source file line indicates
+the beginning of a source file comment.  The remainder of that line
+is a comment.
+
+
+Kconfig hints
+-------------
+This is a collection of Kconfig tips, most of which aren't obvious at
+first glance and most of which have become idioms in several Kconfig
+files.
+
+Adding common features and make the usage configurable
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It is a common idiom to implement a feature/functionality that are
+relevant for some architectures but not all.
+The recommended way to do so is to use a config variable named HAVE_*
+that is defined in a common Kconfig file and selected by the relevant
+architectures.
+An example is the generic IOMAP functionality.
+
+We would in lib/Kconfig see::
+
+  # Generic IOMAP is used to ...
+  config HAVE_GENERIC_IOMAP
+
+  config GENERIC_IOMAP
+	depends on HAVE_GENERIC_IOMAP && FOO
+
+And in lib/Makefile we would see::
+
+	obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
+
+For each architecture using the generic IOMAP functionality we would see::
+
+  config X86
+	select ...
+	select HAVE_GENERIC_IOMAP
+	select ...
+
+Note: we use the existing config option and avoid creating a new
+config variable to select HAVE_GENERIC_IOMAP.
+
+Note: the use of the internal config variable HAVE_GENERIC_IOMAP, it is
+introduced to overcome the limitation of select which will force a
+config option to 'y' no matter the dependencies.
+The dependencies are moved to the symbol GENERIC_IOMAP and we avoid the
+situation where select forces a symbol equals to 'y'.
+
+Adding features that need compiler support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are several features that need compiler support. The recommended way
+to describe the dependency on the compiler feature is to use "depends on"
+followed by a test macro::
+
+  config STACKPROTECTOR
+	bool "Stack Protector buffer overflow detection"
+	depends on $(cc-option,-fstack-protector)
+	...
+
+If you need to expose a compiler capability to makefiles and/or C source files,
+`CC_HAS_` is the recommended prefix for the config option::
+
+  config CC_HAS_STACKPROTECTOR_NONE
+	def_bool $(cc-option,-fno-stack-protector)
+
+Build as module only
+~~~~~~~~~~~~~~~~~~~~
+To restrict a component build to module-only, qualify its config symbol
+with "depends on m".  E.g.::
+
+  config FOO
+	depends on BAR && m
+
+limits FOO to module (=m) or disabled (=n).
+
+Kconfig recursive dependency limitations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you've hit the Kconfig error: "recursive dependency detected" you've run
+into a recursive dependency issue with Kconfig, a recursive dependency can be
+summarized as a circular dependency. The kconfig tools need to ensure that
+Kconfig files comply with specified configuration requirements. In order to do
+that kconfig must determine the values that are possible for all Kconfig
+symbols, this is currently not possible if there is a circular relation
+between two or more Kconfig symbols. For more details refer to the "Simple
+Kconfig recursive issue" subsection below. Kconfig does not do recursive
+dependency resolution; this has a few implications for Kconfig file writers.
+We'll first explain why this issues exists and then provide an example
+technical limitation which this brings upon Kconfig developers. Eager
+developers wishing to try to address this limitation should read the next
+subsections.
+
+Simple Kconfig recursive issue
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Read: Documentation/kbuild/Kconfig.recursion-issue-01
+
+Test with::
+
+  make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-01 allnoconfig
+
+Cumulative Kconfig recursive issue
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Read: Documentation/kbuild/Kconfig.recursion-issue-02
+
+Test with::
+
+  make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-02 allnoconfig
+
+Practical solutions to kconfig recursive issue
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Developers who run into the recursive Kconfig issue have two options
+at their disposal. We document them below and also provide a list of
+historical issues resolved through these different solutions.
+
+  a) Remove any superfluous "select FOO" or "depends on FOO"
+  b) Match dependency semantics:
+
+	b1) Swap all "select FOO" to "depends on FOO" or,
+
+	b2) Swap all "depends on FOO" to "select FOO"
+
+The resolution to a) can be tested with the sample Kconfig file
+Documentation/kbuild/Kconfig.recursion-issue-01 through the removal
+of the "select CORE" from CORE_BELL_A_ADVANCED as that is implicit already
+since CORE_BELL_A depends on CORE. At times it may not be possible to remove
+some dependency criteria, for such cases you can work with solution b).
+
+The two different resolutions for b) can be tested in the sample Kconfig file
+Documentation/kbuild/Kconfig.recursion-issue-02.
+
+Below is a list of examples of prior fixes for these types of recursive issues;
+all errors appear to involve one or more select's and one or more "depends on".
+
+============    ===================================
+commit          fix
+============    ===================================
+06b718c01208    select A -> depends on A
+c22eacfe82f9    depends on A -> depends on B
+6a91e854442c    select A -> depends on A
+118c565a8f2e    select A -> select B
+f004e5594705    select A -> depends on A
+c7861f37b4c6    depends on A -> (null)
+80c69915e5fb    select A -> (null)              (1)
+c2218e26c0d0    select A -> depends on A        (1)
+d6ae99d04e1c    select A -> depends on A
+95ca19cf8cbf    select A -> depends on A
+8f057d7bca54    depends on A -> (null)
+8f057d7bca54    depends on A -> select A
+a0701f04846e    select A -> depends on A
+0c8b92f7f259    depends on A -> (null)
+e4e9e0540928    select A -> depends on A        (2)
+7453ea886e87    depends on A > (null)           (1)
+7b1fff7e4fdf    select A -> depends on A
+86c747d2a4f0    select A -> depends on A
+d9f9ab51e55e    select A -> depends on A
+0c51a4d8abd6    depends on A -> select A        (3)
+e98062ed6dc4    select A -> depends on A        (3)
+91e5d284a7f1    select A -> (null)
+============    ===================================
+
+(1) Partial (or no) quote of error.
+(2) That seems to be the gist of that fix.
+(3) Same error.
+
+Future kconfig work
+~~~~~~~~~~~~~~~~~~~
+
+Work on kconfig is welcomed on both areas of clarifying semantics and on
+evaluating the use of a full SAT solver for it. A full SAT solver can be
+desirable to enable more complex dependency mappings and / or queries,
+for instance on possible use case for a SAT solver could be that of handling
+the current known recursive dependency issues. It is not known if this would
+address such issues but such evaluation is desirable. If support for a full SAT
+solver proves too complex or that it cannot address recursive dependency issues
+Kconfig should have at least clear and well defined semantics which also
+addresses and documents limitations or requirements such as the ones dealing
+with recursive dependencies.
+
+Further work on both of these areas is welcomed on Kconfig. We elaborate
+on both of these in the next two subsections.
+
+Semantics of Kconfig
+~~~~~~~~~~~~~~~~~~~~
+
+The use of Kconfig is broad, Linux is now only one of Kconfig's users:
+one study has completed a broad analysis of Kconfig use in 12 projects [0]_.
+Despite its widespread use, and although this document does a reasonable job
+in documenting basic Kconfig syntax a more precise definition of Kconfig
+semantics is welcomed. One project deduced Kconfig semantics through
+the use of the xconfig configurator [1]_. Work should be done to confirm if
+the deduced semantics matches our intended Kconfig design goals.
+
+Having well defined semantics can be useful for tools for practical
+evaluation of depenencies, for instance one such use known case was work to
+express in boolean abstraction of the inferred semantics of Kconfig to
+translate Kconfig logic into boolean formulas and run a SAT solver on this to
+find dead code / features (always inactive), 114 dead features were found in
+Linux using this methodology [1]_ (Section 8: Threats to validity).
+
+Confirming this could prove useful as Kconfig stands as one of the the leading
+industrial variability modeling languages [1]_ [2]_. Its study would help
+evaluate practical uses of such languages, their use was only theoretical
+and real world requirements were not well understood. As it stands though
+only reverse engineering techniques have been used to deduce semantics from
+variability modeling languages such as Kconfig [3]_.
+
+.. [0] http://www.eng.uwaterloo.ca/~shshe/kconfig_semantics.pdf
+.. [1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf
+.. [2] http://gsd.uwaterloo.ca/sites/default/files/ase241-berger_0.pdf
+.. [3] http://gsd.uwaterloo.ca/sites/default/files/icse2011.pdf
+
+Full SAT solver for Kconfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Although SAT solvers [4]_ haven't yet been used by Kconfig directly, as noted
+in the previous subsection, work has been done however to express in boolean
+abstraction the inferred semantics of Kconfig to translate Kconfig logic into
+boolean formulas and run a SAT solver on it [5]_. Another known related project
+is CADOS [6]_ (former VAMOS [7]_) and the tools, mainly undertaker [8]_, which
+has been introduced first with [9]_.  The basic concept of undertaker is to
+exract variability models from Kconfig, and put them together with a
+propositional formula extracted from CPP #ifdefs and build-rules into a SAT
+solver in order to find dead code, dead files, and dead symbols. If using a SAT
+solver is desirable on Kconfig one approach would be to evaluate repurposing
+such efforts somehow on Kconfig. There is enough interest from mentors of
+existing projects to not only help advise how to integrate this work upstream
+but also help maintain it long term. Interested developers should visit:
+
+http://kernelnewbies.org/KernelProjects/kconfig-sat
+
+.. [4] http://www.cs.cornell.edu/~sabhar/chapters/SATSolvers-KR-Handbook.pdf
+.. [5] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf
+.. [6] https://cados.cs.fau.de
+.. [7] https://vamos.cs.fau.de
+.. [8] https://undertaker.cs.fau.de
+.. [9] https://www4.cs.fau.de/Publications/2011/tartler_11_eurosys.pdf
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
deleted file mode 100644
index 864e740811da..000000000000
--- a/Documentation/kbuild/kconfig-language.txt
+++ /dev/null
@@ -1,669 +0,0 @@
-Introduction
-------------
-
-The configuration database is a collection of configuration options
-organized in a tree structure:
-
-	+- Code maturity level options
-	|  +- Prompt for development and/or incomplete code/drivers
-	+- General setup
-	|  +- Networking support
-	|  +- System V IPC
-	|  +- BSD Process Accounting
-	|  +- Sysctl support
-	+- Loadable module support
-	|  +- Enable loadable module support
-	|     +- Set version information on all module symbols
-	|     +- Kernel module loader
-	+- ...
-
-Every entry has its own dependencies. These dependencies are used
-to determine the visibility of an entry. Any child entry is only
-visible if its parent entry is also visible.
-
-Menu entries
-------------
-
-Most entries define a config option; all other entries help to organize
-them. A single configuration option is defined like this:
-
-config MODVERSIONS
-	bool "Set version information on all module symbols"
-	depends on MODULES
-	help
-	  Usually, modules have to be recompiled whenever you switch to a new
-	  kernel.  ...
-
-Every line starts with a key word and can be followed by multiple
-arguments.  "config" starts a new config entry. The following lines
-define attributes for this config option. Attributes can be the type of
-the config option, input prompt, dependencies, help text and default
-values. A config option can be defined multiple times with the same
-name, but every definition can have only a single input prompt and the
-type must not conflict.
-
-Menu attributes
----------------
-
-A menu entry can have a number of attributes. Not all of them are
-applicable everywhere (see syntax).
-
-- type definition: "bool"/"tristate"/"string"/"hex"/"int"
-  Every config option must have a type. There are only two basic types:
-  tristate and string; the other types are based on these two. The type
-  definition optionally accepts an input prompt, so these two examples
-  are equivalent:
-
-	bool "Networking support"
-  and
-	bool
-	prompt "Networking support"
-
-- input prompt: "prompt" <prompt> ["if" <expr>]
-  Every menu entry can have at most one prompt, which is used to display
-  to the user. Optionally dependencies only for this prompt can be added
-  with "if".
-
-- default value: "default" <expr> ["if" <expr>]
-  A config option can have any number of default values. If multiple
-  default values are visible, only the first defined one is active.
-  Default values are not limited to the menu entry where they are
-  defined. This means the default can be defined somewhere else or be
-  overridden by an earlier definition.
-  The default value is only assigned to the config symbol if no other
-  value was set by the user (via the input prompt above). If an input
-  prompt is visible the default value is presented to the user and can
-  be overridden by him.
-  Optionally, dependencies only for this default value can be added with
-  "if".
-
- The default value deliberately defaults to 'n' in order to avoid bloating the
- build. With few exceptions, new config options should not change this. The
- intent is for "make oldconfig" to add as little as possible to the config from
- release to release.
-
- Note:
-	Things that merit "default y/m" include:
-
-	a) A new Kconfig option for something that used to always be built
-	   should be "default y".
-
-	b) A new gatekeeping Kconfig option that hides/shows other Kconfig
-	   options (but does not generate any code of its own), should be
-	   "default y" so people will see those other options.
-
-	c) Sub-driver behavior or similar options for a driver that is
-	   "default n". This allows you to provide sane defaults.
-
-	d) Hardware or infrastructure that everybody expects, such as CONFIG_NET
-	   or CONFIG_BLOCK. These are rare exceptions.
-
-- type definition + default value:
-	"def_bool"/"def_tristate" <expr> ["if" <expr>]
-  This is a shorthand notation for a type definition plus a value.
-  Optionally dependencies for this default value can be added with "if".
-
-- dependencies: "depends on" <expr>
-  This defines a dependency for this menu entry. If multiple
-  dependencies are defined, they are connected with '&&'. Dependencies
-  are applied to all other options within this menu entry (which also
-  accept an "if" expression), so these two examples are equivalent:
-
-	bool "foo" if BAR
-	default y if BAR
-  and
-	depends on BAR
-	bool "foo"
-	default y
-
-- reverse dependencies: "select" <symbol> ["if" <expr>]
-  While normal dependencies reduce the upper limit of a symbol (see
-  below), reverse dependencies can be used to force a lower limit of
-  another symbol. The value of the current menu symbol is used as the
-  minimal value <symbol> can be set to. If <symbol> is selected multiple
-  times, the limit is set to the largest selection.
-  Reverse dependencies can only be used with boolean or tristate
-  symbols.
-  Note:
-	select should be used with care. select will force
-	a symbol to a value without visiting the dependencies.
-	By abusing select you are able to select a symbol FOO even
-	if FOO depends on BAR that is not set.
-	In general use select only for non-visible symbols
-	(no prompts anywhere) and for symbols with no dependencies.
-	That will limit the usefulness but on the other hand avoid
-	the illegal configurations all over.
-
-- weak reverse dependencies: "imply" <symbol> ["if" <expr>]
-  This is similar to "select" as it enforces a lower limit on another
-  symbol except that the "implied" symbol's value may still be set to n
-  from a direct dependency or with a visible prompt.
-
-  Given the following example:
-
-  config FOO
-	tristate
-	imply BAZ
-
-  config BAZ
-	tristate
-	depends on BAR
-
-  The following values are possible:
-
-	FOO		BAR		BAZ's default	choice for BAZ
-	---		---		-------------	--------------
-	n		y		n		N/m/y
-	m		y		m		M/y/n
-	y		y		y		Y/n
-	y		n		*		N
-
-  This is useful e.g. with multiple drivers that want to indicate their
-  ability to hook into a secondary subsystem while allowing the user to
-  configure that subsystem out without also having to unset these drivers.
-
-- limiting menu display: "visible if" <expr>
-  This attribute is only applicable to menu blocks, if the condition is
-  false, the menu block is not displayed to the user (the symbols
-  contained there can still be selected by other symbols, though). It is
-  similar to a conditional "prompt" attribute for individual menu
-  entries. Default value of "visible" is true.
-
-- numerical ranges: "range" <symbol> <symbol> ["if" <expr>]
-  This allows to limit the range of possible input values for int
-  and hex symbols. The user can only input a value which is larger than
-  or equal to the first symbol and smaller than or equal to the second
-  symbol.
-
-- help text: "help" or "---help---"
-  This defines a help text. The end of the help text is determined by
-  the indentation level, this means it ends at the first line which has
-  a smaller indentation than the first line of the help text.
-  "---help---" and "help" do not differ in behaviour, "---help---" is
-  used to help visually separate configuration logic from help within
-  the file as an aid to developers.
-
-- misc options: "option" <symbol>[=<value>]
-  Various less common options can be defined via this option syntax,
-  which can modify the behaviour of the menu entry and its config
-  symbol. These options are currently possible:
-
-  - "defconfig_list"
-    This declares a list of default entries which can be used when
-    looking for the default configuration (which is used when the main
-    .config doesn't exists yet.)
-
-  - "modules"
-    This declares the symbol to be used as the MODULES symbol, which
-    enables the third modular state for all config symbols.
-    At most one symbol may have the "modules" option set.
-
-  - "allnoconfig_y"
-    This declares the symbol as one that should have the value y when
-    using "allnoconfig". Used for symbols that hide other symbols.
-
-Menu dependencies
------------------
-
-Dependencies define the visibility of a menu entry and can also reduce
-the input range of tristate symbols. The tristate logic used in the
-expressions uses one more state than normal boolean logic to express the
-module state. Dependency expressions have the following syntax:
-
-<expr> ::= <symbol>                             (1)
-           <symbol> '=' <symbol>                (2)
-           <symbol> '!=' <symbol>               (3)
-           <symbol1> '<' <symbol2>              (4)
-           <symbol1> '>' <symbol2>              (4)
-           <symbol1> '<=' <symbol2>             (4)
-           <symbol1> '>=' <symbol2>             (4)
-           '(' <expr> ')'                       (5)
-           '!' <expr>                           (6)
-           <expr> '&&' <expr>                   (7)
-           <expr> '||' <expr>                   (8)
-
-Expressions are listed in decreasing order of precedence. 
-
-(1) Convert the symbol into an expression. Boolean and tristate symbols
-    are simply converted into the respective expression values. All
-    other symbol types result in 'n'.
-(2) If the values of both symbols are equal, it returns 'y',
-    otherwise 'n'.
-(3) If the values of both symbols are equal, it returns 'n',
-    otherwise 'y'.
-(4) If value of <symbol1> is respectively lower, greater, lower-or-equal,
-    or greater-or-equal than value of <symbol2>, it returns 'y',
-    otherwise 'n'.
-(5) Returns the value of the expression. Used to override precedence.
-(6) Returns the result of (2-/expr/).
-(7) Returns the result of min(/expr/, /expr/).
-(8) Returns the result of max(/expr/, /expr/).
-
-An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
-respectively for calculations). A menu entry becomes visible when its
-expression evaluates to 'm' or 'y'.
-
-There are two types of symbols: constant and non-constant symbols.
-Non-constant symbols are the most common ones and are defined with the
-'config' statement. Non-constant symbols consist entirely of alphanumeric
-characters or underscores.
-Constant symbols are only part of expressions. Constant symbols are
-always surrounded by single or double quotes. Within the quote, any
-other character is allowed and the quotes can be escaped using '\'.
-
-Menu structure
---------------
-
-The position of a menu entry in the tree is determined in two ways. First
-it can be specified explicitly:
-
-menu "Network device support"
-	depends on NET
-
-config NETDEVICES
-	...
-
-endmenu
-
-All entries within the "menu" ... "endmenu" block become a submenu of
-"Network device support". All subentries inherit the dependencies from
-the menu entry, e.g. this means the dependency "NET" is added to the
-dependency list of the config option NETDEVICES.
-
-The other way to generate the menu structure is done by analyzing the
-dependencies. If a menu entry somehow depends on the previous entry, it
-can be made a submenu of it. First, the previous (parent) symbol must
-be part of the dependency list and then one of these two conditions
-must be true:
-- the child entry must become invisible, if the parent is set to 'n'
-- the child entry must only be visible, if the parent is visible
-
-config MODULES
-	bool "Enable loadable module support"
-
-config MODVERSIONS
-	bool "Set version information on all module symbols"
-	depends on MODULES
-
-comment "module support disabled"
-	depends on !MODULES
-
-MODVERSIONS directly depends on MODULES, this means it's only visible if
-MODULES is different from 'n'. The comment on the other hand is only
-visible when MODULES is set to 'n'.
-
-
-Kconfig syntax
---------------
-
-The configuration file describes a series of menu entries, where every
-line starts with a keyword (except help texts). The following keywords
-end a menu entry:
-- config
-- menuconfig
-- choice/endchoice
-- comment
-- menu/endmenu
-- if/endif
-- source
-The first five also start the definition of a menu entry.
-
-config:
-
-	"config" <symbol>
-	<config options>
-
-This defines a config symbol <symbol> and accepts any of above
-attributes as options.
-
-menuconfig:
-	"menuconfig" <symbol>
-	<config options>
-
-This is similar to the simple config entry above, but it also gives a
-hint to front ends, that all suboptions should be displayed as a
-separate list of options. To make sure all the suboptions will really
-show up under the menuconfig entry and not outside of it, every item
-from the <config options> list must depend on the menuconfig symbol.
-In practice, this is achieved by using one of the next two constructs:
-
-(1):
-menuconfig M
-if M
-    config C1
-    config C2
-endif
-
-(2):
-menuconfig M
-config C1
-    depends on M
-config C2
-    depends on M
-
-In the following examples (3) and (4), C1 and C2 still have the M
-dependency, but will not appear under menuconfig M anymore, because
-of C0, which doesn't depend on M:
-
-(3):
-menuconfig M
-    config C0
-if M
-    config C1
-    config C2
-endif
-
-(4):
-menuconfig M
-config C0
-config C1
-    depends on M
-config C2
-    depends on M
-
-choices:
-
-	"choice" [symbol]
-	<choice options>
-	<choice block>
-	"endchoice"
-
-This defines a choice group and accepts any of the above attributes as
-options. A choice can only be of type bool or tristate.  If no type is
-specified for a choice, its type will be determined by the type of
-the first choice element in the group or remain unknown if none of the
-choice elements have a type specified, as well.
-
-While a boolean choice only allows a single config entry to be
-selected, a tristate choice also allows any number of config entries
-to be set to 'm'. This can be used if multiple drivers for a single
-hardware exists and only a single driver can be compiled/loaded into
-the kernel, but all drivers can be compiled as modules.
-
-A choice accepts another option "optional", which allows to set the
-choice to 'n' and no entry needs to be selected.
-If no [symbol] is associated with a choice, then you can not have multiple
-definitions of that choice. If a [symbol] is associated to the choice,
-then you may define the same choice (i.e. with the same entries) in another
-place.
-
-comment:
-
-	"comment" <prompt>
-	<comment options>
-
-This defines a comment which is displayed to the user during the
-configuration process and is also echoed to the output files. The only
-possible options are dependencies.
-
-menu:
-
-	"menu" <prompt>
-	<menu options>
-	<menu block>
-	"endmenu"
-
-This defines a menu block, see "Menu structure" above for more
-information. The only possible options are dependencies and "visible"
-attributes.
-
-if:
-
-	"if" <expr>
-	<if block>
-	"endif"
-
-This defines an if block. The dependency expression <expr> is appended
-to all enclosed menu entries.
-
-source:
-
-	"source" <prompt>
-
-This reads the specified configuration file. This file is always parsed.
-
-mainmenu:
-
-	"mainmenu" <prompt>
-
-This sets the config program's title bar if the config program chooses
-to use it. It should be placed at the top of the configuration, before any
-other statement.
-
-'#' Kconfig source file comment:
-
-An unquoted '#' character anywhere in a source file line indicates
-the beginning of a source file comment.  The remainder of that line
-is a comment.
-
-
-Kconfig hints
--------------
-This is a collection of Kconfig tips, most of which aren't obvious at
-first glance and most of which have become idioms in several Kconfig
-files.
-
-Adding common features and make the usage configurable
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-It is a common idiom to implement a feature/functionality that are
-relevant for some architectures but not all.
-The recommended way to do so is to use a config variable named HAVE_*
-that is defined in a common Kconfig file and selected by the relevant
-architectures.
-An example is the generic IOMAP functionality.
-
-We would in lib/Kconfig see:
-
-# Generic IOMAP is used to ...
-config HAVE_GENERIC_IOMAP
-
-config GENERIC_IOMAP
-	depends on HAVE_GENERIC_IOMAP && FOO
-
-And in lib/Makefile we would see:
-obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
-
-For each architecture using the generic IOMAP functionality we would see:
-
-config X86
-	select ...
-	select HAVE_GENERIC_IOMAP
-	select ...
-
-Note: we use the existing config option and avoid creating a new
-config variable to select HAVE_GENERIC_IOMAP.
-
-Note: the use of the internal config variable HAVE_GENERIC_IOMAP, it is
-introduced to overcome the limitation of select which will force a
-config option to 'y' no matter the dependencies.
-The dependencies are moved to the symbol GENERIC_IOMAP and we avoid the
-situation where select forces a symbol equals to 'y'.
-
-Adding features that need compiler support
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are several features that need compiler support. The recommended way
-to describe the dependency on the compiler feature is to use "depends on"
-followed by a test macro.
-
-config STACKPROTECTOR
-	bool "Stack Protector buffer overflow detection"
-	depends on $(cc-option,-fstack-protector)
-	...
-
-If you need to expose a compiler capability to makefiles and/or C source files,
-CC_HAS_ is the recommended prefix for the config option.
-
-config CC_HAS_STACKPROTECTOR_NONE
-	def_bool $(cc-option,-fno-stack-protector)
-
-Build as module only
-~~~~~~~~~~~~~~~~~~~~
-To restrict a component build to module-only, qualify its config symbol
-with "depends on m".  E.g.:
-
-config FOO
-	depends on BAR && m
-
-limits FOO to module (=m) or disabled (=n).
-
-Kconfig recursive dependency limitations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you've hit the Kconfig error: "recursive dependency detected" you've run
-into a recursive dependency issue with Kconfig, a recursive dependency can be
-summarized as a circular dependency. The kconfig tools need to ensure that
-Kconfig files comply with specified configuration requirements. In order to do
-that kconfig must determine the values that are possible for all Kconfig
-symbols, this is currently not possible if there is a circular relation
-between two or more Kconfig symbols. For more details refer to the "Simple
-Kconfig recursive issue" subsection below. Kconfig does not do recursive
-dependency resolution; this has a few implications for Kconfig file writers.
-We'll first explain why this issues exists and then provide an example
-technical limitation which this brings upon Kconfig developers. Eager
-developers wishing to try to address this limitation should read the next
-subsections.
-
-Simple Kconfig recursive issue
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Read: Documentation/kbuild/Kconfig.recursion-issue-01
-
-Test with:
-
-make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-01 allnoconfig
-
-Cumulative Kconfig recursive issue
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Read: Documentation/kbuild/Kconfig.recursion-issue-02
-
-Test with:
-
-make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-02 allnoconfig
-
-Practical solutions to kconfig recursive issue
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Developers who run into the recursive Kconfig issue have two options
-at their disposal. We document them below and also provide a list of
-historical issues resolved through these different solutions.
-
-  a) Remove any superfluous "select FOO" or "depends on FOO"
-  b) Match dependency semantics:
-	b1) Swap all "select FOO" to "depends on FOO" or,
-	b2) Swap all "depends on FOO" to "select FOO"
-
-The resolution to a) can be tested with the sample Kconfig file
-Documentation/kbuild/Kconfig.recursion-issue-01 through the removal
-of the "select CORE" from CORE_BELL_A_ADVANCED as that is implicit already
-since CORE_BELL_A depends on CORE. At times it may not be possible to remove
-some dependency criteria, for such cases you can work with solution b).
-
-The two different resolutions for b) can be tested in the sample Kconfig file
-Documentation/kbuild/Kconfig.recursion-issue-02.
-
-Below is a list of examples of prior fixes for these types of recursive issues;
-all errors appear to involve one or more select's and one or more "depends on".
-
-commit          fix
-======          ===
-06b718c01208    select A -> depends on A
-c22eacfe82f9    depends on A -> depends on B
-6a91e854442c    select A -> depends on A
-118c565a8f2e    select A -> select B
-f004e5594705    select A -> depends on A
-c7861f37b4c6    depends on A -> (null)
-80c69915e5fb    select A -> (null)              (1)
-c2218e26c0d0    select A -> depends on A        (1)
-d6ae99d04e1c    select A -> depends on A
-95ca19cf8cbf    select A -> depends on A
-8f057d7bca54    depends on A -> (null)
-8f057d7bca54    depends on A -> select A
-a0701f04846e    select A -> depends on A
-0c8b92f7f259    depends on A -> (null)
-e4e9e0540928    select A -> depends on A        (2)
-7453ea886e87    depends on A > (null)           (1)
-7b1fff7e4fdf    select A -> depends on A
-86c747d2a4f0    select A -> depends on A
-d9f9ab51e55e    select A -> depends on A
-0c51a4d8abd6    depends on A -> select A        (3)
-e98062ed6dc4    select A -> depends on A        (3)
-91e5d284a7f1    select A -> (null)
-
-(1) Partial (or no) quote of error.
-(2) That seems to be the gist of that fix.
-(3) Same error.
-
-Future kconfig work
-~~~~~~~~~~~~~~~~~~~
-
-Work on kconfig is welcomed on both areas of clarifying semantics and on
-evaluating the use of a full SAT solver for it. A full SAT solver can be
-desirable to enable more complex dependency mappings and / or queries,
-for instance on possible use case for a SAT solver could be that of handling
-the current known recursive dependency issues. It is not known if this would
-address such issues but such evaluation is desirable. If support for a full SAT
-solver proves too complex or that it cannot address recursive dependency issues
-Kconfig should have at least clear and well defined semantics which also
-addresses and documents limitations or requirements such as the ones dealing
-with recursive dependencies.
-
-Further work on both of these areas is welcomed on Kconfig. We elaborate
-on both of these in the next two subsections.
-
-Semantics of Kconfig
-~~~~~~~~~~~~~~~~~~~~
-
-The use of Kconfig is broad, Linux is now only one of Kconfig's users:
-one study has completed a broad analysis of Kconfig use in 12 projects [0].
-Despite its widespread use, and although this document does a reasonable job
-in documenting basic Kconfig syntax a more precise definition of Kconfig
-semantics is welcomed. One project deduced Kconfig semantics through
-the use of the xconfig configurator [1]. Work should be done to confirm if
-the deduced semantics matches our intended Kconfig design goals.
-
-Having well defined semantics can be useful for tools for practical
-evaluation of depenencies, for instance one such use known case was work to
-express in boolean abstraction of the inferred semantics of Kconfig to
-translate Kconfig logic into boolean formulas and run a SAT solver on this to
-find dead code / features (always inactive), 114 dead features were found in
-Linux using this methodology [1] (Section 8: Threats to validity).
-
-Confirming this could prove useful as Kconfig stands as one of the the leading
-industrial variability modeling languages [1] [2]. Its study would help
-evaluate practical uses of such languages, their use was only theoretical
-and real world requirements were not well understood. As it stands though
-only reverse engineering techniques have been used to deduce semantics from
-variability modeling languages such as Kconfig [3].
-
-[0] http://www.eng.uwaterloo.ca/~shshe/kconfig_semantics.pdf
-[1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf
-[2] http://gsd.uwaterloo.ca/sites/default/files/ase241-berger_0.pdf
-[3] http://gsd.uwaterloo.ca/sites/default/files/icse2011.pdf
-
-Full SAT solver for Kconfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Although SAT solvers [0] haven't yet been used by Kconfig directly, as noted in
-the previous subsection, work has been done however to express in boolean
-abstraction the inferred semantics of Kconfig to translate Kconfig logic into
-boolean formulas and run a SAT solver on it [1]. Another known related project
-is CADOS [2] (former VAMOS [3]) and the tools, mainly undertaker [4], which has
-been introduced first with [5].  The basic concept of undertaker is to exract
-variability models from Kconfig, and put them together with a propositional
-formula extracted from CPP #ifdefs and build-rules into a SAT solver in order
-to find dead code, dead files, and dead symbols. If using a SAT solver is
-desirable on Kconfig one approach would be to evaluate repurposing such efforts
-somehow on Kconfig. There is enough interest from mentors of existing projects
-to not only help advise how to integrate this work upstream but also help
-maintain it long term. Interested developers should visit:
-
-http://kernelnewbies.org/KernelProjects/kconfig-sat
-
-[0] http://www.cs.cornell.edu/~sabhar/chapters/SATSolvers-KR-Handbook.pdf
-[1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf
-[2] https://cados.cs.fau.de
-[3] https://vamos.cs.fau.de
-[4] https://undertaker.cs.fau.de
-[5] https://www4.cs.fau.de/Publications/2011/tartler_11_eurosys.pdf
diff --git a/Documentation/kbuild/kconfig-macro-language.rst b/Documentation/kbuild/kconfig-macro-language.rst
new file mode 100644
index 000000000000..35b3263b7e40
--- /dev/null
+++ b/Documentation/kbuild/kconfig-macro-language.rst
@@ -0,0 +1,247 @@
+======================
+Kconfig macro language
+======================
+
+Concept
+-------
+
+The basic idea was inspired by Make. When we look at Make, we notice sort of
+two languages in one. One language describes dependency graphs consisting of
+targets and prerequisites. The other is a macro language for performing textual
+substitution.
+
+There is clear distinction between the two language stages. For example, you
+can write a makefile like follows::
+
+    APP := foo
+    SRC := foo.c
+    CC := gcc
+
+    $(APP): $(SRC)
+            $(CC) -o $(APP) $(SRC)
+
+The macro language replaces the variable references with their expanded form,
+and handles as if the source file were input like follows::
+
+    foo: foo.c
+            gcc -o foo foo.c
+
+Then, Make analyzes the dependency graph and determines the targets to be
+updated.
+
+The idea is quite similar in Kconfig - it is possible to describe a Kconfig
+file like this::
+
+    CC := gcc
+
+    config CC_HAS_FOO
+            def_bool $(shell, $(srctree)/scripts/gcc-check-foo.sh $(CC))
+
+The macro language in Kconfig processes the source file into the following
+intermediate::
+
+    config CC_HAS_FOO
+            def_bool y
+
+Then, Kconfig moves onto the evaluation stage to resolve inter-symbol
+dependency as explained in kconfig-language.txt.
+
+
+Variables
+---------
+
+Like in Make, a variable in Kconfig works as a macro variable.  A macro
+variable is expanded "in place" to yield a text string that may then be
+expanded further. To get the value of a variable, enclose the variable name in
+$( ). The parentheses are required even for single-letter variable names; $X is
+a syntax error. The curly brace form as in ${CC} is not supported either.
+
+There are two types of variables: simply expanded variables and recursively
+expanded variables.
+
+A simply expanded variable is defined using the := assignment operator. Its
+righthand side is expanded immediately upon reading the line from the Kconfig
+file.
+
+A recursively expanded variable is defined using the = assignment operator.
+Its righthand side is simply stored as the value of the variable without
+expanding it in any way. Instead, the expansion is performed when the variable
+is used.
+
+There is another type of assignment operator; += is used to append text to a
+variable. The righthand side of += is expanded immediately if the lefthand
+side was originally defined as a simple variable. Otherwise, its evaluation is
+deferred.
+
+The variable reference can take parameters, in the following form::
+
+  $(name,arg1,arg2,arg3)
+
+You can consider the parameterized reference as a function. (more precisely,
+"user-defined function" in contrast to "built-in function" listed below).
+
+Useful functions must be expanded when they are used since the same function is
+expanded differently if different parameters are passed. Hence, a user-defined
+function is defined using the = assignment operator. The parameters are
+referenced within the body definition with $(1), $(2), etc.
+
+In fact, recursively expanded variables and user-defined functions are the same
+internally. (In other words, "variable" is "function with zero argument".)
+When we say "variable" in a broad sense, it includes "user-defined function".
+
+
+Built-in functions
+------------------
+
+Like Make, Kconfig provides several built-in functions. Every function takes a
+particular number of arguments.
+
+In Make, every built-in function takes at least one argument. Kconfig allows
+zero argument for built-in functions, such as $(fileno), $(lineno). You could
+consider those as "built-in variable", but it is just a matter of how we call
+it after all. Let's say "built-in function" here to refer to natively supported
+functionality.
+
+Kconfig currently supports the following built-in functions.
+
+ - $(shell,command)
+
+  The "shell" function accepts a single argument that is expanded and passed
+  to a subshell for execution. The standard output of the command is then read
+  and returned as the value of the function. Every newline in the output is
+  replaced with a space. Any trailing newlines are deleted. The standard error
+  is not returned, nor is any program exit status.
+
+ - $(info,text)
+
+  The "info" function takes a single argument and prints it to stdout.
+  It evaluates to an empty string.
+
+ - $(warning-if,condition,text)
+
+  The "warning-if" function takes two arguments. If the condition part is "y",
+  the text part is sent to stderr. The text is prefixed with the name of the
+  current Kconfig file and the current line number.
+
+ - $(error-if,condition,text)
+
+  The "error-if" function is similar to "warning-if", but it terminates the
+  parsing immediately if the condition part is "y".
+
+ - $(filename)
+
+  The 'filename' takes no argument, and $(filename) is expanded to the file
+  name being parsed.
+
+ - $(lineno)
+
+  The 'lineno' takes no argument, and $(lineno) is expanded to the line number
+  being parsed.
+
+
+Make vs Kconfig
+---------------
+
+Kconfig adopts Make-like macro language, but the function call syntax is
+slightly different.
+
+A function call in Make looks like this::
+
+  $(func-name arg1,arg2,arg3)
+
+The function name and the first argument are separated by at least one
+whitespace. Then, leading whitespaces are trimmed from the first argument,
+while whitespaces in the other arguments are kept. You need to use a kind of
+trick to start the first parameter with spaces. For example, if you want
+to make "info" function print "  hello", you can write like follows::
+
+  empty :=
+  space := $(empty) $(empty)
+  $(info $(space)$(space)hello)
+
+Kconfig uses only commas for delimiters, and keeps all whitespaces in the
+function call. Some people prefer putting a space after each comma delimiter::
+
+  $(func-name, arg1, arg2, arg3)
+
+In this case, "func-name" will receive " arg1", " arg2", " arg3". The presence
+of leading spaces may matter depending on the function. The same applies to
+Make - for example, $(subst .c, .o, $(sources)) is a typical mistake; it
+replaces ".c" with " .o".
+
+In Make, a user-defined function is referenced by using a built-in function,
+'call', like this::
+
+    $(call my-func,arg1,arg2,arg3)
+
+Kconfig invokes user-defined functions and built-in functions in the same way.
+The omission of 'call' makes the syntax shorter.
+
+In Make, some functions treat commas verbatim instead of argument separators.
+For example, $(shell echo hello, world) runs the command "echo hello, world".
+Likewise, $(info hello, world) prints "hello, world" to stdout. You could say
+this is _useful_ inconsistency.
+
+In Kconfig, for simpler implementation and grammatical consistency, commas that
+appear in the $( ) context are always delimiters. It means::
+
+  $(shell, echo hello, world)
+
+is an error because it is passing two parameters where the 'shell' function
+accepts only one. To pass commas in arguments, you can use the following trick::
+
+  comma := ,
+  $(shell, echo hello$(comma) world)
+
+
+Caveats
+-------
+
+A variable (or function) cannot be expanded across tokens. So, you cannot use
+a variable as a shorthand for an expression that consists of multiple tokens.
+The following works::
+
+    RANGE_MIN := 1
+    RANGE_MAX := 3
+
+    config FOO
+            int "foo"
+            range $(RANGE_MIN) $(RANGE_MAX)
+
+But, the following does not work::
+
+    RANGES := 1 3
+
+    config FOO
+            int "foo"
+            range $(RANGES)
+
+A variable cannot be expanded to any keyword in Kconfig.  The following does
+not work::
+
+    MY_TYPE := tristate
+
+    config FOO
+            $(MY_TYPE) "foo"
+            default y
+
+Obviously from the design, $(shell command) is expanded in the textual
+substitution phase. You cannot pass symbols to the 'shell' function.
+
+The following does not work as expected::
+
+    config ENDIAN_FLAG
+            string
+            default "-mbig-endian" if CPU_BIG_ENDIAN
+            default "-mlittle-endian" if CPU_LITTLE_ENDIAN
+
+    config CC_HAS_ENDIAN_FLAG
+            def_bool $(shell $(srctree)/scripts/gcc-check-flag ENDIAN_FLAG)
+
+Instead, you can do like follows so that any function call is statically
+expanded::
+
+    config CC_HAS_ENDIAN_FLAG
+            bool
+            default $(shell $(srctree)/scripts/gcc-check-flag -mbig-endian) if CPU_BIG_ENDIAN
+            default $(shell $(srctree)/scripts/gcc-check-flag -mlittle-endian) if CPU_LITTLE_ENDIAN
diff --git a/Documentation/kbuild/kconfig-macro-language.txt b/Documentation/kbuild/kconfig-macro-language.txt
deleted file mode 100644
index 07da2ea68dce..000000000000
--- a/Documentation/kbuild/kconfig-macro-language.txt
+++ /dev/null
@@ -1,242 +0,0 @@
-Concept
--------
-
-The basic idea was inspired by Make. When we look at Make, we notice sort of
-two languages in one. One language describes dependency graphs consisting of
-targets and prerequisites. The other is a macro language for performing textual
-substitution.
-
-There is clear distinction between the two language stages. For example, you
-can write a makefile like follows:
-
-    APP := foo
-    SRC := foo.c
-    CC := gcc
-
-    $(APP): $(SRC)
-            $(CC) -o $(APP) $(SRC)
-
-The macro language replaces the variable references with their expanded form,
-and handles as if the source file were input like follows:
-
-    foo: foo.c
-            gcc -o foo foo.c
-
-Then, Make analyzes the dependency graph and determines the targets to be
-updated.
-
-The idea is quite similar in Kconfig - it is possible to describe a Kconfig
-file like this:
-
-    CC := gcc
-
-    config CC_HAS_FOO
-            def_bool $(shell, $(srctree)/scripts/gcc-check-foo.sh $(CC))
-
-The macro language in Kconfig processes the source file into the following
-intermediate:
-
-    config CC_HAS_FOO
-            def_bool y
-
-Then, Kconfig moves onto the evaluation stage to resolve inter-symbol
-dependency as explained in kconfig-language.txt.
-
-
-Variables
----------
-
-Like in Make, a variable in Kconfig works as a macro variable.  A macro
-variable is expanded "in place" to yield a text string that may then be
-expanded further. To get the value of a variable, enclose the variable name in
-$( ). The parentheses are required even for single-letter variable names; $X is
-a syntax error. The curly brace form as in ${CC} is not supported either.
-
-There are two types of variables: simply expanded variables and recursively
-expanded variables.
-
-A simply expanded variable is defined using the := assignment operator. Its
-righthand side is expanded immediately upon reading the line from the Kconfig
-file.
-
-A recursively expanded variable is defined using the = assignment operator.
-Its righthand side is simply stored as the value of the variable without
-expanding it in any way. Instead, the expansion is performed when the variable
-is used.
-
-There is another type of assignment operator; += is used to append text to a
-variable. The righthand side of += is expanded immediately if the lefthand
-side was originally defined as a simple variable. Otherwise, its evaluation is
-deferred.
-
-The variable reference can take parameters, in the following form:
-
-  $(name,arg1,arg2,arg3)
-
-You can consider the parameterized reference as a function. (more precisely,
-"user-defined function" in contrast to "built-in function" listed below).
-
-Useful functions must be expanded when they are used since the same function is
-expanded differently if different parameters are passed. Hence, a user-defined
-function is defined using the = assignment operator. The parameters are
-referenced within the body definition with $(1), $(2), etc.
-
-In fact, recursively expanded variables and user-defined functions are the same
-internally. (In other words, "variable" is "function with zero argument".)
-When we say "variable" in a broad sense, it includes "user-defined function".
-
-
-Built-in functions
-------------------
-
-Like Make, Kconfig provides several built-in functions. Every function takes a
-particular number of arguments.
-
-In Make, every built-in function takes at least one argument. Kconfig allows
-zero argument for built-in functions, such as $(fileno), $(lineno). You could
-consider those as "built-in variable", but it is just a matter of how we call
-it after all. Let's say "built-in function" here to refer to natively supported
-functionality.
-
-Kconfig currently supports the following built-in functions.
-
- - $(shell,command)
-
-  The "shell" function accepts a single argument that is expanded and passed
-  to a subshell for execution. The standard output of the command is then read
-  and returned as the value of the function. Every newline in the output is
-  replaced with a space. Any trailing newlines are deleted. The standard error
-  is not returned, nor is any program exit status.
-
- - $(info,text)
-
-  The "info" function takes a single argument and prints it to stdout.
-  It evaluates to an empty string.
-
- - $(warning-if,condition,text)
-
-  The "warning-if" function takes two arguments. If the condition part is "y",
-  the text part is sent to stderr. The text is prefixed with the name of the
-  current Kconfig file and the current line number.
-
- - $(error-if,condition,text)
-
-  The "error-if" function is similar to "warning-if", but it terminates the
-  parsing immediately if the condition part is "y".
-
- - $(filename)
-
-  The 'filename' takes no argument, and $(filename) is expanded to the file
-  name being parsed.
-
- - $(lineno)
-
-  The 'lineno' takes no argument, and $(lineno) is expanded to the line number
-  being parsed.
-
-
-Make vs Kconfig
----------------
-
-Kconfig adopts Make-like macro language, but the function call syntax is
-slightly different.
-
-A function call in Make looks like this:
-
-  $(func-name arg1,arg2,arg3)
-
-The function name and the first argument are separated by at least one
-whitespace. Then, leading whitespaces are trimmed from the first argument,
-while whitespaces in the other arguments are kept. You need to use a kind of
-trick to start the first parameter with spaces. For example, if you want
-to make "info" function print "  hello", you can write like follows:
-
-  empty :=
-  space := $(empty) $(empty)
-  $(info $(space)$(space)hello)
-
-Kconfig uses only commas for delimiters, and keeps all whitespaces in the
-function call. Some people prefer putting a space after each comma delimiter:
-
-  $(func-name, arg1, arg2, arg3)
-
-In this case, "func-name" will receive " arg1", " arg2", " arg3". The presence
-of leading spaces may matter depending on the function. The same applies to
-Make - for example, $(subst .c, .o, $(sources)) is a typical mistake; it
-replaces ".c" with " .o".
-
-In Make, a user-defined function is referenced by using a built-in function,
-'call', like this:
-
-    $(call my-func,arg1,arg2,arg3)
-
-Kconfig invokes user-defined functions and built-in functions in the same way.
-The omission of 'call' makes the syntax shorter.
-
-In Make, some functions treat commas verbatim instead of argument separators.
-For example, $(shell echo hello, world) runs the command "echo hello, world".
-Likewise, $(info hello, world) prints "hello, world" to stdout. You could say
-this is _useful_ inconsistency.
-
-In Kconfig, for simpler implementation and grammatical consistency, commas that
-appear in the $( ) context are always delimiters. It means
-
-  $(shell, echo hello, world)
-
-is an error because it is passing two parameters where the 'shell' function
-accepts only one. To pass commas in arguments, you can use the following trick:
-
-  comma := ,
-  $(shell, echo hello$(comma) world)
-
-
-Caveats
--------
-
-A variable (or function) cannot be expanded across tokens. So, you cannot use
-a variable as a shorthand for an expression that consists of multiple tokens.
-The following works:
-
-    RANGE_MIN := 1
-    RANGE_MAX := 3
-
-    config FOO
-            int "foo"
-            range $(RANGE_MIN) $(RANGE_MAX)
-
-But, the following does not work:
-
-    RANGES := 1 3
-
-    config FOO
-            int "foo"
-            range $(RANGES)
-
-A variable cannot be expanded to any keyword in Kconfig.  The following does
-not work:
-
-    MY_TYPE := tristate
-
-    config FOO
-            $(MY_TYPE) "foo"
-            default y
-
-Obviously from the design, $(shell command) is expanded in the textual
-substitution phase. You cannot pass symbols to the 'shell' function.
-The following does not work as expected.
-
-    config ENDIAN_FLAG
-            string
-            default "-mbig-endian" if CPU_BIG_ENDIAN
-            default "-mlittle-endian" if CPU_LITTLE_ENDIAN
-
-    config CC_HAS_ENDIAN_FLAG
-            def_bool $(shell $(srctree)/scripts/gcc-check-flag ENDIAN_FLAG)
-
-Instead, you can do like follows so that any function call is statically
-expanded.
-
-    config CC_HAS_ENDIAN_FLAG
-            bool
-            default $(shell $(srctree)/scripts/gcc-check-flag -mbig-endian) if CPU_BIG_ENDIAN
-            default $(shell $(srctree)/scripts/gcc-check-flag -mlittle-endian) if CPU_LITTLE_ENDIAN
diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst
new file mode 100644
index 000000000000..a9a855f894b3
--- /dev/null
+++ b/Documentation/kbuild/kconfig.rst
@@ -0,0 +1,304 @@
+===================
+Kconfig make config
+===================
+
+This file contains some assistance for using `make *config`.
+
+Use "make help" to list all of the possible configuration targets.
+
+The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf')
+programs also have embedded help text.  Be sure to check that for
+navigation, search, and other general help text.
+
+General
+-------
+
+New kernel releases often introduce new config symbols.  Often more
+important, new kernel releases may rename config symbols.  When
+this happens, using a previously working .config file and running
+"make oldconfig" won't necessarily produce a working new kernel
+for you, so you may find that you need to see what NEW kernel
+symbols have been introduced.
+
+To see a list of new config symbols, use::
+
+	cp user/some/old.config .config
+	make listnewconfig
+
+and the config program will list any new symbols, one per line.
+
+Alternatively, you can use the brute force method::
+
+	make oldconfig
+	scripts/diffconfig .config.old .config | less
+
+----------------------------------------------------------------------
+
+Environment variables for `*config`
+
+KCONFIG_CONFIG
+--------------
+This environment variable can be used to specify a default kernel config
+file name to override the default name of ".config".
+
+KCONFIG_OVERWRITECONFIG
+-----------------------
+If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
+break symlinks when .config is a symlink to somewhere else.
+
+`CONFIG_`
+---------
+If you set `CONFIG_` in the environment, Kconfig will prefix all symbols
+with its value when saving the configuration, instead of using the default,
+`CONFIG_`.
+
+----------------------------------------------------------------------
+
+Environment variables for '{allyes/allmod/allno/rand}config'
+
+KCONFIG_ALLCONFIG
+-----------------
+(partially based on lkml email from/by Rob Landley, re: miniconfig)
+
+--------------------------------------------------
+
+The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also
+use the environment variable KCONFIG_ALLCONFIG as a flag or a filename
+that contains config symbols that the user requires to be set to a
+specific value.  If KCONFIG_ALLCONFIG is used without a filename where
+KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", `make *config`
+checks for a file named "all{yes/mod/no/def/random}.config"
+(corresponding to the `*config` command that was used) for symbol values
+that are to be forced.  If this file is not found, it checks for a
+file named "all.config" to contain forced values.
+
+This enables you to create "miniature" config (miniconfig) or custom
+config files containing just the config symbols that you are interested
+in.  Then the kernel config system generates the full .config file,
+including symbols of your miniconfig file.
+
+This 'KCONFIG_ALLCONFIG' file is a config file which contains
+(usually a subset of all) preset config symbols.  These variable
+settings are still subject to normal dependency checks.
+
+Examples::
+
+	KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig
+
+or::
+
+	KCONFIG_ALLCONFIG=mini.config make allnoconfig
+
+or::
+
+	make KCONFIG_ALLCONFIG=mini.config allnoconfig
+
+These examples will disable most options (allnoconfig) but enable or
+disable the options that are explicitly listed in the specified
+mini-config files.
+
+----------------------------------------------------------------------
+
+Environment variables for 'randconfig'
+
+KCONFIG_SEED
+------------
+You can set this to the integer value used to seed the RNG, if you want
+to somehow debug the behaviour of the kconfig parser/frontends.
+If not set, the current time will be used.
+
+KCONFIG_PROBABILITY
+-------------------
+This variable can be used to skew the probabilities. This variable can
+be unset or empty, or set to three different formats:
+
+    =======================     ==================  =====================
+	KCONFIG_PROBABILITY     y:n split           y:m:n split
+    =======================     ==================  =====================
+	unset or empty          50  : 50            33  : 33  : 34
+	N                        N  : 100-N         N/2 : N/2 : 100-N
+    [1] N:M                     N+M : 100-(N+M)      N  :  M  : 100-(N+M)
+    [2] N:M:L                    N  : 100-N          M  :  L  : 100-(M+L)
+    =======================     ==================  =====================
+
+where N, M and L are integers (in base 10) in the range [0,100], and so
+that:
+
+    [1] N+M is in the range [0,100]
+
+    [2] M+L is in the range [0,100]
+
+Examples::
+
+	KCONFIG_PROBABILITY=10
+		10% of booleans will be set to 'y', 90% to 'n'
+		5% of tristates will be set to 'y', 5% to 'm', 90% to 'n'
+	KCONFIG_PROBABILITY=15:25
+		40% of booleans will be set to 'y', 60% to 'n'
+		15% of tristates will be set to 'y', 25% to 'm', 60% to 'n'
+	KCONFIG_PROBABILITY=10:15:15
+		10% of booleans will be set to 'y', 90% to 'n'
+		15% of tristates will be set to 'y', 15% to 'm', 70% to 'n'
+
+----------------------------------------------------------------------
+
+Environment variables for 'syncconfig'
+
+KCONFIG_NOSILENTUPDATE
+----------------------
+If this variable has a non-blank value, it prevents silent kernel
+config updates (requires explicit updates).
+
+KCONFIG_AUTOCONFIG
+------------------
+This environment variable can be set to specify the path & name of the
+"auto.conf" file.  Its default value is "include/config/auto.conf".
+
+KCONFIG_TRISTATE
+----------------
+This environment variable can be set to specify the path & name of the
+"tristate.conf" file.  Its default value is "include/config/tristate.conf".
+
+KCONFIG_AUTOHEADER
+------------------
+This environment variable can be set to specify the path & name of the
+"autoconf.h" (header) file.
+Its default value is "include/generated/autoconf.h".
+
+
+----------------------------------------------------------------------
+
+menuconfig
+----------
+
+SEARCHING for CONFIG symbols
+
+Searching in menuconfig:
+
+	The Search function searches for kernel configuration symbol
+	names, so you have to know something close to what you are
+	looking for.
+
+	Example::
+
+		/hotplug
+		This lists all config symbols that contain "hotplug",
+		e.g., HOTPLUG_CPU, MEMORY_HOTPLUG.
+
+	For search help, enter / followed by TAB-TAB (to highlight
+	<Help>) and Enter.  This will tell you that you can also use
+	regular expressions (regexes) in the search string, so if you
+	are not interested in MEMORY_HOTPLUG, you could try::
+
+		/^hotplug
+
+	When searching, symbols are sorted thus:
+
+	  - first, exact matches, sorted alphabetically (an exact match
+	    is when the search matches the complete symbol name);
+	  - then, other matches, sorted alphabetically.
+
+	For example: ^ATH.K matches:
+
+	    ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG
+	    [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...]
+
+	of which only ATH5K and ATH9K match exactly and so are sorted
+	first (and in alphabetical order), then come all other symbols,
+	sorted in alphabetical order.
+
+----------------------------------------------------------------------
+
+User interface options for 'menuconfig'
+
+MENUCONFIG_COLOR
+----------------
+It is possible to select different color themes using the variable
+MENUCONFIG_COLOR.  To select a theme use::
+
+	make MENUCONFIG_COLOR=<theme> menuconfig
+
+Available themes are::
+
+  - mono       => selects colors suitable for monochrome displays
+  - blackbg    => selects a color scheme with black background
+  - classic    => theme with blue background. The classic look
+  - bluetitle  => a LCD friendly version of classic. (default)
+
+MENUCONFIG_MODE
+---------------
+This mode shows all sub-menus in one large tree.
+
+Example::
+
+	make MENUCONFIG_MODE=single_menu menuconfig
+
+----------------------------------------------------------------------
+
+nconfig
+-------
+
+nconfig is an alternate text-based configurator.  It lists function
+keys across the bottom of the terminal (window) that execute commands.
+You can also just use the corresponding numeric key to execute the
+commands unless you are in a data entry window.  E.g., instead of F6
+for Save, you can just press 6.
+
+Use F1 for Global help or F3 for the Short help menu.
+
+Searching in nconfig:
+
+	You can search either in the menu entry "prompt" strings
+	or in the configuration symbols.
+
+	Use / to begin a search through the menu entries.  This does
+	not support regular expressions.  Use <Down> or <Up> for
+	Next hit and Previous hit, respectively.  Use <Esc> to
+	terminate the search mode.
+
+	F8 (SymSearch) searches the configuration symbols for the
+	given string or regular expression (regex).
+
+NCONFIG_MODE
+------------
+This mode shows all sub-menus in one large tree.
+
+Example::
+
+	make NCONFIG_MODE=single_menu nconfig
+
+----------------------------------------------------------------------
+
+xconfig
+-------
+
+Searching in xconfig:
+
+	The Search function searches for kernel configuration symbol
+	names, so you have to know something close to what you are
+	looking for.
+
+	Example::
+
+		Ctrl-F hotplug
+
+	or::
+
+		Menu: File, Search, hotplug
+
+	lists all config symbol entries that contain "hotplug" in
+	the symbol name.  In this Search dialog, you may change the
+	config setting for any of the entries that are not grayed out.
+	You can also enter a different search string without having
+	to return to the main menu.
+
+
+----------------------------------------------------------------------
+
+gconfig
+-------
+
+Searching in gconfig:
+
+	There is no search command in gconfig.  However, gconfig does
+	have several different viewing choices, modes, and options.
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
deleted file mode 100644
index 68c82914c0f3..000000000000
--- a/Documentation/kbuild/kconfig.txt
+++ /dev/null
@@ -1,272 +0,0 @@
-This file contains some assistance for using "make *config".
-
-Use "make help" to list all of the possible configuration targets.
-
-The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf')
-programs also have embedded help text.  Be sure to check that for
-navigation, search, and other general help text.
-
-======================================================================
-General
---------------------------------------------------
-
-New kernel releases often introduce new config symbols.  Often more
-important, new kernel releases may rename config symbols.  When
-this happens, using a previously working .config file and running
-"make oldconfig" won't necessarily produce a working new kernel
-for you, so you may find that you need to see what NEW kernel
-symbols have been introduced.
-
-To see a list of new config symbols, use
-
-	cp user/some/old.config .config
-	make listnewconfig
-
-and the config program will list any new symbols, one per line.
-
-Alternatively, you can use the brute force method:
-
-	make oldconfig
-	scripts/diffconfig .config.old .config | less
-
-______________________________________________________________________
-Environment variables for '*config'
-
-KCONFIG_CONFIG
---------------------------------------------------
-This environment variable can be used to specify a default kernel config
-file name to override the default name of ".config".
-
-KCONFIG_OVERWRITECONFIG
---------------------------------------------------
-If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
-break symlinks when .config is a symlink to somewhere else.
-
-CONFIG_
---------------------------------------------------
-If you set CONFIG_ in the environment, Kconfig will prefix all symbols
-with its value when saving the configuration, instead of using the default,
-"CONFIG_".
-
-______________________________________________________________________
-Environment variables for '{allyes/allmod/allno/rand}config'
-
-KCONFIG_ALLCONFIG
---------------------------------------------------
-(partially based on lkml email from/by Rob Landley, re: miniconfig)
---------------------------------------------------
-The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also
-use the environment variable KCONFIG_ALLCONFIG as a flag or a filename
-that contains config symbols that the user requires to be set to a
-specific value.  If KCONFIG_ALLCONFIG is used without a filename where
-KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", "make *config"
-checks for a file named "all{yes/mod/no/def/random}.config"
-(corresponding to the *config command that was used) for symbol values
-that are to be forced.  If this file is not found, it checks for a
-file named "all.config" to contain forced values.
-
-This enables you to create "miniature" config (miniconfig) or custom
-config files containing just the config symbols that you are interested
-in.  Then the kernel config system generates the full .config file,
-including symbols of your miniconfig file.
-
-This 'KCONFIG_ALLCONFIG' file is a config file which contains
-(usually a subset of all) preset config symbols.  These variable
-settings are still subject to normal dependency checks.
-
-Examples:
-	KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig
-or
-	KCONFIG_ALLCONFIG=mini.config make allnoconfig
-or
-	make KCONFIG_ALLCONFIG=mini.config allnoconfig
-
-These examples will disable most options (allnoconfig) but enable or
-disable the options that are explicitly listed in the specified
-mini-config files.
-
-______________________________________________________________________
-Environment variables for 'randconfig'
-
-KCONFIG_SEED
---------------------------------------------------
-You can set this to the integer value used to seed the RNG, if you want
-to somehow debug the behaviour of the kconfig parser/frontends.
-If not set, the current time will be used.
-
-KCONFIG_PROBABILITY
---------------------------------------------------
-This variable can be used to skew the probabilities. This variable can
-be unset or empty, or set to three different formats:
-	KCONFIG_PROBABILITY     y:n split           y:m:n split
-	-----------------------------------------------------------------
-	unset or empty          50  : 50            33  : 33  : 34
-	N                        N  : 100-N         N/2 : N/2 : 100-N
-    [1] N:M                     N+M : 100-(N+M)      N  :  M  : 100-(N+M)
-    [2] N:M:L                    N  : 100-N          M  :  L  : 100-(M+L)
-
-where N, M and L are integers (in base 10) in the range [0,100], and so
-that:
-    [1] N+M is in the range [0,100]
-    [2] M+L is in the range [0,100]
-
-Examples:
-	KCONFIG_PROBABILITY=10
-		10% of booleans will be set to 'y', 90% to 'n'
-		5% of tristates will be set to 'y', 5% to 'm', 90% to 'n'
-	KCONFIG_PROBABILITY=15:25
-		40% of booleans will be set to 'y', 60% to 'n'
-		15% of tristates will be set to 'y', 25% to 'm', 60% to 'n'
-	KCONFIG_PROBABILITY=10:15:15
-		10% of booleans will be set to 'y', 90% to 'n'
-		15% of tristates will be set to 'y', 15% to 'm', 70% to 'n'
-
-______________________________________________________________________
-Environment variables for 'syncconfig'
-
-KCONFIG_NOSILENTUPDATE
---------------------------------------------------
-If this variable has a non-blank value, it prevents silent kernel
-config updates (requires explicit updates).
-
-KCONFIG_AUTOCONFIG
---------------------------------------------------
-This environment variable can be set to specify the path & name of the
-"auto.conf" file.  Its default value is "include/config/auto.conf".
-
-KCONFIG_TRISTATE
---------------------------------------------------
-This environment variable can be set to specify the path & name of the
-"tristate.conf" file.  Its default value is "include/config/tristate.conf".
-
-KCONFIG_AUTOHEADER
---------------------------------------------------
-This environment variable can be set to specify the path & name of the
-"autoconf.h" (header) file.
-Its default value is "include/generated/autoconf.h".
-
-
-======================================================================
-menuconfig
---------------------------------------------------
-
-SEARCHING for CONFIG symbols
-
-Searching in menuconfig:
-
-	The Search function searches for kernel configuration symbol
-	names, so you have to know something close to what you are
-	looking for.
-
-	Example:
-		/hotplug
-		This lists all config symbols that contain "hotplug",
-		e.g., HOTPLUG_CPU, MEMORY_HOTPLUG.
-
-	For search help, enter / followed by TAB-TAB (to highlight
-	<Help>) and Enter.  This will tell you that you can also use
-	regular expressions (regexes) in the search string, so if you
-	are not interested in MEMORY_HOTPLUG, you could try
-
-		/^hotplug
-
-	When searching, symbols are sorted thus:
-	  - first, exact matches, sorted alphabetically (an exact match
-	    is when the search matches the complete symbol name);
-	  - then, other matches, sorted alphabetically.
-	For example: ^ATH.K matches:
-	    ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG
-	    [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...]
-	of which only ATH5K and ATH9K match exactly and so are sorted
-	first (and in alphabetical order), then come all other symbols,
-	sorted in alphabetical order.
-
-______________________________________________________________________
-User interface options for 'menuconfig'
-
-MENUCONFIG_COLOR
---------------------------------------------------
-It is possible to select different color themes using the variable
-MENUCONFIG_COLOR.  To select a theme use:
-
-	make MENUCONFIG_COLOR=<theme> menuconfig
-
-Available themes are:
-  mono       => selects colors suitable for monochrome displays
-  blackbg    => selects a color scheme with black background
-  classic    => theme with blue background. The classic look
-  bluetitle  => a LCD friendly version of classic. (default)
-
-MENUCONFIG_MODE
---------------------------------------------------
-This mode shows all sub-menus in one large tree.
-
-Example:
-	make MENUCONFIG_MODE=single_menu menuconfig
-
-
-======================================================================
-nconfig
---------------------------------------------------
-
-nconfig is an alternate text-based configurator.  It lists function
-keys across the bottom of the terminal (window) that execute commands.
-You can also just use the corresponding numeric key to execute the
-commands unless you are in a data entry window.  E.g., instead of F6
-for Save, you can just press 6.
-
-Use F1 for Global help or F3 for the Short help menu.
-
-Searching in nconfig:
-
-	You can search either in the menu entry "prompt" strings
-	or in the configuration symbols.
-
-	Use / to begin a search through the menu entries.  This does
-	not support regular expressions.  Use <Down> or <Up> for
-	Next hit and Previous hit, respectively.  Use <Esc> to
-	terminate the search mode.
-
-	F8 (SymSearch) searches the configuration symbols for the
-	given string or regular expression (regex).
-
-NCONFIG_MODE
---------------------------------------------------
-This mode shows all sub-menus in one large tree.
-
-Example:
-	make NCONFIG_MODE=single_menu nconfig
-
-
-======================================================================
-xconfig
---------------------------------------------------
-
-Searching in xconfig:
-
-	The Search function searches for kernel configuration symbol
-	names, so you have to know something close to what you are
-	looking for.
-
-	Example:
-		Ctrl-F hotplug
-	or
-		Menu: File, Search, hotplug
-
-	lists all config symbol entries that contain "hotplug" in
-	the symbol name.  In this Search dialog, you may change the
-	config setting for any of the entries that are not grayed out.
-	You can also enter a different search string without having
-	to return to the main menu.
-
-
-======================================================================
-gconfig
---------------------------------------------------
-
-Searching in gconfig:
-
-	There is no search command in gconfig.  However, gconfig does
-	have several different viewing choices, modes, and options.
-
-###
diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
new file mode 100644
index 000000000000..f4f0f7ffde2b
--- /dev/null
+++ b/Documentation/kbuild/makefiles.rst
@@ -0,0 +1,1522 @@
+======================
+Linux Kernel Makefiles
+======================
+
+This document describes the Linux kernel Makefiles.
+
+.. Table of Contents
+
+	=== 1 Overview
+	=== 2 Who does what
+	=== 3 The kbuild files
+	   --- 3.1 Goal definitions
+	   --- 3.2 Built-in object goals - obj-y
+	   --- 3.3 Loadable module goals - obj-m
+	   --- 3.4 Objects which export symbols
+	   --- 3.5 Library file goals - lib-y
+	   --- 3.6 Descending down in directories
+	   --- 3.7 Compilation flags
+	   --- 3.8 Command line dependency
+	   --- 3.9 Dependency tracking
+	   --- 3.10 Special Rules
+	   --- 3.11 $(CC) support functions
+	   --- 3.12 $(LD) support functions
+
+	=== 4 Host Program support
+	   --- 4.1 Simple Host Program
+	   --- 4.2 Composite Host Programs
+	   --- 4.3 Using C++ for host programs
+	   --- 4.4 Controlling compiler options for host programs
+	   --- 4.5 When host programs are actually built
+	   --- 4.6 Using hostprogs-$(CONFIG_FOO)
+
+	=== 5 Kbuild clean infrastructure
+
+	=== 6 Architecture Makefiles
+	   --- 6.1 Set variables to tweak the build to the architecture
+	   --- 6.2 Add prerequisites to archheaders:
+	   --- 6.3 Add prerequisites to archprepare:
+	   --- 6.4 List directories to visit when descending
+	   --- 6.5 Architecture-specific boot images
+	   --- 6.6 Building non-kbuild targets
+	   --- 6.7 Commands useful for building a boot image
+	   --- 6.8 Custom kbuild commands
+	   --- 6.9 Preprocessing linker scripts
+	   --- 6.10 Generic header files
+	   --- 6.11 Post-link pass
+
+	=== 7 Kbuild syntax for exported headers
+		--- 7.1 no-export-headers
+		--- 7.2 generic-y
+		--- 7.3 generated-y
+		--- 7.4 mandatory-y
+
+	=== 8 Kbuild Variables
+	=== 9 Makefile language
+	=== 10 Credits
+	=== 11 TODO
+
+1 Overview
+==========
+
+The Makefiles have five parts::
+
+	Makefile		the top Makefile.
+	.config			the kernel configuration file.
+	arch/$(ARCH)/Makefile	the arch Makefile.
+	scripts/Makefile.*	common rules etc. for all kbuild Makefiles.
+	kbuild Makefiles	there are about 500 of these.
+
+The top Makefile reads the .config file, which comes from the kernel
+configuration process.
+
+The top Makefile is responsible for building two major products: vmlinux
+(the resident kernel image) and modules (any module files).
+It builds these goals by recursively descending into the subdirectories of
+the kernel source tree.
+The list of subdirectories which are visited depends upon the kernel
+configuration. The top Makefile textually includes an arch Makefile
+with the name arch/$(ARCH)/Makefile. The arch Makefile supplies
+architecture-specific information to the top Makefile.
+
+Each subdirectory has a kbuild Makefile which carries out the commands
+passed down from above. The kbuild Makefile uses information from the
+.config file to construct various file lists used by kbuild to build
+any built-in or modular targets.
+
+scripts/Makefile.* contains all the definitions/rules etc. that
+are used to build the kernel based on the kbuild makefiles.
+
+
+2 Who does what
+===============
+
+People have four different relationships with the kernel Makefiles.
+
+*Users* are people who build kernels.  These people type commands such as
+"make menuconfig" or "make".  They usually do not read or edit
+any kernel Makefiles (or any other source files).
+
+*Normal developers* are people who work on features such as device
+drivers, file systems, and network protocols.  These people need to
+maintain the kbuild Makefiles for the subsystem they are
+working on.  In order to do this effectively, they need some overall
+knowledge about the kernel Makefiles, plus detailed knowledge about the
+public interface for kbuild.
+
+*Arch developers* are people who work on an entire architecture, such
+as sparc or ia64.  Arch developers need to know about the arch Makefile
+as well as kbuild Makefiles.
+
+*Kbuild developers* are people who work on the kernel build system itself.
+These people need to know about all aspects of the kernel Makefiles.
+
+This document is aimed towards normal developers and arch developers.
+
+
+3 The kbuild files
+==================
+
+Most Makefiles within the kernel are kbuild Makefiles that use the
+kbuild infrastructure. This chapter introduces the syntax used in the
+kbuild makefiles.
+The preferred name for the kbuild files are 'Makefile' but 'Kbuild' can
+be used and if both a 'Makefile' and a 'Kbuild' file exists, then the 'Kbuild'
+file will be used.
+
+Section 3.1 "Goal definitions" is a quick intro, further chapters provide
+more details, with real examples.
+
+3.1 Goal definitions
+--------------------
+
+	Goal definitions are the main part (heart) of the kbuild Makefile.
+	These lines define the files to be built, any special compilation
+	options, and any subdirectories to be entered recursively.
+
+	The most simple kbuild makefile contains one line:
+
+	Example::
+
+		obj-y += foo.o
+
+	This tells kbuild that there is one object in that directory, named
+	foo.o. foo.o will be built from foo.c or foo.S.
+
+	If foo.o shall be built as a module, the variable obj-m is used.
+	Therefore the following pattern is often used:
+
+	Example::
+
+		obj-$(CONFIG_FOO) += foo.o
+
+	$(CONFIG_FOO) evaluates to either y (for built-in) or m (for module).
+	If CONFIG_FOO is neither y nor m, then the file will not be compiled
+	nor linked.
+
+3.2 Built-in object goals - obj-y
+---------------------------------
+
+	The kbuild Makefile specifies object files for vmlinux
+	in the $(obj-y) lists.  These lists depend on the kernel
+	configuration.
+
+	Kbuild compiles all the $(obj-y) files.  It then calls
+	"$(AR) rcSTP" to merge these files into one built-in.a file.
+	This is a thin archive without a symbol table. It will be later
+	linked into vmlinux by scripts/link-vmlinux.sh
+
+	The order of files in $(obj-y) is significant.  Duplicates in
+	the lists are allowed: the first instance will be linked into
+	built-in.a and succeeding instances will be ignored.
+
+	Link order is significant, because certain functions
+	(module_init() / __initcall) will be called during boot in the
+	order they appear. So keep in mind that changing the link
+	order may e.g. change the order in which your SCSI
+	controllers are detected, and thus your disks are renumbered.
+
+	Example::
+
+		#drivers/isdn/i4l/Makefile
+		# Makefile for the kernel ISDN subsystem and device drivers.
+		# Each configuration option enables a list of files.
+		obj-$(CONFIG_ISDN_I4L)         += isdn.o
+		obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
+
+3.3 Loadable module goals - obj-m
+---------------------------------
+
+	$(obj-m) specifies object files which are built as loadable
+	kernel modules.
+
+	A module may be built from one source file or several source
+	files. In the case of one source file, the kbuild makefile
+	simply adds the file to $(obj-m).
+
+	Example::
+
+		#drivers/isdn/i4l/Makefile
+		obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
+
+	Note: In this example $(CONFIG_ISDN_PPP_BSDCOMP) evaluates to 'm'
+
+	If a kernel module is built from several source files, you specify
+	that you want to build a module in the same way as above; however,
+	kbuild needs to know which object files you want to build your
+	module from, so you have to tell it by setting a $(<module_name>-y)
+	variable.
+
+	Example::
+
+		#drivers/isdn/i4l/Makefile
+		obj-$(CONFIG_ISDN_I4L) += isdn.o
+		isdn-y := isdn_net_lib.o isdn_v110.o isdn_common.o
+
+	In this example, the module name will be isdn.o. Kbuild will
+	compile the objects listed in $(isdn-y) and then run
+	"$(LD) -r" on the list of these files to generate isdn.o.
+
+	Due to kbuild recognizing $(<module_name>-y) for composite objects,
+	you can use the value of a `CONFIG_` symbol to optionally include an
+	object file as part of a composite object.
+
+	Example::
+
+		#fs/ext2/Makefile
+	        obj-$(CONFIG_EXT2_FS) += ext2.o
+		ext2-y := balloc.o dir.o file.o ialloc.o inode.o ioctl.o \
+			  namei.o super.o symlink.o
+	        ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o \
+						xattr_trusted.o
+
+	In this example, xattr.o, xattr_user.o and xattr_trusted.o are only
+	part of the composite object ext2.o if $(CONFIG_EXT2_FS_XATTR)
+	evaluates to 'y'.
+
+	Note: Of course, when you are building objects into the kernel,
+	the syntax above will also work. So, if you have CONFIG_EXT2_FS=y,
+	kbuild will build an ext2.o file for you out of the individual
+	parts and then link this into built-in.a, as you would expect.
+
+3.4 Objects which export symbols
+--------------------------------
+
+	No special notation is required in the makefiles for
+	modules exporting symbols.
+
+3.5 Library file goals - lib-y
+------------------------------
+
+	Objects listed with obj-* are used for modules, or
+	combined in a built-in.a for that specific directory.
+	There is also the possibility to list objects that will
+	be included in a library, lib.a.
+	All objects listed with lib-y are combined in a single
+	library for that directory.
+	Objects that are listed in obj-y and additionally listed in
+	lib-y will not be included in the library, since they will
+	be accessible anyway.
+	For consistency, objects listed in lib-m will be included in lib.a.
+
+	Note that the same kbuild makefile may list files to be built-in
+	and to be part of a library. Therefore the same directory
+	may contain both a built-in.a and a lib.a file.
+
+	Example::
+
+		#arch/x86/lib/Makefile
+		lib-y    := delay.o
+
+	This will create a library lib.a based on delay.o. For kbuild to
+	actually recognize that there is a lib.a being built, the directory
+	shall be listed in libs-y.
+
+	See also "6.4 List directories to visit when descending".
+
+	Use of lib-y is normally restricted to `lib/` and `arch/*/lib`.
+
+3.6 Descending down in directories
+----------------------------------
+
+	A Makefile is only responsible for building objects in its own
+	directory. Files in subdirectories should be taken care of by
+	Makefiles in these subdirs. The build system will automatically
+	invoke make recursively in subdirectories, provided you let it know of
+	them.
+
+	To do so, obj-y and obj-m are used.
+	ext2 lives in a separate directory, and the Makefile present in fs/
+	tells kbuild to descend down using the following assignment.
+
+	Example::
+
+		#fs/Makefile
+		obj-$(CONFIG_EXT2_FS) += ext2/
+
+	If CONFIG_EXT2_FS is set to either 'y' (built-in) or 'm' (modular)
+	the corresponding obj- variable will be set, and kbuild will descend
+	down in the ext2 directory.
+	Kbuild only uses this information to decide that it needs to visit
+	the directory, it is the Makefile in the subdirectory that
+	specifies what is modular and what is built-in.
+
+	It is good practice to use a `CONFIG_` variable when assigning directory
+	names. This allows kbuild to totally skip the directory if the
+	corresponding `CONFIG_` option is neither 'y' nor 'm'.
+
+3.7 Compilation flags
+---------------------
+
+    ccflags-y, asflags-y and ldflags-y
+	These three flags apply only to the kbuild makefile in which they
+	are assigned. They are used for all the normal cc, as and ld
+	invocations happening during a recursive build.
+	Note: Flags with the same behaviour were previously named:
+	EXTRA_CFLAGS, EXTRA_AFLAGS and EXTRA_LDFLAGS.
+	They are still supported but their usage is deprecated.
+
+	ccflags-y specifies options for compiling with $(CC).
+
+	Example::
+
+		# drivers/acpi/acpica/Makefile
+		ccflags-y			:= -Os -D_LINUX -DBUILDING_ACPICA
+		ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
+
+	This variable is necessary because the top Makefile owns the
+	variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
+	entire tree.
+
+	asflags-y specifies assembler options.
+
+	Example::
+
+		#arch/sparc/kernel/Makefile
+		asflags-y := -ansi
+
+	ldflags-y specifies options for linking with $(LD).
+
+	Example::
+
+		#arch/cris/boot/compressed/Makefile
+		ldflags-y += -T $(srctree)/$(src)/decompress_$(arch-y).lds
+
+    subdir-ccflags-y, subdir-asflags-y
+	The two flags listed above are similar to ccflags-y and asflags-y.
+	The difference is that the subdir- variants have effect for the kbuild
+	file where they are present and all subdirectories.
+	Options specified using subdir-* are added to the commandline before
+	the options specified using the non-subdir variants.
+
+	Example::
+
+		subdir-ccflags-y := -Werror
+
+    CFLAGS_$@, AFLAGS_$@
+	CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
+	kbuild makefile.
+
+	$(CFLAGS_$@) specifies per-file options for $(CC).  The $@
+	part has a literal value which specifies the file that it is for.
+
+	Example::
+
+		# drivers/scsi/Makefile
+		CFLAGS_aha152x.o =   -DAHA152X_STAT -DAUTOCONF
+		CFLAGS_gdth.o    = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ \
+				     -DGDTH_STATISTICS
+
+	These two lines specify compilation flags for aha152x.o and gdth.o.
+
+	$(AFLAGS_$@) is a similar feature for source files in assembly
+	languages.
+
+	Example::
+
+		# arch/arm/kernel/Makefile
+		AFLAGS_head.o        := -DTEXT_OFFSET=$(TEXT_OFFSET)
+		AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312
+		AFLAGS_iwmmxt.o      := -Wa,-mcpu=iwmmxt
+
+
+3.9 Dependency tracking
+-----------------------
+
+	Kbuild tracks dependencies on the following:
+
+	1) All prerequisite files (both `*.c` and `*.h`)
+	2) `CONFIG_` options used in all prerequisite files
+	3) Command-line used to compile target
+
+	Thus, if you change an option to $(CC) all affected files will
+	be re-compiled.
+
+3.10 Special Rules
+------------------
+
+	Special rules are used when the kbuild infrastructure does
+	not provide the required support. A typical example is
+	header files generated during the build process.
+	Another example are the architecture-specific Makefiles which
+	need special rules to prepare boot images etc.
+
+	Special rules are written as normal Make rules.
+	Kbuild is not executing in the directory where the Makefile is
+	located, so all special rules shall provide a relative
+	path to prerequisite files and target files.
+
+	Two variables are used when defining special rules:
+
+	$(src)
+	    $(src) is a relative path which points to the directory
+	    where the Makefile is located. Always use $(src) when
+	    referring to files located in the src tree.
+
+	$(obj)
+	    $(obj) is a relative path which points to the directory
+	    where the target is saved. Always use $(obj) when
+	    referring to generated files.
+
+	    Example::
+
+		#drivers/scsi/Makefile
+		$(obj)/53c8xx_d.h: $(src)/53c7,8xx.scr $(src)/script_asm.pl
+			$(CPP) -DCHIP=810 - < $< | ... $(src)/script_asm.pl
+
+	    This is a special rule, following the normal syntax
+	    required by make.
+
+	    The target file depends on two prerequisite files. References
+	    to the target file are prefixed with $(obj), references
+	    to prerequisites are referenced with $(src) (because they are not
+	    generated files).
+
+	$(kecho)
+	    echoing information to user in a rule is often a good practice
+	    but when execution "make -s" one does not expect to see any output
+	    except for warnings/errors.
+	    To support this kbuild defines $(kecho) which will echo out the
+	    text following $(kecho) to stdout except if "make -s" is used.
+
+	Example::
+
+		#arch/blackfin/boot/Makefile
+		$(obj)/vmImage: $(obj)/vmlinux.gz
+			$(call if_changed,uimage)
+			@$(kecho) 'Kernel: $@ is ready'
+
+
+3.11 $(CC) support functions
+----------------------------
+
+	The kernel may be built with several different versions of
+	$(CC), each supporting a unique set of features and options.
+	kbuild provides basic support to check for valid options for $(CC).
+	$(CC) is usually the gcc compiler, but other alternatives are
+	available.
+
+    as-option
+	as-option is used to check if $(CC) -- when used to compile
+	assembler (`*.S`) files -- supports the given option. An optional
+	second option may be specified if the first option is not supported.
+
+	Example::
+
+		#arch/sh/Makefile
+		cflags-y += $(call as-option,-Wa$(comma)-isa=$(isa-y),)
+
+	In the above example, cflags-y will be assigned the option
+	-Wa$(comma)-isa=$(isa-y) if it is supported by $(CC).
+	The second argument is optional, and if supplied will be used
+	if first argument is not supported.
+
+    cc-ldoption
+	cc-ldoption is used to check if $(CC) when used to link object files
+	supports the given option.  An optional second option may be
+	specified if first option are not supported.
+
+	Example::
+
+		#arch/x86/kernel/Makefile
+		vsyscall-flags += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+
+	In the above example, vsyscall-flags will be assigned the option
+	-Wl$(comma)--hash-style=sysv if it is supported by $(CC).
+	The second argument is optional, and if supplied will be used
+	if first argument is not supported.
+
+    as-instr
+	as-instr checks if the assembler reports a specific instruction
+	and then outputs either option1 or option2
+	C escapes are supported in the test instruction
+	Note: as-instr-option uses KBUILD_AFLAGS for assembler options
+
+    cc-option
+	cc-option is used to check if $(CC) supports a given option, and if
+	not supported to use an optional second option.
+
+	Example::
+
+		#arch/x86/Makefile
+		cflags-y += $(call cc-option,-march=pentium-mmx,-march=i586)
+
+	In the above example, cflags-y will be assigned the option
+	-march=pentium-mmx if supported by $(CC), otherwise -march=i586.
+	The second argument to cc-option is optional, and if omitted,
+	cflags-y will be assigned no value if first option is not supported.
+	Note: cc-option uses KBUILD_CFLAGS for $(CC) options
+
+   cc-option-yn
+	cc-option-yn is used to check if gcc supports a given option
+	and return 'y' if supported, otherwise 'n'.
+
+	Example::
+
+		#arch/ppc/Makefile
+		biarch := $(call cc-option-yn, -m32)
+		aflags-$(biarch) += -a32
+		cflags-$(biarch) += -m32
+
+	In the above example, $(biarch) is set to y if $(CC) supports the -m32
+	option. When $(biarch) equals 'y', the expanded variables $(aflags-y)
+	and $(cflags-y) will be assigned the values -a32 and -m32,
+	respectively.
+	Note: cc-option-yn uses KBUILD_CFLAGS for $(CC) options
+
+    cc-disable-warning
+	cc-disable-warning checks if gcc supports a given warning and returns
+	the commandline switch to disable it. This special function is needed,
+	because gcc 4.4 and later accept any unknown -Wno-* option and only
+	warn about it if there is another warning in the source file.
+
+	Example::
+
+		KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+
+	In the above example, -Wno-unused-but-set-variable will be added to
+	KBUILD_CFLAGS only if gcc really accepts it.
+
+    cc-ifversion
+	cc-ifversion tests the version of $(CC) and equals the fourth parameter
+	if version expression is true, or the fifth (if given) if the version
+	expression is false.
+
+	Example::
+
+		#fs/reiserfs/Makefile
+		ccflags-y := $(call cc-ifversion, -lt, 0402, -O1)
+
+	In this example, ccflags-y will be assigned the value -O1 if the
+	$(CC) version is less than 4.2.
+	cc-ifversion takes all the shell operators:
+	-eq, -ne, -lt, -le, -gt, and -ge
+	The third parameter may be a text as in this example, but it may also
+	be an expanded variable or a macro.
+
+    cc-cross-prefix
+	cc-cross-prefix is used to check if there exists a $(CC) in path with
+	one of the listed prefixes. The first prefix where there exist a
+	prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
+	then nothing is returned.
+	Additional prefixes are separated by a single space in the
+	call of cc-cross-prefix.
+	This functionality is useful for architecture Makefiles that try
+	to set CROSS_COMPILE to well-known values but may have several
+	values to select between.
+	It is recommended only to try to set CROSS_COMPILE if it is a cross
+	build (host arch is different from target arch). And if CROSS_COMPILE
+	is already set then leave it with the old value.
+
+	Example::
+
+		#arch/m68k/Makefile
+		ifneq ($(SUBARCH),$(ARCH))
+		        ifeq ($(CROSS_COMPILE),)
+		               CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
+			endif
+		endif
+
+3.12 $(LD) support functions
+----------------------------
+
+    ld-option
+	ld-option is used to check if $(LD) supports the supplied option.
+	ld-option takes two options as arguments.
+	The second argument is an optional option that can be used if the
+	first option is not supported by $(LD).
+
+	Example::
+
+		#Makefile
+		LDFLAGS_vmlinux += $(call ld-option, -X)
+
+
+4 Host Program support
+======================
+
+Kbuild supports building executables on the host for use during the
+compilation stage.
+Two steps are required in order to use a host executable.
+
+The first step is to tell kbuild that a host program exists. This is
+done utilising the variable hostprogs-y.
+
+The second step is to add an explicit dependency to the executable.
+This can be done in two ways. Either add the dependency in a rule,
+or utilise the variable $(always).
+Both possibilities are described in the following.
+
+4.1 Simple Host Program
+-----------------------
+
+	In some cases there is a need to compile and run a program on the
+	computer where the build is running.
+	The following line tells kbuild that the program bin2hex shall be
+	built on the build host.
+
+	Example::
+
+		hostprogs-y := bin2hex
+
+	Kbuild assumes in the above example that bin2hex is made from a single
+	c-source file named bin2hex.c located in the same directory as
+	the Makefile.
+
+4.2 Composite Host Programs
+---------------------------
+
+	Host programs can be made up based on composite objects.
+	The syntax used to define composite objects for host programs is
+	similar to the syntax used for kernel objects.
+	$(<executable>-objs) lists all objects used to link the final
+	executable.
+
+	Example::
+
+		#scripts/lxdialog/Makefile
+		hostprogs-y   := lxdialog
+		lxdialog-objs := checklist.o lxdialog.o
+
+	Objects with extension .o are compiled from the corresponding .c
+	files. In the above example, checklist.c is compiled to checklist.o
+	and lxdialog.c is compiled to lxdialog.o.
+
+	Finally, the two .o files are linked to the executable, lxdialog.
+	Note: The syntax <executable>-y is not permitted for host-programs.
+
+4.3 Using C++ for host programs
+-------------------------------
+
+	kbuild offers support for host programs written in C++. This was
+	introduced solely to support kconfig, and is not recommended
+	for general use.
+
+	Example::
+
+		#scripts/kconfig/Makefile
+		hostprogs-y   := qconf
+		qconf-cxxobjs := qconf.o
+
+	In the example above the executable is composed of the C++ file
+	qconf.cc - identified by $(qconf-cxxobjs).
+
+	If qconf is composed of a mixture of .c and .cc files, then an
+	additional line can be used to identify this.
+
+	Example::
+
+		#scripts/kconfig/Makefile
+		hostprogs-y   := qconf
+		qconf-cxxobjs := qconf.o
+		qconf-objs    := check.o
+
+4.4 Controlling compiler options for host programs
+--------------------------------------------------
+
+	When compiling host programs, it is possible to set specific flags.
+	The programs will always be compiled utilising $(HOSTCC) passed
+	the options specified in $(KBUILD_HOSTCFLAGS).
+	To set flags that will take effect for all host programs created
+	in that Makefile, use the variable HOST_EXTRACFLAGS.
+
+	Example::
+
+		#scripts/lxdialog/Makefile
+		HOST_EXTRACFLAGS += -I/usr/include/ncurses
+
+	To set specific flags for a single file the following construction
+	is used:
+
+	Example::
+
+		#arch/ppc64/boot/Makefile
+		HOSTCFLAGS_piggyback.o := -DKERNELBASE=$(KERNELBASE)
+
+	It is also possible to specify additional options to the linker.
+
+	Example::
+
+		#scripts/kconfig/Makefile
+		HOSTLDLIBS_qconf := -L$(QTDIR)/lib
+
+	When linking qconf, it will be passed the extra option
+	"-L$(QTDIR)/lib".
+
+4.5 When host programs are actually built
+-----------------------------------------
+
+	Kbuild will only build host-programs when they are referenced
+	as a prerequisite.
+	This is possible in two ways:
+
+	(1) List the prerequisite explicitly in a special rule.
+
+	Example::
+
+		#drivers/pci/Makefile
+		hostprogs-y := gen-devlist
+		$(obj)/devlist.h: $(src)/pci.ids $(obj)/gen-devlist
+			( cd $(obj); ./gen-devlist ) < $<
+
+	The target $(obj)/devlist.h will not be built before
+	$(obj)/gen-devlist is updated. Note that references to
+	the host programs in special rules must be prefixed with $(obj).
+
+	(2) Use $(always)
+
+	When there is no suitable special rule, and the host program
+	shall be built when a makefile is entered, the $(always)
+	variable shall be used.
+
+	Example::
+
+		#scripts/lxdialog/Makefile
+		hostprogs-y   := lxdialog
+		always        := $(hostprogs-y)
+
+	This will tell kbuild to build lxdialog even if not referenced in
+	any rule.
+
+4.6 Using hostprogs-$(CONFIG_FOO)
+---------------------------------
+
+	A typical pattern in a Kbuild file looks like this:
+
+	Example::
+
+		#scripts/Makefile
+		hostprogs-$(CONFIG_KALLSYMS) += kallsyms
+
+	Kbuild knows about both 'y' for built-in and 'm' for module.
+	So if a config symbol evaluates to 'm', kbuild will still build
+	the binary. In other words, Kbuild handles hostprogs-m exactly
+	like hostprogs-y. But only hostprogs-y is recommended to be used
+	when no CONFIG symbols are involved.
+
+5 Kbuild clean infrastructure
+=============================
+
+"make clean" deletes most generated files in the obj tree where the kernel
+is compiled. This includes generated files such as host programs.
+Kbuild knows targets listed in $(hostprogs-y), $(hostprogs-m), $(always),
+$(extra-y) and $(targets). They are all deleted during "make clean".
+Files matching the patterns "*.[oas]", "*.ko", plus some additional files
+generated by kbuild are deleted all over the kernel src tree when
+"make clean" is executed.
+
+Additional files can be specified in kbuild makefiles by use of $(clean-files).
+
+	Example::
+
+		#lib/Makefile
+		clean-files := crc32table.h
+
+When executing "make clean", the file "crc32table.h" will be deleted.
+Kbuild will assume files to be in the same relative directory as the
+Makefile, except if prefixed with $(objtree).
+
+To delete a directory hierarchy use:
+
+	Example::
+
+		#scripts/package/Makefile
+		clean-dirs := $(objtree)/debian/
+
+This will delete the directory debian in the toplevel directory, including all
+subdirectories.
+
+To exclude certain files from make clean, use the $(no-clean-files) variable.
+This is only a special case used in the top level Kbuild file:
+
+	Example::
+
+		#Kbuild
+		no-clean-files := $(bounds-file) $(offsets-file)
+
+Usually kbuild descends down in subdirectories due to "obj-* := dir/",
+but in the architecture makefiles where the kbuild infrastructure
+is not sufficient this sometimes needs to be explicit.
+
+	Example::
+
+		#arch/x86/boot/Makefile
+		subdir- := compressed/
+
+The above assignment instructs kbuild to descend down in the
+directory compressed/ when "make clean" is executed.
+
+To support the clean infrastructure in the Makefiles that build the
+final bootimage there is an optional target named archclean:
+
+	Example::
+
+		#arch/x86/Makefile
+		archclean:
+			$(Q)$(MAKE) $(clean)=arch/x86/boot
+
+When "make clean" is executed, make will descend down in arch/x86/boot,
+and clean as usual. The Makefile located in arch/x86/boot/ may use
+the subdir- trick to descend further down.
+
+Note 1: arch/$(ARCH)/Makefile cannot use "subdir-", because that file is
+included in the top level makefile, and the kbuild infrastructure
+is not operational at that point.
+
+Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will
+be visited during "make clean".
+
+6 Architecture Makefiles
+========================
+
+The top level Makefile sets up the environment and does the preparation,
+before starting to descend down in the individual directories.
+The top level makefile contains the generic part, whereas
+arch/$(ARCH)/Makefile contains what is required to set up kbuild
+for said architecture.
+To do so, arch/$(ARCH)/Makefile sets up a number of variables and defines
+a few targets.
+
+When kbuild executes, the following steps are followed (roughly):
+
+1) Configuration of the kernel => produce .config
+2) Store kernel version in include/linux/version.h
+3) Updating all other prerequisites to the target prepare:
+   - Additional prerequisites are specified in arch/$(ARCH)/Makefile
+4) Recursively descend down in all directories listed in
+   init-* core* drivers-* net-* libs-* and build all targets.
+   - The values of the above variables are expanded in arch/$(ARCH)/Makefile.
+5) All object files are then linked and the resulting file vmlinux is
+   located at the root of the obj tree.
+   The very first objects linked are listed in head-y, assigned by
+   arch/$(ARCH)/Makefile.
+6) Finally, the architecture-specific part does any required post processing
+   and builds the final bootimage.
+   - This includes building boot records
+   - Preparing initrd images and the like
+
+
+6.1 Set variables to tweak the build to the architecture
+--------------------------------------------------------
+
+    LDFLAGS
+	Generic $(LD) options
+
+	Flags used for all invocations of the linker.
+	Often specifying the emulation is sufficient.
+
+	Example::
+
+		#arch/s390/Makefile
+		LDFLAGS         := -m elf_s390
+
+	Note: ldflags-y can be used to further customise
+	the flags used. See chapter 3.7.
+
+    LDFLAGS_vmlinux
+	Options for $(LD) when linking vmlinux
+
+	LDFLAGS_vmlinux is used to specify additional flags to pass to
+	the linker when linking the final vmlinux image.
+	LDFLAGS_vmlinux uses the LDFLAGS_$@ support.
+
+	Example::
+
+		#arch/x86/Makefile
+		LDFLAGS_vmlinux := -e stext
+
+    OBJCOPYFLAGS
+	objcopy flags
+
+	When $(call if_changed,objcopy) is used to translate a .o file,
+	the flags specified in OBJCOPYFLAGS will be used.
+	$(call if_changed,objcopy) is often used to generate raw binaries on
+	vmlinux.
+
+	Example::
+
+		#arch/s390/Makefile
+		OBJCOPYFLAGS := -O binary
+
+		#arch/s390/boot/Makefile
+		$(obj)/image: vmlinux FORCE
+			$(call if_changed,objcopy)
+
+	In this example, the binary $(obj)/image is a binary version of
+	vmlinux. The usage of $(call if_changed,xxx) will be described later.
+
+    KBUILD_AFLAGS
+	Assembler flags
+
+	Default value - see top level Makefile
+	Append or modify as required per architecture.
+
+	Example::
+
+		#arch/sparc64/Makefile
+		KBUILD_AFLAGS += -m64 -mcpu=ultrasparc
+
+    KBUILD_CFLAGS
+	$(CC) compiler flags
+
+	Default value - see top level Makefile
+	Append or modify as required per architecture.
+
+	Often, the KBUILD_CFLAGS variable depends on the configuration.
+
+	Example::
+
+		#arch/x86/boot/compressed/Makefile
+		cflags-$(CONFIG_X86_32) := -march=i386
+		cflags-$(CONFIG_X86_64) := -mcmodel=small
+		KBUILD_CFLAGS += $(cflags-y)
+
+	Many arch Makefiles dynamically run the target C compiler to
+	probe supported options::
+
+		#arch/x86/Makefile
+
+		...
+		cflags-$(CONFIG_MPENTIUMII)     += $(call cc-option,\
+						-march=pentium2,-march=i686)
+		...
+		# Disable unit-at-a-time mode ...
+		KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+		...
+
+
+	The first example utilises the trick that a config option expands
+	to 'y' when selected.
+
+    KBUILD_AFLAGS_KERNEL
+	Assembler options specific for built-in
+
+	$(KBUILD_AFLAGS_KERNEL) contains extra C compiler flags used to compile
+	resident kernel code.
+
+    KBUILD_AFLAGS_MODULE
+	Assembler options specific for modules
+
+	$(KBUILD_AFLAGS_MODULE) is used to add arch-specific options that
+	are used for assembler.
+
+	From commandline AFLAGS_MODULE shall be used (see kbuild.txt).
+
+    KBUILD_CFLAGS_KERNEL
+	$(CC) options specific for built-in
+
+	$(KBUILD_CFLAGS_KERNEL) contains extra C compiler flags used to compile
+	resident kernel code.
+
+    KBUILD_CFLAGS_MODULE
+	Options for $(CC) when building modules
+
+	$(KBUILD_CFLAGS_MODULE) is used to add arch-specific options that
+	are used for $(CC).
+	From commandline CFLAGS_MODULE shall be used (see kbuild.txt).
+
+    KBUILD_LDFLAGS_MODULE
+	Options for $(LD) when linking modules
+
+	$(KBUILD_LDFLAGS_MODULE) is used to add arch-specific options
+	used when linking modules. This is often a linker script.
+
+	From commandline LDFLAGS_MODULE shall be used (see kbuild.txt).
+
+    KBUILD_ARFLAGS   Options for $(AR) when creating archives
+
+	$(KBUILD_ARFLAGS) set by the top level Makefile to "D" (deterministic
+	mode) if this option is supported by $(AR).
+
+    ARCH_CPPFLAGS, ARCH_AFLAGS, ARCH_CFLAGS   Overrides the kbuild defaults
+
+	These variables are appended to the KBUILD_CPPFLAGS,
+	KBUILD_AFLAGS, and KBUILD_CFLAGS, respectively, after the
+	top-level Makefile has set any other flags. This provides a
+	means for an architecture to override the defaults.
+
+
+6.2 Add prerequisites to archheaders
+------------------------------------
+
+	The archheaders: rule is used to generate header files that
+	may be installed into user space by "make header_install".
+
+	It is run before "make archprepare" when run on the
+	architecture itself.
+
+
+6.3 Add prerequisites to archprepare
+------------------------------------
+
+	The archprepare: rule is used to list prerequisites that need to be
+	built before starting to descend down in the subdirectories.
+	This is usually used for header files containing assembler constants.
+
+	Example::
+
+		#arch/arm/Makefile
+		archprepare: maketools
+
+	In this example, the file target maketools will be processed
+	before descending down in the subdirectories.
+	See also chapter XXX-TODO that describe how kbuild supports
+	generating offset header files.
+
+
+6.4 List directories to visit when descending
+---------------------------------------------
+
+	An arch Makefile cooperates with the top Makefile to define variables
+	which specify how to build the vmlinux file.  Note that there is no
+	corresponding arch-specific section for modules; the module-building
+	machinery is all architecture-independent.
+
+
+	head-y, init-y, core-y, libs-y, drivers-y, net-y
+	    $(head-y) lists objects to be linked first in vmlinux.
+
+	    $(libs-y) lists directories where a lib.a archive can be located.
+
+	    The rest list directories where a built-in.a object file can be
+	    located.
+
+	    $(init-y) objects will be located after $(head-y).
+
+	    Then the rest follows in this order:
+
+		$(core-y), $(libs-y), $(drivers-y) and $(net-y).
+
+	    The top level Makefile defines values for all generic directories,
+	    and arch/$(ARCH)/Makefile only adds architecture-specific
+	    directories.
+
+	    Example::
+
+		#arch/sparc64/Makefile
+		core-y += arch/sparc64/kernel/
+		libs-y += arch/sparc64/prom/ arch/sparc64/lib/
+		drivers-$(CONFIG_OPROFILE)  += arch/sparc64/oprofile/
+
+
+6.5 Architecture-specific boot images
+-------------------------------------
+
+	An arch Makefile specifies goals that take the vmlinux file, compress
+	it, wrap it in bootstrapping code, and copy the resulting files
+	somewhere. This includes various kinds of installation commands.
+	The actual goals are not standardized across architectures.
+
+	It is common to locate any additional processing in a boot/
+	directory below arch/$(ARCH)/.
+
+	Kbuild does not provide any smart way to support building a
+	target specified in boot/. Therefore arch/$(ARCH)/Makefile shall
+	call make manually to build a target in boot/.
+
+	The recommended approach is to include shortcuts in
+	arch/$(ARCH)/Makefile, and use the full path when calling down
+	into the arch/$(ARCH)/boot/Makefile.
+
+	Example::
+
+		#arch/x86/Makefile
+		boot := arch/x86/boot
+		bzImage: vmlinux
+			$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
+	"$(Q)$(MAKE) $(build)=<dir>" is the recommended way to invoke
+	make in a subdirectory.
+
+	There are no rules for naming architecture-specific targets,
+	but executing "make help" will list all relevant targets.
+	To support this, $(archhelp) must be defined.
+
+	Example::
+
+		#arch/x86/Makefile
+		define archhelp
+		  echo  '* bzImage      - Image (arch/$(ARCH)/boot/bzImage)'
+		endif
+
+	When make is executed without arguments, the first goal encountered
+	will be built. In the top level Makefile the first goal present
+	is all:.
+	An architecture shall always, per default, build a bootable image.
+	In "make help", the default goal is highlighted with a '*'.
+	Add a new prerequisite to all: to select a default goal different
+	from vmlinux.
+
+	Example::
+
+		#arch/x86/Makefile
+		all: bzImage
+
+	When "make" is executed without arguments, bzImage will be built.
+
+6.6 Building non-kbuild targets
+-------------------------------
+
+    extra-y
+	extra-y specifies additional targets created in the current
+	directory, in addition to any targets specified by `obj-*`.
+
+	Listing all targets in extra-y is required for two purposes:
+
+	1) Enable kbuild to check changes in command lines
+
+	   - When $(call if_changed,xxx) is used
+
+	2) kbuild knows what files to delete during "make clean"
+
+	Example::
+
+		#arch/x86/kernel/Makefile
+		extra-y := head.o init_task.o
+
+	In this example, extra-y is used to list object files that
+	shall be built, but shall not be linked as part of built-in.a.
+
+    header-test-y
+
+	header-test-y specifies headers (*.h) in the current directory that
+	should be compile tested to ensure they are self-contained,
+	i.e. compilable as standalone units. If CONFIG_HEADER_TEST is enabled,
+	this builds them as part of extra-y.
+
+    header-test-pattern-y
+
+	This works as a weaker version of header-test-y, and accepts wildcard
+	patterns. The typical usage is:
+
+		  header-test-pattern-y += *.h
+
+	This specifies all the files that matches to '*.h' in the current
+	directory, but the files in 'header-test-' are excluded.
+
+6.7 Commands useful for building a boot image
+---------------------------------------------
+
+    Kbuild provides a few macros that are useful when building a
+    boot image.
+
+    if_changed
+	if_changed is the infrastructure used for the following commands.
+
+	Usage::
+
+		target: source(s) FORCE
+			$(call if_changed,ld/objcopy/gzip/...)
+
+	When the rule is evaluated, it is checked to see if any files
+	need an update, or the command line has changed since the last
+	invocation. The latter will force a rebuild if any options
+	to the executable have changed.
+	Any target that utilises if_changed must be listed in $(targets),
+	otherwise the command line check will fail, and the target will
+	always be built.
+	Assignments to $(targets) are without $(obj)/ prefix.
+	if_changed may be used in conjunction with custom commands as
+	defined in 6.8 "Custom kbuild commands".
+
+	Note: It is a typical mistake to forget the FORCE prerequisite.
+	Another common pitfall is that whitespace is sometimes
+	significant; for instance, the below will fail (note the extra space
+	after the comma)::
+
+		target: source(s) FORCE
+
+	**WRONG!**	$(call if_changed, ld/objcopy/gzip/...)
+
+        Note:
+	      if_changed should not be used more than once per target.
+              It stores the executed command in a corresponding .cmd
+
+        file and multiple calls would result in overwrites and
+        unwanted results when the target is up to date and only the
+        tests on changed commands trigger execution of commands.
+
+    ld
+	Link target. Often, LDFLAGS_$@ is used to set specific options to ld.
+
+	Example::
+
+		#arch/x86/boot/Makefile
+		LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
+		LDFLAGS_setup    := -Ttext 0x0 -s --oformat binary -e begtext
+
+		targets += setup setup.o bootsect bootsect.o
+		$(obj)/setup $(obj)/bootsect: %: %.o FORCE
+			$(call if_changed,ld)
+
+	In this example, there are two possible targets, requiring different
+	options to the linker. The linker options are specified using the
+	LDFLAGS_$@ syntax - one for each potential target.
+	$(targets) are assigned all potential targets, by which kbuild knows
+	the targets and will:
+
+		1) check for commandline changes
+		2) delete target during make clean
+
+	The ": %: %.o" part of the prerequisite is a shorthand that
+	frees us from listing the setup.o and bootsect.o files.
+
+	Note:
+	      It is a common mistake to forget the "targets :=" assignment,
+	      resulting in the target file being recompiled for no
+	      obvious reason.
+
+    objcopy
+	Copy binary. Uses OBJCOPYFLAGS usually specified in
+	arch/$(ARCH)/Makefile.
+	OBJCOPYFLAGS_$@ may be used to set additional options.
+
+    gzip
+	Compress target. Use maximum compression to compress target.
+
+	Example::
+
+		#arch/x86/boot/compressed/Makefile
+		$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
+			$(call if_changed,gzip)
+
+    dtc
+	Create flattened device tree blob object suitable for linking
+	into vmlinux. Device tree blobs linked into vmlinux are placed
+	in an init section in the image. Platform code *must* copy the
+	blob to non-init memory prior to calling unflatten_device_tree().
+
+	To use this command, simply add `*.dtb` into obj-y or targets, or make
+	some other target depend on `%.dtb`
+
+	A central rule exists to create `$(obj)/%.dtb` from `$(src)/%.dts`;
+	architecture Makefiles do no need to explicitly write out that rule.
+
+	Example::
+
+		targets += $(dtb-y)
+		DTC_FLAGS ?= -p 1024
+
+6.8 Custom kbuild commands
+--------------------------
+
+	When kbuild is executing with KBUILD_VERBOSE=0, then only a shorthand
+	of a command is normally displayed.
+	To enable this behaviour for custom commands kbuild requires
+	two variables to be set::
+
+		quiet_cmd_<command>	- what shall be echoed
+		      cmd_<command>	- the command to execute
+
+	Example::
+
+		#
+		quiet_cmd_image = BUILD   $@
+		      cmd_image = $(obj)/tools/build $(BUILDFLAGS) \
+		                                     $(obj)/vmlinux.bin > $@
+
+		targets += bzImage
+		$(obj)/bzImage: $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+			$(call if_changed,image)
+			@echo 'Kernel: $@ is ready'
+
+	When updating the $(obj)/bzImage target, the line:
+
+		BUILD    arch/x86/boot/bzImage
+
+	will be displayed with "make KBUILD_VERBOSE=0".
+
+
+--- 6.9 Preprocessing linker scripts
+
+	When the vmlinux image is built, the linker script
+	arch/$(ARCH)/kernel/vmlinux.lds is used.
+	The script is a preprocessed variant of the file vmlinux.lds.S
+	located in the same directory.
+	kbuild knows .lds files and includes a rule `*lds.S` -> `*lds`.
+
+	Example::
+
+		#arch/x86/kernel/Makefile
+		always := vmlinux.lds
+
+		#Makefile
+		export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH)
+
+	The assignment to $(always) is used to tell kbuild to build the
+	target vmlinux.lds.
+	The assignment to $(CPPFLAGS_vmlinux.lds) tells kbuild to use the
+	specified options when building the target vmlinux.lds.
+
+	When building the `*.lds` target, kbuild uses the variables::
+
+		KBUILD_CPPFLAGS	: Set in top-level Makefile
+		cppflags-y	: May be set in the kbuild makefile
+		CPPFLAGS_$(@F)  : Target-specific flags.
+				Note that the full filename is used in this
+				assignment.
+
+	The kbuild infrastructure for `*lds` files is used in several
+	architecture-specific files.
+
+6.10 Generic header files
+-------------------------
+
+	The directory include/asm-generic contains the header files
+	that may be shared between individual architectures.
+	The recommended approach how to use a generic header file is
+	to list the file in the Kbuild file.
+	See "7.2 generic-y" for further info on syntax etc.
+
+6.11 Post-link pass
+-------------------
+
+	If the file arch/xxx/Makefile.postlink exists, this makefile
+	will be invoked for post-link objects (vmlinux and modules.ko)
+	for architectures to run post-link passes on. Must also handle
+	the clean target.
+
+	This pass runs after kallsyms generation. If the architecture
+	needs to modify symbol locations, rather than manipulate the
+	kallsyms, it may be easier to add another postlink target for
+	.tmp_vmlinux? targets to be called from link-vmlinux.sh.
+
+	For example, powerpc uses this to check relocation sanity of
+	the linked vmlinux file.
+
+7 Kbuild syntax for exported headers
+------------------------------------
+
+The kernel includes a set of headers that is exported to userspace.
+Many headers can be exported as-is but other headers require a
+minimal pre-processing before they are ready for user-space.
+The pre-processing does:
+
+- drop kernel-specific annotations
+- drop include of compiler.h
+- drop all sections that are kernel internal (guarded by `ifdef __KERNEL__`)
+
+All headers under include/uapi/, include/generated/uapi/,
+arch/<arch>/include/uapi/ and arch/<arch>/include/generated/uapi/
+are exported.
+
+A Kbuild file may be defined under arch/<arch>/include/uapi/asm/ and
+arch/<arch>/include/asm/ to list asm files coming from asm-generic.
+See subsequent chapter for the syntax of the Kbuild file.
+
+7.1 no-export-headers
+---------------------
+
+	no-export-headers is essentially used by include/uapi/linux/Kbuild to
+	avoid exporting specific headers (e.g. kvm.h) on architectures that do
+	not support it. It should be avoided as much as possible.
+
+7.2 generic-y
+-------------
+
+	If an architecture uses a verbatim copy of a header from
+	include/asm-generic then this is listed in the file
+	arch/$(ARCH)/include/asm/Kbuild like this:
+
+		Example::
+
+			#arch/x86/include/asm/Kbuild
+			generic-y += termios.h
+			generic-y += rtc.h
+
+	During the prepare phase of the build a wrapper include
+	file is generated in the directory::
+
+		arch/$(ARCH)/include/generated/asm
+
+	When a header is exported where the architecture uses
+	the generic header a similar wrapper is generated as part
+	of the set of exported headers in the directory::
+
+		usr/include/asm
+
+	The generated wrapper will in both cases look like the following:
+
+		Example: termios.h::
+
+			#include <asm-generic/termios.h>
+
+7.3 generated-y
+---------------
+
+	If an architecture generates other header files alongside generic-y
+	wrappers, generated-y specifies them.
+
+	This prevents them being treated as stale asm-generic wrappers and
+	removed.
+
+		Example::
+
+			#arch/x86/include/asm/Kbuild
+			generated-y += syscalls_32.h
+
+7.4 mandatory-y
+---------------
+
+	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild
+	to define the minimum set of ASM headers that all architectures must have.
+
+	This works like optional generic-y. If a mandatory header is missing
+	in arch/$(ARCH)/include/(uapi/)/asm, Kbuild will automatically generate
+	a wrapper of the asm-generic one.
+
+	The convention is to list one subdir per line and
+	preferably in alphabetic order.
+
+8 Kbuild Variables
+==================
+
+The top Makefile exports the following variables:
+
+    VERSION, PATCHLEVEL, SUBLEVEL, EXTRAVERSION
+	These variables define the current kernel version.  A few arch
+	Makefiles actually use these values directly; they should use
+	$(KERNELRELEASE) instead.
+
+	$(VERSION), $(PATCHLEVEL), and $(SUBLEVEL) define the basic
+	three-part version number, such as "2", "4", and "0".  These three
+	values are always numeric.
+
+	$(EXTRAVERSION) defines an even tinier sublevel for pre-patches
+	or additional patches.	It is usually some non-numeric string
+	such as "-pre4", and is often blank.
+
+    KERNELRELEASE
+	$(KERNELRELEASE) is a single string such as "2.4.0-pre4", suitable
+	for constructing installation directory names or showing in
+	version strings.  Some arch Makefiles use it for this purpose.
+
+    ARCH
+	This variable defines the target architecture, such as "i386",
+	"arm", or "sparc". Some kbuild Makefiles test $(ARCH) to
+	determine which files to compile.
+
+	By default, the top Makefile sets $(ARCH) to be the same as the
+	host system architecture.  For a cross build, a user may
+	override the value of $(ARCH) on the command line::
+
+	    make ARCH=m68k ...
+
+
+    INSTALL_PATH
+	This variable defines a place for the arch Makefiles to install
+	the resident kernel image and System.map file.
+	Use this for architecture-specific install targets.
+
+    INSTALL_MOD_PATH, MODLIB
+	$(INSTALL_MOD_PATH) specifies a prefix to $(MODLIB) for module
+	installation.  This variable is not defined in the Makefile but
+	may be passed in by the user if desired.
+
+	$(MODLIB) specifies the directory for module installation.
+	The top Makefile defines $(MODLIB) to
+	$(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE).  The user may
+	override this value on the command line if desired.
+
+    INSTALL_MOD_STRIP
+	If this variable is specified, it will cause modules to be stripped
+	after they are installed.  If INSTALL_MOD_STRIP is '1', then the
+	default option --strip-debug will be used.  Otherwise, the
+	INSTALL_MOD_STRIP value will be used as the option(s) to the strip
+	command.
+
+
+9 Makefile language
+===================
+
+The kernel Makefiles are designed to be run with GNU Make.  The Makefiles
+use only the documented features of GNU Make, but they do use many
+GNU extensions.
+
+GNU Make supports elementary list-processing functions.  The kernel
+Makefiles use a novel style of list building and manipulation with few
+"if" statements.
+
+GNU Make has two assignment operators, ":=" and "=".  ":=" performs
+immediate evaluation of the right-hand side and stores an actual string
+into the left-hand side.  "=" is like a formula definition; it stores the
+right-hand side in an unevaluated form and then evaluates this form each
+time the left-hand side is used.
+
+There are some cases where "=" is appropriate.  Usually, though, ":="
+is the right choice.
+
+10 Credits
+==========
+
+- Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
+- Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
+- Updates by Sam Ravnborg <sam@ravnborg.org>
+- Language QA by Jan Engelhardt <jengelh@gmx.de>
+
+11 TODO
+=======
+
+- Describe how kbuild supports shipped files with _shipped.
+- Generating offset header files.
+- Add more variables to section 7?
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
deleted file mode 100644
index 03c065855eaf..000000000000
--- a/Documentation/kbuild/makefiles.txt
+++ /dev/null
@@ -1,1383 +0,0 @@
-Linux Kernel Makefiles
-
-This document describes the Linux kernel Makefiles.
-
-=== Table of Contents
-
-	=== 1 Overview
-	=== 2 Who does what
-	=== 3 The kbuild files
-	   --- 3.1 Goal definitions
-	   --- 3.2 Built-in object goals - obj-y
-	   --- 3.3 Loadable module goals - obj-m
-	   --- 3.4 Objects which export symbols
-	   --- 3.5 Library file goals - lib-y
-	   --- 3.6 Descending down in directories
-	   --- 3.7 Compilation flags
-	   --- 3.8 Command line dependency
-	   --- 3.9 Dependency tracking
-	   --- 3.10 Special Rules
-	   --- 3.11 $(CC) support functions
-	   --- 3.12 $(LD) support functions
-
-	=== 4 Host Program support
-	   --- 4.1 Simple Host Program
-	   --- 4.2 Composite Host Programs
-	   --- 4.3 Using C++ for host programs
-	   --- 4.4 Controlling compiler options for host programs
-	   --- 4.5 When host programs are actually built
-	   --- 4.6 Using hostprogs-$(CONFIG_FOO)
-
-	=== 5 Kbuild clean infrastructure
-
-	=== 6 Architecture Makefiles
-	   --- 6.1 Set variables to tweak the build to the architecture
-	   --- 6.2 Add prerequisites to archheaders:
-	   --- 6.3 Add prerequisites to archprepare:
-	   --- 6.4 List directories to visit when descending
-	   --- 6.5 Architecture-specific boot images
-	   --- 6.6 Building non-kbuild targets
-	   --- 6.7 Commands useful for building a boot image
-	   --- 6.8 Custom kbuild commands
-	   --- 6.9 Preprocessing linker scripts
-	   --- 6.10 Generic header files
-	   --- 6.11 Post-link pass
-
-	=== 7 Kbuild syntax for exported headers
-		--- 7.1 no-export-headers
-		--- 7.2 generic-y
-		--- 7.3 generated-y
-		--- 7.4 mandatory-y
-
-	=== 8 Kbuild Variables
-	=== 9 Makefile language
-	=== 10 Credits
-	=== 11 TODO
-
-=== 1 Overview
-
-The Makefiles have five parts:
-
-	Makefile		the top Makefile.
-	.config			the kernel configuration file.
-	arch/$(ARCH)/Makefile	the arch Makefile.
-	scripts/Makefile.*	common rules etc. for all kbuild Makefiles.
-	kbuild Makefiles	there are about 500 of these.
-
-The top Makefile reads the .config file, which comes from the kernel
-configuration process.
-
-The top Makefile is responsible for building two major products: vmlinux
-(the resident kernel image) and modules (any module files).
-It builds these goals by recursively descending into the subdirectories of
-the kernel source tree.
-The list of subdirectories which are visited depends upon the kernel
-configuration. The top Makefile textually includes an arch Makefile
-with the name arch/$(ARCH)/Makefile. The arch Makefile supplies
-architecture-specific information to the top Makefile.
-
-Each subdirectory has a kbuild Makefile which carries out the commands
-passed down from above. The kbuild Makefile uses information from the
-.config file to construct various file lists used by kbuild to build
-any built-in or modular targets.
-
-scripts/Makefile.* contains all the definitions/rules etc. that
-are used to build the kernel based on the kbuild makefiles.
-
-
-=== 2 Who does what
-
-People have four different relationships with the kernel Makefiles.
-
-*Users* are people who build kernels.  These people type commands such as
-"make menuconfig" or "make".  They usually do not read or edit
-any kernel Makefiles (or any other source files).
-
-*Normal developers* are people who work on features such as device
-drivers, file systems, and network protocols.  These people need to
-maintain the kbuild Makefiles for the subsystem they are
-working on.  In order to do this effectively, they need some overall
-knowledge about the kernel Makefiles, plus detailed knowledge about the
-public interface for kbuild.
-
-*Arch developers* are people who work on an entire architecture, such
-as sparc or ia64.  Arch developers need to know about the arch Makefile
-as well as kbuild Makefiles.
-
-*Kbuild developers* are people who work on the kernel build system itself.
-These people need to know about all aspects of the kernel Makefiles.
-
-This document is aimed towards normal developers and arch developers.
-
-
-=== 3 The kbuild files
-
-Most Makefiles within the kernel are kbuild Makefiles that use the
-kbuild infrastructure. This chapter introduces the syntax used in the
-kbuild makefiles.
-The preferred name for the kbuild files are 'Makefile' but 'Kbuild' can
-be used and if both a 'Makefile' and a 'Kbuild' file exists, then the 'Kbuild'
-file will be used.
-
-Section 3.1 "Goal definitions" is a quick intro, further chapters provide
-more details, with real examples.
-
---- 3.1 Goal definitions
-
-	Goal definitions are the main part (heart) of the kbuild Makefile.
-	These lines define the files to be built, any special compilation
-	options, and any subdirectories to be entered recursively.
-
-	The most simple kbuild makefile contains one line:
-
-	Example:
-		obj-y += foo.o
-
-	This tells kbuild that there is one object in that directory, named
-	foo.o. foo.o will be built from foo.c or foo.S.
-
-	If foo.o shall be built as a module, the variable obj-m is used.
-	Therefore the following pattern is often used:
-
-	Example:
-		obj-$(CONFIG_FOO) += foo.o
-
-	$(CONFIG_FOO) evaluates to either y (for built-in) or m (for module).
-	If CONFIG_FOO is neither y nor m, then the file will not be compiled
-	nor linked.
-
---- 3.2 Built-in object goals - obj-y
-
-	The kbuild Makefile specifies object files for vmlinux
-	in the $(obj-y) lists.  These lists depend on the kernel
-	configuration.
-
-	Kbuild compiles all the $(obj-y) files.  It then calls
-	"$(AR) rcSTP" to merge these files into one built-in.a file.
-	This is a thin archive without a symbol table. It will be later
-	linked into vmlinux by scripts/link-vmlinux.sh
-
-	The order of files in $(obj-y) is significant.  Duplicates in
-	the lists are allowed: the first instance will be linked into
-	built-in.a and succeeding instances will be ignored.
-
-	Link order is significant, because certain functions
-	(module_init() / __initcall) will be called during boot in the
-	order they appear. So keep in mind that changing the link
-	order may e.g. change the order in which your SCSI
-	controllers are detected, and thus your disks are renumbered.
-
-	Example:
-		#drivers/isdn/i4l/Makefile
-		# Makefile for the kernel ISDN subsystem and device drivers.
-		# Each configuration option enables a list of files.
-		obj-$(CONFIG_ISDN_I4L)         += isdn.o
-		obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
-
---- 3.3 Loadable module goals - obj-m
-
-	$(obj-m) specifies object files which are built as loadable
-	kernel modules.
-
-	A module may be built from one source file or several source
-	files. In the case of one source file, the kbuild makefile
-	simply adds the file to $(obj-m).
-
-	Example:
-		#drivers/isdn/i4l/Makefile
-		obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
-
-	Note: In this example $(CONFIG_ISDN_PPP_BSDCOMP) evaluates to 'm'
-
-	If a kernel module is built from several source files, you specify
-	that you want to build a module in the same way as above; however,
-	kbuild needs to know which object files you want to build your
-	module from, so you have to tell it by setting a $(<module_name>-y)
-	variable.
-
-	Example:
-		#drivers/isdn/i4l/Makefile
-		obj-$(CONFIG_ISDN_I4L) += isdn.o
-		isdn-y := isdn_net_lib.o isdn_v110.o isdn_common.o
-
-	In this example, the module name will be isdn.o. Kbuild will
-	compile the objects listed in $(isdn-y) and then run
-	"$(LD) -r" on the list of these files to generate isdn.o.
-
-	Due to kbuild recognizing $(<module_name>-y) for composite objects,
-	you can use the value of a CONFIG_ symbol to optionally include an
-	object file as part of a composite object.
-
-	Example:
-		#fs/ext2/Makefile
-	        obj-$(CONFIG_EXT2_FS) += ext2.o
-		ext2-y := balloc.o dir.o file.o ialloc.o inode.o ioctl.o \
-			  namei.o super.o symlink.o
-	        ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o \
-						xattr_trusted.o
-
-	In this example, xattr.o, xattr_user.o and xattr_trusted.o are only
-	part of the composite object ext2.o if $(CONFIG_EXT2_FS_XATTR)
-	evaluates to 'y'.
-
-	Note: Of course, when you are building objects into the kernel,
-	the syntax above will also work. So, if you have CONFIG_EXT2_FS=y,
-	kbuild will build an ext2.o file for you out of the individual
-	parts and then link this into built-in.a, as you would expect.
-
---- 3.4 Objects which export symbols
-
-	No special notation is required in the makefiles for
-	modules exporting symbols.
-
---- 3.5 Library file goals - lib-y
-
-	Objects listed with obj-* are used for modules, or
-	combined in a built-in.a for that specific directory.
-	There is also the possibility to list objects that will
-	be included in a library, lib.a.
-	All objects listed with lib-y are combined in a single
-	library for that directory.
-	Objects that are listed in obj-y and additionally listed in
-	lib-y will not be included in the library, since they will
-	be accessible anyway.
-	For consistency, objects listed in lib-m will be included in lib.a.
-
-	Note that the same kbuild makefile may list files to be built-in
-	and to be part of a library. Therefore the same directory
-	may contain both a built-in.a and a lib.a file.
-
-	Example:
-		#arch/x86/lib/Makefile
-		lib-y    := delay.o
-
-	This will create a library lib.a based on delay.o. For kbuild to
-	actually recognize that there is a lib.a being built, the directory
-	shall be listed in libs-y.
-	See also "6.4 List directories to visit when descending".
-
-	Use of lib-y is normally restricted to lib/ and arch/*/lib.
-
---- 3.6 Descending down in directories
-
-	A Makefile is only responsible for building objects in its own
-	directory. Files in subdirectories should be taken care of by
-	Makefiles in these subdirs. The build system will automatically
-	invoke make recursively in subdirectories, provided you let it know of
-	them.
-
-	To do so, obj-y and obj-m are used.
-	ext2 lives in a separate directory, and the Makefile present in fs/
-	tells kbuild to descend down using the following assignment.
-
-	Example:
-		#fs/Makefile
-		obj-$(CONFIG_EXT2_FS) += ext2/
-
-	If CONFIG_EXT2_FS is set to either 'y' (built-in) or 'm' (modular)
-	the corresponding obj- variable will be set, and kbuild will descend
-	down in the ext2 directory.
-	Kbuild only uses this information to decide that it needs to visit
-	the directory, it is the Makefile in the subdirectory that
-	specifies what is modular and what is built-in.
-
-	It is good practice to use a CONFIG_ variable when assigning directory
-	names. This allows kbuild to totally skip the directory if the
-	corresponding CONFIG_ option is neither 'y' nor 'm'.
-
---- 3.7 Compilation flags
-
-    ccflags-y, asflags-y and ldflags-y
-	These three flags apply only to the kbuild makefile in which they
-	are assigned. They are used for all the normal cc, as and ld
-	invocations happening during a recursive build.
-	Note: Flags with the same behaviour were previously named:
-	EXTRA_CFLAGS, EXTRA_AFLAGS and EXTRA_LDFLAGS.
-	They are still supported but their usage is deprecated.
-
-	ccflags-y specifies options for compiling with $(CC).
-
-	Example:
-		# drivers/acpi/acpica/Makefile
-		ccflags-y			:= -Os -D_LINUX -DBUILDING_ACPICA
-		ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
-
-	This variable is necessary because the top Makefile owns the
-	variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
-	entire tree.
-
-	asflags-y specifies options for assembling with $(AS).
-
-	Example:
-		#arch/sparc/kernel/Makefile
-		asflags-y := -ansi
-
-	ldflags-y specifies options for linking with $(LD).
-
-	Example:
-		#arch/cris/boot/compressed/Makefile
-		ldflags-y += -T $(srctree)/$(src)/decompress_$(arch-y).lds
-
-    subdir-ccflags-y, subdir-asflags-y
-	The two flags listed above are similar to ccflags-y and asflags-y.
-	The difference is that the subdir- variants have effect for the kbuild
-	file where they are present and all subdirectories.
-	Options specified using subdir-* are added to the commandline before
-	the options specified using the non-subdir variants.
-
-	Example:
-		subdir-ccflags-y := -Werror
-
-    CFLAGS_$@, AFLAGS_$@
-
-	CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
-	kbuild makefile.
-
-	$(CFLAGS_$@) specifies per-file options for $(CC).  The $@
-	part has a literal value which specifies the file that it is for.
-
-	Example:
-		# drivers/scsi/Makefile
-		CFLAGS_aha152x.o =   -DAHA152X_STAT -DAUTOCONF
-		CFLAGS_gdth.o    = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ \
-				     -DGDTH_STATISTICS
-
-	These two lines specify compilation flags for aha152x.o and gdth.o.
-
-	$(AFLAGS_$@) is a similar feature for source files in assembly
-	languages.
-
-	Example:
-		# arch/arm/kernel/Makefile
-		AFLAGS_head.o        := -DTEXT_OFFSET=$(TEXT_OFFSET)
-		AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312
-		AFLAGS_iwmmxt.o      := -Wa,-mcpu=iwmmxt
-
-
---- 3.9 Dependency tracking
-
-	Kbuild tracks dependencies on the following:
-	1) All prerequisite files (both *.c and *.h)
-	2) CONFIG_ options used in all prerequisite files
-	3) Command-line used to compile target
-
-	Thus, if you change an option to $(CC) all affected files will
-	be re-compiled.
-
---- 3.10 Special Rules
-
-	Special rules are used when the kbuild infrastructure does
-	not provide the required support. A typical example is
-	header files generated during the build process.
-	Another example are the architecture-specific Makefiles which
-	need special rules to prepare boot images etc.
-
-	Special rules are written as normal Make rules.
-	Kbuild is not executing in the directory where the Makefile is
-	located, so all special rules shall provide a relative
-	path to prerequisite files and target files.
-
-	Two variables are used when defining special rules:
-
-    $(src)
-	$(src) is a relative path which points to the directory
-	where the Makefile is located. Always use $(src) when
-	referring to files located in the src tree.
-
-    $(obj)
-	$(obj) is a relative path which points to the directory
-	where the target is saved. Always use $(obj) when
-	referring to generated files.
-
-	Example:
-		#drivers/scsi/Makefile
-		$(obj)/53c8xx_d.h: $(src)/53c7,8xx.scr $(src)/script_asm.pl
-			$(CPP) -DCHIP=810 - < $< | ... $(src)/script_asm.pl
-
-	This is a special rule, following the normal syntax
-	required by make.
-	The target file depends on two prerequisite files. References
-	to the target file are prefixed with $(obj), references
-	to prerequisites are referenced with $(src) (because they are not
-	generated files).
-
-    $(kecho)
-	echoing information to user in a rule is often a good practice
-	but when execution "make -s" one does not expect to see any output
-	except for warnings/errors.
-	To support this kbuild defines $(kecho) which will echo out the
-	text following $(kecho) to stdout except if "make -s" is used.
-
-	Example:
-		#arch/blackfin/boot/Makefile
-		$(obj)/vmImage: $(obj)/vmlinux.gz
-			$(call if_changed,uimage)
-			@$(kecho) 'Kernel: $@ is ready'
-
-
---- 3.11 $(CC) support functions
-
-	The kernel may be built with several different versions of
-	$(CC), each supporting a unique set of features and options.
-	kbuild provides basic support to check for valid options for $(CC).
-	$(CC) is usually the gcc compiler, but other alternatives are
-	available.
-
-    as-option
-	as-option is used to check if $(CC) -- when used to compile
-	assembler (*.S) files -- supports the given option. An optional
-	second option may be specified if the first option is not supported.
-
-	Example:
-		#arch/sh/Makefile
-		cflags-y += $(call as-option,-Wa$(comma)-isa=$(isa-y),)
-
-	In the above example, cflags-y will be assigned the option
-	-Wa$(comma)-isa=$(isa-y) if it is supported by $(CC).
-	The second argument is optional, and if supplied will be used
-	if first argument is not supported.
-
-    cc-ldoption
-	cc-ldoption is used to check if $(CC) when used to link object files
-	supports the given option.  An optional second option may be
-	specified if first option are not supported.
-
-	Example:
-		#arch/x86/kernel/Makefile
-		vsyscall-flags += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
-
-	In the above example, vsyscall-flags will be assigned the option
-	-Wl$(comma)--hash-style=sysv if it is supported by $(CC).
-	The second argument is optional, and if supplied will be used
-	if first argument is not supported.
-
-    as-instr
-	as-instr checks if the assembler reports a specific instruction
-	and then outputs either option1 or option2
-	C escapes are supported in the test instruction
-	Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options
-
-    cc-option
-	cc-option is used to check if $(CC) supports a given option, and if
-	not supported to use an optional second option.
-
-	Example:
-		#arch/x86/Makefile
-		cflags-y += $(call cc-option,-march=pentium-mmx,-march=i586)
-
-	In the above example, cflags-y will be assigned the option
-	-march=pentium-mmx if supported by $(CC), otherwise -march=i586.
-	The second argument to cc-option is optional, and if omitted,
-	cflags-y will be assigned no value if first option is not supported.
-	Note: cc-option uses KBUILD_CFLAGS for $(CC) options
-
-   cc-option-yn
-	cc-option-yn is used to check if gcc supports a given option
-	and return 'y' if supported, otherwise 'n'.
-
-	Example:
-		#arch/ppc/Makefile
-		biarch := $(call cc-option-yn, -m32)
-		aflags-$(biarch) += -a32
-		cflags-$(biarch) += -m32
-
-	In the above example, $(biarch) is set to y if $(CC) supports the -m32
-	option. When $(biarch) equals 'y', the expanded variables $(aflags-y)
-	and $(cflags-y) will be assigned the values -a32 and -m32,
-	respectively.
-	Note: cc-option-yn uses KBUILD_CFLAGS for $(CC) options
-
-    cc-disable-warning
-	cc-disable-warning checks if gcc supports a given warning and returns
-	the commandline switch to disable it. This special function is needed,
-	because gcc 4.4 and later accept any unknown -Wno-* option and only
-	warn about it if there is another warning in the source file.
-
-	Example:
-		KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
-
-	In the above example, -Wno-unused-but-set-variable will be added to
-	KBUILD_CFLAGS only if gcc really accepts it.
-
-    cc-ifversion
-	cc-ifversion tests the version of $(CC) and equals the fourth parameter
-	if version expression is true, or the fifth (if given) if the version
-	expression is false.
-
-	Example:
-		#fs/reiserfs/Makefile
-		ccflags-y := $(call cc-ifversion, -lt, 0402, -O1)
-
-	In this example, ccflags-y will be assigned the value -O1 if the
-	$(CC) version is less than 4.2.
-	cc-ifversion takes all the shell operators:
-	-eq, -ne, -lt, -le, -gt, and -ge
-	The third parameter may be a text as in this example, but it may also
-	be an expanded variable or a macro.
-
-    cc-cross-prefix
-	cc-cross-prefix is used to check if there exists a $(CC) in path with
-	one of the listed prefixes. The first prefix where there exist a
-	prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
-	then nothing is returned.
-	Additional prefixes are separated by a single space in the
-	call of cc-cross-prefix.
-	This functionality is useful for architecture Makefiles that try
-	to set CROSS_COMPILE to well-known values but may have several
-	values to select between.
-	It is recommended only to try to set CROSS_COMPILE if it is a cross
-	build (host arch is different from target arch). And if CROSS_COMPILE
-	is already set then leave it with the old value.
-
-	Example:
-		#arch/m68k/Makefile
-		ifneq ($(SUBARCH),$(ARCH))
-		        ifeq ($(CROSS_COMPILE),)
-		               CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
-			endif
-		endif
-
---- 3.12 $(LD) support functions
-
-    ld-option
-	ld-option is used to check if $(LD) supports the supplied option.
-	ld-option takes two options as arguments.
-	The second argument is an optional option that can be used if the
-	first option is not supported by $(LD).
-
-	Example:
-		#Makefile
-		LDFLAGS_vmlinux += $(call ld-option, -X)
-
-
-=== 4 Host Program support
-
-Kbuild supports building executables on the host for use during the
-compilation stage.
-Two steps are required in order to use a host executable.
-
-The first step is to tell kbuild that a host program exists. This is
-done utilising the variable hostprogs-y.
-
-The second step is to add an explicit dependency to the executable.
-This can be done in two ways. Either add the dependency in a rule,
-or utilise the variable $(always).
-Both possibilities are described in the following.
-
---- 4.1 Simple Host Program
-
-	In some cases there is a need to compile and run a program on the
-	computer where the build is running.
-	The following line tells kbuild that the program bin2hex shall be
-	built on the build host.
-
-	Example:
-		hostprogs-y := bin2hex
-
-	Kbuild assumes in the above example that bin2hex is made from a single
-	c-source file named bin2hex.c located in the same directory as
-	the Makefile.
-
---- 4.2 Composite Host Programs
-
-	Host programs can be made up based on composite objects.
-	The syntax used to define composite objects for host programs is
-	similar to the syntax used for kernel objects.
-	$(<executable>-objs) lists all objects used to link the final
-	executable.
-
-	Example:
-		#scripts/lxdialog/Makefile
-		hostprogs-y   := lxdialog
-		lxdialog-objs := checklist.o lxdialog.o
-
-	Objects with extension .o are compiled from the corresponding .c
-	files. In the above example, checklist.c is compiled to checklist.o
-	and lxdialog.c is compiled to lxdialog.o.
-	Finally, the two .o files are linked to the executable, lxdialog.
-	Note: The syntax <executable>-y is not permitted for host-programs.
-
---- 4.3 Using C++ for host programs
-
-	kbuild offers support for host programs written in C++. This was
-	introduced solely to support kconfig, and is not recommended
-	for general use.
-
-	Example:
-		#scripts/kconfig/Makefile
-		hostprogs-y   := qconf
-		qconf-cxxobjs := qconf.o
-
-	In the example above the executable is composed of the C++ file
-	qconf.cc - identified by $(qconf-cxxobjs).
-
-	If qconf is composed of a mixture of .c and .cc files, then an
-	additional line can be used to identify this.
-
-	Example:
-		#scripts/kconfig/Makefile
-		hostprogs-y   := qconf
-		qconf-cxxobjs := qconf.o
-		qconf-objs    := check.o
-
---- 4.4 Controlling compiler options for host programs
-
-	When compiling host programs, it is possible to set specific flags.
-	The programs will always be compiled utilising $(HOSTCC) passed
-	the options specified in $(KBUILD_HOSTCFLAGS).
-	To set flags that will take effect for all host programs created
-	in that Makefile, use the variable HOST_EXTRACFLAGS.
-
-	Example:
-		#scripts/lxdialog/Makefile
-		HOST_EXTRACFLAGS += -I/usr/include/ncurses
-
-	To set specific flags for a single file the following construction
-	is used:
-
-	Example:
-		#arch/ppc64/boot/Makefile
-		HOSTCFLAGS_piggyback.o := -DKERNELBASE=$(KERNELBASE)
-
-	It is also possible to specify additional options to the linker.
-
-	Example:
-		#scripts/kconfig/Makefile
-		HOSTLDLIBS_qconf := -L$(QTDIR)/lib
-
-	When linking qconf, it will be passed the extra option
-	"-L$(QTDIR)/lib".
-
---- 4.5 When host programs are actually built
-
-	Kbuild will only build host-programs when they are referenced
-	as a prerequisite.
-	This is possible in two ways:
-
-	(1) List the prerequisite explicitly in a special rule.
-
-	Example:
-		#drivers/pci/Makefile
-		hostprogs-y := gen-devlist
-		$(obj)/devlist.h: $(src)/pci.ids $(obj)/gen-devlist
-			( cd $(obj); ./gen-devlist ) < $<
-
-	The target $(obj)/devlist.h will not be built before
-	$(obj)/gen-devlist is updated. Note that references to
-	the host programs in special rules must be prefixed with $(obj).
-
-	(2) Use $(always)
-	When there is no suitable special rule, and the host program
-	shall be built when a makefile is entered, the $(always)
-	variable shall be used.
-
-	Example:
-		#scripts/lxdialog/Makefile
-		hostprogs-y   := lxdialog
-		always        := $(hostprogs-y)
-
-	This will tell kbuild to build lxdialog even if not referenced in
-	any rule.
-
---- 4.6 Using hostprogs-$(CONFIG_FOO)
-
-	A typical pattern in a Kbuild file looks like this:
-
-	Example:
-		#scripts/Makefile
-		hostprogs-$(CONFIG_KALLSYMS) += kallsyms
-
-	Kbuild knows about both 'y' for built-in and 'm' for module.
-	So if a config symbol evaluates to 'm', kbuild will still build
-	the binary. In other words, Kbuild handles hostprogs-m exactly
-	like hostprogs-y. But only hostprogs-y is recommended to be used
-	when no CONFIG symbols are involved.
-
-=== 5 Kbuild clean infrastructure
-
-"make clean" deletes most generated files in the obj tree where the kernel
-is compiled. This includes generated files such as host programs.
-Kbuild knows targets listed in $(hostprogs-y), $(hostprogs-m), $(always),
-$(extra-y) and $(targets). They are all deleted during "make clean".
-Files matching the patterns "*.[oas]", "*.ko", plus some additional files
-generated by kbuild are deleted all over the kernel src tree when
-"make clean" is executed.
-
-Additional files can be specified in kbuild makefiles by use of $(clean-files).
-
-	Example:
-		#lib/Makefile
-		clean-files := crc32table.h
-
-When executing "make clean", the file "crc32table.h" will be deleted.
-Kbuild will assume files to be in the same relative directory as the
-Makefile, except if prefixed with $(objtree).
-
-To delete a directory hierarchy use:
-
-	Example:
-		#scripts/package/Makefile
-		clean-dirs := $(objtree)/debian/
-
-This will delete the directory debian in the toplevel directory, including all
-subdirectories.
-
-To exclude certain files from make clean, use the $(no-clean-files) variable.
-This is only a special case used in the top level Kbuild file:
-
-	Example:
-		#Kbuild
-		no-clean-files := $(bounds-file) $(offsets-file)
-
-Usually kbuild descends down in subdirectories due to "obj-* := dir/",
-but in the architecture makefiles where the kbuild infrastructure
-is not sufficient this sometimes needs to be explicit.
-
-	Example:
-		#arch/x86/boot/Makefile
-		subdir- := compressed/
-
-The above assignment instructs kbuild to descend down in the
-directory compressed/ when "make clean" is executed.
-
-To support the clean infrastructure in the Makefiles that build the
-final bootimage there is an optional target named archclean:
-
-	Example:
-		#arch/x86/Makefile
-		archclean:
-			$(Q)$(MAKE) $(clean)=arch/x86/boot
-
-When "make clean" is executed, make will descend down in arch/x86/boot,
-and clean as usual. The Makefile located in arch/x86/boot/ may use
-the subdir- trick to descend further down.
-
-Note 1: arch/$(ARCH)/Makefile cannot use "subdir-", because that file is
-included in the top level makefile, and the kbuild infrastructure
-is not operational at that point.
-
-Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will
-be visited during "make clean".
-
-=== 6 Architecture Makefiles
-
-The top level Makefile sets up the environment and does the preparation,
-before starting to descend down in the individual directories.
-The top level makefile contains the generic part, whereas
-arch/$(ARCH)/Makefile contains what is required to set up kbuild
-for said architecture.
-To do so, arch/$(ARCH)/Makefile sets up a number of variables and defines
-a few targets.
-
-When kbuild executes, the following steps are followed (roughly):
-1) Configuration of the kernel => produce .config
-2) Store kernel version in include/linux/version.h
-3) Updating all other prerequisites to the target prepare:
-   - Additional prerequisites are specified in arch/$(ARCH)/Makefile
-4) Recursively descend down in all directories listed in
-   init-* core* drivers-* net-* libs-* and build all targets.
-   - The values of the above variables are expanded in arch/$(ARCH)/Makefile.
-5) All object files are then linked and the resulting file vmlinux is
-   located at the root of the obj tree.
-   The very first objects linked are listed in head-y, assigned by
-   arch/$(ARCH)/Makefile.
-6) Finally, the architecture-specific part does any required post processing
-   and builds the final bootimage.
-   - This includes building boot records
-   - Preparing initrd images and the like
-
-
---- 6.1 Set variables to tweak the build to the architecture
-
-    LDFLAGS		Generic $(LD) options
-
-	Flags used for all invocations of the linker.
-	Often specifying the emulation is sufficient.
-
-	Example:
-		#arch/s390/Makefile
-		LDFLAGS         := -m elf_s390
-	Note: ldflags-y can be used to further customise
-	the flags used. See chapter 3.7.
-
-    LDFLAGS_vmlinux	Options for $(LD) when linking vmlinux
-
-	LDFLAGS_vmlinux is used to specify additional flags to pass to
-	the linker when linking the final vmlinux image.
-	LDFLAGS_vmlinux uses the LDFLAGS_$@ support.
-
-	Example:
-		#arch/x86/Makefile
-		LDFLAGS_vmlinux := -e stext
-
-    OBJCOPYFLAGS	objcopy flags
-
-	When $(call if_changed,objcopy) is used to translate a .o file,
-	the flags specified in OBJCOPYFLAGS will be used.
-	$(call if_changed,objcopy) is often used to generate raw binaries on
-	vmlinux.
-
-	Example:
-		#arch/s390/Makefile
-		OBJCOPYFLAGS := -O binary
-
-		#arch/s390/boot/Makefile
-		$(obj)/image: vmlinux FORCE
-			$(call if_changed,objcopy)
-
-	In this example, the binary $(obj)/image is a binary version of
-	vmlinux. The usage of $(call if_changed,xxx) will be described later.
-
-    KBUILD_AFLAGS		$(AS) assembler flags
-
-	Default value - see top level Makefile
-	Append or modify as required per architecture.
-
-	Example:
-		#arch/sparc64/Makefile
-		KBUILD_AFLAGS += -m64 -mcpu=ultrasparc
-
-    KBUILD_CFLAGS		$(CC) compiler flags
-
-	Default value - see top level Makefile
-	Append or modify as required per architecture.
-
-	Often, the KBUILD_CFLAGS variable depends on the configuration.
-
-	Example:
-		#arch/x86/boot/compressed/Makefile
-		cflags-$(CONFIG_X86_32) := -march=i386
-		cflags-$(CONFIG_X86_64) := -mcmodel=small
-		KBUILD_CFLAGS += $(cflags-y)
-
-	Many arch Makefiles dynamically run the target C compiler to
-	probe supported options:
-
-		#arch/x86/Makefile
-
-		...
-		cflags-$(CONFIG_MPENTIUMII)     += $(call cc-option,\
-						-march=pentium2,-march=i686)
-		...
-		# Disable unit-at-a-time mode ...
-		KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time)
-		...
-
-
-	The first example utilises the trick that a config option expands
-	to 'y' when selected.
-
-    KBUILD_AFLAGS_KERNEL	$(AS) options specific for built-in
-
-	$(KBUILD_AFLAGS_KERNEL) contains extra C compiler flags used to compile
-	resident kernel code.
-
-    KBUILD_AFLAGS_MODULE   Options for $(AS) when building modules
-
-	$(KBUILD_AFLAGS_MODULE) is used to add arch-specific options that
-	are used for $(AS).
-	From commandline AFLAGS_MODULE shall be used (see kbuild.txt).
-
-    KBUILD_CFLAGS_KERNEL	$(CC) options specific for built-in
-
-	$(KBUILD_CFLAGS_KERNEL) contains extra C compiler flags used to compile
-	resident kernel code.
-
-    KBUILD_CFLAGS_MODULE   Options for $(CC) when building modules
-
-	$(KBUILD_CFLAGS_MODULE) is used to add arch-specific options that
-	are used for $(CC).
-	From commandline CFLAGS_MODULE shall be used (see kbuild.txt).
-
-    KBUILD_LDFLAGS_MODULE   Options for $(LD) when linking modules
-
-	$(KBUILD_LDFLAGS_MODULE) is used to add arch-specific options
-	used when linking modules. This is often a linker script.
-	From commandline LDFLAGS_MODULE shall be used (see kbuild.txt).
-
-    KBUILD_ARFLAGS   Options for $(AR) when creating archives
-
-	$(KBUILD_ARFLAGS) set by the top level Makefile to "D" (deterministic
-	mode) if this option is supported by $(AR).
-
-    ARCH_CPPFLAGS, ARCH_AFLAGS, ARCH_CFLAGS   Overrides the kbuild defaults
-
-	These variables are appended to the KBUILD_CPPFLAGS,
-	KBUILD_AFLAGS, and KBUILD_CFLAGS, respectively, after the
-	top-level Makefile has set any other flags. This provides a
-	means for an architecture to override the defaults.
-
-
---- 6.2 Add prerequisites to archheaders:
-
-	The archheaders: rule is used to generate header files that
-	may be installed into user space by "make header_install" or
-	"make headers_install_all".  In order to support
-	"make headers_install_all", this target has to be able to run
-	on an unconfigured tree, or a tree configured for another
-	architecture.
-
-	It is run before "make archprepare" when run on the
-	architecture itself.
-
-
---- 6.3 Add prerequisites to archprepare:
-
-	The archprepare: rule is used to list prerequisites that need to be
-	built before starting to descend down in the subdirectories.
-	This is usually used for header files containing assembler constants.
-
-		Example:
-		#arch/arm/Makefile
-		archprepare: maketools
-
-	In this example, the file target maketools will be processed
-	before descending down in the subdirectories.
-	See also chapter XXX-TODO that describe how kbuild supports
-	generating offset header files.
-
-
---- 6.4 List directories to visit when descending
-
-	An arch Makefile cooperates with the top Makefile to define variables
-	which specify how to build the vmlinux file.  Note that there is no
-	corresponding arch-specific section for modules; the module-building
-	machinery is all architecture-independent.
-
-
-    head-y, init-y, core-y, libs-y, drivers-y, net-y
-
-	$(head-y) lists objects to be linked first in vmlinux.
-	$(libs-y) lists directories where a lib.a archive can be located.
-	The rest list directories where a built-in.a object file can be
-	located.
-
-	$(init-y) objects will be located after $(head-y).
-	Then the rest follows in this order:
-	$(core-y), $(libs-y), $(drivers-y) and $(net-y).
-
-	The top level Makefile defines values for all generic directories,
-	and arch/$(ARCH)/Makefile only adds architecture-specific directories.
-
-	Example:
-		#arch/sparc64/Makefile
-		core-y += arch/sparc64/kernel/
-		libs-y += arch/sparc64/prom/ arch/sparc64/lib/
-		drivers-$(CONFIG_OPROFILE)  += arch/sparc64/oprofile/
-
-
---- 6.5 Architecture-specific boot images
-
-	An arch Makefile specifies goals that take the vmlinux file, compress
-	it, wrap it in bootstrapping code, and copy the resulting files
-	somewhere. This includes various kinds of installation commands.
-	The actual goals are not standardized across architectures.
-
-	It is common to locate any additional processing in a boot/
-	directory below arch/$(ARCH)/.
-
-	Kbuild does not provide any smart way to support building a
-	target specified in boot/. Therefore arch/$(ARCH)/Makefile shall
-	call make manually to build a target in boot/.
-
-	The recommended approach is to include shortcuts in
-	arch/$(ARCH)/Makefile, and use the full path when calling down
-	into the arch/$(ARCH)/boot/Makefile.
-
-	Example:
-		#arch/x86/Makefile
-		boot := arch/x86/boot
-		bzImage: vmlinux
-			$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-
-	"$(Q)$(MAKE) $(build)=<dir>" is the recommended way to invoke
-	make in a subdirectory.
-
-	There are no rules for naming architecture-specific targets,
-	but executing "make help" will list all relevant targets.
-	To support this, $(archhelp) must be defined.
-
-	Example:
-		#arch/x86/Makefile
-		define archhelp
-		  echo  '* bzImage      - Image (arch/$(ARCH)/boot/bzImage)'
-		endif
-
-	When make is executed without arguments, the first goal encountered
-	will be built. In the top level Makefile the first goal present
-	is all:.
-	An architecture shall always, per default, build a bootable image.
-	In "make help", the default goal is highlighted with a '*'.
-	Add a new prerequisite to all: to select a default goal different
-	from vmlinux.
-
-	Example:
-		#arch/x86/Makefile
-		all: bzImage
-
-	When "make" is executed without arguments, bzImage will be built.
-
---- 6.6 Building non-kbuild targets
-
-    extra-y
-
-	extra-y specifies additional targets created in the current
-	directory, in addition to any targets specified by obj-*.
-
-	Listing all targets in extra-y is required for two purposes:
-	1) Enable kbuild to check changes in command lines
-	   - When $(call if_changed,xxx) is used
-	2) kbuild knows what files to delete during "make clean"
-
-	Example:
-		#arch/x86/kernel/Makefile
-		extra-y := head.o init_task.o
-
-	In this example, extra-y is used to list object files that
-	shall be built, but shall not be linked as part of built-in.a.
-
-
---- 6.7 Commands useful for building a boot image
-
-	Kbuild provides a few macros that are useful when building a
-	boot image.
-
-    if_changed
-
-	if_changed is the infrastructure used for the following commands.
-
-	Usage:
-		target: source(s) FORCE
-			$(call if_changed,ld/objcopy/gzip/...)
-
-	When the rule is evaluated, it is checked to see if any files
-	need an update, or the command line has changed since the last
-	invocation. The latter will force a rebuild if any options
-	to the executable have changed.
-	Any target that utilises if_changed must be listed in $(targets),
-	otherwise the command line check will fail, and the target will
-	always be built.
-	Assignments to $(targets) are without $(obj)/ prefix.
-	if_changed may be used in conjunction with custom commands as
-	defined in 6.8 "Custom kbuild commands".
-
-	Note: It is a typical mistake to forget the FORCE prerequisite.
-	Another common pitfall is that whitespace is sometimes
-	significant; for instance, the below will fail (note the extra space
-	after the comma):
-		target: source(s) FORCE
-	#WRONG!#	$(call if_changed, ld/objcopy/gzip/...)
-
-        Note: if_changed should not be used more than once per target.
-              It stores the executed command in a corresponding .cmd
-        file and multiple calls would result in overwrites and
-        unwanted results when the target is up to date and only the
-        tests on changed commands trigger execution of commands.
-
-    ld
-	Link target. Often, LDFLAGS_$@ is used to set specific options to ld.
-
-	Example:
-		#arch/x86/boot/Makefile
-		LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
-		LDFLAGS_setup    := -Ttext 0x0 -s --oformat binary -e begtext
-
-		targets += setup setup.o bootsect bootsect.o
-		$(obj)/setup $(obj)/bootsect: %: %.o FORCE
-			$(call if_changed,ld)
-
-	In this example, there are two possible targets, requiring different
-	options to the linker. The linker options are specified using the
-	LDFLAGS_$@ syntax - one for each potential target.
-	$(targets) are assigned all potential targets, by which kbuild knows
-	the targets and will:
-		1) check for commandline changes
-		2) delete target during make clean
-
-	The ": %: %.o" part of the prerequisite is a shorthand that
-	frees us from listing the setup.o and bootsect.o files.
-	Note: It is a common mistake to forget the "targets :=" assignment,
-	      resulting in the target file being recompiled for no
-	      obvious reason.
-
-    objcopy
-	Copy binary. Uses OBJCOPYFLAGS usually specified in
-	arch/$(ARCH)/Makefile.
-	OBJCOPYFLAGS_$@ may be used to set additional options.
-
-    gzip
-	Compress target. Use maximum compression to compress target.
-
-	Example:
-		#arch/x86/boot/compressed/Makefile
-		$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
-			$(call if_changed,gzip)
-
-    dtc
-	Create flattened device tree blob object suitable for linking
-	into vmlinux. Device tree blobs linked into vmlinux are placed
-	in an init section in the image. Platform code *must* copy the
-	blob to non-init memory prior to calling unflatten_device_tree().
-
-	To use this command, simply add *.dtb into obj-y or targets, or make
-	some other target depend on %.dtb
-
-	A central rule exists to create $(obj)/%.dtb from $(src)/%.dts;
-	architecture Makefiles do no need to explicitly write out that rule.
-
-	Example:
-		targets += $(dtb-y)
-		DTC_FLAGS ?= -p 1024
-
---- 6.8 Custom kbuild commands
-
-	When kbuild is executing with KBUILD_VERBOSE=0, then only a shorthand
-	of a command is normally displayed.
-	To enable this behaviour for custom commands kbuild requires
-	two variables to be set:
-	quiet_cmd_<command>	- what shall be echoed
-	      cmd_<command>	- the command to execute
-
-	Example:
-		#
-		quiet_cmd_image = BUILD   $@
-		      cmd_image = $(obj)/tools/build $(BUILDFLAGS) \
-		                                     $(obj)/vmlinux.bin > $@
-
-		targets += bzImage
-		$(obj)/bzImage: $(obj)/vmlinux.bin $(obj)/tools/build FORCE
-			$(call if_changed,image)
-			@echo 'Kernel: $@ is ready'
-
-	When updating the $(obj)/bzImage target, the line
-
-	BUILD    arch/x86/boot/bzImage
-
-	will be displayed with "make KBUILD_VERBOSE=0".
-
-
---- 6.9 Preprocessing linker scripts
-
-	When the vmlinux image is built, the linker script
-	arch/$(ARCH)/kernel/vmlinux.lds is used.
-	The script is a preprocessed variant of the file vmlinux.lds.S
-	located in the same directory.
-	kbuild knows .lds files and includes a rule *lds.S -> *lds.
-
-	Example:
-		#arch/x86/kernel/Makefile
-		always := vmlinux.lds
-
-		#Makefile
-		export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH)
-
-	The assignment to $(always) is used to tell kbuild to build the
-	target vmlinux.lds.
-	The assignment to $(CPPFLAGS_vmlinux.lds) tells kbuild to use the
-	specified options when building the target vmlinux.lds.
-
-	When building the *.lds target, kbuild uses the variables:
-	KBUILD_CPPFLAGS	: Set in top-level Makefile
-	cppflags-y	: May be set in the kbuild makefile
-	CPPFLAGS_$(@F)  : Target-specific flags.
-	                  Note that the full filename is used in this
-	                  assignment.
-
-	The kbuild infrastructure for *lds files is used in several
-	architecture-specific files.
-
---- 6.10 Generic header files
-
-	The directory include/asm-generic contains the header files
-	that may be shared between individual architectures.
-	The recommended approach how to use a generic header file is
-	to list the file in the Kbuild file.
-	See "7.2 generic-y" for further info on syntax etc.
-
---- 6.11 Post-link pass
-
-	If the file arch/xxx/Makefile.postlink exists, this makefile
-	will be invoked for post-link objects (vmlinux and modules.ko)
-	for architectures to run post-link passes on. Must also handle
-	the clean target.
-
-	This pass runs after kallsyms generation. If the architecture
-	needs to modify symbol locations, rather than manipulate the
-	kallsyms, it may be easier to add another postlink target for
-	.tmp_vmlinux? targets to be called from link-vmlinux.sh.
-
-	For example, powerpc uses this to check relocation sanity of
-	the linked vmlinux file.
-
-=== 7 Kbuild syntax for exported headers
-
-The kernel includes a set of headers that is exported to userspace.
-Many headers can be exported as-is but other headers require a
-minimal pre-processing before they are ready for user-space.
-The pre-processing does:
-- drop kernel-specific annotations
-- drop include of compiler.h
-- drop all sections that are kernel internal (guarded by ifdef __KERNEL__)
-
-All headers under include/uapi/, include/generated/uapi/,
-arch/<arch>/include/uapi/ and arch/<arch>/include/generated/uapi/
-are exported.
-
-A Kbuild file may be defined under arch/<arch>/include/uapi/asm/ and
-arch/<arch>/include/asm/ to list asm files coming from asm-generic.
-See subsequent chapter for the syntax of the Kbuild file.
-
---- 7.1 no-export-headers
-
-	no-export-headers is essentially used by include/uapi/linux/Kbuild to
-	avoid exporting specific headers (e.g. kvm.h) on architectures that do
-	not support it. It should be avoided as much as possible.
-
---- 7.2 generic-y
-
-	If an architecture uses a verbatim copy of a header from
-	include/asm-generic then this is listed in the file
-	arch/$(ARCH)/include/asm/Kbuild like this:
-
-		Example:
-			#arch/x86/include/asm/Kbuild
-			generic-y += termios.h
-			generic-y += rtc.h
-
-	During the prepare phase of the build a wrapper include
-	file is generated in the directory:
-
-		arch/$(ARCH)/include/generated/asm
-
-	When a header is exported where the architecture uses
-	the generic header a similar wrapper is generated as part
-	of the set of exported headers in the directory:
-
-		usr/include/asm
-
-	The generated wrapper will in both cases look like the following:
-
-		Example: termios.h
-			#include <asm-generic/termios.h>
-
---- 7.3 generated-y
-
-	If an architecture generates other header files alongside generic-y
-	wrappers, generated-y specifies them.
-
-	This prevents them being treated as stale asm-generic wrappers and
-	removed.
-
-		Example:
-			#arch/x86/include/asm/Kbuild
-			generated-y += syscalls_32.h
-
---- 7.4 mandatory-y
-
-	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild
-	to define the minimum set of ASM headers that all architectures must have.
-
-	This works like optional generic-y. If a mandatory header is missing
-	in arch/$(ARCH)/include/(uapi/)/asm, Kbuild will automatically generate
-	a wrapper of the asm-generic one.
-
-	The convention is to list one subdir per line and
-	preferably in alphabetic order.
-
-=== 8 Kbuild Variables
-
-The top Makefile exports the following variables:
-
-    VERSION, PATCHLEVEL, SUBLEVEL, EXTRAVERSION
-
-	These variables define the current kernel version.  A few arch
-	Makefiles actually use these values directly; they should use
-	$(KERNELRELEASE) instead.
-
-	$(VERSION), $(PATCHLEVEL), and $(SUBLEVEL) define the basic
-	three-part version number, such as "2", "4", and "0".  These three
-	values are always numeric.
-
-	$(EXTRAVERSION) defines an even tinier sublevel for pre-patches
-	or additional patches.	It is usually some non-numeric string
-	such as "-pre4", and is often blank.
-
-    KERNELRELEASE
-
-	$(KERNELRELEASE) is a single string such as "2.4.0-pre4", suitable
-	for constructing installation directory names or showing in
-	version strings.  Some arch Makefiles use it for this purpose.
-
-    ARCH
-
-	This variable defines the target architecture, such as "i386",
-	"arm", or "sparc". Some kbuild Makefiles test $(ARCH) to
-	determine which files to compile.
-
-	By default, the top Makefile sets $(ARCH) to be the same as the
-	host system architecture.  For a cross build, a user may
-	override the value of $(ARCH) on the command line:
-
-	    make ARCH=m68k ...
-
-
-    INSTALL_PATH
-
-	This variable defines a place for the arch Makefiles to install
-	the resident kernel image and System.map file.
-	Use this for architecture-specific install targets.
-
-    INSTALL_MOD_PATH, MODLIB
-
-	$(INSTALL_MOD_PATH) specifies a prefix to $(MODLIB) for module
-	installation.  This variable is not defined in the Makefile but
-	may be passed in by the user if desired.
-
-	$(MODLIB) specifies the directory for module installation.
-	The top Makefile defines $(MODLIB) to
-	$(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE).  The user may
-	override this value on the command line if desired.
-
-    INSTALL_MOD_STRIP
-
-	If this variable is specified, it will cause modules to be stripped
-	after they are installed.  If INSTALL_MOD_STRIP is '1', then the
-	default option --strip-debug will be used.  Otherwise, the
-	INSTALL_MOD_STRIP value will be used as the option(s) to the strip
-	command.
-
-
-=== 9 Makefile language
-
-The kernel Makefiles are designed to be run with GNU Make.  The Makefiles
-use only the documented features of GNU Make, but they do use many
-GNU extensions.
-
-GNU Make supports elementary list-processing functions.  The kernel
-Makefiles use a novel style of list building and manipulation with few
-"if" statements.
-
-GNU Make has two assignment operators, ":=" and "=".  ":=" performs
-immediate evaluation of the right-hand side and stores an actual string
-into the left-hand side.  "=" is like a formula definition; it stores the
-right-hand side in an unevaluated form and then evaluates this form each
-time the left-hand side is used.
-
-There are some cases where "=" is appropriate.  Usually, though, ":="
-is the right choice.
-
-=== 10 Credits
-
-Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
-Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
-Updates by Sam Ravnborg <sam@ravnborg.org>
-Language QA by Jan Engelhardt <jengelh@gmx.de>
-
-=== 11 TODO
-
-- Describe how kbuild supports shipped files with _shipped.
-- Generating offset header files.
-- Add more variables to section 7?
-
-
-
diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst
new file mode 100644
index 000000000000..24e763482650
--- /dev/null
+++ b/Documentation/kbuild/modules.rst
@@ -0,0 +1,571 @@
+=========================
+Building External Modules
+=========================
+
+This document describes how to build an out-of-tree kernel module.
+
+.. Table of Contents
+
+	=== 1 Introduction
+	=== 2 How to Build External Modules
+	   --- 2.1 Command Syntax
+	   --- 2.2 Options
+	   --- 2.3 Targets
+	   --- 2.4 Building Separate Files
+	=== 3. Creating a Kbuild File for an External Module
+	   --- 3.1 Shared Makefile
+	   --- 3.2 Separate Kbuild file and Makefile
+	   --- 3.3 Binary Blobs
+	   --- 3.4 Building Multiple Modules
+	=== 4. Include Files
+	   --- 4.1 Kernel Includes
+	   --- 4.2 Single Subdirectory
+	   --- 4.3 Several Subdirectories
+	=== 5. Module Installation
+	   --- 5.1 INSTALL_MOD_PATH
+	   --- 5.2 INSTALL_MOD_DIR
+	=== 6. Module Versioning
+	   --- 6.1 Symbols From the Kernel (vmlinux + modules)
+	   --- 6.2 Symbols and External Modules
+	   --- 6.3 Symbols From Another External Module
+	=== 7. Tips & Tricks
+	   --- 7.1 Testing for CONFIG_FOO_BAR
+
+
+
+1. Introduction
+===============
+
+"kbuild" is the build system used by the Linux kernel. Modules must use
+kbuild to stay compatible with changes in the build infrastructure and
+to pick up the right flags to "gcc." Functionality for building modules
+both in-tree and out-of-tree is provided. The method for building
+either is similar, and all modules are initially developed and built
+out-of-tree.
+
+Covered in this document is information aimed at developers interested
+in building out-of-tree (or "external") modules. The author of an
+external module should supply a makefile that hides most of the
+complexity, so one only has to type "make" to build the module. This is
+easily accomplished, and a complete example will be presented in
+section 3.
+
+
+2. How to Build External Modules
+================================
+
+To build external modules, you must have a prebuilt kernel available
+that contains the configuration and header files used in the build.
+Also, the kernel must have been built with modules enabled. If you are
+using a distribution kernel, there will be a package for the kernel you
+are running provided by your distribution.
+
+An alternative is to use the "make" target "modules_prepare." This will
+make sure the kernel contains the information required. The target
+exists solely as a simple way to prepare a kernel source tree for
+building external modules.
+
+NOTE: "modules_prepare" will not build Module.symvers even if
+CONFIG_MODVERSIONS is set; therefore, a full kernel build needs to be
+executed to make module versioning work.
+
+2.1 Command Syntax
+==================
+
+	The command to build an external module is::
+
+		$ make -C <path_to_kernel_src> M=$PWD
+
+	The kbuild system knows that an external module is being built
+	due to the "M=<dir>" option given in the command.
+
+	To build against the running kernel use::
+
+		$ make -C /lib/modules/`uname -r`/build M=$PWD
+
+	Then to install the module(s) just built, add the target
+	"modules_install" to the command::
+
+		$ make -C /lib/modules/`uname -r`/build M=$PWD modules_install
+
+2.2 Options
+===========
+
+	($KDIR refers to the path of the kernel source directory.)
+
+	make -C $KDIR M=$PWD
+
+	-C $KDIR
+		The directory where the kernel source is located.
+		"make" will actually change to the specified directory
+		when executing and will change back when finished.
+
+	M=$PWD
+		Informs kbuild that an external module is being built.
+		The value given to "M" is the absolute path of the
+		directory where the external module (kbuild file) is
+		located.
+
+2.3 Targets
+===========
+
+	When building an external module, only a subset of the "make"
+	targets are available.
+
+	make -C $KDIR M=$PWD [target]
+
+	The default will build the module(s) located in the current
+	directory, so a target does not need to be specified. All
+	output files will also be generated in this directory. No
+	attempts are made to update the kernel source, and it is a
+	precondition that a successful "make" has been executed for the
+	kernel.
+
+	modules
+		The default target for external modules. It has the
+		same functionality as if no target was specified. See
+		description above.
+
+	modules_install
+		Install the external module(s). The default location is
+		/lib/modules/<kernel_release>/extra/, but a prefix may
+		be added with INSTALL_MOD_PATH (discussed in section 5).
+
+	clean
+		Remove all generated files in the module directory only.
+
+	help
+		List the available targets for external modules.
+
+2.4 Building Separate Files
+===========================
+
+	It is possible to build single files that are part of a module.
+	This works equally well for the kernel, a module, and even for
+	external modules.
+
+	Example (The module foo.ko, consist of bar.o and baz.o)::
+
+		make -C $KDIR M=$PWD bar.lst
+		make -C $KDIR M=$PWD baz.o
+		make -C $KDIR M=$PWD foo.ko
+		make -C $KDIR M=$PWD ./
+
+
+3. Creating a Kbuild File for an External Module
+================================================
+
+In the last section we saw the command to build a module for the
+running kernel. The module is not actually built, however, because a
+build file is required. Contained in this file will be the name of
+the module(s) being built, along with the list of requisite source
+files. The file may be as simple as a single line::
+
+	obj-m := <module_name>.o
+
+The kbuild system will build <module_name>.o from <module_name>.c,
+and, after linking, will result in the kernel module <module_name>.ko.
+The above line can be put in either a "Kbuild" file or a "Makefile."
+When the module is built from multiple sources, an additional line is
+needed listing the files::
+
+	<module_name>-y := <src1>.o <src2>.o ...
+
+NOTE: Further documentation describing the syntax used by kbuild is
+located in Documentation/kbuild/makefiles.rst.
+
+The examples below demonstrate how to create a build file for the
+module 8123.ko, which is built from the following files::
+
+	8123_if.c
+	8123_if.h
+	8123_pci.c
+	8123_bin.o_shipped	<= Binary blob
+
+--- 3.1 Shared Makefile
+
+	An external module always includes a wrapper makefile that
+	supports building the module using "make" with no arguments.
+	This target is not used by kbuild; it is only for convenience.
+	Additional functionality, such as test targets, can be included
+	but should be filtered out from kbuild due to possible name
+	clashes.
+
+	Example 1::
+
+		--> filename: Makefile
+		ifneq ($(KERNELRELEASE),)
+		# kbuild part of makefile
+		obj-m  := 8123.o
+		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+		else
+		# normal makefile
+		KDIR ?= /lib/modules/`uname -r`/build
+
+		default:
+			$(MAKE) -C $(KDIR) M=$$PWD
+
+		# Module specific targets
+		genbin:
+			echo "X" > 8123_bin.o_shipped
+
+		endif
+
+	The check for KERNELRELEASE is used to separate the two parts
+	of the makefile. In the example, kbuild will only see the two
+	assignments, whereas "make" will see everything except these
+	two assignments. This is due to two passes made on the file:
+	the first pass is by the "make" instance run on the command
+	line; the second pass is by the kbuild system, which is
+	initiated by the parameterized "make" in the default target.
+
+3.2 Separate Kbuild File and Makefile
+-------------------------------------
+
+	In newer versions of the kernel, kbuild will first look for a
+	file named "Kbuild," and only if that is not found, will it
+	then look for a makefile. Utilizing a "Kbuild" file allows us
+	to split up the makefile from example 1 into two files:
+
+	Example 2::
+
+		--> filename: Kbuild
+		obj-m  := 8123.o
+		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+		--> filename: Makefile
+		KDIR ?= /lib/modules/`uname -r`/build
+
+		default:
+			$(MAKE) -C $(KDIR) M=$$PWD
+
+		# Module specific targets
+		genbin:
+			echo "X" > 8123_bin.o_shipped
+
+	The split in example 2 is questionable due to the simplicity of
+	each file; however, some external modules use makefiles
+	consisting of several hundred lines, and here it really pays
+	off to separate the kbuild part from the rest.
+
+	The next example shows a backward compatible version.
+
+	Example 3::
+
+		--> filename: Kbuild
+		obj-m  := 8123.o
+		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+		--> filename: Makefile
+		ifneq ($(KERNELRELEASE),)
+		# kbuild part of makefile
+		include Kbuild
+
+		else
+		# normal makefile
+		KDIR ?= /lib/modules/`uname -r`/build
+
+		default:
+			$(MAKE) -C $(KDIR) M=$$PWD
+
+		# Module specific targets
+		genbin:
+			echo "X" > 8123_bin.o_shipped
+
+		endif
+
+	Here the "Kbuild" file is included from the makefile. This
+	allows an older version of kbuild, which only knows of
+	makefiles, to be used when the "make" and kbuild parts are
+	split into separate files.
+
+3.3 Binary Blobs
+----------------
+
+	Some external modules need to include an object file as a blob.
+	kbuild has support for this, but requires the blob file to be
+	named <filename>_shipped. When the kbuild rules kick in, a copy
+	of <filename>_shipped is created with _shipped stripped off,
+	giving us <filename>. This shortened filename can be used in
+	the assignment to the module.
+
+	Throughout this section, 8123_bin.o_shipped has been used to
+	build the kernel module 8123.ko; it has been included as
+	8123_bin.o::
+
+		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+	Although there is no distinction between the ordinary source
+	files and the binary file, kbuild will pick up different rules
+	when creating the object file for the module.
+
+3.4 Building Multiple Modules
+=============================
+
+	kbuild supports building multiple modules with a single build
+	file. For example, if you wanted to build two modules, foo.ko
+	and bar.ko, the kbuild lines would be::
+
+		obj-m := foo.o bar.o
+		foo-y := <foo_srcs>
+		bar-y := <bar_srcs>
+
+	It is that simple!
+
+
+4. Include Files
+================
+
+Within the kernel, header files are kept in standard locations
+according to the following rule:
+
+	* If the header file only describes the internal interface of a
+	  module, then the file is placed in the same directory as the
+	  source files.
+	* If the header file describes an interface used by other parts
+	  of the kernel that are located in different directories, then
+	  the file is placed in include/linux/.
+
+	  NOTE:
+	      There are two notable exceptions to this rule: larger
+	      subsystems have their own directory under include/, such as
+	      include/scsi; and architecture specific headers are located
+	      under arch/$(ARCH)/include/.
+
+4.1 Kernel Includes
+-------------------
+
+	To include a header file located under include/linux/, simply
+	use::
+
+		#include <linux/module.h>
+
+	kbuild will add options to "gcc" so the relevant directories
+	are searched.
+
+4.2 Single Subdirectory
+-----------------------
+
+	External modules tend to place header files in a separate
+	include/ directory where their source is located, although this
+	is not the usual kernel style. To inform kbuild of the
+	directory, use either ccflags-y or CFLAGS_<filename>.o.
+
+	Using the example from section 3, if we moved 8123_if.h to a
+	subdirectory named include, the resulting kbuild file would
+	look like::
+
+		--> filename: Kbuild
+		obj-m := 8123.o
+
+		ccflags-y := -Iinclude
+		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+	Note that in the assignment there is no space between -I and
+	the path. This is a limitation of kbuild: there must be no
+	space present.
+
+4.3 Several Subdirectories
+--------------------------
+
+	kbuild can handle files that are spread over several directories.
+	Consider the following example::
+
+		.
+		|__ src
+		|   |__ complex_main.c
+		|   |__ hal
+		|	|__ hardwareif.c
+		|	|__ include
+		|	    |__ hardwareif.h
+		|__ include
+		|__ complex.h
+
+	To build the module complex.ko, we then need the following
+	kbuild file::
+
+		--> filename: Kbuild
+		obj-m := complex.o
+		complex-y := src/complex_main.o
+		complex-y += src/hal/hardwareif.o
+
+		ccflags-y := -I$(src)/include
+		ccflags-y += -I$(src)/src/hal/include
+
+	As you can see, kbuild knows how to handle object files located
+	in other directories. The trick is to specify the directory
+	relative to the kbuild file's location. That being said, this
+	is NOT recommended practice.
+
+	For the header files, kbuild must be explicitly told where to
+	look. When kbuild executes, the current directory is always the
+	root of the kernel tree (the argument to "-C") and therefore an
+	absolute path is needed. $(src) provides the absolute path by
+	pointing to the directory where the currently executing kbuild
+	file is located.
+
+
+5. Module Installation
+======================
+
+Modules which are included in the kernel are installed in the
+directory:
+
+	/lib/modules/$(KERNELRELEASE)/kernel/
+
+And external modules are installed in:
+
+	/lib/modules/$(KERNELRELEASE)/extra/
+
+5.1 INSTALL_MOD_PATH
+--------------------
+
+	Above are the default directories but as always some level of
+	customization is possible. A prefix can be added to the
+	installation path using the variable INSTALL_MOD_PATH::
+
+		$ make INSTALL_MOD_PATH=/frodo modules_install
+		=> Install dir: /frodo/lib/modules/$(KERNELRELEASE)/kernel/
+
+	INSTALL_MOD_PATH may be set as an ordinary shell variable or,
+	as shown above, can be specified on the command line when
+	calling "make." This has effect when installing both in-tree
+	and out-of-tree modules.
+
+5.2 INSTALL_MOD_DIR
+-------------------
+
+	External modules are by default installed to a directory under
+	/lib/modules/$(KERNELRELEASE)/extra/, but you may wish to
+	locate modules for a specific functionality in a separate
+	directory. For this purpose, use INSTALL_MOD_DIR to specify an
+	alternative name to "extra."::
+
+		$ make INSTALL_MOD_DIR=gandalf -C $KDIR \
+		       M=$PWD modules_install
+		=> Install dir: /lib/modules/$(KERNELRELEASE)/gandalf/
+
+
+6. Module Versioning
+====================
+
+Module versioning is enabled by the CONFIG_MODVERSIONS tag, and is used
+as a simple ABI consistency check. A CRC value of the full prototype
+for an exported symbol is created. When a module is loaded/used, the
+CRC values contained in the kernel are compared with similar values in
+the module; if they are not equal, the kernel refuses to load the
+module.
+
+Module.symvers contains a list of all exported symbols from a kernel
+build.
+
+6.1 Symbols From the Kernel (vmlinux + modules)
+-----------------------------------------------
+
+	During a kernel build, a file named Module.symvers will be
+	generated. Module.symvers contains all exported symbols from
+	the kernel and compiled modules. For each symbol, the
+	corresponding CRC value is also stored.
+
+	The syntax of the Module.symvers file is::
+
+		<CRC>	    <Symbol>	       <module>
+
+		0x2d036834  scsi_remove_host   drivers/scsi/scsi_mod
+
+	For a kernel build without CONFIG_MODVERSIONS enabled, the CRC
+	would read 0x00000000.
+
+	Module.symvers serves two purposes:
+
+	1) It lists all exported symbols from vmlinux and all modules.
+	2) It lists the CRC if CONFIG_MODVERSIONS is enabled.
+
+6.2 Symbols and External Modules
+--------------------------------
+
+	When building an external module, the build system needs access
+	to the symbols from the kernel to check if all external symbols
+	are defined. This is done in the MODPOST step. modpost obtains
+	the symbols by reading Module.symvers from the kernel source
+	tree. If a Module.symvers file is present in the directory
+	where the external module is being built, this file will be
+	read too. During the MODPOST step, a new Module.symvers file
+	will be written containing all exported symbols that were not
+	defined in the kernel.
+
+--- 6.3 Symbols From Another External Module
+
+	Sometimes, an external module uses exported symbols from
+	another external module. kbuild needs to have full knowledge of
+	all symbols to avoid spitting out warnings about undefined
+	symbols. Three solutions exist for this situation.
+
+	NOTE: The method with a top-level kbuild file is recommended
+	but may be impractical in certain situations.
+
+	Use a top-level kbuild file
+		If you have two modules, foo.ko and bar.ko, where
+		foo.ko needs symbols from bar.ko, you can use a
+		common top-level kbuild file so both modules are
+		compiled in the same build. Consider the following
+		directory layout::
+
+			./foo/ <= contains foo.ko
+			./bar/ <= contains bar.ko
+
+		The top-level kbuild file would then look like::
+
+			#./Kbuild (or ./Makefile):
+				obj-y := foo/ bar/
+
+		And executing::
+
+			$ make -C $KDIR M=$PWD
+
+		will then do the expected and compile both modules with
+		full knowledge of symbols from either module.
+
+	Use an extra Module.symvers file
+		When an external module is built, a Module.symvers file
+		is generated containing all exported symbols which are
+		not defined in the kernel. To get access to symbols
+		from bar.ko, copy the Module.symvers file from the
+		compilation of bar.ko to the directory where foo.ko is
+		built. During the module build, kbuild will read the
+		Module.symvers file in the directory of the external
+		module, and when the build is finished, a new
+		Module.symvers file is created containing the sum of
+		all symbols defined and not part of the kernel.
+
+	Use "make" variable KBUILD_EXTRA_SYMBOLS
+		If it is impractical to copy Module.symvers from
+		another module, you can assign a space separated list
+		of files to KBUILD_EXTRA_SYMBOLS in your build file.
+		These files will be loaded by modpost during the
+		initialization of its symbol tables.
+
+
+7. Tips & Tricks
+================
+
+7.1 Testing for CONFIG_FOO_BAR
+------------------------------
+
+	Modules often need to check for certain `CONFIG_` options to
+	decide if a specific feature is included in the module. In
+	kbuild this is done by referencing the `CONFIG_` variable
+	directly::
+
+		#fs/ext2/Makefile
+		obj-$(CONFIG_EXT2_FS) += ext2.o
+
+		ext2-y := balloc.o bitmap.o dir.o
+		ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o
+
+	External modules have traditionally used "grep" to check for
+	specific `CONFIG_` settings directly in .config. This usage is
+	broken. As introduced before, external modules should use
+	kbuild for building and can therefore use the same methods as
+	in-tree modules when testing for `CONFIG_` definitions.
diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
deleted file mode 100644
index 80295c613e37..000000000000
--- a/Documentation/kbuild/modules.txt
+++ /dev/null
@@ -1,541 +0,0 @@
-Building External Modules
-
-This document describes how to build an out-of-tree kernel module.
-
-=== Table of Contents
-
-	=== 1 Introduction
-	=== 2 How to Build External Modules
-	   --- 2.1 Command Syntax
-	   --- 2.2 Options
-	   --- 2.3 Targets
-	   --- 2.4 Building Separate Files
-	=== 3. Creating a Kbuild File for an External Module
-	   --- 3.1 Shared Makefile
-	   --- 3.2 Separate Kbuild file and Makefile
-	   --- 3.3 Binary Blobs
-	   --- 3.4 Building Multiple Modules
-	=== 4. Include Files
-	   --- 4.1 Kernel Includes
-	   --- 4.2 Single Subdirectory
-	   --- 4.3 Several Subdirectories
-	=== 5. Module Installation
-	   --- 5.1 INSTALL_MOD_PATH
-	   --- 5.2 INSTALL_MOD_DIR
-	=== 6. Module Versioning
-	   --- 6.1 Symbols From the Kernel (vmlinux + modules)
-	   --- 6.2 Symbols and External Modules
-	   --- 6.3 Symbols From Another External Module
-	=== 7. Tips & Tricks
-	   --- 7.1 Testing for CONFIG_FOO_BAR
-
-
-
-=== 1. Introduction
-
-"kbuild" is the build system used by the Linux kernel. Modules must use
-kbuild to stay compatible with changes in the build infrastructure and
-to pick up the right flags to "gcc." Functionality for building modules
-both in-tree and out-of-tree is provided. The method for building
-either is similar, and all modules are initially developed and built
-out-of-tree.
-
-Covered in this document is information aimed at developers interested
-in building out-of-tree (or "external") modules. The author of an
-external module should supply a makefile that hides most of the
-complexity, so one only has to type "make" to build the module. This is
-easily accomplished, and a complete example will be presented in
-section 3.
-
-
-=== 2. How to Build External Modules
-
-To build external modules, you must have a prebuilt kernel available
-that contains the configuration and header files used in the build.
-Also, the kernel must have been built with modules enabled. If you are
-using a distribution kernel, there will be a package for the kernel you
-are running provided by your distribution.
-
-An alternative is to use the "make" target "modules_prepare." This will
-make sure the kernel contains the information required. The target
-exists solely as a simple way to prepare a kernel source tree for
-building external modules.
-
-NOTE: "modules_prepare" will not build Module.symvers even if
-CONFIG_MODVERSIONS is set; therefore, a full kernel build needs to be
-executed to make module versioning work.
-
---- 2.1 Command Syntax
-
-	The command to build an external module is:
-
-		$ make -C <path_to_kernel_src> M=$PWD
-
-	The kbuild system knows that an external module is being built
-	due to the "M=<dir>" option given in the command.
-
-	To build against the running kernel use:
-
-		$ make -C /lib/modules/`uname -r`/build M=$PWD
-
-	Then to install the module(s) just built, add the target
-	"modules_install" to the command:
-
-		$ make -C /lib/modules/`uname -r`/build M=$PWD modules_install
-
---- 2.2 Options
-
-	($KDIR refers to the path of the kernel source directory.)
-
-	make -C $KDIR M=$PWD
-
-	-C $KDIR
-		The directory where the kernel source is located.
-		"make" will actually change to the specified directory
-		when executing and will change back when finished.
-
-	M=$PWD
-		Informs kbuild that an external module is being built.
-		The value given to "M" is the absolute path of the
-		directory where the external module (kbuild file) is
-		located.
-
---- 2.3 Targets
-
-	When building an external module, only a subset of the "make"
-	targets are available.
-
-	make -C $KDIR M=$PWD [target]
-
-	The default will build the module(s) located in the current
-	directory, so a target does not need to be specified. All
-	output files will also be generated in this directory. No
-	attempts are made to update the kernel source, and it is a
-	precondition that a successful "make" has been executed for the
-	kernel.
-
-	modules
-		The default target for external modules. It has the
-		same functionality as if no target was specified. See
-		description above.
-
-	modules_install
-		Install the external module(s). The default location is
-		/lib/modules/<kernel_release>/extra/, but a prefix may
-		be added with INSTALL_MOD_PATH (discussed in section 5).
-
-	clean
-		Remove all generated files in the module directory only.
-
-	help
-		List the available targets for external modules.
-
---- 2.4 Building Separate Files
-
-	It is possible to build single files that are part of a module.
-	This works equally well for the kernel, a module, and even for
-	external modules.
-
-	Example (The module foo.ko, consist of bar.o and baz.o):
-		make -C $KDIR M=$PWD bar.lst
-		make -C $KDIR M=$PWD baz.o
-		make -C $KDIR M=$PWD foo.ko
-		make -C $KDIR M=$PWD ./
-
-
-=== 3. Creating a Kbuild File for an External Module
-
-In the last section we saw the command to build a module for the
-running kernel. The module is not actually built, however, because a
-build file is required. Contained in this file will be the name of
-the module(s) being built, along with the list of requisite source
-files. The file may be as simple as a single line:
-
-	obj-m := <module_name>.o
-
-The kbuild system will build <module_name>.o from <module_name>.c,
-and, after linking, will result in the kernel module <module_name>.ko.
-The above line can be put in either a "Kbuild" file or a "Makefile."
-When the module is built from multiple sources, an additional line is
-needed listing the files:
-
-	<module_name>-y := <src1>.o <src2>.o ...
-
-NOTE: Further documentation describing the syntax used by kbuild is
-located in Documentation/kbuild/makefiles.txt.
-
-The examples below demonstrate how to create a build file for the
-module 8123.ko, which is built from the following files:
-
-	8123_if.c
-	8123_if.h
-	8123_pci.c
-	8123_bin.o_shipped	<= Binary blob
-
---- 3.1 Shared Makefile
-
-	An external module always includes a wrapper makefile that
-	supports building the module using "make" with no arguments.
-	This target is not used by kbuild; it is only for convenience.
-	Additional functionality, such as test targets, can be included
-	but should be filtered out from kbuild due to possible name
-	clashes.
-
-	Example 1:
-		--> filename: Makefile
-		ifneq ($(KERNELRELEASE),)
-		# kbuild part of makefile
-		obj-m  := 8123.o
-		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
-
-		else
-		# normal makefile
-		KDIR ?= /lib/modules/`uname -r`/build
-
-		default:
-			$(MAKE) -C $(KDIR) M=$$PWD
-
-		# Module specific targets
-		genbin:
-			echo "X" > 8123_bin.o_shipped
-
-		endif
-
-	The check for KERNELRELEASE is used to separate the two parts
-	of the makefile. In the example, kbuild will only see the two
-	assignments, whereas "make" will see everything except these
-	two assignments. This is due to two passes made on the file:
-	the first pass is by the "make" instance run on the command
-	line; the second pass is by the kbuild system, which is
-	initiated by the parameterized "make" in the default target.
-
---- 3.2 Separate Kbuild File and Makefile
-
-	In newer versions of the kernel, kbuild will first look for a
-	file named "Kbuild," and only if that is not found, will it
-	then look for a makefile. Utilizing a "Kbuild" file allows us
-	to split up the makefile from example 1 into two files:
-
-	Example 2:
-		--> filename: Kbuild
-		obj-m  := 8123.o
-		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
-
-		--> filename: Makefile
-		KDIR ?= /lib/modules/`uname -r`/build
-
-		default:
-			$(MAKE) -C $(KDIR) M=$$PWD
-
-		# Module specific targets
-		genbin:
-			echo "X" > 8123_bin.o_shipped
-
-	The split in example 2 is questionable due to the simplicity of
-	each file; however, some external modules use makefiles
-	consisting of several hundred lines, and here it really pays
-	off to separate the kbuild part from the rest.
-
-	The next example shows a backward compatible version.
-
-	Example 3:
-		--> filename: Kbuild
-		obj-m  := 8123.o
-		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
-
-		--> filename: Makefile
-		ifneq ($(KERNELRELEASE),)
-		# kbuild part of makefile
-		include Kbuild
-
-		else
-		# normal makefile
-		KDIR ?= /lib/modules/`uname -r`/build
-
-		default:
-			$(MAKE) -C $(KDIR) M=$$PWD
-
-		# Module specific targets
-		genbin:
-			echo "X" > 8123_bin.o_shipped
-
-		endif
-
-	Here the "Kbuild" file is included from the makefile. This
-	allows an older version of kbuild, which only knows of
-	makefiles, to be used when the "make" and kbuild parts are
-	split into separate files.
-
---- 3.3 Binary Blobs
-
-	Some external modules need to include an object file as a blob.
-	kbuild has support for this, but requires the blob file to be
-	named <filename>_shipped. When the kbuild rules kick in, a copy
-	of <filename>_shipped is created with _shipped stripped off,
-	giving us <filename>. This shortened filename can be used in
-	the assignment to the module.
-
-	Throughout this section, 8123_bin.o_shipped has been used to
-	build the kernel module 8123.ko; it has been included as
-	8123_bin.o.
-
-		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
-
-	Although there is no distinction between the ordinary source
-	files and the binary file, kbuild will pick up different rules
-	when creating the object file for the module.
-
---- 3.4 Building Multiple Modules
-
-	kbuild supports building multiple modules with a single build
-	file. For example, if you wanted to build two modules, foo.ko
-	and bar.ko, the kbuild lines would be:
-
-		obj-m := foo.o bar.o
-		foo-y := <foo_srcs>
-		bar-y := <bar_srcs>
-
-	It is that simple!
-
-
-=== 4. Include Files
-
-Within the kernel, header files are kept in standard locations
-according to the following rule:
-
-	* If the header file only describes the internal interface of a
-	  module, then the file is placed in the same directory as the
-	  source files.
-	* If the header file describes an interface used by other parts
-	  of the kernel that are located in different directories, then
-	  the file is placed in include/linux/.
-
-	  NOTE: There are two notable exceptions to this rule: larger
-	  subsystems have their own directory under include/, such as
-	  include/scsi; and architecture specific headers are located
-	  under arch/$(ARCH)/include/.
-
---- 4.1 Kernel Includes
-
-	To include a header file located under include/linux/, simply
-	use:
-
-		#include <linux/module.h>
-
-	kbuild will add options to "gcc" so the relevant directories
-	are searched.
-
---- 4.2 Single Subdirectory
-
-	External modules tend to place header files in a separate
-	include/ directory where their source is located, although this
-	is not the usual kernel style. To inform kbuild of the
-	directory, use either ccflags-y or CFLAGS_<filename>.o.
-
-	Using the example from section 3, if we moved 8123_if.h to a
-	subdirectory named include, the resulting kbuild file would
-	look like:
-
-		--> filename: Kbuild
-		obj-m := 8123.o
-
-		ccflags-y := -Iinclude
-		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
-
-	Note that in the assignment there is no space between -I and
-	the path. This is a limitation of kbuild: there must be no
-	space present.
-
---- 4.3 Several Subdirectories
-
-	kbuild can handle files that are spread over several directories.
-	Consider the following example:
-
-	.
-	|__ src
-	|   |__ complex_main.c
-	|   |__ hal
-	|	|__ hardwareif.c
-	|	|__ include
-	|	    |__ hardwareif.h
-	|__ include
-	    |__ complex.h
-
-	To build the module complex.ko, we then need the following
-	kbuild file:
-
-		--> filename: Kbuild
-		obj-m := complex.o
-		complex-y := src/complex_main.o
-		complex-y += src/hal/hardwareif.o
-
-		ccflags-y := -I$(src)/include
-		ccflags-y += -I$(src)/src/hal/include
-
-	As you can see, kbuild knows how to handle object files located
-	in other directories. The trick is to specify the directory
-	relative to the kbuild file's location. That being said, this
-	is NOT recommended practice.
-
-	For the header files, kbuild must be explicitly told where to
-	look. When kbuild executes, the current directory is always the
-	root of the kernel tree (the argument to "-C") and therefore an
-	absolute path is needed. $(src) provides the absolute path by
-	pointing to the directory where the currently executing kbuild
-	file is located.
-
-
-=== 5. Module Installation
-
-Modules which are included in the kernel are installed in the
-directory:
-
-	/lib/modules/$(KERNELRELEASE)/kernel/
-
-And external modules are installed in:
-
-	/lib/modules/$(KERNELRELEASE)/extra/
-
---- 5.1 INSTALL_MOD_PATH
-
-	Above are the default directories but as always some level of
-	customization is possible. A prefix can be added to the
-	installation path using the variable INSTALL_MOD_PATH:
-
-		$ make INSTALL_MOD_PATH=/frodo modules_install
-		=> Install dir: /frodo/lib/modules/$(KERNELRELEASE)/kernel/
-
-	INSTALL_MOD_PATH may be set as an ordinary shell variable or,
-	as shown above, can be specified on the command line when
-	calling "make." This has effect when installing both in-tree
-	and out-of-tree modules.
-
---- 5.2 INSTALL_MOD_DIR
-
-	External modules are by default installed to a directory under
-	/lib/modules/$(KERNELRELEASE)/extra/, but you may wish to
-	locate modules for a specific functionality in a separate
-	directory. For this purpose, use INSTALL_MOD_DIR to specify an
-	alternative name to "extra."
-
-		$ make INSTALL_MOD_DIR=gandalf -C $KDIR \
-		       M=$PWD modules_install
-		=> Install dir: /lib/modules/$(KERNELRELEASE)/gandalf/
-
-
-=== 6. Module Versioning
-
-Module versioning is enabled by the CONFIG_MODVERSIONS tag, and is used
-as a simple ABI consistency check. A CRC value of the full prototype
-for an exported symbol is created. When a module is loaded/used, the
-CRC values contained in the kernel are compared with similar values in
-the module; if they are not equal, the kernel refuses to load the
-module.
-
-Module.symvers contains a list of all exported symbols from a kernel
-build.
-
---- 6.1 Symbols From the Kernel (vmlinux + modules)
-
-	During a kernel build, a file named Module.symvers will be
-	generated. Module.symvers contains all exported symbols from
-	the kernel and compiled modules. For each symbol, the
-	corresponding CRC value is also stored.
-
-	The syntax of the Module.symvers file is:
-		<CRC>	    <Symbol>	       <module>
-
-		0x2d036834  scsi_remove_host   drivers/scsi/scsi_mod
-
-	For a kernel build without CONFIG_MODVERSIONS enabled, the CRC
-	would read 0x00000000.
-
-	Module.symvers serves two purposes:
-	1) It lists all exported symbols from vmlinux and all modules.
-	2) It lists the CRC if CONFIG_MODVERSIONS is enabled.
-
---- 6.2 Symbols and External Modules
-
-	When building an external module, the build system needs access
-	to the symbols from the kernel to check if all external symbols
-	are defined. This is done in the MODPOST step. modpost obtains
-	the symbols by reading Module.symvers from the kernel source
-	tree. If a Module.symvers file is present in the directory
-	where the external module is being built, this file will be
-	read too. During the MODPOST step, a new Module.symvers file
-	will be written containing all exported symbols that were not
-	defined in the kernel.
-
---- 6.3 Symbols From Another External Module
-
-	Sometimes, an external module uses exported symbols from
-	another external module. kbuild needs to have full knowledge of
-	all symbols to avoid spitting out warnings about undefined
-	symbols. Three solutions exist for this situation.
-
-	NOTE: The method with a top-level kbuild file is recommended
-	but may be impractical in certain situations.
-
-	Use a top-level kbuild file
-		If you have two modules, foo.ko and bar.ko, where
-		foo.ko needs symbols from bar.ko, you can use a
-		common top-level kbuild file so both modules are
-		compiled in the same build. Consider the following
-		directory layout:
-
-		./foo/ <= contains foo.ko
-		./bar/ <= contains bar.ko
-
-		The top-level kbuild file would then look like:
-
-		#./Kbuild (or ./Makefile):
-			obj-y := foo/ bar/
-
-		And executing
-
-			$ make -C $KDIR M=$PWD
-
-		will then do the expected and compile both modules with
-		full knowledge of symbols from either module.
-
-	Use an extra Module.symvers file
-		When an external module is built, a Module.symvers file
-		is generated containing all exported symbols which are
-		not defined in the kernel. To get access to symbols
-		from bar.ko, copy the Module.symvers file from the
-		compilation of bar.ko to the directory where foo.ko is
-		built. During the module build, kbuild will read the
-		Module.symvers file in the directory of the external
-		module, and when the build is finished, a new
-		Module.symvers file is created containing the sum of
-		all symbols defined and not part of the kernel.
-
-	Use "make" variable KBUILD_EXTRA_SYMBOLS
-		If it is impractical to copy Module.symvers from
-		another module, you can assign a space separated list
-		of files to KBUILD_EXTRA_SYMBOLS in your build file.
-		These files will be loaded by modpost during the
-		initialization of its symbol tables.
-
-
-=== 7. Tips & Tricks
-
---- 7.1 Testing for CONFIG_FOO_BAR
-
-	Modules often need to check for certain CONFIG_ options to
-	decide if a specific feature is included in the module. In
-	kbuild this is done by referencing the CONFIG_ variable
-	directly.
-
-		#fs/ext2/Makefile
-		obj-$(CONFIG_EXT2_FS) += ext2.o
-
-		ext2-y := balloc.o bitmap.o dir.o
-		ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o
-
-	External modules have traditionally used "grep" to check for
-	specific CONFIG_ settings directly in .config. This usage is
-	broken. As introduced before, external modules should use
-	kbuild for building and can therefore use the same methods as
-	in-tree modules when testing for CONFIG_ definitions.
-
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
deleted file mode 100644
index 51814450a7f8..000000000000
--- a/Documentation/kdump/kdump.txt
+++ /dev/null
@@ -1,509 +0,0 @@
-================================================================
-Documentation for Kdump - The kexec-based Crash Dumping Solution
-================================================================
-
-This document includes overview, setup and installation, and analysis
-information.
-
-Overview
-========
-
-Kdump uses kexec to quickly boot to a dump-capture kernel whenever a
-dump of the system kernel's memory needs to be taken (for example, when
-the system panics). The system kernel's memory image is preserved across
-the reboot and is accessible to the dump-capture kernel.
-
-You can use common commands, such as cp and scp, to copy the
-memory image to a dump file on the local disk, or across the network to
-a remote system.
-
-Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64,
-s390x, arm and arm64 architectures.
-
-When the system kernel boots, it reserves a small section of memory for
-the dump-capture kernel. This ensures that ongoing Direct Memory Access
-(DMA) from the system kernel does not corrupt the dump-capture kernel.
-The kexec -p command loads the dump-capture kernel into this reserved
-memory.
-
-On x86 machines, the first 640 KB of physical memory is needed to boot,
-regardless of where the kernel loads. Therefore, kexec backs up this
-region just before rebooting into the dump-capture kernel.
-
-Similarly on PPC64 machines first 32KB of physical memory is needed for
-booting regardless of where the kernel is loaded and to support 64K page
-size kexec backs up the first 64KB memory.
-
-For s390x, when kdump is triggered, the crashkernel region is exchanged
-with the region [0, crashkernel region size] and then the kdump kernel
-runs in [0, crashkernel region size]. Therefore no relocatable kernel is
-needed for s390x.
-
-All of the necessary information about the system kernel's core image is
-encoded in the ELF format, and stored in a reserved area of memory
-before a crash. The physical address of the start of the ELF header is
-passed to the dump-capture kernel through the elfcorehdr= boot
-parameter. Optionally the size of the ELF header can also be passed
-when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax.
-
-
-With the dump-capture kernel, you can access the memory image through
-/proc/vmcore. This exports the dump as an ELF-format file that you can
-write out using file copy commands such as cp or scp. Further, you can
-use analysis tools such as the GNU Debugger (GDB) and the Crash tool to
-debug the dump file. This method ensures that the dump pages are correctly
-ordered.
-
-
-Setup and Installation
-======================
-
-Install kexec-tools
--------------------
-
-1) Login as the root user.
-
-2) Download the kexec-tools user-space package from the following URL:
-
-http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz
-
-This is a symlink to the latest version.
-
-The latest kexec-tools git tree is available at:
-
-git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
-and
-http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
-
-There is also a gitweb interface available at
-http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git
-
-More information about kexec-tools can be found at
-http://horms.net/projects/kexec/
-
-3) Unpack the tarball with the tar command, as follows:
-
-   tar xvpzf kexec-tools.tar.gz
-
-4) Change to the kexec-tools directory, as follows:
-
-   cd kexec-tools-VERSION
-
-5) Configure the package, as follows:
-
-   ./configure
-
-6) Compile the package, as follows:
-
-   make
-
-7) Install the package, as follows:
-
-   make install
-
-
-Build the system and dump-capture kernels
------------------------------------------
-There are two possible methods of using Kdump.
-
-1) Build a separate custom dump-capture kernel for capturing the
-   kernel core dump.
-
-2) Or use the system kernel binary itself as dump-capture kernel and there is
-   no need to build a separate dump-capture kernel. This is possible
-   only with the architectures which support a relocatable kernel. As
-   of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support
-   relocatable kernel.
-
-Building a relocatable kernel is advantageous from the point of view that
-one does not have to build a second kernel for capturing the dump. But
-at the same time one might want to build a custom dump capture kernel
-suitable to his needs.
-
-Following are the configuration setting required for system and
-dump-capture kernels for enabling kdump support.
-
-System kernel config options
-----------------------------
-
-1) Enable "kexec system call" in "Processor type and features."
-
-   CONFIG_KEXEC=y
-
-2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo
-   filesystems." This is usually enabled by default.
-
-   CONFIG_SYSFS=y
-
-   Note that "sysfs file system support" might not appear in the "Pseudo
-   filesystems" menu if "Configure standard kernel features (for small
-   systems)" is not enabled in "General Setup." In this case, check the
-   .config file itself to ensure that sysfs is turned on, as follows:
-
-   grep 'CONFIG_SYSFS' .config
-
-3) Enable "Compile the kernel with debug info" in "Kernel hacking."
-
-   CONFIG_DEBUG_INFO=Y
-
-   This causes the kernel to be built with debug symbols. The dump
-   analysis tools require a vmlinux with debug symbols in order to read
-   and analyze a dump file.
-
-Dump-capture kernel config options (Arch Independent)
------------------------------------------------------
-
-1) Enable "kernel crash dumps" support under "Processor type and
-   features":
-
-   CONFIG_CRASH_DUMP=y
-
-2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems".
-
-   CONFIG_PROC_VMCORE=y
-   (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.)
-
-Dump-capture kernel config options (Arch Dependent, i386 and x86_64)
---------------------------------------------------------------------
-
-1) On i386, enable high memory support under "Processor type and
-   features":
-
-   CONFIG_HIGHMEM64G=y
-   or
-   CONFIG_HIGHMEM4G
-
-2) On i386 and x86_64, disable symmetric multi-processing support
-   under "Processor type and features":
-
-   CONFIG_SMP=n
-
-   (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line
-   when loading the dump-capture kernel, see section "Load the Dump-capture
-   Kernel".)
-
-3) If one wants to build and use a relocatable kernel,
-   Enable "Build a relocatable kernel" support under "Processor type and
-   features"
-
-   CONFIG_RELOCATABLE=y
-
-4) Use a suitable value for "Physical address where the kernel is
-   loaded" (under "Processor type and features"). This only appears when
-   "kernel crash dumps" is enabled. A suitable value depends upon
-   whether kernel is relocatable or not.
-
-   If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000
-   This will compile the kernel for physical address 1MB, but given the fact
-   kernel is relocatable, it can be run from any physical address hence
-   kexec boot loader will load it in memory region reserved for dump-capture
-   kernel.
-
-   Otherwise it should be the start of memory region reserved for
-   second kernel using boot parameter "crashkernel=Y@X". Here X is
-   start of memory region reserved for dump-capture kernel.
-   Generally X is 16MB (0x1000000). So you can set
-   CONFIG_PHYSICAL_START=0x1000000
-
-5) Make and install the kernel and its modules. DO NOT add this kernel
-   to the boot loader configuration files.
-
-Dump-capture kernel config options (Arch Dependent, ppc64)
-----------------------------------------------------------
-
-1) Enable "Build a kdump crash kernel" support under "Kernel" options:
-
-   CONFIG_CRASH_DUMP=y
-
-2)   Enable "Build a relocatable kernel" support
-
-   CONFIG_RELOCATABLE=y
-
-   Make and install the kernel and its modules.
-
-Dump-capture kernel config options (Arch Dependent, ia64)
-----------------------------------------------------------
-
-- No specific options are required to create a dump-capture kernel
-  for ia64, other than those specified in the arch independent section
-  above. This means that it is possible to use the system kernel
-  as a dump-capture kernel if desired.
-
-  The crashkernel region can be automatically placed by the system
-  kernel at run time. This is done by specifying the base address as 0,
-  or omitting it all together.
-
-  crashkernel=256M@0
-  or
-  crashkernel=256M
-
-  If the start address is specified, note that the start address of the
-  kernel will be aligned to 64Mb, so if the start address is not then
-  any space below the alignment point will be wasted.
-
-Dump-capture kernel config options (Arch Dependent, arm)
-----------------------------------------------------------
-
--   To use a relocatable kernel,
-    Enable "AUTO_ZRELADDR" support under "Boot" options:
-
-    AUTO_ZRELADDR=y
-
-Dump-capture kernel config options (Arch Dependent, arm64)
-----------------------------------------------------------
-
-- Please note that kvm of the dump-capture kernel will not be enabled
-  on non-VHE systems even if it is configured. This is because the CPU
-  will not be reset to EL2 on panic.
-
-Extended crashkernel syntax
-===========================
-
-While the "crashkernel=size[@offset]" syntax is sufficient for most
-configurations, sometimes it's handy to have the reserved memory dependent
-on the value of System RAM -- that's mostly for distributors that pre-setup
-the kernel command line to avoid a unbootable system after some memory has
-been removed from the machine.
-
-The syntax is:
-
-    crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
-    range=start-[end]
-
-For example:
-
-    crashkernel=512M-2G:64M,2G-:128M
-
-This would mean:
-
-    1) if the RAM is smaller than 512M, then don't reserve anything
-       (this is the "rescue" case)
-    2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M
-    3) if the RAM size is larger than 2G, then reserve 128M
-
-
-
-Boot into System Kernel
-=======================
-
-1) Update the boot loader (such as grub, yaboot, or lilo) configuration
-   files as necessary.
-
-2) Boot the system kernel with the boot parameter "crashkernel=Y@X",
-   where Y specifies how much memory to reserve for the dump-capture kernel
-   and X specifies the beginning of this reserved memory. For example,
-   "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory
-   starting at physical address 0x01000000 (16MB) for the dump-capture kernel.
-
-   On x86 and x86_64, use "crashkernel=64M@16M".
-
-   On ppc64, use "crashkernel=128M@32M".
-
-   On ia64, 256M@256M is a generous value that typically works.
-   The region may be automatically placed on ia64, see the
-   dump-capture kernel config option notes above.
-   If use sparse memory, the size should be rounded to GRANULE boundaries.
-
-   On s390x, typically use "crashkernel=xxM". The value of xx is dependent
-   on the memory consumption of the kdump system. In general this is not
-   dependent on the memory size of the production system.
-
-   On arm, the use of "crashkernel=Y@X" is no longer necessary; the
-   kernel will automatically locate the crash kernel image within the
-   first 512MB of RAM if X is not given.
-
-   On arm64, use "crashkernel=Y[@X]".  Note that the start address of
-   the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000).
-
-Load the Dump-capture Kernel
-============================
-
-After booting to the system kernel, dump-capture kernel needs to be
-loaded.
-
-Based on the architecture and type of image (relocatable or not), one
-can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz
-of dump-capture kernel. Following is the summary.
-
-For i386 and x86_64:
-	- Use vmlinux if kernel is not relocatable.
-	- Use bzImage/vmlinuz if kernel is relocatable.
-For ppc64:
-	- Use vmlinux
-For ia64:
-	- Use vmlinux or vmlinuz.gz
-For s390x:
-	- Use image or bzImage
-For arm:
-	- Use zImage
-For arm64:
-	- Use vmlinux or Image
-
-If you are using an uncompressed vmlinux image then use following command
-to load dump-capture kernel.
-
-   kexec -p <dump-capture-kernel-vmlinux-image> \
-   --initrd=<initrd-for-dump-capture-kernel> --args-linux \
-   --append="root=<root-dev> <arch-specific-options>"
-
-If you are using a compressed bzImage/vmlinuz, then use following command
-to load dump-capture kernel.
-
-   kexec -p <dump-capture-kernel-bzImage> \
-   --initrd=<initrd-for-dump-capture-kernel> \
-   --append="root=<root-dev> <arch-specific-options>"
-
-If you are using a compressed zImage, then use following command
-to load dump-capture kernel.
-
-   kexec --type zImage -p <dump-capture-kernel-bzImage> \
-   --initrd=<initrd-for-dump-capture-kernel> \
-   --dtb=<dtb-for-dump-capture-kernel> \
-   --append="root=<root-dev> <arch-specific-options>"
-
-If you are using an uncompressed Image, then use following command
-to load dump-capture kernel.
-
-   kexec -p <dump-capture-kernel-Image> \
-   --initrd=<initrd-for-dump-capture-kernel> \
-   --append="root=<root-dev> <arch-specific-options>"
-
-Please note, that --args-linux does not need to be specified for ia64.
-It is planned to make this a no-op on that architecture, but for now
-it should be omitted
-
-Following are the arch specific command line options to be used while
-loading dump-capture kernel.
-
-For i386, x86_64 and ia64:
-	"1 irqpoll maxcpus=1 reset_devices"
-
-For ppc64:
-	"1 maxcpus=1 noirqdistrib reset_devices"
-
-For s390x:
-	"1 maxcpus=1 cgroup_disable=memory"
-
-For arm:
-	"1 maxcpus=1 reset_devices"
-
-For arm64:
-	"1 maxcpus=1 reset_devices"
-
-Notes on loading the dump-capture kernel:
-
-* By default, the ELF headers are stored in ELF64 format to support
-  systems with more than 4GB memory. On i386, kexec automatically checks if
-  the physical RAM size exceeds the 4 GB limit and if not, uses ELF32.
-  So, on non-PAE systems, ELF32 is always used.
-
-  The --elf32-core-headers option can be used to force the generation of ELF32
-  headers. This is necessary because GDB currently cannot open vmcore files
-  with ELF64 headers on 32-bit systems.
-
-* The "irqpoll" boot parameter reduces driver initialization failures
-  due to shared interrupts in the dump-capture kernel.
-
-* You must specify <root-dev> in the format corresponding to the root
-  device name in the output of mount command.
-
-* Boot parameter "1" boots the dump-capture kernel into single-user
-  mode without networking. If you want networking, use "3".
-
-* We generally don' have to bring up a SMP kernel just to capture the
-  dump. Hence generally it is useful either to build a UP dump-capture
-  kernel or specify maxcpus=1 option while loading dump-capture kernel.
-  Note, though maxcpus always works, you had better replace it with
-  nr_cpus to save memory if supported by the current ARCH, such as x86.
-
-* You should enable multi-cpu support in dump-capture kernel if you intend
-  to use multi-thread programs with it, such as parallel dump feature of
-  makedumpfile. Otherwise, the multi-thread program may have a great
-  performance degradation. To enable multi-cpu support, you should bring up an
-  SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X]
-  options while loading it.
-
-* For s390x there are two kdump modes: If a ELF header is specified with
-  the elfcorehdr= kernel parameter, it is used by the kdump kernel as it
-  is done on all other architectures. If no elfcorehdr= kernel parameter is
-  specified, the s390x kdump kernel dynamically creates the header. The
-  second mode has the advantage that for CPU and memory hotplug, kdump has
-  not to be reloaded with kexec_load().
-
-* For s390x systems with many attached devices the "cio_ignore" kernel
-  parameter should be used for the kdump kernel in order to prevent allocation
-  of kernel memory for devices that are not relevant for kdump. The same
-  applies to systems that use SCSI/FCP devices. In that case the
-  "allow_lun_scan" zfcp module parameter should be set to zero before
-  setting FCP devices online.
-
-Kernel Panic
-============
-
-After successfully loading the dump-capture kernel as previously
-described, the system will reboot into the dump-capture kernel if a
-system crash is triggered.  Trigger points are located in panic(),
-die(), die_nmi() and in the sysrq handler (ALT-SysRq-c).
-
-The following conditions will execute a crash trigger point:
-
-If a hard lockup is detected and "NMI watchdog" is configured, the system
-will boot into the dump-capture kernel ( die_nmi() ).
-
-If die() is called, and it happens to be a thread with pid 0 or 1, or die()
-is called inside interrupt context or die() is called and panic_on_oops is set,
-the system will boot into the dump-capture kernel.
-
-On powerpc systems when a soft-reset is generated, die() is called by all cpus
-and the system will boot into the dump-capture kernel.
-
-For testing purposes, you can trigger a crash by using "ALT-SysRq-c",
-"echo c > /proc/sysrq-trigger" or write a module to force the panic.
-
-Write Out the Dump File
-=======================
-
-After the dump-capture kernel is booted, write out the dump file with
-the following command:
-
-   cp /proc/vmcore <dump-file>
-
-
-Analysis
-========
-
-Before analyzing the dump image, you should reboot into a stable kernel.
-
-You can do limited analysis using GDB on the dump file copied out of
-/proc/vmcore. Use the debug vmlinux built with -g and run the following
-command:
-
-   gdb vmlinux <dump-file>
-
-Stack trace for the task on processor 0, register display, and memory
-display work fine.
-
-Note: GDB cannot analyze core files generated in ELF64 format for x86.
-On systems with a maximum of 4GB of memory, you can generate
-ELF32-format headers using the --elf32-core-headers kernel option on the
-dump kernel.
-
-You can also use the Crash utility to analyze dump files in Kdump
-format. Crash is available on Dave Anderson's site at the following URL:
-
-   http://people.redhat.com/~anderson/
-
-Trigger Kdump on WARN()
-=======================
-
-The kernel parameter, panic_on_warn, calls panic() in all WARN() paths.  This
-will cause a kdump to occur at the panic() call.  In cases where a user wants
-to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
-to achieve the same behaviour.
-
-Contact
-=======
-
-Vivek Goyal (vgoyal@redhat.com)
-Maneesh Soni (maneesh@in.ibm.com)
-
diff --git a/Documentation/kdump/vmcoreinfo.txt b/Documentation/kdump/vmcoreinfo.txt
deleted file mode 100644
index bb94a4bd597a..000000000000
--- a/Documentation/kdump/vmcoreinfo.txt
+++ /dev/null
@@ -1,495 +0,0 @@
-================================================================
-			VMCOREINFO
-================================================================
-
-===========
-What is it?
-===========
-
-VMCOREINFO is a special ELF note section. It contains various
-information from the kernel like structure size, page size, symbol
-values, field offsets, etc. These data are packed into an ELF note
-section and used by user-space tools like crash and makedumpfile to
-analyze a kernel's memory layout.
-
-================
-Common variables
-================
-
-init_uts_ns.name.release
-------------------------
-
-The version of the Linux kernel. Used to find the corresponding source
-code from which the kernel has been built. For example, crash uses it to
-find the corresponding vmlinux in order to process vmcore.
-
-PAGE_SIZE
----------
-
-The size of a page. It is the smallest unit of data used by the memory
-management facilities. It is usually 4096 bytes of size and a page is
-aligned on 4096 bytes. Used for computing page addresses.
-
-init_uts_ns
------------
-
-The UTS namespace which is used to isolate two specific elements of the
-system that relate to the uname(2) system call. It is named after the
-data structure used to store information returned by the uname(2) system
-call.
-
-User-space tools can get the kernel name, host name, kernel release
-number, kernel version, architecture name and OS type from it.
-
-node_online_map
----------------
-
-An array node_states[N_ONLINE] which represents the set of online nodes
-in a system, one bit position per node number. Used to keep track of
-which nodes are in the system and online.
-
-swapper_pg_dir
--------------
-
-The global page directory pointer of the kernel. Used to translate
-virtual to physical addresses.
-
-_stext
-------
-
-Defines the beginning of the text section. In general, _stext indicates
-the kernel start address. Used to convert a virtual address from the
-direct kernel map to a physical address.
-
-vmap_area_list
---------------
-
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
-
-mem_map
--------
-
-Physical addresses are translated to struct pages by treating them as
-an index into the mem_map array. Right-shifting a physical address
-PAGE_SHIFT bits converts it into a page frame number which is an index
-into that mem_map array.
-
-Used to map an address to the corresponding struct page.
-
-contig_page_data
-----------------
-
-Makedumpfile gets the pglist_data structure from this symbol, which is
-used to describe the memory layout.
-
-User-space tools use this to exclude free pages when dumping memory.
-
-mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
---------------------------------------------------------------------------
-
-The address of the mem_section array, its length, structure size, and
-the section_mem_map offset.
-
-It exists in the sparse memory mapping model, and it is also somewhat
-similar to the mem_map variable, both of them are used to translate an
-address.
-
-page
-----
-
-The size of a page structure. struct page is an important data structure
-and it is widely used to compute contiguous memory.
-
-pglist_data
------------
-
-The size of a pglist_data structure. This value is used to check if the
-pglist_data structure is valid. It is also used for checking the memory
-type.
-
-zone
-----
-
-The size of a zone structure. This value is used to check if the zone
-structure has been found. It is also used for excluding free pages.
-
-free_area
----------
-
-The size of a free_area structure. It indicates whether the free_area
-structure is valid or not. Useful when excluding free pages.
-
-list_head
----------
-
-The size of a list_head structure. Used when iterating lists in a
-post-mortem analysis session.
-
-nodemask_t
-----------
-
-The size of a nodemask_t type. Used to compute the number of online
-nodes.
-
-(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
-       compound_order|compound_head)
--------------------------------------------------------------------
-
-User-space tools compute their values based on the offset of these
-variables. The variables are used when excluding unnecessary pages.
-
-(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_
-              spanned_pages|node_id)
--------------------------------------------------------------------
-
-On NUMA machines, each NUMA node has a pg_data_t to describe its memory
-layout. On UMA machines there is a single pglist_data which describes the
-whole memory.
-
-These values are used to check the memory type and to compute the
-virtual address for memory map.
-
-(zone, free_area|vm_stat|spanned_pages)
----------------------------------------
-
-Each node is divided into a number of blocks called zones which
-represent ranges within memory. A zone is described by a structure zone.
-
-User-space tools compute required values based on the offset of these
-variables.
-
-(free_area, free_list)
-----------------------
-
-Offset of the free_list's member. This value is used to compute the number
-of free pages.
-
-Each zone has a free_area structure array called free_area[MAX_ORDER].
-The free_list represents a linked list of free page blocks.
-
-(list_head, next|prev)
-----------------------
-
-Offsets of the list_head's members. list_head is used to define a
-circular linked list. User-space tools need these in order to traverse
-lists.
-
-(vmap_area, va_start|list)
---------------------------
-
-Offsets of the vmap_area's members. They carry vmalloc-specific
-information. Makedumpfile gets the start address of the vmalloc region
-from this.
-
-(zone.free_area, MAX_ORDER)
----------------------------
-
-Free areas descriptor. User-space tools use this value to iterate the
-free_area ranges. MAX_ORDER is used by the zone buddy allocator.
-
-log_first_idx
--------------
-
-Index of the first record stored in the buffer log_buf. Used by
-user-space tools to read the strings in the log_buf.
-
-log_buf
--------
-
-Console output is written to the ring buffer log_buf at index
-log_first_idx. Used to get the kernel log.
-
-log_buf_len
------------
-
-log_buf's length.
-
-clear_idx
----------
-
-The index that the next printk() record to read after the last clear
-command. It indicates the first record after the last SYSLOG_ACTION
-_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
-the dmesg log.
-
-log_next_idx
-------------
-
-The index of the next record to store in the buffer log_buf. Used to
-compute the index of the current buffer position.
-
-printk_log
-----------
-
-The size of a structure printk_log. Used to compute the size of
-messages, and extract dmesg log. It encapsulates header information for
-log_buf, such as timestamp, syslog level, etc.
-
-(printk_log, ts_nsec|len|text_len|dict_len)
--------------------------------------------
-
-It represents field offsets in struct printk_log. User space tools
-parse it and check whether the values of printk_log's members have been
-changed.
-
-(free_area.free_list, MIGRATE_TYPES)
-------------------------------------
-
-The number of migrate types for pages. The free_list is described by the
-array. Used by tools to compute the number of free pages.
-
-NR_FREE_PAGES
--------------
-
-On linux-2.6.21 or later, the number of free pages is in
-vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
-
-PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision
-|PG_head_mask|PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)
-|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
------------------------------------------------------------------
-
-Page attributes. These flags are used to filter various unnecessary for
-dumping pages.
-
-HUGETLB_PAGE_DTOR
------------------
-
-The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
-excludes these pages.
-
-======
-x86_64
-======
-
-phys_base
----------
-
-Used to convert the virtual address of an exported kernel symbol to its
-corresponding physical address.
-
-init_top_pgt
-------------
-
-Used to walk through the whole page table and convert virtual addresses
-to physical addresses. The init_top_pgt is somewhat similar to
-swapper_pg_dir, but it is only used in x86_64.
-
-pgtable_l5_enabled
-------------------
-
-User-space tools need to know whether the crash kernel was in 5-level
-paging mode.
-
-node_data
----------
-
-This is a struct pglist_data array and stores all NUMA nodes
-information. Makedumpfile gets the pglist_data structure from it.
-
-(node_data, MAX_NUMNODES)
--------------------------
-
-The maximum number of nodes in system.
-
-KERNELOFFSET
-------------
-
-The kernel randomization offset. Used to compute the page offset. If
-KASLR is disabled, this value is zero.
-
-KERNEL_IMAGE_SIZE
------------------
-
-Currently unused by Makedumpfile. Used to compute the module virtual
-address by Crash.
-
-sme_mask
---------
-
-AMD-specific with SME support: it indicates the secure memory encryption
-mask. Makedumpfile tools need to know whether the crash kernel was
-encrypted. If SME is enabled in the first kernel, the crash kernel's
-page table entries (pgd/pud/pmd/pte) contain the memory encryption
-mask. This is used to remove the SME mask and obtain the true physical
-address.
-
-Currently, sme_mask stores the value of the C-bit position. If needed,
-additional SME-relevant info can be placed in that variable.
-
-For example:
-[ misc	        ][ enc bit  ][ other misc SME info       ]
-0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000
-63   59   55   51   47   43   39   35   31   27   ... 3
-
-======
-x86_32
-======
-
-X86_PAE
--------
-
-Denotes whether physical address extensions are enabled. It has the cost
-of a higher page table lookup overhead, and also consumes more page
-table space per process. Used to check whether PAE was enabled in the
-crash kernel when converting virtual addresses to physical addresses.
-
-====
-ia64
-====
-
-pgdat_list|(pgdat_list, MAX_NUMNODES)
--------------------------------------
-
-pg_data_t array storing all NUMA nodes information. MAX_NUMNODES
-indicates the number of the nodes.
-
-node_memblk|(node_memblk, NR_NODE_MEMBLKS)
-------------------------------------------
-
-List of node memory chunks. Filled when parsing the SRAT table to obtain
-information about memory nodes. NR_NODE_MEMBLKS indicates the number of
-node memory chunks.
-
-These values are used to compute the number of nodes the crashed kernel used.
-
-node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size)
-----------------------------------------------------------------
-
-The size of a struct node_memblk_s and the offsets of the
-node_memblk_s's members. Used to compute the number of nodes.
-
-PGTABLE_3|PGTABLE_4
--------------------
-
-User-space tools need to know whether the crash kernel was in 3-level or
-4-level paging mode. Used to distinguish the page table.
-
-=====
-ARM64
-=====
-
-VA_BITS
--------
-
-The maximum number of bits for virtual addresses. Used to compute the
-virtual memory ranges.
-
-kimage_voffset
---------------
-
-The offset between the kernel virtual and physical mappings. Used to
-translate virtual to physical addresses.
-
-PHYS_OFFSET
------------
-
-Indicates the physical address of the start of memory. Similar to
-kimage_voffset, which is used to translate virtual to physical
-addresses.
-
-KERNELOFFSET
-------------
-
-The kernel randomization offset. Used to compute the page offset. If
-KASLR is disabled, this value is zero.
-
-====
-arm
-====
-
-ARM_LPAE
---------
-
-It indicates whether the crash kernel supports large physical address
-extensions. Used to translate virtual to physical addresses.
-
-====
-s390
-====
-
-lowcore_ptr
-----------
-
-An array with a pointer to the lowcore of every CPU. Used to print the
-psw and all registers information.
-
-high_memory
------------
-
-Used to get the vmalloc_start address from the high_memory symbol.
-
-(lowcore_ptr, NR_CPUS)
-----------------------
-
-The maximum number of CPUs.
-
-=======
-powerpc
-=======
-
-
-node_data|(node_data, MAX_NUMNODES)
------------------------------------
-
-See above.
-
-contig_page_data
-----------------
-
-See above.
-
-vmemmap_list
-------------
-
-The vmemmap_list maintains the entire vmemmap physical mapping. Used
-to get vmemmap list count and populated vmemmap regions info. If the
-vmemmap address translation information is stored in the crash kernel,
-it is used to translate vmemmap kernel virtual addresses.
-
-mmu_vmemmap_psize
------------------
-
-The size of a page. Used to translate virtual to physical addresses.
-
-mmu_psize_defs
---------------
-
-Page size definitions, i.e. 4k, 64k, or 16M.
-
-Used to make vtop translations.
-
-vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|
-(vmemmap_backing, virt_addr)
-----------------------------------------------------------------
-
-The vmemmap virtual address space management does not have a traditional
-page table to track which virtual struct pages are backed by a physical
-mapping. The virtual to physical mappings are tracked in a simple linked
-list format.
-
-User-space tools need to know the offset of list, phys and virt_addr
-when computing the count of vmemmap regions.
-
-mmu_psize_def|(mmu_psize_def, shift)
-------------------------------------
-
-The size of a struct mmu_psize_def and the offset of mmu_psize_def's
-member.
-
-Used in vtop translations.
-
-==
-sh
-==
-
-node_data|(node_data, MAX_NUMNODES)
------------------------------------
-
-See above.
-
-X2TLB
------
-
-Indicates whether the crashed kernel enabled SH extended mode.
diff --git a/Documentation/kernel-hacking/conf.py b/Documentation/kernel-hacking/conf.py
deleted file mode 100644
index 3d8acf0f33ad..000000000000
--- a/Documentation/kernel-hacking/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Kernel Hacking Guides"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'kernel-hacking.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst
index d824e4feaff3..5891a701a159 100644
--- a/Documentation/kernel-hacking/hacking.rst
+++ b/Documentation/kernel-hacking/hacking.rst
@@ -718,7 +718,7 @@ make a neat patch, there's administrative work to be done:
 -  Usually you want a configuration option for your kernel hack. Edit
    ``Kconfig`` in the appropriate directory. The Config language is
    simple to use by cut and paste, and there's complete documentation in
-   ``Documentation/kbuild/kconfig-language.txt``.
+   ``Documentation/kbuild/kconfig-language.rst``.
 
    In your description of the option, make sure you address both the
    expert user and the user who knows nothing about your feature.
@@ -728,7 +728,7 @@ make a neat patch, there's administrative work to be done:
 
 -  Edit the ``Makefile``: the CONFIG variables are exported here so you
    can usually just add a "obj-$(CONFIG_xxx) += xxx.o" line. The syntax
-   is documented in ``Documentation/kbuild/makefiles.txt``.
+   is documented in ``Documentation/kbuild/makefiles.rst``.
 
 -  Put yourself in ``CREDITS`` if you've done something noteworthy,
    usually beyond a single file (your name should be at the top of the
diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index 519673df0e82..a8518ac0d31d 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -451,7 +451,7 @@ to protect the cache and all the objects within it. Here's the code::
             if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
                     return -ENOMEM;
 
-            strlcpy(obj->name, name, sizeof(obj->name));
+            strscpy(obj->name, name, sizeof(obj->name));
             obj->id = id;
             obj->popularity = 0;
 
@@ -660,7 +660,7 @@ Here is the code::
      }
 
     @@ -63,6 +94,7 @@
-             strlcpy(obj->name, name, sizeof(obj->name));
+             strscpy(obj->name, name, sizeof(obj->name));
              obj->id = id;
              obj->popularity = 0;
     +        obj->refcnt = 1; /* The cache holds a reference */
@@ -774,7 +774,7 @@ the lock is no longer used to protect the reference count itself.
      }
 
     @@ -94,7 +76,7 @@
-             strlcpy(obj->name, name, sizeof(obj->name));
+             strscpy(obj->name, name, sizeof(obj->name));
              obj->id = id;
              obj->popularity = 0;
     -        obj->refcnt = 1; /* The cache holds a reference */
@@ -1364,7 +1364,7 @@ Futex API reference
 Further reading
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking
+-  ``Documentation/locking/spinlocks.rst``: Linus Torvalds' spinlocking
    tutorial in the kernel sources.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
deleted file mode 100644
index 23b0c8b20cd1..000000000000
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ /dev/null
@@ -1,356 +0,0 @@
-==========================================
-Reducing OS jitter due to per-cpu kthreads
-==========================================
-
-This document lists per-CPU kthreads in the Linux kernel and presents
-options to control their OS jitter.  Note that non-per-CPU kthreads are
-not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
-them to a "housekeeping" CPU dedicated to such work.
-
-References
-==========
-
--	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
-
--	Documentation/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
-
--	man taskset:  Using the taskset command to bind tasks to sets
-	of CPUs.
-
--	man sched_setaffinity:  Using the sched_setaffinity() system
-	call to bind tasks to sets of CPUs.
-
--	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
-	writing "0" to offline and "1" to online.
-
--	In order to locate kernel-generated OS jitter on CPU N:
-
-		cd /sys/kernel/debug/tracing
-		echo 1 > max_graph_depth # Increase the "1" for more detail
-		echo function_graph > current_tracer
-		# run workload
-		cat per_cpu/cpuN/trace
-
-kthreads
-========
-
-Name:
-  ehca_comp/%u
-
-Purpose:
-  Periodically process Infiniband-related work.
-
-To reduce its OS jitter, do any of the following:
-
-1.	Don't use eHCA Infiniband hardware, instead choosing hardware
-	that does not require per-CPU kthreads.  This will prevent these
-	kthreads from being created in the first place.  (This will
-	work for most people, as this hardware, though important, is
-	relatively old and is produced in relatively low unit volumes.)
-2.	Do all eHCA-Infiniband-related work on other CPUs, including
-	interrupts.
-3.	Rework the eHCA driver so that its per-CPU kthreads are
-	provisioned only on selected CPUs.
-
-
-Name:
-  irq/%d-%s
-
-Purpose:
-  Handle threaded interrupts.
-
-To reduce its OS jitter, do the following:
-
-1.	Use irq affinity to force the irq threads to execute on
-	some other CPU.
-
-Name:
-  kcmtpd_ctr_%d
-
-Purpose:
-  Handle Bluetooth work.
-
-To reduce its OS jitter, do one of the following:
-
-1.	Don't use Bluetooth, in which case these kthreads won't be
-	created in the first place.
-2.	Use irq affinity to force Bluetooth-related interrupts to
-	occur on some other CPU and furthermore initiate all
-	Bluetooth activity on some other CPU.
-
-Name:
-  ksoftirqd/%u
-
-Purpose:
-  Execute softirq handlers when threaded or when under heavy load.
-
-To reduce its OS jitter, each softirq vector must be handled
-separately as follows:
-
-TIMER_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by forcing
-	both kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
-	the CPU offline, then bring it back online.  This forces
-	recurring timers to migrate elsewhere.	If you are concerned
-	with multiple CPUs, force them all offline before bringing the
-	first one back online.  Once you have onlined the CPUs in question,
-	do not offline any other CPUs, because doing so could force the
-	timer back onto one of the CPUs in question.
-
-NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
----------------------------------
-
-Do all of the following:
-
-1.	Force networking interrupts onto other CPUs.
-2.	Initiate any network I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-BLOCK_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-IRQ_POLL_SOFTIRQ
-----------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O and block-I/O polling on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-TASKLET_SOFTIRQ
----------------
-
-Do one or more of the following:
-
-1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
-	calls to things like tasklet_schedule().)
-2.	Convert all drivers that you must use from tasklets to workqueues.
-3.	Force interrupts for drivers using tasklets onto other CPUs,
-	and also do I/O involving these drivers on other CPUs.
-
-SCHED_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
-	for example, ensure that at most one runnable kthread is present
-	on that CPU.  If a thread that expects to run on the de-jittered
-	CPU awakens, the scheduler will send an IPI that can result in
-	a subsequent SCHED_SOFTIRQ.
-2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
-	is marked as an adaptive-ticks CPU using the "nohz_full="
-	boot parameter.  This reduces the number of scheduler-clock
-	interrupts that the de-jittered CPU receives, minimizing its
-	chances of being selected to do the load balancing work that
-	runs in SCHED_SOFTIRQ context.
-3.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by
-	forcing both kernel threads and interrupts to execute elsewhere.
-	This further reduces the number of scheduler-clock interrupts
-	received by the de-jittered CPU.
-
-HRTIMER_SOFTIRQ
----------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle.  For example, avoid system calls and force both
-	kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
-	CPU offline, then bring it back online.  This forces recurring
-	timers to migrate elsewhere.  If you are concerned with multiple
-	CPUs, force them all offline before bringing the first one
-	back online.  Once you have onlined the CPUs in question, do not
-	offline any other CPUs, because doing so could force the timer
-	back onto one of the CPUs in question.
-
-RCU_SOFTIRQ
------------
-
-Do at least one of the following:
-
-1.	Offload callbacks and keep the CPU in either dyntick-idle or
-	adaptive-ticks state by doing all of the following:
-
-	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
-		de-jittered is marked as an adaptive-ticks CPU using the
-		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
-		housekeeping CPUs, which can tolerate OS jitter.
-	b.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-2.	Enable RCU to do its processing remotely via dyntick-idle by
-	doing all of the following:
-
-	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
-	b.	Ensure that the CPU goes idle frequently, allowing other
-		CPUs to detect that it has passed through an RCU quiescent
-		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
-		userspace execution also allows other CPUs to detect that
-		the CPU in question has passed through a quiescent state.
-	c.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-Name:
-  kworker/%u:%d%s (cpu, id, priority)
-
-Purpose:
-  Execute workqueue requests
-
-To reduce its OS jitter, do any of the following:
-
-1.	Run your workload at a real-time priority, which will allow
-	preempting the kworker daemons.
-2.	A given workqueue can be made visible in the sysfs filesystem
-	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
-	Such a workqueue can be confined to a given subset of the
-	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
-	files.	The set of WQ_SYSFS workqueues can be displayed using
-	"ls sys/devices/virtual/workqueue".  That said, the workqueues
-	maintainer would like to caution people against indiscriminately
-	sprinkling WQ_SYSFS across all the workqueues.	The reason for
-	caution is that it is easy to add WQ_SYSFS, but because sysfs is
-	part of the formal user/kernel API, it can be nearly impossible
-	to remove it, even if its addition was a mistake.
-3.	Do any of the following needed to avoid jitter that your
-	application cannot tolerate:
-
-	a.	Build your kernel with CONFIG_SLUB=y rather than
-		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
-		use of each CPU's workqueues to run its cache_reap()
-		function.
-	b.	Avoid using oprofile, thus avoiding OS jitter from
-		wq_sync_buffer().
-	c.	Limit your CPU frequency so that a CPU-frequency
-		governor is not required, possibly enlisting the aid of
-		special heatsinks or other cooling technologies.  If done
-		correctly, and if you CPU architecture permits, you should
-		be able to build your kernel with CONFIG_CPU_FREQ=n to
-		avoid the CPU-frequency governor periodically running
-		on each CPU, including cs_dbs_timer() and od_dbs_timer().
-
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
-		commit prevents OS jitter due to vmstat_update() on
-		CONFIG_SMP=y systems.  Before v3.18, is not possible
-		to entirely get rid of the OS jitter, but you can
-		decrease its frequency by writing a large value to
-		/proc/sys/vm/stat_interval.  The default value is HZ,
-		for an interval of one second.	Of course, larger values
-		will make your virtual-memory statistics update more
-		slowly.  Of course, you can also run your workload at
-		a real-time priority, thus preempting vmstat_update(),
-		but if your workload is CPU-bound, this is a bad idea.
-		However, there is an RFC patch from Christoph Lameter
-		(based on an earlier one from Gilad Ben-Yossef) that
-		reduces or even eliminates vmstat overhead for some
-		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	Boot with "elevator=noop" to avoid workqueue use by
-		the block layer.
-	f.	If running on high-end powerpc servers, build with
-		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
-		daemon from running on each CPU every second or so.
-		(This will require editing Kconfig files and will defeat
-		this platform's RAS functionality.)  This avoids jitter
-		due to the rtas_event_scan() function.
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	g.	If running on Cell Processor, build your kernel with
-		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
-		spu_gov_work().
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	h.	If running on PowerMAC, build your kernel with
-		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
-		avoiding OS jitter from rackmeter_do_timer().
-
-Name:
-  rcuc/%u
-
-Purpose:
-  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
-	kthreads from being created in the first place, and also obviates
-	the need for RCU priority boosting.  This approach is feasible
-	for workloads that do not require high degrees of responsiveness.
-2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
-	kthreads from being created in the first place.  This approach
-	is feasible only if your workload never requires RCU priority
-	boosting, for example, if you ensure frequent idle time on all
-	CPUs that might execute within the kernel.
-3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
-	boot parameter offloading RCU callbacks from all CPUs susceptible
-	to OS jitter.  This approach prevents the rcuc/%u kthreads from
-	having any work to do, so that they are never awakened.
-4.	Ensure that the CPU never enters the kernel, and, in particular,
-	avoid initiating any CPU hotplug operations on this CPU.  This is
-	another way of preventing any callbacks from being queued on the
-	CPU, again preventing the rcuc/%u kthreads from having any work
-	to do.
-
-Name:
-  rcuop/%d and rcuos/%d
-
-Purpose:
-  Offload RCU callbacks from the corresponding CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Use affinity, cgroups, or other mechanism to force these kthreads
-	to execute on some other CPU.
-2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
-	kthreads from being created in the first place.  However, please
-	note that this will not eliminate OS jitter, but will instead
-	shift it to RCU_SOFTIRQ.
-
-Name:
-  watchdog/%u
-
-Purpose:
-  Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
-	kthreads from being created in the first place.
-2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
-	from being created.  Other related watchdog and softlockup boot
-	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
-	and Documentation/watchdog/watchdog-parameters.txt.
-3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
-	watchdog timer.
-4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
-	order to reduce the frequency of OS jitter due to the watchdog
-	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/laptops/asus-laptop.txt b/Documentation/laptops/asus-laptop.txt
deleted file mode 100644
index 5f2858712aa0..000000000000
--- a/Documentation/laptops/asus-laptop.txt
+++ /dev/null
@@ -1,257 +0,0 @@
-Asus Laptop Extras
-
-Version 0.1
-August 6, 2009
-
-Corentin Chary <corentincj@iksaif.net>
-http://acpi4asus.sf.net/
-
- This driver provides support for extra features of ACPI-compatible ASUS laptops.
- It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or
- VICTOR XP7210 for example). It makes all the extra buttons generate input
- events (like keyboards).
- On some models adds support for changing the display brightness and output,
- switching the LCD backlight on and off, and most importantly, allows you to
- blink those fancy LEDs intended for reporting mail and wireless status.
-
-This driver supercedes the old asus_acpi driver.
-
-Requirements
-------------
-
-  Kernel 2.6.X sources, configured for your computer, with ACPI support.
-  You also need CONFIG_INPUT and CONFIG_ACPI.
-
-Status
-------
-
- The features currently supported are the following (see below for
- detailed description):
-
- - Fn key combinations
- - Bluetooth enable and disable
- - Wlan enable and disable
- - GPS enable and disable
- - Video output switching
- - Ambient Light Sensor on and off
- - LED control
- - LED Display control
- - LCD brightness control
- - LCD on and off
-
- A compatibility table by model and feature is maintained on the web
- site, http://acpi4asus.sf.net/.
-
-Usage
------
-
-  Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should
-  see some lines like this :
-
-      Asus Laptop Extras version 0.42
-        L2D model detected.
-
-  If it is not the output you have on your laptop, send it (and the laptop's
-  DSDT) to me.
-
-  That's all, now, all the events generated by the hotkeys of your laptop
-  should be reported via netlink events. You can check with
-  "acpi_genl monitor" (part of the acpica project).
-
-  Hotkeys are also reported as input keys (like keyboards) you can check
-  which key are supported using "xev" under X11.
-
-  You can get information on the version of your DSDT table by reading the
-  /sys/devices/platform/asus-laptop/infos entry. If you have a question or a
-  bug report to do, please include the output of this entry.
-
-LEDs
-----
-
-  You can modify LEDs be echoing values to /sys/class/leds/asus::*/brightness :
-    echo 1 >  /sys/class/leds/asus::mail/brightness
-  will switch the mail LED on.
-  You can also know if they are on/off by reading their content and use
-  kernel triggers like disk-activity or heartbeat.
-
-Backlight
----------
-
-  You can control lcd backlight power and brightness with
-  /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15.
-
-Wireless devices
----------------
-
-  You can turn the internal Bluetooth adapter on/off with the bluetooth entry
-  (only on models with Bluetooth). This usually controls the associated LED.
-  Same for Wlan adapter.
-
-Display switching
------------------
-
-  Note: the display switching code is currently considered EXPERIMENTAL.
-
-  Switching works for the following models:
-    L3800C
-    A2500H
-    L5800C
-    M5200N
-    W1000N (albeit with some glitches)
-    M6700R
-    A6JC
-    F3J
-
-  Switching doesn't work for the following:
-    M3700N
-    L2X00D (locks the laptop under certain conditions)
-
-  To switch the displays, echo values from 0 to 15 to
-  /sys/devices/platform/asus-laptop/display. The significance of those values
-  is as follows:
-
-  +-------+-----+-----+-----+-----+-----+
-  | Bin   | Val | DVI | TV  | CRT | LCD |
-  +-------+-----+-----+-----+-----+-----+
-  + 0000  +   0 +     +     +     +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 0001  +   1 +     +     +     +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 0010  +   2 +     +     +  X  +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 0011  +   3 +     +     +  X  +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 0100  +   4 +     +  X  +     +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 0101  +   5 +     +  X  +     + X   +
-  +-------+-----+-----+-----+-----+-----+
-  + 0110  +   6 +     +  X  +  X  +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 0111  +   7 +     +  X  +  X  +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 1000  +   8 +  X  +     +     +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 1001  +   9 +  X  +     +     +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 1010  +  10 +  X  +     +  X  +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 1011  +  11 +  X  +     +  X  +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 1100  +  12 +  X  +  X  +     +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 1101  +  13 +  X  +  X  +     +  X  +
-  +-------+-----+-----+-----+-----+-----+
-  + 1110  +  14 +  X  +  X  +  X  +     +
-  +-------+-----+-----+-----+-----+-----+
-  + 1111  +  15 +  X  +  X  +  X  +  X  +
-  +-------+-----+-----+-----+-----+-----+
-
-  In most cases, the appropriate displays must be plugged in for the above
-  combinations to work. TV-Out may need to be initialized at boot time.
-
-  Debugging:
-  1) Check whether the Fn+F8 key:
-     a) does not lock the laptop (try a boot with noapic / nolapic if it does)
-     b) generates events (0x6n, where n is the value corresponding to the
-        configuration above)
-     c) actually works
-     Record the disp value at every configuration.
-  2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display.
-     Record its value, note any change. If nothing changes, try a broader range,
-     up to 65535.
-  3) Send ANY output (both positive and negative reports are needed, unless your
-     machine is already listed above) to the acpi4asus-user mailing list.
-
-  Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n
-  events are generated and no actual switching occurs. In such a case, a line
-  like:
-
-    echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display
-
-  will usually do the trick ($arg is the 0000006n-like event passed to acpid).
-
-  Note: there is currently no reliable way to read display status on xxN
-  (Centrino) models.
-
-LED display
------------
-
-  Some models like the W1N have a LED display that can be used to display
-  several items of information.
-
-  LED display works for the following models:
-    W1000N
-    W1J
-
-  To control the LED display, use the following :
-
-    echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
-
-  where T control the 3 letters display, and DDD the 3 digits display,
-  according to the tables below.
-
-         DDD (digits)
-         000 to 999 = display digits
-         AAA        = ---
-         BBB to FFF = turn-off
-
-         T  (type)
-         0 = off
-         1 = dvd
-         2 = vcd
-         3 = mp3
-         4 = cd
-         5 = tv
-         6 = cpu
-         7 = vol
-
-  For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd"
-  would display "DVD001".
-
-Driver options:
----------------
-
- Options can be passed to the asus-laptop driver using the standard
- module argument syntax (<param>=<value> when passing the option to the
- module or asus-laptop.<param>=<value> on the kernel boot line when
- asus-laptop is statically linked into the kernel).
-
-	     wapf: WAPF defines the behavior of the Fn+Fx wlan key
-		   The significance of values is yet to be found, but
-		   most of the time:
-		   - 0x0 should do nothing
-		   - 0x1 should allow to control the device with Fn+Fx key.
-		   - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key
-		   - 0x5 like 0x1 or 0x4
-
- The default value is 0x1.
-
-Unsupported models
-------------------
-
- These models will never be supported by this module, as they use a completely
- different mechanism to handle LEDs and extra stuff (meaning we have no clue
- how it works):
-
- - ASUS A1300 (A1B), A1370D
- - ASUS L7300G
- - ASUS L8400
-
-Patches, Errors, Questions:
---------------------------
-
- I appreciate any success or failure
- reports, especially if they add to or correct the compatibility table.
- Please include the following information in your report:
-
- - Asus model name
- - a copy of your ACPI tables, using the "acpidump" utility
- - a copy of /sys/devices/platform/asus-laptop/infos
- - which driver features work and which don't
- - the observed behavior of non-working features
-
- Any other comments or patches are also more than welcome.
-
- acpi4asus-user@lists.sourceforge.net
- http://sourceforge.net/projects/acpi4asus
-
diff --git a/Documentation/laptops/disk-shock-protection.txt b/Documentation/laptops/disk-shock-protection.txt
deleted file mode 100644
index 0e6ba2663834..000000000000
--- a/Documentation/laptops/disk-shock-protection.txt
+++ /dev/null
@@ -1,149 +0,0 @@
-Hard disk shock protection
-==========================
-
-Author: Elias Oltmanns <eo@nebensachen.de>
-Last modified: 2008-10-03
-
-
-0. Contents
------------
-
-1. Intro
-2. The interface
-3. References
-4. CREDITS
-
-
-1. Intro
---------
-
-ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature.
-Issuing this command should cause the drive to switch to idle mode and
-unload disk heads. This feature is being used in modern laptops in
-conjunction with accelerometers and appropriate software to implement
-a shock protection facility. The idea is to stop all I/O operations on
-the internal hard drive and park its heads on the ramp when critical
-situations are anticipated. The desire to have such a feature
-available on GNU/Linux systems has been the original motivation to
-implement a generic disk head parking interface in the Linux kernel.
-Please note, however, that other components have to be set up on your
-system in order to get disk shock protection working (see
-section 3. References below for pointers to more information about
-that).
-
-
-2. The interface
-----------------
-
-For each ATA device, the kernel exports the file
-block/*/device/unload_heads in sysfs (here assumed to be mounted under
-/sys). Access to /sys/block/*/device/unload_heads is denied with
--EOPNOTSUPP if the device does not support the unload feature.
-Otherwise, writing an integer value to this file will take the heads
-of the respective drive off the platter and block all I/O operations
-for the specified number of milliseconds. When the timeout expires and
-no further disk head park request has been issued in the meantime,
-normal operation will be resumed. The maximal value accepted for a
-timeout is 30000 milliseconds. Exceeding this limit will return
--EOVERFLOW, but heads will be parked anyway and the timeout will be
-set to 30 seconds. However, you can always change a timeout to any
-value between 0 and 30000 by issuing a subsequent head park request
-before the timeout of the previous one has expired. In particular, the
-total timeout can exceed 30 seconds and, more importantly, you can
-cancel a previously set timeout and resume normal operation
-immediately by specifying a timeout of 0. Values below -2 are rejected
-with -EINVAL (see below for the special meaning of -1 and -2). If the
-timeout specified for a recent head park request has not yet expired,
-reading from /sys/block/*/device/unload_heads will report the number
-of milliseconds remaining until normal operation will be resumed;
-otherwise, reading the unload_heads attribute will return 0.
-
-For example, do the following in order to park the heads of drive
-/dev/sda and stop all I/O operations for five seconds:
-
-# echo 5000 > /sys/block/sda/device/unload_heads
-
-A simple
-
-# cat /sys/block/sda/device/unload_heads
-
-will show you how many milliseconds are left before normal operation
-will be resumed.
-
-A word of caution: The fact that the interface operates on a basis of
-milliseconds may raise expectations that cannot be satisfied in
-reality. In fact, the ATA specs clearly state that the time for an
-unload operation to complete is vendor specific. The hint in ATA-7
-that this will typically be within 500 milliseconds apparently has
-been dropped in ATA-8.
-
-There is a technical detail of this implementation that may cause some
-confusion and should be discussed here. When a head park request has
-been issued to a device successfully, all I/O operations on the
-controller port this device is attached to will be deferred. That is
-to say, any other device that may be connected to the same port will
-be affected too. The only exception is that a subsequent head unload
-request to that other device will be executed immediately. Further
-operations on that port will be deferred until the timeout specified
-for either device on the port has expired. As far as PATA (old style
-IDE) configurations are concerned, there can only be two devices
-attached to any single port. In SATA world we have port multipliers
-which means that a user-issued head parking request to one device may
-actually result in stopping I/O to a whole bunch of devices. However,
-since this feature is supposed to be used on laptops and does not seem
-to be very useful in any other environment, there will be mostly one
-device per port. Even if the CD/DVD writer happens to be connected to
-the same port as the hard drive, it generally *should* recover just
-fine from the occasional buffer under-run incurred by a head park
-request to the HD. Actually, when you are using an ide driver rather
-than its libata counterpart (i.e. your disk is called /dev/hda
-instead of /dev/sda), then parking the heads of one drive (drive X)
-will generally not affect the mode of operation of another drive
-(drive Y) on the same port as described above. It is only when a port
-reset is required to recover from an exception on drive Y that further
-I/O operations on that drive (and the reset itself) will be delayed
-until drive X is no longer in the parked state.
-
-Finally, there are some hard drives that only comply with an earlier
-version of the ATA standard than ATA-7, but do support the unload
-feature nonetheless. Unfortunately, there is no safe way Linux can
-detect these devices, so you won't be able to write to the
-unload_heads attribute. If you know that your device really does
-support the unload feature (for instance, because the vendor of your
-laptop or the hard drive itself told you so), then you can tell the
-kernel to enable the usage of this feature for that drive by writing
-the special value -1 to the unload_heads attribute:
-
-# echo -1 > /sys/block/sda/device/unload_heads
-
-will enable the feature for /dev/sda, and giving -2 instead of -1 will
-disable it again.
-
-
-3. References
--------------
-
-There are several laptops from different vendors featuring shock
-protection capabilities. As manufacturers have refused to support open
-source development of the required software components so far, Linux
-support for shock protection varies considerably between different
-hardware implementations. Ideally, this section should contain a list
-of pointers at different projects aiming at an implementation of shock
-protection on different systems. Unfortunately, I only know of a
-single project which, although still considered experimental, is fit
-for use. Please feel free to add projects that have been the victims
-of my ignorance.
-
-- http://www.thinkwiki.org/wiki/HDAPS
-  See this page for information about Linux support of the hard disk
-  active protection system as implemented in IBM/Lenovo Thinkpads.
-
-
-4. CREDITS
-----------
-
-This implementation of disk head parking has been inspired by a patch
-originally published by Jon Escombe <lists@dresco.co.uk>. My efforts
-to develop an implementation of this feature that is fit to be merged
-into mainline have been aided by various kernel developers, in
-particular by Tejun Heo and Bartlomiej Zolnierkiewicz.
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
deleted file mode 100644
index 1c707fc9b141..000000000000
--- a/Documentation/laptops/laptop-mode.txt
+++ /dev/null
@@ -1,782 +0,0 @@
-How to conserve battery power using laptop-mode
------------------------------------------------
-
-Document Author: Bart Samwel (bart@samwel.tk)
-Date created: January 2, 2004
-Last modified: December 06, 2004
-
-Introduction
-------------
-
-Laptop mode is used to minimize the time that the hard disk needs to be spun up,
-to conserve battery power on laptops. It has been reported to cause significant
-power savings.
-
-Contents
---------
-
-* Introduction
-* Installation
-* Caveats
-* The Details
-* Tips & Tricks
-* Control script
-* ACPI integration
-* Monitoring tool
-
-
-Installation
-------------
-
-To use laptop mode, you don't need to set any kernel configuration options
-or anything. Simply install all the files included in this document, and
-laptop mode will automatically be started when you're on battery. For
-your convenience, a tarball containing an installer can be downloaded at:
-
-http://www.samwel.tk/laptop_mode/laptop_mode/
-
-To configure laptop mode, you need to edit the configuration file, which is
-located in /etc/default/laptop-mode on Debian-based systems, or in
-/etc/sysconfig/laptop-mode on other systems.
-
-Unfortunately, automatic enabling of laptop mode does not work for
-laptops that don't have ACPI. On those laptops, you need to start laptop
-mode manually. To start laptop mode, run "laptop_mode start", and to
-stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
-has experimental support for APM, you might want to try that first.)
-
-
-Caveats
--------
-
-* The downside of laptop mode is that you have a chance of losing up to 10
-  minutes of work. If you cannot afford this, don't use it! The supplied ACPI
-  scripts automatically turn off laptop mode when the battery almost runs out,
-  so that you won't lose any data at the end of your battery life.
-
-* Most desktop hard drives have a very limited lifetime measured in spindown
-  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
-  Check your drive's rating, and don't wear down your drive's lifetime if you
-  don't need to.
-
-* If you mount some of your ext3/reiserfs filesystems with the -n option, then
-  the control script will not be able to remount them correctly. You must set
-  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
-  wrong options -- or it will fail because it cannot write to /etc/mtab.
-
-* If you have your filesystems listed as type "auto" in fstab, like I did, then
-  the control script will not recognize them as filesystems that need remounting.
-  You must list the filesystems with their true type instead.
-
-* It has been reported that some versions of the mutt mail client use file access
-  times to determine whether a folder contains new mail. If you use mutt and
-  experience this, you must disable the noatime remounting by setting the option
-  DO_REMOUNT_NOATIME to 0 in the configuration file.
-
-
-The Details
------------
-
-Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
-present for all kernels that have the laptop mode patch, regardless of any
-configuration options. When the knob is set, any physical disk I/O (that might
-have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
-result of this is that after a disk has spun down, it will not be spun up
-anymore to write dirty blocks, because those blocks had already been written
-immediately after the most recent read operation. The value of the laptop_mode
-knob determines the time between the occurrence of disk I/O and when the flush
-is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
-0 disables laptop mode.
-
-To increase the effectiveness of the laptop_mode strategy, the laptop_mode
-control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
-/proc/sys/vm to about 10 minutes (by default), which means that pages that are
-dirtied are not forced to be written to disk as often. The control script also
-changes the dirty background ratio, so that background writeback of dirty pages
-is not done anymore. Combined with a higher commit value (also 10 minutes) for
-ext3 or ReiserFS filesystems (also done automatically by the control script),
-this results in concentration of disk activity in a small time interval which
-occurs only once every 10 minutes, or whenever the disk is forced to spin up by
-a cache miss. The disk can then be spun down in the periods of inactivity.
-
-If you want to find out which process caused the disk to spin up, you can
-gather information by setting the flag /proc/sys/vm/block_dump. When this flag
-is set, Linux reports all disk read and write operations that take place, and
-all block dirtyings done to files. This makes it possible to debug why a disk
-needs to spin up, and to increase battery life even more. The output of
-block_dump is written to the kernel output, and it can be retrieved using
-"dmesg". When you use block_dump and your kernel logging level also includes
-kernel debugging messages, you probably want to turn off klogd, otherwise
-the output of block_dump will be logged, causing disk activity that is not
-normally there.
-
-
-Configuration
--------------
-
-The laptop mode configuration file is located in /etc/default/laptop-mode on
-Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
-contains the following options:
-
-MAX_AGE:
-
-Maximum time, in seconds, of hard drive spindown time that you are
-comfortable with. Worst case, it's possible that you could lose this
-amount of work if your battery fails while you're in laptop mode.
-
-MINIMUM_BATTERY_MINUTES:
-
-Automatically disable laptop mode if the remaining number of minutes of
-battery power is less than this value. Default is 10 minutes.
-
-AC_HD/BATT_HD:
-
-The idle timeout that should be set on your hard drive when laptop mode
-is active (BATT_HD) and when it is not active (AC_HD). The defaults are
-20 seconds (value 4) for BATT_HD  and 2 hours (value 244) for AC_HD. The
-possible values are those listed in the manual page for "hdparm" for the
-"-S" option.
-
-HD:
-
-The devices for which the spindown timeout should be adjusted by laptop mode.
-Default is /dev/hda. If you specify multiple devices, separate them by a space.
-
-READAHEAD:
-
-Disk readahead, in 512-byte sectors, while laptop mode is active. A large
-readahead can prevent disk accesses for things like executable pages (which are
-loaded on demand while the application executes) and sequentially accessed data
-(MP3s).
-
-DO_REMOUNTS:
-
-The control script automatically remounts any mounted journaled filesystems
-with appropriate commit interval options. When this option is set to 0, this
-feature is disabled.
-
-DO_REMOUNT_NOATIME:
-
-When remounting, should the filesystems be remounted with the noatime option?
-Normally, this is set to "1" (enabled), but there may be programs that require
-access time recording.
-
-DIRTY_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-before a writeback is forced, while laptop mode is active. Corresponds to
-the /proc/sys/vm/dirty_ratio sysctl.
-
-DIRTY_BACKGROUND_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
-this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
-sysctl.
-
-Note that the behaviour of dirty_background_ratio is quite different
-when laptop mode is active and when it isn't. When laptop mode is inactive,
-dirty_background_ratio is the threshold percentage at which background writeouts
-start taking place. When laptop mode is active, however, background writeouts
-are disabled, and the dirty_background_ratio only determines how much writeback
-is done when dirty_ratio is reached.
-
-DO_CPU:
-
-Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
-See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
-
-CPU_MAXFREQ:
-
-When on battery, what is the maximum CPU speed that the system should use? Legal
-values are "slowest" for the slowest speed that your CPU is able to operate at,
-or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
-
-
-Tips & Tricks
--------------
-
-* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
-  of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
-
-* You can spin down the disk while playing MP3, by setting disk readahead
-  to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
-  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
-  Kania.)
-
-* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
-  of colours that my display uses it consumes less battery power. I've seen
-  this on powerbooks too. I hope that this is a piece of information that
-  might be useful to the Laptop Mode patch or its users."
-
-* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
-  file after every logging. When you're using laptop-mode and your disk doesn't
-  spin down, this is a likely culprit.
-
-* Richard Atterer observed that laptop mode does not work well with noflushd
-  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
-  from doing its thing.
-
-* If you're worried about your data, you might want to consider using a USB
-  memory stick or something like that as a "working area". (Be aware though
-  that flash memory can only handle a limited number of writes, and overuse
-  may wear out your memory stick pretty quickly. Do _not_ use journalling
-  filesystems on flash memory sticks.)
-
-
-Configuration file for control and ACPI battery scripts
--------------------------------------------------------
-
-This allows the tunables to be changed for the scripts via an external
-configuration file
-
-It should be installed as /etc/default/laptop-mode on Debian, and as
-/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
-
---------------------CONFIG FILE BEGIN-------------------------------------------
-# Maximum time, in seconds, of hard drive spindown time that you are
-# comfortable with. Worst case, it's possible that you could lose this
-# amount of work if your battery fails you while in laptop mode.
-#MAX_AGE=600
-
-# Automatically disable laptop mode when the number of minutes of battery
-# that you have left goes below this threshold.
-MINIMUM_BATTERY_MINUTES=10
-
-# Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
-# by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
-# will read a complete MP3 at once, and will then spin down while the MP3/OGG is
-# playing.
-#READAHEAD=4096
-
-# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-#DO_REMOUNTS=1
-
-# And shall we add the "noatime" option to that as well? (1=yes)
-#DO_REMOUNT_NOATIME=1
-
-# Dirty synchronous ratio.  At this percentage of dirty pages the process
-# which
-# calls write() does its own writeback
-#DIRTY_RATIO=40
-
-#
-# Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-# exceeded, the kernel will wake flusher threads which will then reduce the
-# amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-# so once some writeout has commenced, we do a lot of it.
-#
-#DIRTY_BACKGROUND_RATIO=5
-
-# kernel default dirty buffer age
-#DEF_AGE=30
-#DEF_UPDATE=5
-#DEF_DIRTY_BACKGROUND_RATIO=10
-#DEF_DIRTY_RATIO=40
-#DEF_XFS_AGE_BUFFER=15
-#DEF_XFS_SYNC_INTERVAL=30
-#DEF_XFS_BUFD_INTERVAL=1
-
-# This must be adjusted manually to the value of HZ in the running kernel
-# on 2.4, until the XFS people change their 2.4 external interfaces to work in
-# centisecs. This can be automated, but it's a work in progress that still
-# needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
-# external interfaces, and that is currently always set to 100. So you don't
-# need to change this on 2.6.
-#XFS_HZ=100
-
-# Should the maximum CPU frequency be adjusted down while on battery?
-# Requires CPUFreq to be setup.
-# See Documentation/admin-guide/pm/cpufreq.rst for more info
-#DO_CPU=0
-
-# When on battery what is the maximum CPU speed that the system should
-# use? Legal values are "slowest" for the slowest speed that your
-# CPU is able to operate at, or a value listed in:
-# /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
-# Only applicable if DO_CPU=1.
-#CPU_MAXFREQ=slowest
-
-# Idle timeout for your hard drive (man hdparm for valid values, -S option)
-# Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
-#AC_HD=244
-#BATT_HD=4
-
-# The drives for which to adjust the idle timeout. Separate them by a space,
-# e.g. HD="/dev/hda /dev/hdb".
-#HD="/dev/hda"
-
-# Set the spindown timeout on a hard drive?
-#DO_HD=1
-
---------------------CONFIG FILE END---------------------------------------------
-
-
-Control script
---------------
-
-Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
-to Kiko Piris).
-
---------------------CONTROL SCRIPT BEGIN----------------------------------------
-#!/bin/bash
-
-# start or stop laptop_mode, best run by a power management daemon when
-# ac gets connected/disconnected from a laptop
-#
-# install as /sbin/laptop_mode
-#
-# Contributors to this script:   Kiko Piris
-#				 Bart Samwel
-#				 Micha Feigin
-#				 Andrew Morton
-#				 Herve Eychenne
-#				 Dax Kelson
-#
-# Original Linux 2.4 version by: Jens Axboe
-
-#############################################################################
-
-# Source config
-if [ -f /etc/default/laptop-mode ] ; then
-	# Debian
-	. /etc/default/laptop-mode
-elif [ -f /etc/sysconfig/laptop-mode ] ; then
-	# Others
-        . /etc/sysconfig/laptop-mode
-fi
-
-# Don't raise an error if the config file is incomplete
-# set defaults instead:
-
-# Maximum time, in seconds, of hard drive spindown time that you are
-# comfortable with. Worst case, it's possible that you could lose this
-# amount of work if your battery fails you while in laptop mode.
-MAX_AGE=${MAX_AGE:-'600'}
-
-# Read-ahead, in kilobytes
-READAHEAD=${READAHEAD:-'4096'}
-
-# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-DO_REMOUNTS=${DO_REMOUNTS:-'1'}
-
-# And shall we add the "noatime" option to that as well? (1=yes)
-DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
-
-# Shall we adjust the idle timeout on a hard drive?
-DO_HD=${DO_HD:-'1'}
-
-# Adjust idle timeout on which hard drive?
-HD="${HD:-'/dev/hda'}"
-
-# spindown time for HD (hdparm -S values)
-AC_HD=${AC_HD:-'244'}
-BATT_HD=${BATT_HD:-'4'}
-
-# Dirty synchronous ratio.  At this percentage of dirty pages the process which
-# calls write() does its own writeback
-DIRTY_RATIO=${DIRTY_RATIO:-'40'}
-
-# cpu frequency scaling
-# See Documentation/admin-guide/pm/cpufreq.rst for more info
-DO_CPU=${CPU_MANAGE:-'0'}
-CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
-
-#
-# Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-# exceeded, the kernel will wake flusher threads which will then reduce the
-# amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-# so once some writeout has commenced, we do a lot of it.
-#
-DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
-
-# kernel default dirty buffer age
-DEF_AGE=${DEF_AGE:-'30'}
-DEF_UPDATE=${DEF_UPDATE:-'5'}
-DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
-DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
-DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
-DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
-DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
-
-# This must be adjusted manually to the value of HZ in the running kernel
-# on 2.4, until the XFS people change their 2.4 external interfaces to work in
-# centisecs. This can be automated, but it's a work in progress that still needs
-# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
-# interfaces, and that is currently always set to 100. So you don't need to
-# change this on 2.6.
-XFS_HZ=${XFS_HZ:-'100'}
-
-#############################################################################
-
-KLEVEL="$(uname -r |
-             {
-	       IFS='.' read a b c
-	       echo $a.$b
-	     }
-)"
-case "$KLEVEL" in
-	"2.4"|"2.6")
-		;;
-	*)
-		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
-		exit 1
-		;;
-esac
-
-if [ ! -e /proc/sys/vm/laptop_mode ] ; then
-	echo "Kernel is not patched with laptop_mode patch." >&2
-	exit 1
-fi
-
-if [ ! -w /proc/sys/vm/laptop_mode ] ; then
-	echo "You do not have enough privileges to enable laptop_mode." >&2
-	exit 1
-fi
-
-# Remove an option (the first parameter) of the form option=<number> from
-# a mount options string (the rest of the parameters).
-parse_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"'=[0-9]*,/,/g'	\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-}
-
-# Remove an option (the first parameter) without any arguments from
-# a mount option string (the rest of the parameters).
-parse_nonumber_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"',/,/g'		\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-}
-
-# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
-# fstab for a given filesystem, and use this state to replace the
-# value of the option in another mount options string. The device
-# is the first argument, the option name the second, and the default
-# value the third. The remainder is the mount options string.
-#
-# Example:
-# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
-#
-# If fstab contains, say, "rw" for this filesystem, then the result
-# will be "defaults,atime".
-parse_yesno_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	DEF_OPT="$3"
-	shift 3
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
-	# Watch for a default atime in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
-		# option specified in fstab: extract the value and use it
-		if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
-			echo "$PARSEDOPTS1,no$OPT"
-		else
-			# no$OPT not found -- so we must have $OPT.
-			echo "$PARSEDOPTS1,$OPT"
-		fi
-	else
-		# option not specified in fstab -- choose the default.
-		echo "$PARSEDOPTS1,$DEF_OPT"
-	fi
-}
-
-# Find out the state of a numbered option (e.g. "commit=NNN") in
-# fstab for a given filesystem, and use this state to replace the
-# value of the option in another mount options string. The device
-# is the first argument, and the option name the second. The
-# remainder is the mount options string in which the replacement
-# must be done.
-#
-# Example:
-# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
-#
-# If fstab contains, say, "commit=3,rw" for this filesystem, then the
-# result will be "rw,commit=3".
-parse_mount_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	shift 2
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
-	# Watch for a default commit in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
-		# option specified in fstab: extract the value, and use it
-		echo -n "$PARSEDOPTS1,$OPT="
-		echo ",$FSTAB_OPTS," | sed \
-		 -e 's/.*,'"$OPT"'=//'	\
-		 -e 's/,.*//'
-	else
-		# option not specified in fstab: set it to 0
-		echo "$PARSEDOPTS1,$OPT=0"
-	fi
-}
-
-deduce_fstype () {
-	MP="$1"
-	# My root filesystem unfortunately has
-	# type "unknown" in /etc/mtab. If we encounter
-	# "unknown", we try to get the type from fstab.
-	cat /etc/fstab |
-	grep -v '^#' |
-	while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
-		if [ "$FSTAB_MP" = "$MP" ]; then
-			echo $FSTAB_FST
-			exit 0
-		fi
-	done
-}
-
-if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
-	NOATIME_OPT=",noatime"
-fi
-
-case "$1" in
-	start)
-		AGE=$((100*$MAX_AGE))
-		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
-		echo -n "Starting laptop_mode"
-
-		if [ -d /proc/sys/vm/pagebuf ] ; then
-			# (For 2.4 and early 2.6.)
-			# This only needs to be set, not reset -- it is only used when
-			# laptop mode is enabled.
-			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# (A couple of early 2.6 laptop mode patches had these.)
-			# The same goes for these.
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
-			# (2.6.6)
-			# But not for these -- they are also used in normal
-			# operation.
-			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# (2.6.7 upwards)
-			# And not for these either. These are in centisecs,
-			# not USER_HZ, so we have to use $AGE, not $XFS_AGE.
-			echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-
-		case "$KLEVEL" in
-			"2.4")
-				echo 1					> /proc/sys/vm/laptop_mode
-				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo 5					> /proc/sys/vm/laptop_mode
-				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ]; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3"|"reiserfs")
-						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
-						;;
-					"xfs")
-						mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra $(($READAHEAD * 2)) $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			if [ $CPU_MAXFREQ = 'slowest' ]; then
-				CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
-			fi
-			echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	stop)
-		U_AGE=$((100*$DEF_UPDATE))
-		B_AGE=$((100*$DEF_AGE))
-		echo -n "Stopping laptop_mode"
-		echo 0 > /proc/sys/vm/laptop_mode
-		if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# These need to be restored, if there are no lm_*.
-			echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER))	 	> /proc/sys/fs/xfs/age_buffer
-			echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) 	> /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# These need to be restored as well.
-			echo $((100*$DEF_XFS_AGE_BUFFER))	> /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $((100*$DEF_XFS_SYNC_INTERVAL))	> /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo $((100*$DEF_XFS_BUFD_INTERVAL))	> /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-		case "$KLEVEL" in
-			"2.4")
-				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ] ; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				# Reset commit and atime options to defaults.
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3"|"reiserfs")
-						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-					"xfs")
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra 256 $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	*)
-		echo "Usage: $0 {start|stop}" 2>&1
-		exit 1
-		;;
-
-esac
-
-exit 0
---------------------CONTROL SCRIPT END------------------------------------------
-
-
-ACPI integration
-----------------
-
-Dax Kelson submitted this so that the ACPI acpid daemon will
-kick off the laptop_mode script and run hdparm. The part that
-automatically disables laptop mode when the battery is low was
-written by Jan Topinski.
-
------------------/etc/acpi/events/ac_adapter BEGIN------------------------------
-event=ac_adapter
-action=/etc/acpi/actions/ac.sh %e
-----------------/etc/acpi/events/ac_adapter END---------------------------------
-
-
------------------/etc/acpi/events/battery BEGIN---------------------------------
-event=battery.*
-action=/etc/acpi/actions/battery.sh %e
-----------------/etc/acpi/events/battery END------------------------------------
-
-
-----------------/etc/acpi/actions/ac.sh BEGIN-----------------------------------
-#!/bin/bash
-
-# ac on/offline event handler
-
-status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
-
-case $status in
-        "on-line")
-                /sbin/laptop_mode stop
-                exit 0
-        ;;
-        "off-line")
-                /sbin/laptop_mode start
-                exit 0
-        ;;
-esac
----------------------------/etc/acpi/actions/ac.sh END--------------------------
-
-
----------------------------/etc/acpi/actions/battery.sh BEGIN-------------------
-#! /bin/bash
-
-# Automatically disable laptop mode when the battery almost runs out.
-
-BATT_INFO=/proc/acpi/battery/$2/state
-
-if [[ -f /proc/sys/vm/laptop_mode ]]
-then
-   LM=`cat /proc/sys/vm/laptop_mode`
-   if [[ $LM -gt 0 ]]
-   then
-     if [[ -f $BATT_INFO ]]
-     then
-        # Source the config file only now that we know we need
-        if [ -f /etc/default/laptop-mode ] ; then
-                # Debian
-                . /etc/default/laptop-mode
-        elif [ -f /etc/sysconfig/laptop-mode ] ; then
-                # Others
-                . /etc/sysconfig/laptop-mode
-        fi
-        MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
-
-        ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
-        if [[ ACTION -eq "discharging" ]]
-        then
-           PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-           REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-        fi
-        if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
-        then
-           /sbin/laptop_mode stop
-        fi
-     else
-       logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
-     fi
-   fi
-fi
----------------------------/etc/acpi/actions/battery.sh END--------------------
-
-
-Monitoring tool
----------------
-
-Bartek Kania submitted this, it can be used to measure how much time your disk
-spends spun up/down.  See tools/laptop/dslm/dslm.c
diff --git a/Documentation/laptops/sony-laptop.txt b/Documentation/laptops/sony-laptop.txt
deleted file mode 100644
index 978b1e615155..000000000000
--- a/Documentation/laptops/sony-laptop.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-Sony Notebook Control Driver (SNC) Readme
------------------------------------------
-	Copyright (C) 2004- 2005 Stelian Pop <stelian@popies.net>
-	Copyright (C) 2007 Mattia Dongili <malattia@linux.it>
-
-This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the
-Sony Vaio laptops. This driver mixes both devices functions under the same
-(hopefully consistent) interface. This also means that the sonypi driver is
-obsoleted by sony-laptop now.
-
-Fn keys (hotkeys):
-------------------
-Some models report hotkeys through the SNC or SPIC devices, such events are
-reported both through the ACPI subsystem as acpi events and through the INPUT
-subsystem. See the logs of /proc/bus/input/devices to find out what those
-events are and which input devices are created by the driver.
-Additionally, loading the driver with the debug option will report all events
-in the kernel log.
-
-The "scancodes" passed to the input system (that can be remapped with udev)
-are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c
-module.  For example the "FN/E" key combination (EJECTCD on some models)
-generates the scancode 20 (0x14).
-
-Backlight control:
-------------------
-If your laptop model supports it, you will find sysfs files in the
-/sys/class/backlight/sony/
-directory. You will be able to query and set the current screen
-brightness:
-	brightness		get/set screen brightness (an integer
-				between 0 and 7)
-	actual_brightness	reading from this file will query the HW
-				to get real brightness value
-	max_brightness		the maximum brightness value
-
-
-Platform specific:
-------------------
-Loading the sony-laptop module will create a
-/sys/devices/platform/sony-laptop/
-directory populated with some files.
-
-You then read/write integer values from/to those files by using
-standard UNIX tools.
-
-The files are:
-	brightness_default	screen brightness which will be set
-				when the laptop will be rebooted
-	cdpower			power on/off the internal CD drive
-	audiopower		power on/off the internal sound card
-	lanpower		power on/off the internal ethernet card
-				(only in debug mode)
-	bluetoothpower		power on/off the internal bluetooth device
-	fanspeed		get/set the fan speed
-
-Note that some files may be missing if they are not supported
-by your particular laptop model.
-
-Example usage:
-	# echo "1" > /sys/devices/platform/sony-laptop/brightness_default
-sets the lowest screen brightness for the next and later reboots,
-	# echo "8" > /sys/devices/platform/sony-laptop/brightness_default
-sets the highest screen brightness for the next and later reboots,
-	# cat /sys/devices/platform/sony-laptop/brightness_default
-retrieves the value.
-
-	# echo "0" > /sys/devices/platform/sony-laptop/audiopower
-powers off the sound card,
-	# echo "1" > /sys/devices/platform/sony-laptop/audiopower
-powers on the sound card.
-
-
-RFkill control:
----------------
-More recent Vaio models expose a consistent set of ACPI methods to
-control radio frequency emitting devices. If you are a lucky owner of
-such a laptop you will find the necessary rfkill devices under
-/sys/class/rfkill. Check those starting with sony-* in
-	# grep . /sys/class/rfkill/*/{state,name}
-
-
-Development:
-------------
-
-If you want to help with the development of this driver (and
-you are not afraid of any side effects doing strange things with
-your ACPI BIOS could have on your laptop), load the driver and
-pass the option 'debug=1'.
-
-REPEAT: DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.
-
-In your kernel logs you will find the list of all ACPI methods
-the SNC device has on your laptop.
-
-* For new models you will see a long list of meaningless method names,
-reading the DSDT table source should reveal that:
-(1) the SNC device uses an internal capability lookup table
-(2) SN00 is used to find values in the lookup table
-(3) SN06 and SN07 are used to call into the real methods based on
-    offsets you can obtain iterating the table using SN00
-(4) SN02 used to enable events.
-Some values in the capability lookup table are more or less known, see
-the code for all sony_call_snc_handle calls, others are more obscure.
-
-* For old models you can see the GCDP/GCDP methods used to pwer on/off
-the CD drive, but there are others and they are usually different from
-model to model.
-
-I HAVE NO IDEA WHAT THOSE METHODS DO.
-
-The sony-laptop driver creates, for some of those methods (the most
-current ones found on several Vaio models), an entry under
-/sys/devices/platform/sony-laptop, just like the 'cdpower' one.
-You can create other entries corresponding to your own laptop methods by
-further editing the source (see the 'sony_nc_values' table, and add a new
-entry to this table with your get/set method names using the
-SNC_HANDLE_NAMES macro).
-
-Your mission, should you accept it, is to try finding out what
-those entries are for, by reading/writing random values from/to those
-files and find out what is the impact on your laptop.
-
-Should you find anything interesting, please report it back to me,
-I will not disavow all knowledge of your actions :)
-
-See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other
-useful info.
-
-Bugs/Limitations:
------------------
-
-* This driver is not based on official documentation from Sony
-  (because there is none), so there is no guarantee this driver
-  will work at all, or do the right thing. Although this hasn't
-  happened to me, this driver could do very bad things to your
-  laptop, including permanent damage.
-
-* The sony-laptop and sonypi drivers do not interact at all. In the
-  future, sonypi will be removed and replaced by sony-laptop.
-
-* spicctrl, which is the userspace tool used to communicate with the
-  sonypi driver (through /dev/sonypi) is deprecated as well since all
-  its features are now available under the sysfs tree via sony-laptop.
diff --git a/Documentation/laptops/sonypi.txt b/Documentation/laptops/sonypi.txt
deleted file mode 100644
index 606bdb9ce036..000000000000
--- a/Documentation/laptops/sonypi.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-Sony Programmable I/O Control Device Driver Readme
---------------------------------------------------
-	Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
-	Copyright (C) 2001-2002 Alcôve <www.alcove.com>
-	Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
-	Copyright (C) 2001 Junichi Morita <jun1m@mars.dti.ne.jp>
-	Copyright (C) 2000 Takaya Kinjo <t-kinjo@tc4.so-net.ne.jp>
-	Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
-
-This driver enables access to the Sony Programmable I/O Control Device which
-can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be
-limited to new FX series laptops, at least the FX501 and the FX702) lack a
-sonypi device and are not supported at all by this driver.
-
-It will give access (through a user space utility) to some events those laptops
-generate, like:
-	- jogdial events (the small wheel on the side of Vaios)
-	- capture button events (only on Vaio Picturebook series)
-	- Fn keys
-	- bluetooth button (only on C1VR model)
-	- programmable keys, back, help, zoom, thumbphrase buttons, etc.
-	  (when available)
-
-Those events (see linux/sonypi.h) can be polled using the character device node
-/dev/sonypi (major 10, minor auto allocated or specified as a option).
-A simple daemon which translates the jogdial movements into mouse wheel events
-can be downloaded at: <http://popies.net/sonypi/>
-
-Another option to intercept the events is to get them directly through the
-input layer.
-
-This driver supports also some ioctl commands for setting the LCD screen
-brightness and querying the batteries charge information (some more
-commands may be added in the future).
-
-This driver can also be used to set the camera controls on Picturebook series
-(brightness, contrast etc), and is used by the video4linux driver for the
-Motion Eye camera.
-
-Please note that this driver was created by reverse engineering the Windows
-driver and the ACPI BIOS, because Sony doesn't agree to release any programming
-specs for its laptops. If someone convinces them to do so, drop me a note.
-
-Driver options:
----------------
-
-Several options can be passed to the sonypi driver using the standard
-module argument syntax (<param>=<value> when passing the option to the
-module or sonypi.<param>=<value> on the kernel boot line when sonypi is
-statically linked into the kernel). Those options are:
-
-	minor: 		minor number of the misc device /dev/sonypi,
-			default is -1 (automatic allocation, see /proc/misc
-			or kernel logs)
-
-	camera:		if you have a PictureBook series Vaio (with the
-			integrated MotionEye camera), set this parameter to 1
-			in order to let the driver access to the camera
-
-	fnkeyinit:	on some Vaios (C1VE, C1VR etc), the Fn key events don't
-			get enabled unless you set this parameter to 1.
-			Do not use this option unless it's actually necessary,
-			some Vaio models don't deal well with this option.
-			This option is available only if the kernel is
-			compiled without ACPI support (since it conflicts
-			with it and it shouldn't be required anyway if
-			ACPI is already enabled).
-
-	verbose:	set to 1 to print unknown events received from the
-			sonypi device.
-			set to 2 to print all events received from the
-			sonypi device.
-
-	compat:		uses some compatibility code for enabling the sonypi
-			events. If the driver worked for you in the past
-			(prior to version 1.5) and does not work anymore,
-			add this option and report to the author.
-
-	mask:		event mask telling the driver what events will be
-			reported to the user. This parameter is required for
-			some Vaio models where the hardware reuses values
-			used in other Vaio models (like the FX series who does
-			not have a jogdial but reuses the jogdial events for
-			programmable keys events). The default event mask is
-			set to 0xffffffff, meaning that all possible events
-			will be tried. You can use the following bits to
-			construct your own event mask (from
-			drivers/char/sonypi.h):
-				SONYPI_JOGGER_MASK 		0x0001
-				SONYPI_CAPTURE_MASK 		0x0002
-				SONYPI_FNKEY_MASK 		0x0004
-				SONYPI_BLUETOOTH_MASK 		0x0008
-				SONYPI_PKEY_MASK 		0x0010
-				SONYPI_BACK_MASK 		0x0020
-				SONYPI_HELP_MASK 		0x0040
-				SONYPI_LID_MASK 		0x0080
-				SONYPI_ZOOM_MASK 		0x0100
-				SONYPI_THUMBPHRASE_MASK 	0x0200
-				SONYPI_MEYE_MASK		0x0400
-				SONYPI_MEMORYSTICK_MASK		0x0800
-				SONYPI_BATTERY_MASK		0x1000
-				SONYPI_WIRELESS_MASK		0x2000
-
-	useinput:	if set (which is the default) two input devices are
-			created, one which interprets the jogdial events as
-			mouse events, the other one which acts like a
-			keyboard reporting the pressing of the special keys.
-
-Module use:
------------
-
-In order to automatically load the sonypi module on use, you can put those
-lines a configuration file in /etc/modprobe.d/:
-
-	alias char-major-10-250 sonypi
-	options sonypi minor=250
-
-This supposes the use of minor 250 for the sonypi device:
-
-	# mknod /dev/sonypi c 10 250
-
-Bugs:
------
-
-	- several users reported that this driver disables the BIOS-managed
-	  Fn-keys which put the laptop in sleeping state, or switch the
-	  external monitor on/off. There is no workaround yet, since this
-	  driver disables all APM management for those keys, by enabling the
-	  ACPI management (and the ACPI core stuff is not complete yet). If
-	  you have one of those laptops with working Fn keys and want to
-	  continue to use them, don't use this driver.
-
-	- some users reported that the laptop speed is lower (dhrystone
-	  tested) when using the driver with the fnkeyinit parameter. I cannot
-	  reproduce it on my laptop and not all users have this problem.
-	  This happens because the fnkeyinit parameter enables the ACPI
-	  mode (but without additional ACPI control, like processor
-	  speed handling etc). Use ACPI instead of APM if it works on your
-	  laptop.
-
-	- sonypi lacks the ability to distinguish between certain key
-	  events on some models.
-
-	- some models with the nvidia card (geforce go 6200 tc) uses a
-	  different way to adjust the backlighting of the screen. There
-	  is a userspace utility to adjust the brightness on those models,
-	  which can be downloaded from
-	  http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
-
-	- since all development was done by reverse engineering, there is
-	  _absolutely no guarantee_ that this driver will not crash your
-	  laptop. Permanently.
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
deleted file mode 100644
index 6cced88de6da..000000000000
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ /dev/null
@@ -1,1487 +0,0 @@
-		     ThinkPad ACPI Extras Driver
-
-                            Version 0.25
-                        October 16th,  2013
-
-               Borislav Deianov <borislav@users.sf.net>
-             Henrique de Moraes Holschuh <hmh@hmh.eng.br>
-                      http://ibm-acpi.sf.net/
-
-
-This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It
-supports various features of these laptops which are accessible
-through the ACPI and ACPI EC framework, but not otherwise fully
-supported by the generic Linux ACPI drivers.
-
-This driver used to be named ibm-acpi until kernel 2.6.21 and release
-0.13-20070314.  It used to be in the drivers/acpi tree, but it was
-moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
-2.6.22, and release 0.14.  It was moved to drivers/platform/x86 for
-kernel 2.6.29 and release 0.22.
-
-The driver is named "thinkpad-acpi".  In some places, like module
-names and log messages, "thinkpad_acpi" is used because of userspace
-issues.
-
-"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
-long due to length limitations on some Linux kernel versions.
-
-Status
-------
-
-The features currently supported are the following (see below for
-detailed description):
-
-	- Fn key combinations
-	- Bluetooth enable and disable
-	- video output switching, expansion control
-	- ThinkLight on and off
-	- CMOS/UCMS control
-	- LED control
-	- ACPI sounds
-	- temperature sensors
-	- Experimental: embedded controller register dump
-	- LCD brightness control
-	- Volume control
-	- Fan control and monitoring: fan speed, fan enable/disable
-	- WAN enable and disable
-	- UWB enable and disable
-
-A compatibility table by model and feature is maintained on the web
-site, http://ibm-acpi.sf.net/. I appreciate any success or failure
-reports, especially if they add to or correct the compatibility table.
-Please include the following information in your report:
-
-	- ThinkPad model name
-	- a copy of your ACPI tables, using the "acpidump" utility
-	- a copy of the output of dmidecode, with serial numbers
-	  and UUIDs masked off
-	- which driver features work and which don't
-	- the observed behavior of non-working features
-
-Any other comments or patches are also more than welcome.
-
-
-Installation
-------------
-
-If you are compiling this driver as included in the Linux kernel
-sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
-It is located on the menu path: "Device Drivers" -> "X86 Platform
-Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
-
-
-Features
---------
-
-The driver exports two different interfaces to userspace, which can be
-used to access the features it provides.  One is a legacy procfs-based
-interface, which will be removed at some time in the future.  The other
-is a new sysfs-based interface which is not complete yet.
-
-The procfs interface creates the /proc/acpi/ibm directory.  There is a
-file under that directory for each feature it supports.  The procfs
-interface is mostly frozen, and will change very little if at all: it
-will not be extended to add any new functionality in the driver, instead
-all new functionality will be implemented on the sysfs interface.
-
-The sysfs interface tries to blend in the generic Linux sysfs subsystems
-and classes as much as possible.  Since some of these subsystems are not
-yet ready or stabilized, it is expected that this interface will change,
-and any and all userspace programs must deal with it.
-
-
-Notes about the sysfs interface:
-
-Unlike what was done with the procfs interface, correctness when talking
-to the sysfs interfaces will be enforced, as will correctness in the
-thinkpad-acpi's implementation of sysfs interfaces.
-
-Also, any bugs in the thinkpad-acpi sysfs driver code or in the
-thinkpad-acpi's implementation of the sysfs interfaces will be fixed for
-maximum correctness, even if that means changing an interface in
-non-compatible ways.  As these interfaces mature both in the kernel and
-in thinkpad-acpi, such changes should become quite rare.
-
-Applications interfacing to the thinkpad-acpi sysfs interfaces must
-follow all sysfs guidelines and correctly process all errors (the sysfs
-interface makes extensive use of errors).  File descriptors and open /
-close operations to the sysfs inodes must also be properly implemented.
-
-The version of thinkpad-acpi's sysfs interface is exported by the driver
-as a driver attribute (see below).
-
-Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
-/sys/bus/platform/drivers/thinkpad_hwmon/
-
-Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
-space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
-
-Sysfs device attributes for the sensors and fan are on the
-thinkpad_hwmon device's sysfs attribute space, but you should locate it
-looking for a hwmon device with the name attribute of "thinkpad", or
-better yet, through libsensors. For 4.14+ sysfs attributes were moved to the
-hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or
-/sys/class/hwmon/hwmon?).
-
-Driver version
---------------
-
-procfs: /proc/acpi/ibm/driver
-sysfs driver attribute: version
-
-The driver name and version. No commands can be written to this file.
-
-
-Sysfs interface version
------------------------
-
-sysfs driver attribute: interface_version
-
-Version of the thinkpad-acpi sysfs interface, as an unsigned long
-(output in hex format: 0xAAAABBCC), where:
-	AAAA - major revision
-	BB - minor revision
-	CC - bugfix revision
-
-The sysfs interface version changelog for the driver can be found at the
-end of this document.  Changes to the sysfs interface done by the kernel
-subsystems are not documented here, nor are they tracked by this
-attribute.
-
-Changes to the thinkpad-acpi sysfs interface are only considered
-non-experimental when they are submitted to Linux mainline, at which
-point the changes in this interface are documented and interface_version
-may be updated.  If you are using any thinkpad-acpi features not yet
-sent to mainline for merging, you do so on your own risk: these features
-may disappear, or be implemented in a different and incompatible way by
-the time they are merged in Linux mainline.
-
-Changes that are backwards-compatible by nature (e.g. the addition of
-attributes that do not change the way the other attributes work) do not
-always warrant an update of interface_version.  Therefore, one must
-expect that an attribute might not be there, and deal with it properly
-(an attribute not being there *is* a valid way to make it clear that a
-feature is not available in sysfs).
-
-
-Hot keys
---------
-
-procfs: /proc/acpi/ibm/hotkey
-sysfs device attribute: hotkey_*
-
-In a ThinkPad, the ACPI HKEY handler is responsible for communicating
-some important events and also keyboard hot key presses to the operating
-system.  Enabling the hotkey functionality of thinkpad-acpi signals the
-firmware that such a driver is present, and modifies how the ThinkPad
-firmware will behave in many situations.
-
-The driver enables the HKEY ("hot key") event reporting automatically
-when loaded, and disables it when it is removed.
-
-The driver will report HKEY events in the following format:
-
-	ibm/hotkey HKEY 00000080 0000xxxx
-
-Some of these events refer to hot key presses, but not all of them.
-
-The driver will generate events over the input layer for hot keys and
-radio switches, and over the ACPI netlink layer for other events.  The
-input layer support accepts the standard IOCTLs to remap the keycodes
-assigned to each hot key.
-
-The hot key bit mask allows some control over which hot keys generate
-events.  If a key is "masked" (bit set to 0 in the mask), the firmware
-will handle it.  If it is "unmasked", it signals the firmware that
-thinkpad-acpi would prefer to handle it, if the firmware would be so
-kind to allow it (and it often doesn't!).
-
-Not all bits in the mask can be modified.  Not all bits that can be
-modified do anything.  Not all hot keys can be individually controlled
-by the mask.  Some models do not support the mask at all.  The behaviour
-of the mask is, therefore, highly dependent on the ThinkPad model.
-
-The driver will filter out any unmasked hotkeys, so even if the firmware
-doesn't allow disabling an specific hotkey, the driver will not report
-events for unmasked hotkeys.
-
-Note that unmasking some keys prevents their default behavior.  For
-example, if Fn+F5 is unmasked, that key will no longer enable/disable
-Bluetooth by itself in firmware.
-
-Note also that not all Fn key combinations are supported through ACPI
-depending on the ThinkPad model and firmware version.  On those
-ThinkPads, it is still possible to support some extra hotkeys by
-polling the "CMOS NVRAM" at least 10 times per second.  The driver
-attempts to enables this functionality automatically when required.
-
-procfs notes:
-
-The following commands can be written to the /proc/acpi/ibm/hotkey file:
-
-	echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
-	echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
-	... any other 8-hex-digit mask ...
-	echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask
-
-The following commands have been deprecated and will cause the kernel
-to log a warning:
-
-	echo enable > /proc/acpi/ibm/hotkey -- does nothing
-	echo disable > /proc/acpi/ibm/hotkey -- returns an error
-
-The procfs interface does not support NVRAM polling control.  So as to
-maintain maximum bug-to-bug compatibility, it does not report any masks,
-nor does it allow one to manipulate the hot key mask when the firmware
-does not support masks at all, even if NVRAM polling is in use.
-
-sysfs notes:
-
-	hotkey_bios_enabled:
-		DEPRECATED, WILL BE REMOVED SOON.
-
-		Returns 0.
-
-	hotkey_bios_mask:
-		DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE.
-
-		Returns the hot keys mask when thinkpad-acpi was loaded.
-		Upon module unload, the hot keys mask will be restored
-		to this value.   This is always 0x80c, because those are
-		the hotkeys that were supported by ancient firmware
-		without mask support.
-
-	hotkey_enable:
-		DEPRECATED, WILL BE REMOVED SOON.
-
-		0: returns -EPERM
-		1: does nothing
-
-	hotkey_mask:
-		bit mask to enable reporting (and depending on
-		the firmware, ACPI event generation) for each hot key
-		(see above).  Returns the current status of the hot keys
-		mask, and allows one to modify it.
-
-	hotkey_all_mask:
-		bit mask that should enable event reporting for all
-		supported hot keys, when echoed to hotkey_mask above.
-		Unless you know which events need to be handled
-		passively (because the firmware *will* handle them
-		anyway), do *not* use hotkey_all_mask.  Use
-		hotkey_recommended_mask, instead. You have been warned.
-
-	hotkey_recommended_mask:
-		bit mask that should enable event reporting for all
-		supported hot keys, except those which are always
-		handled by the firmware anyway.  Echo it to
-		hotkey_mask above, to use.  This is the default mask
-		used by the driver.
-
-	hotkey_source_mask:
-		bit mask that selects which hot keys will the driver
-		poll the NVRAM for.  This is auto-detected by the driver
-		based on the capabilities reported by the ACPI firmware,
-		but it can be overridden at runtime.
-
-		Hot keys whose bits are set in hotkey_source_mask are
-		polled for in NVRAM, and reported as hotkey events if
-		enabled in hotkey_mask.  Only a few hot keys are
-		available through CMOS NVRAM polling.
-
-		Warning: when in NVRAM mode, the volume up/down/mute
-		keys are synthesized according to changes in the mixer,
-		which uses a single volume up or volume down hotkey
-		press to unmute, as per the ThinkPad volume mixer user
-		interface.  When in ACPI event mode, volume up/down/mute
-		events are reported by the firmware and can behave
-		differently (and that behaviour changes with firmware
-		version -- not just with firmware models -- as well as
-		OSI(Linux) state).
-
-	hotkey_poll_freq:
-		frequency in Hz for hot key polling. It must be between
-		0 and 25 Hz.  Polling is only carried out when strictly
-		needed.
-
-		Setting hotkey_poll_freq to zero disables polling, and
-		will cause hot key presses that require NVRAM polling
-		to never be reported.
-
-		Setting hotkey_poll_freq too low may cause repeated
-		pressings of the same hot key to be misreported as a
-		single key press, or to not even be detected at all.
-		The recommended polling frequency is 10Hz.
-
-	hotkey_radio_sw:
-		If the ThinkPad has a hardware radio switch, this
-		attribute will read 0 if the switch is in the "radios
-		disabled" position, and 1 if the switch is in the
-		"radios enabled" position.
-
-		This attribute has poll()/select() support.
-
-	hotkey_tablet_mode:
-		If the ThinkPad has tablet capabilities, this attribute
-		will read 0 if the ThinkPad is in normal mode, and
-		1 if the ThinkPad is in tablet mode.
-
-		This attribute has poll()/select() support.
-
-	wakeup_reason:
-		Set to 1 if the system is waking up because the user
-		requested a bay ejection.  Set to 2 if the system is
-		waking up because the user requested the system to
-		undock.  Set to zero for normal wake-ups or wake-ups
-		due to unknown reasons.
-
-		This attribute has poll()/select() support.
-
-	wakeup_hotunplug_complete:
-		Set to 1 if the system was waken up because of an
-		undock or bay ejection request, and that request
-		was successfully completed.  At this point, it might
-		be useful to send the system back to sleep, at the
-		user's choice.  Refer to HKEY events 0x4003 and
-		0x3003, below.
-
-		This attribute has poll()/select() support.
-
-input layer notes:
-
-A Hot key is mapped to a single input layer EV_KEY event, possibly
-followed by an EV_MSC MSC_SCAN event that shall contain that key's scan
-code.  An EV_SYN event will always be generated to mark the end of the
-event block.
-
-Do not use the EV_MSC MSC_SCAN events to process keys.  They are to be
-used as a helper to remap keys, only.  They are particularly useful when
-remapping KEY_UNKNOWN keys.
-
-The events are available in an input device, with the following id:
-
-	Bus:		BUS_HOST
-	vendor:		0x1014 (PCI_VENDOR_ID_IBM)  or
-			0x17aa (PCI_VENDOR_ID_LENOVO)
-	product:	0x5054 ("TP")
-	version:	0x4101
-
-The version will have its LSB incremented if the keymap changes in a
-backwards-compatible way.  The MSB shall always be 0x41 for this input
-device.  If the MSB is not 0x41, do not use the device as described in
-this section, as it is either something else (e.g. another input device
-exported by a thinkpad driver, such as HDAPS) or its functionality has
-been changed in a non-backwards compatible way.
-
-Adding other event types for other functionalities shall be considered a
-backwards-compatible change for this input device.
-
-Thinkpad-acpi Hot Key event map (version 0x4101):
-
-ACPI	Scan
-event	code	Key		Notes
-
-0x1001	0x00	FN+F1		-
-
-0x1002	0x01	FN+F2		IBM: battery (rare)
-				Lenovo: Screen lock
-
-0x1003	0x02	FN+F3		Many IBM models always report
-				this hot key, even with hot keys
-				disabled or with Fn+F3 masked
-				off
-				IBM: screen lock, often turns
-				off the ThinkLight as side-effect
-				Lenovo: battery
-
-0x1004	0x03	FN+F4		Sleep button (ACPI sleep button
-				semantics, i.e. sleep-to-RAM).
-				It always generates some kind
-				of event, either the hot key
-				event or an ACPI sleep button
-				event. The firmware may
-				refuse to generate further FN+F4
-				key presses until a S3 or S4 ACPI
-				sleep cycle is performed or some
-				time passes.
-
-0x1005	0x04	FN+F5		Radio.  Enables/disables
-				the internal Bluetooth hardware
-				and W-WAN card if left in control
-				of the firmware.  Does not affect
-				the WLAN card.
-				Should be used to turn on/off all
-				radios (Bluetooth+W-WAN+WLAN),
-				really.
-
-0x1006	0x05	FN+F6		-
-
-0x1007	0x06	FN+F7		Video output cycle.
-				Do you feel lucky today?
-
-0x1008	0x07	FN+F8		IBM: toggle screen expand
-				Lenovo: configure UltraNav,
-				or toggle screen expand
-
-0x1009	0x08	FN+F9		-
-	..	..		..
-0x100B	0x0A	FN+F11		-
-
-0x100C	0x0B	FN+F12		Sleep to disk.  You are always
-				supposed to handle it yourself,
-				either through the ACPI event,
-				or through a hotkey event.
-				The firmware may refuse to
-				generate further FN+F12 key
-				press events until a S3 or S4
-				ACPI sleep cycle is performed,
-				or some time passes.
-
-0x100D	0x0C	FN+BACKSPACE	-
-0x100E	0x0D	FN+INSERT	-
-0x100F	0x0E	FN+DELETE	-
-
-0x1010	0x0F	FN+HOME		Brightness up.  This key is
-				always handled by the firmware
-				in IBM ThinkPads, even when
-				unmasked.  Just leave it alone.
-				For Lenovo ThinkPads with a new
-				BIOS, it has to be handled either
-				by the ACPI OSI, or by userspace.
-				The driver does the right thing,
-				never mess with this.
-0x1011	0x10	FN+END		Brightness down.  See brightness
-				up for details.
-
-0x1012	0x11	FN+PGUP		ThinkLight toggle.  This key is
-				always handled by the firmware,
-				even when unmasked.
-
-0x1013	0x12	FN+PGDOWN	-
-
-0x1014	0x13	FN+SPACE	Zoom key
-
-0x1015	0x14	VOLUME UP	Internal mixer volume up. This
-				key is always handled by the
-				firmware, even when unmasked.
-				NOTE: Lenovo seems to be changing
-				this.
-0x1016	0x15	VOLUME DOWN	Internal mixer volume up. This
-				key is always handled by the
-				firmware, even when unmasked.
-				NOTE: Lenovo seems to be changing
-				this.
-0x1017	0x16	MUTE		Mute internal mixer. This
-				key is always handled by the
-				firmware, even when unmasked.
-
-0x1018	0x17	THINKPAD	ThinkPad/Access IBM/Lenovo key
-
-0x1019	0x18	unknown
-..	..	..
-0x1020	0x1F	unknown
-
-The ThinkPad firmware does not allow one to differentiate when most hot
-keys are pressed or released (either that, or we don't know how to, yet).
-For these keys, the driver generates a set of events for a key press and
-immediately issues the same set of events for a key release.  It is
-unknown by the driver if the ThinkPad firmware triggered these events on
-hot key press or release, but the firmware will do it for either one, not
-both.
-
-If a key is mapped to KEY_RESERVED, it generates no input events at all.
-If a key is mapped to KEY_UNKNOWN, it generates an input event that
-includes an scan code.  If a key is mapped to anything else, it will
-generate input device EV_KEY events.
-
-In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
-events for switches:
-
-SW_RFKILL_ALL	T60 and later hardware rfkill rocker switch
-SW_TABLET_MODE	Tablet ThinkPads HKEY events 0x5009 and 0x500A
-
-Non hotkey ACPI HKEY event map:
--------------------------------
-
-Events that are never propagated by the driver:
-
-0x2304		System is waking up from suspend to undock
-0x2305		System is waking up from suspend to eject bay
-0x2404		System is waking up from hibernation to undock
-0x2405		System is waking up from hibernation to eject bay
-0x5001		Lid closed
-0x5002		Lid opened
-0x5009		Tablet swivel: switched to tablet mode
-0x500A		Tablet swivel: switched to normal mode
-0x5010		Brightness level changed/control event
-0x6000		KEYBOARD: Numlock key pressed
-0x6005		KEYBOARD: Fn key pressed (TO BE VERIFIED)
-0x7000		Radio Switch may have changed state
-
-
-Events that are propagated by the driver to userspace:
-
-0x2313		ALARM: System is waking up from suspend because
-		the battery is nearly empty
-0x2413		ALARM: System is waking up from hibernation because
-		the battery is nearly empty
-0x3003		Bay ejection (see 0x2x05) complete, can sleep again
-0x3006		Bay hotplug request (hint to power up SATA link when
-		the optical drive tray is ejected)
-0x4003		Undocked (see 0x2x04), can sleep again
-0x4010		Docked into hotplug port replicator (non-ACPI dock)
-0x4011		Undocked from hotplug port replicator (non-ACPI dock)
-0x500B		Tablet pen inserted into its storage bay
-0x500C		Tablet pen removed from its storage bay
-0x6011		ALARM: battery is too hot
-0x6012		ALARM: battery is extremely hot
-0x6021		ALARM: a sensor is too hot
-0x6022		ALARM: a sensor is extremely hot
-0x6030		System thermal table changed
-0x6032		Thermal Control command set completion  (DYTC, Windows)
-0x6040		Nvidia Optimus/AC adapter related (TO BE VERIFIED)
-0x60C0		X1 Yoga 2016, Tablet mode status changed
-0x60F0		Thermal Transformation changed (GMTS, Windows)
-
-Battery nearly empty alarms are a last resort attempt to get the
-operating system to hibernate or shutdown cleanly (0x2313), or shutdown
-cleanly (0x2413) before power is lost.  They must be acted upon, as the
-wake up caused by the firmware will have negated most safety nets...
-
-When any of the "too hot" alarms happen, according to Lenovo the user
-should suspend or hibernate the laptop (and in the case of battery
-alarms, unplug the AC adapter) to let it cool down.  These alarms do
-signal that something is wrong, they should never happen on normal
-operating conditions.
-
-The "extremely hot" alarms are emergencies.  According to Lenovo, the
-operating system is to force either an immediate suspend or hibernate
-cycle, or a system shutdown.  Obviously, something is very wrong if this
-happens.
-
-
-Brightness hotkey notes:
-
-Don't mess with the brightness hotkeys in a Thinkpad.  If you want
-notifications for OSD, use the sysfs backlight class event support.
-
-The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events
-automatically for the cases were userspace has to do something to
-implement brightness changes.  When you override these events, you will
-either fail to handle properly the ThinkPads that require explicit
-action to change backlight brightness, or the ThinkPads that require
-that no action be taken to work properly.
-
-
-Bluetooth
----------
-
-procfs: /proc/acpi/ibm/bluetooth
-sysfs device attribute: bluetooth_enable (deprecated)
-sysfs rfkill class: switch "tpacpi_bluetooth_sw"
-
-This feature shows the presence and current state of a ThinkPad
-Bluetooth device in the internal ThinkPad CDC slot.
-
-If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
-so it is kept across reboots and power-off.
-
-Procfs notes:
-
-If Bluetooth is installed, the following commands can be used:
-
-	echo enable > /proc/acpi/ibm/bluetooth
-	echo disable > /proc/acpi/ibm/bluetooth
-
-Sysfs notes:
-
-	If the Bluetooth CDC card is installed, it can be enabled /
-	disabled through the "bluetooth_enable" thinkpad-acpi device
-	attribute, and its current status can also be queried.
-
-	enable:
-		0: disables Bluetooth / Bluetooth is disabled
-		1: enables Bluetooth / Bluetooth is enabled.
-
-	Note: this interface has been superseded by the	generic rfkill
-	class.  It has been deprecated, and it will be removed in year
-	2010.
-
-	rfkill controller switch "tpacpi_bluetooth_sw": refer to
-	Documentation/rfkill.txt for details.
-
-
-Video output control -- /proc/acpi/ibm/video
---------------------------------------------
-
-This feature allows control over the devices used for video output -
-LCD, CRT or DVI (if available). The following commands are available:
-
-	echo lcd_enable > /proc/acpi/ibm/video
-	echo lcd_disable > /proc/acpi/ibm/video
-	echo crt_enable > /proc/acpi/ibm/video
-	echo crt_disable > /proc/acpi/ibm/video
-	echo dvi_enable > /proc/acpi/ibm/video
-	echo dvi_disable > /proc/acpi/ibm/video
-	echo auto_enable > /proc/acpi/ibm/video
-	echo auto_disable > /proc/acpi/ibm/video
-	echo expand_toggle > /proc/acpi/ibm/video
-	echo video_switch > /proc/acpi/ibm/video
-
-NOTE: Access to this feature is restricted to processes owning the
-CAP_SYS_ADMIN capability for safety reasons, as it can interact badly
-enough with some versions of X.org to crash it.
-
-Each video output device can be enabled or disabled individually.
-Reading /proc/acpi/ibm/video shows the status of each device.
-
-Automatic video switching can be enabled or disabled.  When automatic
-video switching is enabled, certain events (e.g. opening the lid,
-docking or undocking) cause the video output device to change
-automatically. While this can be useful, it also causes flickering
-and, on the X40, video corruption. By disabling automatic switching,
-the flickering or video corruption can be avoided.
-
-The video_switch command cycles through the available video outputs
-(it simulates the behavior of Fn-F7).
-
-Video expansion can be toggled through this feature. This controls
-whether the display is expanded to fill the entire LCD screen when a
-mode with less than full resolution is used. Note that the current
-video expansion status cannot be determined through this feature.
-
-Note that on many models (particularly those using Radeon graphics
-chips) the X driver configures the video card in a way which prevents
-Fn-F7 from working. This also disables the video output switching
-features of this driver, as it uses the same ACPI methods as
-Fn-F7. Video switching on the console should still work.
-
-UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
-
-
-ThinkLight control
-------------------
-
-procfs: /proc/acpi/ibm/light
-sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED
-
-procfs notes:
-
-The ThinkLight status can be read and set through the procfs interface.  A
-few models which do not make the status available will show the ThinkLight
-status as "unknown". The available commands are:
-
-	echo on  > /proc/acpi/ibm/light
-	echo off > /proc/acpi/ibm/light
-
-sysfs notes:
-
-The ThinkLight sysfs interface is documented by the LED class
-documentation, in Documentation/leds/leds-class.txt.  The ThinkLight LED name
-is "tpacpi::thinklight".
-
-Due to limitations in the sysfs LED class, if the status of the ThinkLight
-cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
-It is impossible to know if the status returned through sysfs is valid.
-
-
-CMOS/UCMS control
------------------
-
-procfs: /proc/acpi/ibm/cmos
-sysfs device attribute: cmos_command
-
-This feature is mostly used internally by the ACPI firmware to keep the legacy
-CMOS NVRAM bits in sync with the current machine state, and to record this
-state so that the ThinkPad will retain such settings across reboots.
-
-Some of these commands actually perform actions in some ThinkPad models, but
-this is expected to disappear more and more in newer models.  As an example, in
-a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for
-real, but commands 0 to 2 don't control the mixer anymore (they have been
-phased out) and just update the NVRAM.
-
-The range of valid cmos command numbers is 0 to 21, but not all have an
-effect and the behavior varies from model to model.  Here is the behavior
-on the X40 (tpb is the ThinkPad Buttons utility):
-
-	0 - Related to "Volume down" key press
-	1 - Related to "Volume up" key press
-	2 - Related to "Mute on" key press
-	3 - Related to "Access IBM" key press
-	4 - Related to "LCD brightness up" key press
-	5 - Related to "LCD brightness down" key press
-	11 - Related to "toggle screen expansion" key press/function
-	12 - Related to "ThinkLight on"
-	13 - Related to "ThinkLight off"
-	14 - Related to "ThinkLight" key press (toggle ThinkLight)
-
-The cmos command interface is prone to firmware split-brain problems, as
-in newer ThinkPads it is just a compatibility layer.  Do not use it, it is
-exported just as a debug tool.
-
-
-LED control
------------
-
-procfs: /proc/acpi/ibm/led
-sysfs attributes: as per LED class, see below for names
-
-Some of the LED indicators can be controlled through this feature.  On
-some older ThinkPad models, it is possible to query the status of the
-LED indicators as well.  Newer ThinkPads cannot query the real status
-of the LED indicators.
-
-Because misuse of the LEDs could induce an unaware user to perform
-dangerous actions (like undocking or ejecting a bay device while the
-buses are still active), or mask an important alarm (such as a nearly
-empty battery, or a broken battery), access to most LEDs is
-restricted.
-
-Unrestricted access to all LEDs requires that thinkpad-acpi be
-compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
-Distributions must never enable this option.  Individual users that
-are aware of the consequences are welcome to enabling it.
-
-Audio mute and microphone mute LEDs are supported, but currently not
-visible to userspace. They are used by the snd-hda-intel audio driver.
-
-procfs notes:
-
-The available commands are:
-
-	echo '<LED number> on' >/proc/acpi/ibm/led
-	echo '<LED number> off' >/proc/acpi/ibm/led
-	echo '<LED number> blink' >/proc/acpi/ibm/led
-
-The <LED number> range is 0 to 15. The set of LEDs that can be
-controlled varies from model to model. Here is the common ThinkPad
-mapping:
-
-	0 - power
-	1 - battery (orange)
-	2 - battery (green)
-	3 - UltraBase/dock
-	4 - UltraBay
-	5 - UltraBase battery slot
-	6 - (unknown)
-	7 - standby
-	8 - dock status 1
-	9 - dock status 2
-	10, 11 - (unknown)
-	12 - thinkvantage
-	13, 14, 15 - (unknown)
-
-All of the above can be turned on and off and can be made to blink.
-
-sysfs notes:
-
-The ThinkPad LED sysfs interface is described in detail by the LED class
-documentation, in Documentation/leds/leds-class.txt.
-
-The LEDs are named (in LED ID order, from 0 to 12):
-"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
-"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
-"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
-"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
-"tpacpi::thinkvantage".
-
-Due to limitations in the sysfs LED class, if the status of the LED
-indicators cannot be read due to an error, thinkpad-acpi will report it as
-a brightness of zero (same as LED off).
-
-If the thinkpad firmware doesn't support reading the current status,
-trying to read the current LED brightness will just return whatever
-brightness was last written to that attribute.
-
-These LEDs can blink using hardware acceleration.  To request that a
-ThinkPad indicator LED should blink in hardware accelerated mode, use the
-"timer" trigger, and leave the delay_on and delay_off parameters set to
-zero (to request hardware acceleration autodetection).
-
-LEDs that are known not to exist in a given ThinkPad model are not
-made available through the sysfs interface.  If you have a dock and you
-notice there are LEDs listed for your ThinkPad that do not exist (and
-are not in the dock), or if you notice that there are missing LEDs,
-a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
-
-
-ACPI sounds -- /proc/acpi/ibm/beep
-----------------------------------
-
-The BEEP method is used internally by the ACPI firmware to provide
-audible alerts in various situations. This feature allows the same
-sounds to be triggered manually.
-
-The commands are non-negative integer numbers:
-
-	echo <number> >/proc/acpi/ibm/beep
-
-The valid <number> range is 0 to 17. Not all numbers trigger sounds
-and the sounds vary from model to model. Here is the behavior on the
-X40:
-
-	0 - stop a sound in progress (but use 17 to stop 16)
-	2 - two beeps, pause, third beep ("low battery")
-	3 - single beep
-	4 - high, followed by low-pitched beep ("unable")
-	5 - single beep
-	6 - very high, followed by high-pitched beep ("AC/DC")
-	7 - high-pitched beep
-	9 - three short beeps
-	10 - very long beep
-	12 - low-pitched beep
-	15 - three high-pitched beeps repeating constantly, stop with 0
-	16 - one medium-pitched beep repeating constantly, stop with 17
-	17 - stop 16
-
-
-Temperature sensors
--------------------
-
-procfs: /proc/acpi/ibm/thermal
-sysfs device attributes: (hwmon "thinkpad") temp*_input
-
-Most ThinkPads include six or more separate temperature sensors but only
-expose the CPU temperature through the standard ACPI methods.  This
-feature shows readings from up to eight different sensors on older
-ThinkPads, and up to sixteen different sensors on newer ThinkPads.
-
-For example, on the X40, a typical output may be:
-temperatures:   42 42 45 41 36 -128 33 -128
-
-On the T43/p, a typical output may be:
-temperatures:   48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128
-
-The mapping of thermal sensors to physical locations varies depending on
-system-board model (and thus, on ThinkPad model).
-
-http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that
-tries to track down these locations for various models.
-
-Most (newer?) models seem to follow this pattern:
-
-1:  CPU
-2:  (depends on model)
-3:  (depends on model)
-4:  GPU
-5:  Main battery: main sensor
-6:  Bay battery: main sensor
-7:  Main battery: secondary sensor
-8:  Bay battery: secondary sensor
-9-15: (depends on model)
-
-For the R51 (source: Thomas Gruber):
-2:  Mini-PCI
-3:  Internal HDD
-
-For the T43, T43/p (source: Shmidoax/Thinkwiki.org)
-http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p
-2:  System board, left side (near PCMCIA slot), reported as HDAPS temp
-3:  PCMCIA slot
-9:  MCH (northbridge) to DRAM Bus
-10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI
-    card, under touchpad
-11: Power regulator, underside of system board, below F2 key
-
-The A31 has a very atypical layout for the thermal sensors
-(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31)
-1:  CPU
-2:  Main Battery: main sensor
-3:  Power Converter
-4:  Bay Battery: main sensor
-5:  MCH (northbridge)
-6:  PCMCIA/ambient
-7:  Main Battery: secondary sensor
-8:  Bay Battery: secondary sensor
-
-
-Procfs notes:
-	Readings from sensors that are not available return -128.
-	No commands can be written to this file.
-
-Sysfs notes:
-	Sensors that are not available return the ENXIO error.  This
-	status may change at runtime, as there are hotplug thermal
-	sensors, like those inside the batteries and docks.
-
-	thinkpad-acpi thermal sensors are reported through the hwmon
-	subsystem, and follow all of the hwmon guidelines at
-	Documentation/hwmon.
-
-EXPERIMENTAL: Embedded controller register dump
------------------------------------------------
-
-This feature is not included in the thinkpad driver anymore.
-Instead the EC can be accessed through /sys/kernel/debug/ec with
-a userspace tool which can be found here:
-ftp://ftp.suse.com/pub/people/trenn/sources/ec
-
-Use it to determine the register holding the fan
-speed on some models. To do that, do the following:
-	- make sure the battery is fully charged
-	- make sure the fan is running
-	- use above mentioned tool to read out the EC
-
-Often fan and temperature values vary between
-readings. Since temperatures don't change vary fast, you can take
-several quick dumps to eliminate them.
-
-You can use a similar method to figure out the meaning of other
-embedded controller registers - e.g. make sure nothing else changes
-except the charging or discharging battery to determine which
-registers contain the current battery capacity, etc. If you experiment
-with this, do send me your results (including some complete dumps with
-a description of the conditions when they were taken.)
-
-
-LCD brightness control
-----------------------
-
-procfs: /proc/acpi/ibm/brightness
-sysfs backlight device "thinkpad_screen"
-
-This feature allows software control of the LCD brightness on ThinkPad
-models which don't have a hardware brightness slider.
-
-It has some limitations: the LCD backlight cannot be actually turned
-on or off by this interface, it just controls the backlight brightness
-level.
-
-On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
-has eight brightness levels, ranging from 0 to 7.  Some of the levels
-may not be distinct.  Later Lenovo models that implement the ACPI
-display backlight brightness control methods have 16 levels, ranging
-from 0 to 15.
-
-For IBM ThinkPads, there are two interfaces to the firmware for direct
-brightness control, EC and UCMS (or CMOS).  To select which one should be
-used, use the brightness_mode module parameter: brightness_mode=1 selects
-EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
-mode with NVRAM backing (so that brightness changes are remembered across
-shutdown/reboot).
-
-The driver tries to select which interface to use from a table of
-defaults for each ThinkPad model.  If it makes a wrong choice, please
-report this as a bug, so that we can fix it.
-
-Lenovo ThinkPads only support brightness_mode=2 (UCMS).
-
-When display backlight brightness controls are available through the
-standard ACPI interface, it is best to use it instead of this direct
-ThinkPad-specific interface.  The driver will disable its native
-backlight brightness control interface if it detects that the standard
-ACPI interface is available in the ThinkPad.
-
-If you want to use the thinkpad-acpi backlight brightness control
-instead of the generic ACPI video backlight brightness control for some
-reason, you should use the acpi_backlight=vendor kernel parameter.
-
-The brightness_enable module parameter can be used to control whether
-the LCD brightness control feature will be enabled when available.
-brightness_enable=0 forces it to be disabled.  brightness_enable=1
-forces it to be enabled when available, even if the standard ACPI
-interface is also available.
-
-Procfs notes:
-
-	The available commands are:
-
-	echo up   >/proc/acpi/ibm/brightness
-	echo down >/proc/acpi/ibm/brightness
-	echo 'level <level>' >/proc/acpi/ibm/brightness
-
-Sysfs notes:
-
-The interface is implemented through the backlight sysfs class, which is
-poorly documented at this time.
-
-Locate the thinkpad_screen device under /sys/class/backlight, and inside
-it there will be the following attributes:
-
-	max_brightness:
-		Reads the maximum brightness the hardware can be set to.
-		The minimum is always zero.
-
-	actual_brightness:
-		Reads what brightness the screen is set to at this instant.
-
-	brightness:
-		Writes request the driver to change brightness to the
-		given value.  Reads will tell you what brightness the
-		driver is trying to set the display to when "power" is set
-		to zero and the display has not been dimmed by a kernel
-		power management event.
-
-	power:
-		power management mode, where 0 is "display on", and 1 to 3
-		will dim the display backlight to brightness level 0
-		because thinkpad-acpi cannot really turn the backlight
-		off.  Kernel power management events can temporarily
-		increase the current power management level, i.e. they can
-		dim the display.
-
-
-WARNING:
-
-    Whatever you do, do NOT ever call thinkpad-acpi backlight-level change
-    interface and the ACPI-based backlight level change interface
-    (available on newer BIOSes, and driven by the Linux ACPI video driver)
-    at the same time.  The two will interact in bad ways, do funny things,
-    and maybe reduce the life of the backlight lamps by needlessly kicking
-    its level up and down at every change.
-
-
-Volume control (Console Audio control)
---------------------------------------
-
-procfs: /proc/acpi/ibm/volume
-ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC"
-
-NOTE: by default, the volume control interface operates in read-only
-mode, as it is supposed to be used for on-screen-display purposes.
-The read/write mode can be enabled through the use of the
-"volume_control=1" module parameter.
-
-NOTE: distros are urged to not enable volume_control by default, this
-should be done by the local admin only.  The ThinkPad UI is for the
-console audio control to be done through the volume keys only, and for
-the desktop environment to just provide on-screen-display feedback.
-Software volume control should be done only in the main AC97/HDA
-mixer.
-
-
-About the ThinkPad Console Audio control:
-
-ThinkPads have a built-in amplifier and muting circuit that drives the
-console headphone and speakers.  This circuit is after the main AC97
-or HDA mixer in the audio path, and under exclusive control of the
-firmware.
-
-ThinkPads have three special hotkeys to interact with the console
-audio control: volume up, volume down and mute.
-
-It is worth noting that the normal way the mute function works (on
-ThinkPads that do not have a "mute LED") is:
-
-1. Press mute to mute.  It will *always* mute, you can press it as
-   many times as you want, and the sound will remain mute.
-
-2. Press either volume key to unmute the ThinkPad (it will _not_
-   change the volume, it will just unmute).
-
-This is a very superior design when compared to the cheap software-only
-mute-toggle solution found on normal consumer laptops:  you can be
-absolutely sure the ThinkPad will not make noise if you press the mute
-button, no matter the previous state.
-
-The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain
-amplifiers driving the speakers and headphone output, and the firmware
-also handles volume control for the headphone and speakers on these
-ThinkPads without any help from the operating system (this volume
-control stage exists after the main AC97 or HDA mixer in the audio
-path).
-
-The newer Lenovo models only have firmware mute control, and depend on
-the main HDA mixer to do volume control (which is done by the operating
-system).  In this case, the volume keys are filtered out for unmute
-key press (there are some firmware bugs in this area) and delivered as
-normal key presses to the operating system (thinkpad-acpi is not
-involved).
-
-
-The ThinkPad-ACPI volume control:
-
-The preferred way to interact with the Console Audio control is the
-ALSA interface.
-
-The legacy procfs interface allows one to read the current state,
-and if volume control is enabled, accepts the following commands:
-
-	echo up   >/proc/acpi/ibm/volume
-	echo down >/proc/acpi/ibm/volume
-	echo mute >/proc/acpi/ibm/volume
-	echo unmute >/proc/acpi/ibm/volume
-	echo 'level <level>' >/proc/acpi/ibm/volume
-
-The <level> number range is 0 to 14 although not all of them may be
-distinct. To unmute the volume after the mute command, use either the
-up or down command (the level command will not unmute the volume), or
-the unmute command.
-
-You can use the volume_capabilities parameter to tell the driver
-whether your thinkpad has volume control or mute-only control:
-volume_capabilities=1 for mixers with mute and volume control,
-volume_capabilities=2 for mixers with only mute control.
-
-If the driver misdetects the capabilities for your ThinkPad model,
-please report this to ibm-acpi-devel@lists.sourceforge.net, so that we
-can update the driver.
-
-There are two strategies for volume control.  To select which one
-should be used, use the volume_mode module parameter: volume_mode=1
-selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing
-(so that volume/mute changes are remembered across shutdown/reboot).
-
-The driver will operate in volume_mode=3 by default. If that does not
-work well on your ThinkPad model, please report this to
-ibm-acpi-devel@lists.sourceforge.net.
-
-The driver supports the standard ALSA module parameters.  If the ALSA
-mixer is disabled, the driver will disable all volume functionality.
-
-
-Fan control and monitoring: fan speed, fan enable/disable
----------------------------------------------------------
-
-procfs: /proc/acpi/ibm/fan
-sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
-			  pwm1_enable, fan2_input
-sysfs hwmon driver attributes: fan_watchdog
-
-NOTE NOTE NOTE: fan control operations are disabled by default for
-safety reasons.  To enable them, the module parameter "fan_control=1"
-must be given to thinkpad-acpi.
-
-This feature attempts to show the current fan speed, control mode and
-other fan data that might be available.  The speed is read directly
-from the hardware registers of the embedded controller.  This is known
-to work on later R, T, X and Z series ThinkPads but may show a bogus
-value on other models.
-
-Some Lenovo ThinkPads support a secondary fan.  This fan cannot be
-controlled separately, it shares the main fan control.
-
-Fan levels:
-
-Most ThinkPad fans work in "levels" at the firmware interface.  Level 0
-stops the fan.  The higher the level, the higher the fan speed, although
-adjacent levels often map to the same fan speed.  7 is the highest
-level, where the fan reaches the maximum recommended speed.
-
-Level "auto" means the EC changes the fan level according to some
-internal algorithm, usually based on readings from the thermal sensors.
-
-There is also a "full-speed" level, also known as "disengaged" level.
-In this level, the EC disables the speed-locked closed-loop fan control,
-and drives the fan as fast as it can go, which might exceed hardware
-limits, so use this level with caution.
-
-The fan usually ramps up or down slowly from one speed to another, and
-it is normal for the EC to take several seconds to react to fan
-commands.  The full-speed level may take up to two minutes to ramp up to
-maximum speed, and in some ThinkPads, the tachometer readings go stale
-while the EC is transitioning to the full-speed level.
-
-WARNING WARNING WARNING: do not leave the fan disabled unless you are
-monitoring all of the temperature sensor readings and you are ready to
-enable it if necessary to avoid overheating.
-
-An enabled fan in level "auto" may stop spinning if the EC decides the
-ThinkPad is cool enough and doesn't need the extra airflow.  This is
-normal, and the EC will spin the fan up if the various thermal readings
-rise too much.
-
-On the X40, this seems to depend on the CPU and HDD temperatures.
-Specifically, the fan is turned on when either the CPU temperature
-climbs to 56 degrees or the HDD temperature climbs to 46 degrees.  The
-fan is turned off when the CPU temperature drops to 49 degrees and the
-HDD temperature drops to 41 degrees.  These thresholds cannot
-currently be controlled.
-
-The ThinkPad's ACPI DSDT code will reprogram the fan on its own when
-certain conditions are met.  It will override any fan programming done
-through thinkpad-acpi.
-
-The thinkpad-acpi kernel driver can be programmed to revert the fan
-level to a safe setting if userspace does not issue one of the procfs
-fan commands: "enable", "disable", "level" or "watchdog", or if there
-are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is
-set to 1, manual mode) within a configurable amount of time of up to
-120 seconds.  This functionality is called fan safety watchdog.
-
-Note that the watchdog timer stops after it enables the fan.  It will be
-rearmed again automatically (using the same interval) when one of the
-above mentioned fan commands is received.  The fan watchdog is,
-therefore, not suitable to protect against fan mode changes made through
-means other than the "enable", "disable", and "level" procfs fan
-commands, or the hwmon fan control sysfs interface.
-
-Procfs notes:
-
-The fan may be enabled or disabled with the following commands:
-
-	echo enable  >/proc/acpi/ibm/fan
-	echo disable >/proc/acpi/ibm/fan
-
-Placing a fan on level 0 is the same as disabling it.  Enabling a fan
-will try to place it in a safe level if it is too slow or disabled.
-
-The fan level can be controlled with the command:
-
-	echo 'level <level>' > /proc/acpi/ibm/fan
-
-Where <level> is an integer from 0 to 7, or one of the words "auto" or
-"full-speed" (without the quotes).  Not all ThinkPads support the "auto"
-and "full-speed" levels.  The driver accepts "disengaged" as an alias for
-"full-speed", and reports it as "disengaged" for backwards
-compatibility.
-
-On the X31 and X40 (and ONLY on those models), the fan speed can be
-controlled to a certain degree.  Once the fan is running, it can be
-forced to run faster or slower with the following command:
-
-	echo 'speed <speed>' > /proc/acpi/ibm/fan
-
-The sustainable range of fan speeds on the X40 appears to be from about
-3700 to about 7350. Values outside this range either do not have any
-effect or the fan speed eventually settles somewhere in that range.  The
-fan cannot be stopped or started with this command.  This functionality
-is incomplete, and not available through the sysfs interface.
-
-To program the safety watchdog, use the "watchdog" command.
-
-	echo 'watchdog <interval in seconds>' > /proc/acpi/ibm/fan
-
-If you want to disable the watchdog, use 0 as the interval.
-
-Sysfs notes:
-
-The sysfs interface follows the hwmon subsystem guidelines for the most
-part, and the exception is the fan safety watchdog.
-
-Writes to any of the sysfs attributes may return the EINVAL error if
-that operation is not supported in a given ThinkPad or if the parameter
-is out-of-bounds, and EPERM if it is forbidden.  They may also return
-EINTR (interrupted system call), and EIO (I/O error while trying to talk
-to the firmware).
-
-Features not yet implemented by the driver return ENOSYS.
-
-hwmon device attribute pwm1_enable:
-	0: PWM offline (fan is set to full-speed mode)
-	1: Manual PWM control (use pwm1 to set fan level)
-	2: Hardware PWM control (EC "auto" mode)
-	3: reserved (Software PWM control, not implemented yet)
-
-	Modes 0 and 2 are not supported by all ThinkPads, and the
-	driver is not always able to detect this.  If it does know a
-	mode is unsupported, it will return -EINVAL.
-
-hwmon device attribute pwm1:
-	Fan level, scaled from the firmware values of 0-7 to the hwmon
-	scale of 0-255.  0 means fan stopped, 255 means highest normal
-	speed (level 7).
-
-	This attribute only commands the fan if pmw1_enable is set to 1
-	(manual PWM control).
-
-hwmon device attribute fan1_input:
-	Fan tachometer reading, in RPM.  May go stale on certain
-	ThinkPads while the EC transitions the PWM to offline mode,
-	which can take up to two minutes.  May return rubbish on older
-	ThinkPads.
-
-hwmon device attribute fan2_input:
-	Fan tachometer reading, in RPM, for the secondary fan.
-	Available only on some ThinkPads.  If the secondary fan is
-	not installed, will always read 0.
-
-hwmon driver attribute fan_watchdog:
-	Fan safety watchdog timer interval, in seconds.  Minimum is
-	1 second, maximum is 120 seconds.  0 disables the watchdog.
-
-To stop the fan: set pwm1 to zero, and pwm1_enable to 1.
-
-To start the fan in a safe mode: set pwm1_enable to 2.  If that fails
-with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255
-would be the safest choice, though).
-
-
-WAN
----
-
-procfs: /proc/acpi/ibm/wan
-sysfs device attribute: wwan_enable (deprecated)
-sysfs rfkill class: switch "tpacpi_wwan_sw"
-
-This feature shows the presence and current state of the built-in
-Wireless WAN device.
-
-If the ThinkPad supports it, the WWAN state is stored in NVRAM,
-so it is kept across reboots and power-off.
-
-It was tested on a Lenovo ThinkPad X60. It should probably work on other
-ThinkPad models which come with this module installed.
-
-Procfs notes:
-
-If the W-WAN card is installed, the following commands can be used:
-
-	echo enable > /proc/acpi/ibm/wan
-	echo disable > /proc/acpi/ibm/wan
-
-Sysfs notes:
-
-	If the W-WAN card is installed, it can be enabled /
-	disabled through the "wwan_enable" thinkpad-acpi device
-	attribute, and its current status can also be queried.
-
-	enable:
-		0: disables WWAN card / WWAN card is disabled
-		1: enables WWAN card / WWAN card is enabled.
-
-	Note: this interface has been superseded by the	generic rfkill
-	class.  It has been deprecated, and it will be removed in year
-	2010.
-
-	rfkill controller switch "tpacpi_wwan_sw": refer to
-	Documentation/rfkill.txt for details.
-
-
-EXPERIMENTAL: UWB
------------------
-
-This feature is considered EXPERIMENTAL because it has not been extensively
-tested and validated in various ThinkPad models yet.  The feature may not
-work as expected. USE WITH CAUTION! To use this feature, you need to supply
-the experimental=1 parameter when loading the module.
-
-sysfs rfkill class: switch "tpacpi_uwb_sw"
-
-This feature exports an rfkill controller for the UWB device, if one is
-present and enabled in the BIOS.
-
-Sysfs notes:
-
-	rfkill controller switch "tpacpi_uwb_sw": refer to
-	Documentation/rfkill.txt for details.
-
-Adaptive keyboard
------------------
-
-sysfs device attribute: adaptive_kbd_mode
-
-This sysfs attribute controls the keyboard "face" that will be shown on the
-Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read
-and set.
-
-1 = Home mode
-2 = Web-browser mode
-3 = Web-conference mode
-4 = Function mode
-5 = Layflat mode
-
-For more details about which buttons will appear depending on the mode, please
-review the laptop's user guide:
-http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf
-
-Multiple Commands, Module Parameters
-------------------------------------
-
-Multiple commands can be written to the proc files in one shot by
-separating them with commas, for example:
-
-	echo enable,0xffff > /proc/acpi/ibm/hotkey
-	echo lcd_disable,crt_enable > /proc/acpi/ibm/video
-
-Commands can also be specified when loading the thinkpad-acpi module,
-for example:
-
-	modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
-
-
-Enabling debugging output
--------------------------
-
-The module takes a debug parameter which can be used to selectively
-enable various classes of debugging output, for example:
-
-	 modprobe thinkpad_acpi debug=0xffff
-
-will enable all debugging output classes.  It takes a bitmask, so
-to enable more than one output class, just add their values.
-
-	Debug bitmask		Description
-	0x8000			Disclose PID of userspace programs
-				accessing some functions of the driver
-	0x0001			Initialization and probing
-	0x0002			Removal
-	0x0004			RF Transmitter control (RFKILL)
-				(bluetooth, WWAN, UWB...)
-	0x0008			HKEY event interface, hotkeys
-	0x0010			Fan control
-	0x0020			Backlight brightness
-	0x0040			Audio mixer/volume control
-
-There is also a kernel build option to enable more debugging
-information, which may be necessary to debug driver problems.
-
-The level of debugging information output by the driver can be changed
-at runtime through sysfs, using the driver attribute debug_level.  The
-attribute takes the same bitmask as the debug module parameter above.
-
-
-Force loading of module
------------------------
-
-If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify
-the module parameter force_load=1.  Regardless of whether this works or
-not, please contact ibm-acpi-devel@lists.sourceforge.net with a report.
-
-
-Sysfs interface changelog:
-
-0x000100:	Initial sysfs support, as a single platform driver and
-		device.
-0x000200:	Hot key support for 32 hot keys, and radio slider switch
-		support.
-0x010000:	Hot keys are now handled by default over the input
-		layer, the radio switch generates input event EV_RADIO,
-		and the driver enables hot key handling by default in
-		the firmware.
-
-0x020000:	ABI fix: added a separate hwmon platform device and
-		driver, which must be located by name (thinkpad)
-		and the hwmon class for libsensors4 (lm-sensors 3)
-		compatibility.  Moved all hwmon attributes to this
-		new platform device.
-
-0x020100:	Marker for thinkpad-acpi with hot key NVRAM polling
-		support.  If you must, use it to know you should not
-		start a userspace NVRAM poller (allows to detect when
-		NVRAM is compiled out by the user because it is
-		unneeded/undesired in the first place).
-0x020101:	Marker for thinkpad-acpi with hot key NVRAM polling
-		and proper hotkey_mask semantics (version 8 of the
-		NVRAM polling patch).  Some development snapshots of
-		0.18 had an earlier version that did strange things
-		to hotkey_mask.
-
-0x020200:	Add poll()/select() support to the following attributes:
-		hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
-
-0x020300:	hotkey enable/disable support removed, attributes
-		hotkey_bios_enabled and hotkey_enable deprecated and
-		marked for removal.
-
-0x020400:	Marker for 16 LEDs support.  Also, LEDs that are known
-		to not exist in a given model are not registered with
-		the LED sysfs class anymore.
-
-0x020500:	Updated hotkey driver, hotkey_mask is always available
-		and it is always able to disable hot keys.  Very old
-		thinkpads are properly supported.  hotkey_bios_mask
-		is deprecated and marked for removal.
-
-0x020600:	Marker for backlight change event support.
-
-0x020700:	Support for mute-only mixers.
-		Volume control in read-only mode by default.
-		Marker for ALSA mixer support.
-
-0x030000:	Thermal and fan sysfs attributes were moved to the hwmon
-		device instead of being attached to the backing platform
-		device.
diff --git a/Documentation/laptops/toshiba_haps.txt b/Documentation/laptops/toshiba_haps.txt
deleted file mode 100644
index 0c1d88dedbde..000000000000
--- a/Documentation/laptops/toshiba_haps.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-Kernel driver toshiba_haps
-Toshiba HDD Active Protection Sensor
-====================================
-
-Author: Azael Avalos <coproscefalo@gmail.com>
-
-
-0. Contents
------------
-
-1. Description
-2. Interface
-3. Accelerometer axes
-4. Supported devices
-5. Usage
-
-
-1. Description
---------------
-
-This driver provides support for the accelerometer found in various Toshiba
-laptops, being called "Toshiba HDD Protection - Shock Sensor" officially,
-and detects laptops automatically with this device.
-On Windows, Toshiba provided software monitors this device and provides
-automatic HDD protection (head unload) on sudden moves or harsh vibrations,
-however, this driver only provides a notification via a sysfs file to let
-userspace tools or daemons act accordingly, as well as providing a sysfs
-file to set the desired protection level or sensor sensibility.
-
-
-2. Interface
-------------
-
-This device comes with 3 methods:
-_STA -  Checks existence of the device, returning Zero if the device does not
-	exists or is not supported.
-PTLV -  Sets the desired protection level.
-RSSS -  Shuts down the HDD protection interface for a few seconds,
-	then restores normal operation.
-
-Note:
-The presence of Solid State Drives (SSD) can make this driver to fail loading,
-given the fact that such drives have no movable parts, and thus, not requiring
-any "protection" as well as failing during the evaluation of the _STA method
-found under this device.
-
-
-3. Accelerometer axes
----------------------
-
-This device does not report any axes, however, to query the sensor position
-a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are
-provided to query such information, handled by the kernel module toshiba_acpi
-since kernel version 3.15.
-
-
-4. Supported devices
---------------------
-
-This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop
-with this device is supported, given the fact that they have the presence of
-conventional HDD and not only SSD, or a combination of both HDD and SSD.
-
-
-5. Usage
---------
-
-The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are:
-protection_level - The protection_level is readable and writeable, and
-		   provides a way to let userspace query the current protection
-		   level, as well as set the desired protection level, the
-		   available protection levels are:
-		   0 - Disabled | 1 - Low | 2 - Medium | 3 - High
-reset_protection - The reset_protection entry is writeable only, being "1"
-		   the only parameter it accepts, it is used to trigger
-		   a reset of the protection interface.
diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
new file mode 100644
index 000000000000..060f4e485897
--- /dev/null
+++ b/Documentation/leds/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+LEDs
+====
+
+.. toctree::
+   :maxdepth: 1
+
+   leds-class
+   leds-class-flash
+   ledtrig-oneshot
+   ledtrig-transient
+   ledtrig-usbport
+
+   uleds
+
+   leds-blinkm
+   leds-lm3556
+   leds-lp3944
+   leds-lp5521
+   leds-lp5523
+   leds-lp5562
+   leds-lp55xx
+   leds-mlxcpld
diff --git a/Documentation/leds/leds-blinkm.rst b/Documentation/leds/leds-blinkm.rst
new file mode 100644
index 000000000000..c74b5bc877b1
--- /dev/null
+++ b/Documentation/leds/leds-blinkm.rst
@@ -0,0 +1,84 @@
+==================
+Leds BlinkM driver
+==================
+
+The leds-blinkm driver supports the devices of the BlinkM family.
+
+They are RGB-LED modules driven by a (AT)tiny microcontroller and
+communicate through I2C. The default address of these modules is
+0x09 but this can be changed through a command. By this you could
+dasy-chain up to 127 BlinkMs on an I2C bus.
+
+The device accepts RGB and HSB color values through separate commands.
+Also you can store blinking sequences as "scripts" in
+the controller and run them. Also fading is an option.
+
+The interface this driver provides is 2-fold:
+
+a) LED class interface for use with triggers
+############################################
+
+The registration follows the scheme::
+
+  blinkm-<i2c-bus-nr>-<i2c-device-nr>-<color>
+
+  $ ls -h /sys/class/leds/blinkm-6-*
+  /sys/class/leds/blinkm-6-9-blue:
+  brightness  device  max_brightness  power  subsystem  trigger  uevent
+
+  /sys/class/leds/blinkm-6-9-green:
+  brightness  device  max_brightness  power  subsystem  trigger  uevent
+
+  /sys/class/leds/blinkm-6-9-red:
+  brightness  device  max_brightness  power  subsystem  trigger  uevent
+
+(same is /sys/bus/i2c/devices/6-0009/leds)
+
+We can control the colors separated into red, green and blue and
+assign triggers on each color.
+
+E.g.::
+
+  $ cat blinkm-6-9-blue/brightness
+  05
+
+  $ echo 200 > blinkm-6-9-blue/brightness
+  $
+
+  $ modprobe ledtrig-heartbeat
+  $ echo heartbeat > blinkm-6-9-green/trigger
+  $
+
+
+b) Sysfs group to control rgb, fade, hsb, scripts ...
+#####################################################
+
+This extended interface is available as folder blinkm
+in the sysfs folder of the I2C device.
+E.g. below /sys/bus/i2c/devices/6-0009/blinkm
+
+  $ ls -h /sys/bus/i2c/devices/6-0009/blinkm/
+  blue  green  red  test
+
+Currently supported is just setting red, green, blue
+and a test sequence.
+
+E.g.::
+
+  $ cat *
+  00
+  00
+  00
+  #Write into test to start test sequence!#
+
+  $ echo 1 > test
+  $
+
+  $ echo 255 > red
+  $
+
+
+
+as of 6/2012
+
+dl9pf <at> gmx <dot> de
diff --git a/Documentation/leds/leds-blinkm.txt b/Documentation/leds/leds-blinkm.txt
deleted file mode 100644
index 9dd92f4cf4e1..000000000000
--- a/Documentation/leds/leds-blinkm.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-The leds-blinkm driver supports the devices of the BlinkM family.
-
-They are RGB-LED modules driven by a (AT)tiny microcontroller and
-communicate through I2C. The default address of these modules is
-0x09 but this can be changed through a command. By this you could
-dasy-chain up to 127 BlinkMs on an I2C bus.
-
-The device accepts RGB and HSB color values through separate commands.
-Also you can store blinking sequences as "scripts" in
-the controller and run them. Also fading is an option.
-
-The interface this driver provides is 2-fold:
-
-a) LED class interface for use with triggers
-############################################
-
-The registration follows the scheme:
-blinkm-<i2c-bus-nr>-<i2c-device-nr>-<color>
-
-$ ls -h /sys/class/leds/blinkm-6-*
-/sys/class/leds/blinkm-6-9-blue:
-brightness  device  max_brightness  power  subsystem  trigger  uevent
-
-/sys/class/leds/blinkm-6-9-green:
-brightness  device  max_brightness  power  subsystem  trigger  uevent
-
-/sys/class/leds/blinkm-6-9-red:
-brightness  device  max_brightness  power  subsystem  trigger  uevent
-
-(same is /sys/bus/i2c/devices/6-0009/leds)
-
-We can control the colors separated into red, green and blue and
-assign triggers on each color.
-
-E.g.:
-
-$ cat blinkm-6-9-blue/brightness
-05
-
-$ echo 200 > blinkm-6-9-blue/brightness
-$
-
-$ modprobe ledtrig-heartbeat
-$ echo heartbeat > blinkm-6-9-green/trigger
-$
-
-
-b) Sysfs group to control rgb, fade, hsb, scripts ...
-#####################################################
-
-This extended interface is available as folder blinkm
-in the sysfs folder of the I2C device.
-E.g. below /sys/bus/i2c/devices/6-0009/blinkm
-
-$ ls -h /sys/bus/i2c/devices/6-0009/blinkm/
-blue  green  red  test
-
-Currently supported is just setting red, green, blue
-and a test sequence.
-
-E.g.:
-
-$ cat *
-00
-00
-00
-#Write into test to start test sequence!#
-
-$ echo 1 > test
-$
-
-$ echo 255 > red
-$
-
-
-
-as of 6/2012
-
-dl9pf <at> gmx <dot> de
-
diff --git a/Documentation/leds/leds-class-flash.rst b/Documentation/leds/leds-class-flash.rst
new file mode 100644
index 000000000000..6ec12c5a1a0e
--- /dev/null
+++ b/Documentation/leds/leds-class-flash.rst
@@ -0,0 +1,90 @@
+==============================
+Flash LED handling under Linux
+==============================
+
+Some LED devices provide two modes - torch and flash. In the LED subsystem
+those modes are supported by LED class (see Documentation/leds/leds-class.rst)
+and LED Flash class respectively. The torch mode related features are enabled
+by default and the flash ones only if a driver declares it by setting
+LED_DEV_CAP_FLASH flag.
+
+In order to enable the support for flash LEDs CONFIG_LEDS_CLASS_FLASH symbol
+must be defined in the kernel config. A LED Flash class driver must be
+registered in the LED subsystem with led_classdev_flash_register function.
+
+Following sysfs attributes are exposed for controlling flash LED devices:
+(see Documentation/ABI/testing/sysfs-class-led-flash)
+
+	- flash_brightness
+	- max_flash_brightness
+	- flash_timeout
+	- max_flash_timeout
+	- flash_strobe
+	- flash_fault
+
+
+V4L2 flash wrapper for flash LEDs
+=================================
+
+A LED subsystem driver can be controlled also from the level of VideoForLinux2
+subsystem. In order to enable this CONFIG_V4L2_FLASH_LED_CLASS symbol has to
+be defined in the kernel config.
+
+The driver must call the v4l2_flash_init function to get registered in the
+V4L2 subsystem. The function takes six arguments:
+
+- dev:
+	flash device, e.g. an I2C device
+- of_node:
+	of_node of the LED, may be NULL if the same as device's
+- fled_cdev:
+	LED flash class device to wrap
+- iled_cdev:
+	LED flash class device representing indicator LED associated with
+	fled_cdev, may be NULL
+- ops:
+	V4L2 specific ops
+
+	* external_strobe_set
+		defines the source of the flash LED strobe -
+		V4L2_CID_FLASH_STROBE control or external source, typically
+		a sensor, which makes it possible to synchronise the flash
+		strobe start with exposure start,
+	* intensity_to_led_brightness and led_brightness_to_intensity
+		perform
+		enum led_brightness <-> V4L2 intensity conversion in a device
+		specific manner - they can be used for devices with non-linear
+		LED current scale.
+- config:
+	configuration for V4L2 Flash sub-device
+
+	* dev_name
+		the name of the media entity, unique in the system,
+	* flash_faults
+		bitmask of flash faults that the LED flash class
+		device can report; corresponding LED_FAULT* bit definitions are
+		available in <linux/led-class-flash.h>,
+	* torch_intensity
+		constraints for the LED in TORCH mode
+		in microamperes,
+	* indicator_intensity
+		constraints for the indicator LED
+		in microamperes,
+	* has_external_strobe
+		determines whether the flash strobe source
+		can be switched to external,
+
+On remove the v4l2_flash_release function has to be called, which takes one
+argument - struct v4l2_flash pointer returned previously by v4l2_flash_init.
+This function can be safely called with NULL or error pointer argument.
+
+Please refer to drivers/leds/leds-max77693.c for an exemplary usage of the
+v4l2 flash wrapper.
+
+Once the V4L2 sub-device is registered by the driver which created the Media
+controller device, the sub-device node acts just as a node of a native V4L2
+flash API device would. The calls are simply routed to the LED flash API.
+
+Opening the V4L2 flash sub-device makes the LED subsystem sysfs interface
+unavailable. The interface is re-enabled after the V4L2 flash sub-device
+is closed.
diff --git a/Documentation/leds/leds-class-flash.txt b/Documentation/leds/leds-class-flash.txt
deleted file mode 100644
index 8da3c6f4b60b..000000000000
--- a/Documentation/leds/leds-class-flash.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-
-Flash LED handling under Linux
-==============================
-
-Some LED devices provide two modes - torch and flash. In the LED subsystem
-those modes are supported by LED class (see Documentation/leds/leds-class.txt)
-and LED Flash class respectively. The torch mode related features are enabled
-by default and the flash ones only if a driver declares it by setting
-LED_DEV_CAP_FLASH flag.
-
-In order to enable the support for flash LEDs CONFIG_LEDS_CLASS_FLASH symbol
-must be defined in the kernel config. A LED Flash class driver must be
-registered in the LED subsystem with led_classdev_flash_register function.
-
-Following sysfs attributes are exposed for controlling flash LED devices:
-(see Documentation/ABI/testing/sysfs-class-led-flash)
-	- flash_brightness
-	- max_flash_brightness
-	- flash_timeout
-	- max_flash_timeout
-	- flash_strobe
-	- flash_fault
-
-
-V4L2 flash wrapper for flash LEDs
-=================================
-
-A LED subsystem driver can be controlled also from the level of VideoForLinux2
-subsystem. In order to enable this CONFIG_V4L2_FLASH_LED_CLASS symbol has to
-be defined in the kernel config.
-
-The driver must call the v4l2_flash_init function to get registered in the
-V4L2 subsystem. The function takes six arguments:
-- dev       : flash device, e.g. an I2C device
-- of_node   : of_node of the LED, may be NULL if the same as device's
-- fled_cdev : LED flash class device to wrap
-- iled_cdev : LED flash class device representing indicator LED associated with
-	      fled_cdev, may be NULL
-- ops : V4L2 specific ops
-	* external_strobe_set - defines the source of the flash LED strobe -
-		V4L2_CID_FLASH_STROBE control or external source, typically
-		a sensor, which makes it possible to synchronise the flash
-		strobe start with exposure start,
-	* intensity_to_led_brightness and led_brightness_to_intensity - perform
-		enum led_brightness <-> V4L2 intensity conversion in a device
-		specific manner - they can be used for devices with non-linear
-		LED current scale.
-- config : configuration for V4L2 Flash sub-device
-	* dev_name - the name of the media entity, unique in the system,
-	* flash_faults - bitmask of flash faults that the LED flash class
-		device can report; corresponding LED_FAULT* bit definitions are
-		available in <linux/led-class-flash.h>,
-	* torch_intensity - constraints for the LED in TORCH mode
-		in microamperes,
-	* indicator_intensity - constraints for the indicator LED
-		in microamperes,
-	* has_external_strobe - determines whether the flash strobe source
-		can be switched to external,
-
-On remove the v4l2_flash_release function has to be called, which takes one
-argument - struct v4l2_flash pointer returned previously by v4l2_flash_init.
-This function can be safely called with NULL or error pointer argument.
-
-Please refer to drivers/leds/leds-max77693.c for an exemplary usage of the
-v4l2 flash wrapper.
-
-Once the V4L2 sub-device is registered by the driver which created the Media
-controller device, the sub-device node acts just as a node of a native V4L2
-flash API device would. The calls are simply routed to the LED flash API.
-
-Opening the V4L2 flash sub-device makes the LED subsystem sysfs interface
-unavailable. The interface is re-enabled after the V4L2 flash sub-device
-is closed.
diff --git a/Documentation/leds/leds-class.rst b/Documentation/leds/leds-class.rst
new file mode 100644
index 000000000000..df0120a1ee3c
--- /dev/null
+++ b/Documentation/leds/leds-class.rst
@@ -0,0 +1,125 @@
+========================
+LED handling under Linux
+========================
+
+In its simplest form, the LED class just allows control of LEDs from
+userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
+LED is defined in max_brightness file. The brightness file will set the brightness
+of the LED (taking a value 0-max_brightness). Most LEDs don't have hardware
+brightness support so will just be turned on for non-zero brightness settings.
+
+The class also introduces the optional concept of an LED trigger. A trigger
+is a kernel based source of led events. Triggers can either be simple or
+complex. A simple trigger isn't configurable and is designed to slot into
+existing subsystems with minimal additional code. Examples are the disk-activity,
+nand-disk and sharpsl-charge triggers. With led triggers disabled, the code
+optimises away.
+
+Complex triggers while available to all LEDs have LED specific
+parameters and work on a per LED basis. The timer trigger is an example.
+The timer trigger will periodically change the LED brightness between
+LED_OFF and the current brightness setting. The "on" and "off" time can
+be specified via /sys/class/leds/<device>/delay_{on,off} in milliseconds.
+You can change the brightness value of a LED independently of the timer
+trigger. However, if you set the brightness value to LED_OFF it will
+also disable the timer trigger.
+
+You can change triggers in a similar manner to the way an IO scheduler
+is chosen (via /sys/class/leds/<device>/trigger). Trigger specific
+parameters can appear in /sys/class/leds/<device> once a given trigger is
+selected.
+
+
+Design Philosophy
+=================
+
+The underlying design philosophy is simplicity. LEDs are simple devices
+and the aim is to keep a small amount of code giving as much functionality
+as possible.  Please keep this in mind when suggesting enhancements.
+
+
+LED Device Naming
+=================
+
+Is currently of the form:
+
+	"devicename:colour:function"
+
+There have been calls for LED properties such as colour to be exported as
+individual led class attributes. As a solution which doesn't incur as much
+overhead, I suggest these become part of the device name. The naming scheme
+above leaves scope for further attributes should they be needed. If sections
+of the name don't apply, just leave that section blank.
+
+
+Brightness setting API
+======================
+
+LED subsystem core exposes following API for setting brightness:
+
+    - led_set_brightness:
+		it is guaranteed not to sleep, passing LED_OFF stops
+		blinking,
+
+    - led_set_brightness_sync:
+		for use cases when immediate effect is desired -
+		it can block the caller for the time required for accessing
+		device registers and can sleep, passing LED_OFF stops hardware
+		blinking, returns -EBUSY if software blink fallback is enabled.
+
+
+LED registration API
+====================
+
+A driver wanting to register a LED classdev for use by other drivers /
+userspace needs to allocate and fill a led_classdev struct and then call
+`[devm_]led_classdev_register`. If the non devm version is used the driver
+must call led_classdev_unregister from its remove function before
+free-ing the led_classdev struct.
+
+If the driver can detect hardware initiated brightness changes and thus
+wants to have a brightness_hw_changed attribute then the LED_BRIGHT_HW_CHANGED
+flag must be set in flags before registering. Calling
+led_classdev_notify_brightness_hw_changed on a classdev not registered with
+the LED_BRIGHT_HW_CHANGED flag is a bug and will trigger a WARN_ON.
+
+Hardware accelerated blink of LEDs
+==================================
+
+Some LEDs can be programmed to blink without any CPU interaction. To
+support this feature, a LED driver can optionally implement the
+blink_set() function (see <linux/leds.h>). To set an LED to blinking,
+however, it is better to use the API function led_blink_set(), as it
+will check and implement software fallback if necessary.
+
+To turn off blinking, use the API function led_brightness_set()
+with brightness value LED_OFF, which should stop any software
+timers that may have been required for blinking.
+
+The blink_set() function should choose a user friendly blinking value
+if it is called with `*delay_on==0` && `*delay_off==0` parameters. In this
+case the driver should give back the chosen value through delay_on and
+delay_off parameters to the leds subsystem.
+
+Setting the brightness to zero with brightness_set() callback function
+should completely turn off the LED and cancel the previously programmed
+hardware blinking function, if any.
+
+
+Known Issues
+============
+
+The LED Trigger core cannot be a module as the simple trigger functions
+would cause nightmare dependency issues. I see this as a minor issue
+compared to the benefits the simple trigger functionality brings. The
+rest of the LED subsystem can be modular.
+
+
+Future Development
+==================
+
+At the moment, a trigger can't be created specifically for a single LED.
+There are a number of cases where a trigger might only be mappable to a
+particular LED (ACPI?). The addition of triggers provided by the LED driver
+should cover this option and be possible to add without breaking the
+current interface.
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
deleted file mode 100644
index 8b39cc6b03ee..000000000000
--- a/Documentation/leds/leds-class.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-
-LED handling under Linux
-========================
-
-In its simplest form, the LED class just allows control of LEDs from
-userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
-LED is defined in max_brightness file. The brightness file will set the brightness
-of the LED (taking a value 0-max_brightness). Most LEDs don't have hardware
-brightness support so will just be turned on for non-zero brightness settings.
-
-The class also introduces the optional concept of an LED trigger. A trigger
-is a kernel based source of led events. Triggers can either be simple or
-complex. A simple trigger isn't configurable and is designed to slot into
-existing subsystems with minimal additional code. Examples are the disk-activity,
-nand-disk and sharpsl-charge triggers. With led triggers disabled, the code
-optimises away.
-
-Complex triggers while available to all LEDs have LED specific
-parameters and work on a per LED basis. The timer trigger is an example.
-The timer trigger will periodically change the LED brightness between
-LED_OFF and the current brightness setting. The "on" and "off" time can
-be specified via /sys/class/leds/<device>/delay_{on,off} in milliseconds.
-You can change the brightness value of a LED independently of the timer
-trigger. However, if you set the brightness value to LED_OFF it will
-also disable the timer trigger.
-
-You can change triggers in a similar manner to the way an IO scheduler
-is chosen (via /sys/class/leds/<device>/trigger). Trigger specific
-parameters can appear in /sys/class/leds/<device> once a given trigger is
-selected.
-
-
-Design Philosophy
-=================
-
-The underlying design philosophy is simplicity. LEDs are simple devices
-and the aim is to keep a small amount of code giving as much functionality
-as possible.  Please keep this in mind when suggesting enhancements.
-
-
-LED Device Naming
-=================
-
-Is currently of the form:
-
-"devicename:colour:function"
-
-There have been calls for LED properties such as colour to be exported as
-individual led class attributes. As a solution which doesn't incur as much
-overhead, I suggest these become part of the device name. The naming scheme
-above leaves scope for further attributes should they be needed. If sections
-of the name don't apply, just leave that section blank.
-
-
-Brightness setting API
-======================
-
-LED subsystem core exposes following API for setting brightness:
-
-    - led_set_brightness : it is guaranteed not to sleep, passing LED_OFF stops
-		blinking,
-    - led_set_brightness_sync : for use cases when immediate effect is desired -
-		it can block the caller for the time required for accessing
-		device registers and can sleep, passing LED_OFF stops hardware
-		blinking, returns -EBUSY if software blink fallback is enabled.
-
-
-LED registration API
-====================
-
-A driver wanting to register a LED classdev for use by other drivers /
-userspace needs to allocate and fill a led_classdev struct and then call
-[devm_]led_classdev_register. If the non devm version is used the driver
-must call led_classdev_unregister from its remove function before
-free-ing the led_classdev struct.
-
-If the driver can detect hardware initiated brightness changes and thus
-wants to have a brightness_hw_changed attribute then the LED_BRIGHT_HW_CHANGED
-flag must be set in flags before registering. Calling
-led_classdev_notify_brightness_hw_changed on a classdev not registered with
-the LED_BRIGHT_HW_CHANGED flag is a bug and will trigger a WARN_ON.
-
-Hardware accelerated blink of LEDs
-==================================
-
-Some LEDs can be programmed to blink without any CPU interaction. To
-support this feature, a LED driver can optionally implement the
-blink_set() function (see <linux/leds.h>). To set an LED to blinking,
-however, it is better to use the API function led_blink_set(), as it
-will check and implement software fallback if necessary.
-
-To turn off blinking, use the API function led_brightness_set()
-with brightness value LED_OFF, which should stop any software
-timers that may have been required for blinking.
-
-The blink_set() function should choose a user friendly blinking value
-if it is called with *delay_on==0 && *delay_off==0 parameters. In this
-case the driver should give back the chosen value through delay_on and
-delay_off parameters to the leds subsystem.
-
-Setting the brightness to zero with brightness_set() callback function
-should completely turn off the LED and cancel the previously programmed
-hardware blinking function, if any.
-
-
-Known Issues
-============
-
-The LED Trigger core cannot be a module as the simple trigger functions
-would cause nightmare dependency issues. I see this as a minor issue
-compared to the benefits the simple trigger functionality brings. The
-rest of the LED subsystem can be modular.
-
-
-Future Development
-==================
-
-At the moment, a trigger can't be created specifically for a single LED.
-There are a number of cases where a trigger might only be mappable to a
-particular LED (ACPI?). The addition of triggers provided by the LED driver
-should cover this option and be possible to add without breaking the
-current interface.
diff --git a/Documentation/leds/leds-lm3556.rst b/Documentation/leds/leds-lm3556.rst
new file mode 100644
index 000000000000..1ef17d7d800e
--- /dev/null
+++ b/Documentation/leds/leds-lm3556.rst
@@ -0,0 +1,137 @@
+========================
+Kernel driver for lm3556
+========================
+
+* Texas Instrument:
+  1.5 A Synchronous Boost LED Flash Driver w/ High-Side Current Source
+* Datasheet: http://www.national.com/ds/LM/LM3556.pdf
+
+Authors:
+      - Daniel Jeong
+
+	Contact:Daniel Jeong(daniel.jeong-at-ti.com, gshark.jeong-at-gmail.com)
+
+Description
+-----------
+There are 3 functions in LM3556, Flash, Torch and Indicator.
+
+Flash Mode
+^^^^^^^^^^
+
+In Flash Mode, the LED current source(LED) provides 16 target current levels
+from 93.75 mA to 1500 mA.The Flash currents are adjusted via the CURRENT
+CONTROL REGISTER(0x09).Flash mode is activated by the ENABLE REGISTER(0x0A),
+or by pulling the STROBE pin HIGH.
+
+LM3556 Flash can be controlled through sys/class/leds/flash/brightness file
+
+* if STROBE pin is enabled, below example control brightness only, and
+  ON / OFF will be controlled by STROBE pin.
+
+Flash Example:
+
+OFF::
+
+	#echo 0 > sys/class/leds/flash/brightness
+
+93.75 mA::
+
+	#echo 1 > sys/class/leds/flash/brightness
+
+...
+
+1500  mA::
+
+	#echo 16 > sys/class/leds/flash/brightness
+
+Torch Mode
+^^^^^^^^^^
+
+In Torch Mode, the current source(LED) is programmed via the CURRENT CONTROL
+REGISTER(0x09).Torch Mode is activated by the ENABLE REGISTER(0x0A) or by the
+hardware TORCH input.
+
+LM3556 torch can be controlled through sys/class/leds/torch/brightness file.
+* if TORCH pin is enabled, below example control brightness only,
+and ON / OFF will be controlled by TORCH pin.
+
+Torch Example:
+
+OFF::
+
+	#echo 0 > sys/class/leds/torch/brightness
+
+46.88 mA::
+
+	#echo 1 > sys/class/leds/torch/brightness
+
+...
+
+375 mA::
+
+	#echo 8 > sys/class/leds/torch/brightness
+
+Indicator Mode
+^^^^^^^^^^^^^^
+
+Indicator pattern can be set through sys/class/leds/indicator/pattern file,
+and 4 patterns are pre-defined in indicator_pattern array.
+
+According to N-lank, Pulse time and N Period values, different pattern wiill
+be generated.If you want new patterns for your own device, change
+indicator_pattern array with your own values and INDIC_PATTERN_SIZE.
+
+Please refer datasheet for more detail about N-Blank, Pulse time and N Period.
+
+Indicator pattern example:
+
+pattern 0::
+
+	#echo 0 > sys/class/leds/indicator/pattern
+
+...
+
+pattern 3::
+
+	#echo 3 > sys/class/leds/indicator/pattern
+
+Indicator brightness can be controlled through
+sys/class/leds/indicator/brightness file.
+
+Example:
+
+OFF::
+
+	#echo 0 > sys/class/leds/indicator/brightness
+
+5.86 mA::
+
+	#echo 1 > sys/class/leds/indicator/brightness
+
+...
+
+46.875mA::
+
+	#echo 8 > sys/class/leds/indicator/brightness
+
+Notes
+-----
+Driver expects it is registered using the i2c_board_info mechanism.
+To register the chip at address 0x63 on specific adapter, set the platform data
+according to include/linux/platform_data/leds-lm3556.h, set the i2c board info
+
+Example::
+
+	static struct i2c_board_info board_i2c_ch4[] __initdata = {
+		{
+			 I2C_BOARD_INFO(LM3556_NAME, 0x63),
+			 .platform_data = &lm3556_pdata,
+		 },
+	};
+
+and register it in the platform init function
+
+Example::
+
+	board_register_i2c_bus(4, 400,
+				board_i2c_ch4, ARRAY_SIZE(board_i2c_ch4));
diff --git a/Documentation/leds/leds-lm3556.txt b/Documentation/leds/leds-lm3556.txt
deleted file mode 100644
index 62278e871b50..000000000000
--- a/Documentation/leds/leds-lm3556.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-Kernel driver for lm3556
-========================
-
-*Texas Instrument:
- 1.5 A Synchronous Boost LED Flash Driver w/ High-Side Current Source
-* Datasheet: http://www.national.com/ds/LM/LM3556.pdf
-
-Authors:
-	Daniel Jeong
-	Contact:Daniel Jeong(daniel.jeong-at-ti.com, gshark.jeong-at-gmail.com)
-
-Description
------------
-There are 3 functions in LM3556, Flash, Torch and Indicator.
-
-FLASH MODE
-In Flash Mode, the LED current source(LED) provides 16 target current levels
-from 93.75 mA to 1500 mA.The Flash currents are adjusted via the CURRENT
-CONTROL REGISTER(0x09).Flash mode is activated by the ENABLE REGISTER(0x0A),
-or by pulling the STROBE pin HIGH.
-LM3556 Flash can be controlled through sys/class/leds/flash/brightness file
-* if STROBE pin is enabled, below example control brightness only, and
-ON / OFF will be controlled by STROBE pin.
-
-Flash Example:
-OFF     : #echo 0 > sys/class/leds/flash/brightness
-93.75 mA: #echo 1 > sys/class/leds/flash/brightness
-... .....
-1500  mA: #echo 16 > sys/class/leds/flash/brightness
-
-TORCH MODE
-In Torch Mode, the current source(LED) is programmed via the CURRENT CONTROL
-REGISTER(0x09).Torch Mode is activated by the ENABLE REGISTER(0x0A) or by the
-hardware TORCH input.
-LM3556 torch can be controlled through sys/class/leds/torch/brightness file.
-* if TORCH pin is enabled, below example control brightness only,
-and ON / OFF will be controlled by TORCH pin.
-
-Torch Example:
-OFF     : #echo 0 > sys/class/leds/torch/brightness
-46.88 mA: #echo 1 > sys/class/leds/torch/brightness
-... .....
-375 mA  : #echo 8 > sys/class/leds/torch/brightness
-
-INDICATOR MODE
-Indicator pattern can be set through sys/class/leds/indicator/pattern file,
-and 4 patterns are pre-defined in indicator_pattern array.
-According to N-lank, Pulse time and N Period values, different pattern wiill
-be generated.If you want new patterns for your own device, change
-indicator_pattern array with your own values and INDIC_PATTERN_SIZE.
-Please refer datasheet for more detail about N-Blank, Pulse time and N Period.
-
-Indicator pattern example:
-pattern 0: #echo 0 > sys/class/leds/indicator/pattern
-....
-pattern 3: #echo 3 > sys/class/leds/indicator/pattern
-
-Indicator brightness can be controlled through
-sys/class/leds/indicator/brightness file.
-
-Example:
-OFF      : #echo 0 > sys/class/leds/indicator/brightness
-5.86 mA  : #echo 1 > sys/class/leds/indicator/brightness
-........
-46.875mA : #echo 8 > sys/class/leds/indicator/brightness
-
-Notes
------
-Driver expects it is registered using the i2c_board_info mechanism.
-To register the chip at address 0x63 on specific adapter, set the platform data
-according to include/linux/platform_data/leds-lm3556.h, set the i2c board info
-
-Example:
-	static struct i2c_board_info board_i2c_ch4[] __initdata = {
-		{
-			 I2C_BOARD_INFO(LM3556_NAME, 0x63),
-			 .platform_data = &lm3556_pdata,
-		 },
-	};
-
-and register it in the platform init function
-
-Example:
-	board_register_i2c_bus(4, 400,
-				board_i2c_ch4, ARRAY_SIZE(board_i2c_ch4));
diff --git a/Documentation/leds/leds-lp3944.rst b/Documentation/leds/leds-lp3944.rst
new file mode 100644
index 000000000000..c2f87dc1a3a9
--- /dev/null
+++ b/Documentation/leds/leds-lp3944.rst
@@ -0,0 +1,59 @@
+====================
+Kernel driver lp3944
+====================
+
+  * National Semiconductor LP3944 Fun-light Chip
+
+    Prefix: 'lp3944'
+
+    Addresses scanned: None (see the Notes section below)
+
+    Datasheet:
+
+	Publicly available at the National Semiconductor website
+	http://www.national.com/pf/LP/LP3944.html
+
+Authors:
+	Antonio Ospite <ospite@studenti.unina.it>
+
+
+Description
+-----------
+The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
+DIM modes; it could even be used as a gpio expander but this driver assumes it
+is used as a led controller.
+
+The DIM modes are used to set _blink_ patterns for leds, the pattern is
+specified supplying two parameters:
+
+  - period:
+	from 0s to 1.6s
+  - duty cycle:
+	percentage of the period the led is on, from 0 to 100
+
+Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
+See the datasheet for details.
+
+LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
+leds, the camera flash light and the lcds power.
+
+
+Notes
+-----
+The chip is used mainly in embedded contexts, so this driver expects it is
+registered using the i2c_board_info mechanism.
+
+To register the chip at address 0x60 on adapter 0, set the platform data
+according to include/linux/leds-lp3944.h, set the i2c board info::
+
+	static struct i2c_board_info a910_i2c_board_info[] __initdata = {
+		{
+			I2C_BOARD_INFO("lp3944", 0x60),
+			.platform_data = &a910_lp3944_leds,
+		},
+	};
+
+and register it in the platform init function::
+
+	i2c_register_board_info(0, a910_i2c_board_info,
+			ARRAY_SIZE(a910_i2c_board_info));
diff --git a/Documentation/leds/leds-lp3944.txt b/Documentation/leds/leds-lp3944.txt
deleted file mode 100644
index e88ac3b60c08..000000000000
--- a/Documentation/leds/leds-lp3944.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-Kernel driver lp3944
-====================
-
-  * National Semiconductor LP3944 Fun-light Chip
-    Prefix: 'lp3944'
-    Addresses scanned: None (see the Notes section below)
-    Datasheet: Publicly available at the National Semiconductor website
-               http://www.national.com/pf/LP/LP3944.html
-
-Authors:
-        Antonio Ospite <ospite@studenti.unina.it>
-
-
-Description
------------
-The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
-DIM modes; it could even be used as a gpio expander but this driver assumes it
-is used as a led controller.
-
-The DIM modes are used to set _blink_ patterns for leds, the pattern is
-specified supplying two parameters:
-  - period: from 0s to 1.6s
-  - duty cycle: percentage of the period the led is on, from 0 to 100
-
-Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
-See the datasheet for details.
-
-LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
-leds, the camera flash light and the lcds power.
-
-
-Notes
------
-The chip is used mainly in embedded contexts, so this driver expects it is
-registered using the i2c_board_info mechanism.
-
-To register the chip at address 0x60 on adapter 0, set the platform data
-according to include/linux/leds-lp3944.h, set the i2c board info:
-
-	static struct i2c_board_info a910_i2c_board_info[] __initdata = {
-		{
-			I2C_BOARD_INFO("lp3944", 0x60),
-			.platform_data = &a910_lp3944_leds,
-		},
-	};
-
-and register it in the platform init function
-
-	i2c_register_board_info(0, a910_i2c_board_info,
-			ARRAY_SIZE(a910_i2c_board_info));
diff --git a/Documentation/leds/leds-lp5521.rst b/Documentation/leds/leds-lp5521.rst
new file mode 100644
index 000000000000..0432615b083d
--- /dev/null
+++ b/Documentation/leds/leds-lp5521.rst
@@ -0,0 +1,115 @@
+========================
+Kernel driver for lp5521
+========================
+
+* National Semiconductor LP5521 led driver chip
+* Datasheet: http://www.national.com/pf/LP/LP5521.html
+
+Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
+
+Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
+
+Description
+-----------
+
+LP5521 can drive up to 3 channels. Leds can be controlled directly via
+the led class control interface. Channels have generic names:
+lp5521:channelx, where x is 0 .. 2
+
+All three channels can be also controlled using the engine micro programs.
+More details of the instructions can be found from the public data sheet.
+
+LP5521 has the internal program memory for running various LED patterns.
+There are two ways to run LED patterns.
+
+1) Legacy interface - enginex_mode and enginex_load
+   Control interface for the engines:
+
+   x is 1 .. 3
+
+   enginex_mode:
+	disabled, load, run
+   enginex_load:
+	store program (visible only in engine load mode)
+
+  Example (start to blink the channel 2 led)::
+
+	cd   /sys/class/leds/lp5521:channel2/device
+	echo "load" > engine3_mode
+	echo "037f4d0003ff6000" > engine3_load
+	echo "run" > engine3_mode
+
+  To stop the engine::
+
+	echo "disabled" > engine3_mode
+
+2) Firmware interface - LP55xx common interface
+
+For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+sysfs contains a selftest entry.
+
+The test communicates with the chip and checks that
+the clock mode is automatically set to the requested one.
+
+Each channel has its own led current settings.
+
+- /sys/class/leds/lp5521:channel0/led_current - RW
+- /sys/class/leds/lp5521:channel0/max_current - RO
+
+Format: 10x mA i.e 10 means 1.0 mA
+
+example platform data::
+
+  static struct lp55xx_led_config lp5521_led_config[] = {
+	  {
+		.name = "red",
+		  .chan_nr        = 0,
+		  .led_current    = 50,
+		.max_current    = 130,
+	  }, {
+		.name = "green",
+		  .chan_nr        = 1,
+		  .led_current    = 0,
+		.max_current    = 130,
+	  }, {
+		.name = "blue",
+		  .chan_nr        = 2,
+		  .led_current    = 0,
+		.max_current    = 130,
+	  }
+  };
+
+  static int lp5521_setup(void)
+  {
+	/* setup HW resources */
+  }
+
+  static void lp5521_release(void)
+  {
+	/* Release HW resources */
+  }
+
+  static void lp5521_enable(bool state)
+  {
+	/* Control of chip enable signal */
+  }
+
+  static struct lp55xx_platform_data lp5521_platform_data = {
+	  .led_config     = lp5521_led_config,
+	  .num_channels   = ARRAY_SIZE(lp5521_led_config),
+	  .clock_mode     = LP55XX_CLOCK_EXT,
+	  .setup_resources   = lp5521_setup,
+	  .release_resources = lp5521_release,
+	  .enable            = lp5521_enable,
+  };
+
+Note:
+  chan_nr can have values between 0 and 2.
+  The name of each channel can be configurable.
+  If the name field is not defined, the default name will be set to 'xxxx:channelN'
+  (XXXX : pdata->label or i2c client name, N : channel number)
+
+
+If the current is set to 0 in the platform data, that channel is
+disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5521.txt b/Documentation/leds/leds-lp5521.txt
deleted file mode 100644
index d08d8c179f85..000000000000
--- a/Documentation/leds/leds-lp5521.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-Kernel driver for lp5521
-========================
-
-* National Semiconductor LP5521 led driver chip
-* Datasheet: http://www.national.com/pf/LP/LP5521.html
-
-Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
-Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
-
-Description
------------
-
-LP5521 can drive up to 3 channels. Leds can be controlled directly via
-the led class control interface. Channels have generic names:
-lp5521:channelx, where x is 0 .. 2
-
-All three channels can be also controlled using the engine micro programs.
-More details of the instructions can be found from the public data sheet.
-
-LP5521 has the internal program memory for running various LED patterns.
-There are two ways to run LED patterns.
-
-1) Legacy interface - enginex_mode and enginex_load
-  Control interface for the engines:
-  x is 1 .. 3
-  enginex_mode : disabled, load, run
-  enginex_load : store program (visible only in engine load mode)
-
-  Example (start to blink the channel 2 led):
-  cd   /sys/class/leds/lp5521:channel2/device
-  echo "load" > engine3_mode
-  echo "037f4d0003ff6000" > engine3_load
-  echo "run" > engine3_mode
-
-  To stop the engine:
-  echo "disabled" > engine3_mode
-
-2) Firmware interface - LP55xx common interface
-  For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-sysfs contains a selftest entry.
-The test communicates with the chip and checks that
-the clock mode is automatically set to the requested one.
-
-Each channel has its own led current settings.
-/sys/class/leds/lp5521:channel0/led_current - RW
-/sys/class/leds/lp5521:channel0/max_current - RO
-Format: 10x mA i.e 10 means 1.0 mA
-
-example platform data:
-
-Note: chan_nr can have values between 0 and 2.
-The name of each channel can be configurable.
-If the name field is not defined, the default name will be set to 'xxxx:channelN'
-(XXXX : pdata->label or i2c client name, N : channel number)
-
-static struct lp55xx_led_config lp5521_led_config[] = {
-        {
-		.name = "red",
-                .chan_nr        = 0,
-                .led_current    = 50,
-		.max_current    = 130,
-        }, {
-		.name = "green",
-                .chan_nr        = 1,
-                .led_current    = 0,
-		.max_current    = 130,
-        }, {
-		.name = "blue",
-                .chan_nr        = 2,
-                .led_current    = 0,
-		.max_current    = 130,
-        }
-};
-
-static int lp5521_setup(void)
-{
-	/* setup HW resources */
-}
-
-static void lp5521_release(void)
-{
-	/* Release HW resources */
-}
-
-static void lp5521_enable(bool state)
-{
-	/* Control of chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5521_platform_data = {
-        .led_config     = lp5521_led_config,
-        .num_channels   = ARRAY_SIZE(lp5521_led_config),
-        .clock_mode     = LP55XX_CLOCK_EXT,
-        .setup_resources   = lp5521_setup,
-        .release_resources = lp5521_release,
-        .enable            = lp5521_enable,
-};
-
-If the current is set to 0 in the platform data, that channel is
-disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5523.rst b/Documentation/leds/leds-lp5523.rst
new file mode 100644
index 000000000000..7d7362a1dd57
--- /dev/null
+++ b/Documentation/leds/leds-lp5523.rst
@@ -0,0 +1,147 @@
+========================
+Kernel driver for lp5523
+========================
+
+* National Semiconductor LP5523 led driver chip
+* Datasheet: http://www.national.com/pf/LP/LP5523.html
+
+Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
+Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
+
+Description
+-----------
+LP5523 can drive up to 9 channels. Leds can be controlled directly via
+the led class control interface.
+The name of each channel is configurable in the platform data - name and label.
+There are three options to make the channel name.
+
+a) Define the 'name' in the platform data
+
+To make specific channel name, then use 'name' platform data.
+
+- /sys/class/leds/R1               (name: 'R1')
+- /sys/class/leds/B1               (name: 'B1')
+
+b) Use the 'label' with no 'name' field
+
+For one device name with channel number, then use 'label'.
+- /sys/class/leds/RGB:channelN     (label: 'RGB', N: 0 ~ 8)
+
+c) Default
+
+If both fields are NULL, 'lp5523' is used by default.
+- /sys/class/leds/lp5523:channelN  (N: 0 ~ 8)
+
+LP5523 has the internal program memory for running various LED patterns.
+There are two ways to run LED patterns.
+
+1) Legacy interface - enginex_mode, enginex_load and enginex_leds
+
+  Control interface for the engines:
+
+  x is 1 .. 3
+
+  enginex_mode:
+	disabled, load, run
+  enginex_load:
+	microcode load
+  enginex_leds:
+	led mux control
+
+  ::
+
+	cd /sys/class/leds/lp5523:channel2/device
+	echo "load" > engine3_mode
+	echo "9d80400004ff05ff437f0000" > engine3_load
+	echo "111111111" > engine3_leds
+	echo "run" > engine3_mode
+
+  To stop the engine::
+
+	echo "disabled" > engine3_mode
+
+2) Firmware interface - LP55xx common interface
+
+For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+LP5523 has three master faders. If a channel is mapped to one of
+the master faders, its output is dimmed based on the value of the master
+fader.
+
+For example::
+
+  echo "123000123" > master_fader_leds
+
+creates the following channel-fader mappings::
+
+  channel 0,6 to master_fader1
+  channel 1,7 to master_fader2
+  channel 2,8 to master_fader3
+
+Then, to have 25% of the original output on channel 0,6::
+
+  echo 64 > master_fader1
+
+To have 0% of the original output (i.e. no output) channel 1,7::
+
+  echo 0 > master_fader2
+
+To have 100% of the original output (i.e. no dimming) on channel 2,8::
+
+  echo 255 > master_fader3
+
+To clear all master fader controls::
+
+  echo "000000000" > master_fader_leds
+
+Selftest uses always the current from the platform data.
+
+Each channel contains led current settings.
+- /sys/class/leds/lp5523:channel2/led_current - RW
+- /sys/class/leds/lp5523:channel2/max_current - RO
+
+Format: 10x mA i.e 10 means 1.0 mA
+
+Example platform data::
+
+	static struct lp55xx_led_config lp5523_led_config[] = {
+		{
+			.name		= "D1",
+			.chan_nr        = 0,
+			.led_current    = 50,
+			.max_current    = 130,
+		},
+	...
+		{
+			.chan_nr        = 8,
+			.led_current    = 50,
+			.max_current    = 130,
+		}
+	};
+
+	static int lp5523_setup(void)
+	{
+		/* Setup HW resources */
+	}
+
+	static void lp5523_release(void)
+	{
+		/* Release HW resources */
+	}
+
+	static void lp5523_enable(bool state)
+	{
+		/* Control chip enable signal */
+	}
+
+	static struct lp55xx_platform_data lp5523_platform_data = {
+		.led_config     = lp5523_led_config,
+		.num_channels   = ARRAY_SIZE(lp5523_led_config),
+		.clock_mode     = LP55XX_CLOCK_EXT,
+		.setup_resources   = lp5523_setup,
+		.release_resources = lp5523_release,
+		.enable            = lp5523_enable,
+	};
+
+Note
+  chan_nr can have values between 0 and 8.
diff --git a/Documentation/leds/leds-lp5523.txt b/Documentation/leds/leds-lp5523.txt
deleted file mode 100644
index 0961a060fc4d..000000000000
--- a/Documentation/leds/leds-lp5523.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-Kernel driver for lp5523
-========================
-
-* National Semiconductor LP5523 led driver chip
-* Datasheet: http://www.national.com/pf/LP/LP5523.html
-
-Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
-Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
-
-Description
------------
-LP5523 can drive up to 9 channels. Leds can be controlled directly via
-the led class control interface.
-The name of each channel is configurable in the platform data - name and label.
-There are three options to make the channel name.
-
-a) Define the 'name' in the platform data
-To make specific channel name, then use 'name' platform data.
-/sys/class/leds/R1               (name: 'R1')
-/sys/class/leds/B1               (name: 'B1')
-
-b) Use the 'label' with no 'name' field
-For one device name with channel number, then use 'label'.
-/sys/class/leds/RGB:channelN     (label: 'RGB', N: 0 ~ 8)
-
-c) Default
-If both fields are NULL, 'lp5523' is used by default.
-/sys/class/leds/lp5523:channelN  (N: 0 ~ 8)
-
-LP5523 has the internal program memory for running various LED patterns.
-There are two ways to run LED patterns.
-
-1) Legacy interface - enginex_mode, enginex_load and enginex_leds
-  Control interface for the engines:
-  x is 1 .. 3
-  enginex_mode : disabled, load, run
-  enginex_load : microcode load
-  enginex_leds : led mux control
-
-  cd /sys/class/leds/lp5523:channel2/device
-  echo "load" > engine3_mode
-  echo "9d80400004ff05ff437f0000" > engine3_load
-  echo "111111111" > engine3_leds
-  echo "run" > engine3_mode
-
-  To stop the engine:
-  echo "disabled" > engine3_mode
-
-2) Firmware interface - LP55xx common interface
-  For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-LP5523 has three master faders. If a channel is mapped to one of
-the master faders, its output is dimmed based on the value of the master
-fader.
-
-For example,
-
-  echo "123000123" > master_fader_leds
-
-creates the following channel-fader mappings:
-
-  channel 0,6 to master_fader1
-  channel 1,7 to master_fader2
-  channel 2,8 to master_fader3
-
-Then, to have 25% of the original output on channel 0,6:
-
-  echo 64 > master_fader1
-
-To have 0% of the original output (i.e. no output) channel 1,7:
-
-  echo 0 > master_fader2
-
-To have 100% of the original output (i.e. no dimming) on channel 2,8:
-
-  echo 255 > master_fader3
-
-To clear all master fader controls:
-
-  echo "000000000" > master_fader_leds
-
-Selftest uses always the current from the platform data.
-
-Each channel contains led current settings.
-/sys/class/leds/lp5523:channel2/led_current - RW
-/sys/class/leds/lp5523:channel2/max_current - RO
-Format: 10x mA i.e 10 means 1.0 mA
-
-Example platform data:
-
-Note - chan_nr can have values between 0 and 8.
-
-static struct lp55xx_led_config lp5523_led_config[] = {
-        {
-		.name		= "D1",
-                .chan_nr        = 0,
-                .led_current    = 50,
-		.max_current    = 130,
-        },
-...
-        {
-                .chan_nr        = 8,
-                .led_current    = 50,
-		.max_current    = 130,
-        }
-};
-
-static int lp5523_setup(void)
-{
-	/* Setup HW resources */
-}
-
-static void lp5523_release(void)
-{
-	/* Release HW resources */
-}
-
-static void lp5523_enable(bool state)
-{
-	/* Control chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5523_platform_data = {
-        .led_config     = lp5523_led_config,
-        .num_channels   = ARRAY_SIZE(lp5523_led_config),
-        .clock_mode     = LP55XX_CLOCK_EXT,
-        .setup_resources   = lp5523_setup,
-        .release_resources = lp5523_release,
-        .enable            = lp5523_enable,
-};
diff --git a/Documentation/leds/leds-lp5562.rst b/Documentation/leds/leds-lp5562.rst
new file mode 100644
index 000000000000..79bbb2487ff6
--- /dev/null
+++ b/Documentation/leds/leds-lp5562.rst
@@ -0,0 +1,137 @@
+========================
+Kernel driver for lp5562
+========================
+
+* TI LP5562 LED Driver
+
+Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+===========
+
+  LP5562 can drive up to 4 channels. R/G/B and White.
+  LEDs can be controlled directly via the led class control interface.
+
+  All four channels can be also controlled using the engine micro programs.
+  LP5562 has the internal program memory for running various LED patterns.
+  For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+Device attribute
+================
+
+engine_mux
+  3 Engines are allocated in LP5562, but the number of channel is 4.
+  Therefore each channel should be mapped to the engine number.
+
+  Value: RGB or W
+
+  This attribute is used for programming LED data with the firmware interface.
+  Unlike the LP5521/LP5523/55231, LP5562 has unique feature for the engine mux,
+  so additional sysfs is required
+
+  LED Map
+
+  ===== === ===============================
+  Red   ... Engine 1 (fixed)
+  Green ... Engine 2 (fixed)
+  Blue  ... Engine 3 (fixed)
+  White ... Engine 1 or 2 or 3 (selective)
+  ===== === ===============================
+
+How to load the program data using engine_mux
+=============================================
+
+  Before loading the LP5562 program data, engine_mux should be written between
+  the engine selection and loading the firmware.
+  Engine mux has two different mode, RGB and W.
+  RGB is used for loading RGB program data, W is used for W program data.
+
+  For example, run blinking green channel pattern::
+
+    echo 2 > /sys/bus/i2c/devices/xxxx/select_engine     # 2 is for green channel
+    echo "RGB" > /sys/bus/i2c/devices/xxxx/engine_mux    # engine mux for RGB
+    echo 1 > /sys/class/firmware/lp5562/loading
+    echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+    echo 0 > /sys/class/firmware/lp5562/loading
+    echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+  To run a blinking white pattern::
+
+    echo 1 or 2 or 3 > /sys/bus/i2c/devices/xxxx/select_engine
+    echo "W" > /sys/bus/i2c/devices/xxxx/engine_mux
+    echo 1 > /sys/class/firmware/lp5562/loading
+    echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+    echo 0 > /sys/class/firmware/lp5562/loading
+    echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+How to load the predefined patterns
+===================================
+
+  Please refer to 'leds-lp55xx.txt"
+
+Setting Current of Each Channel
+===============================
+
+  Like LP5521 and LP5523/55231, LP5562 provides LED current settings.
+  The 'led_current' and 'max_current' are used.
+
+Example of Platform data
+========================
+
+::
+
+	static struct lp55xx_led_config lp5562_led_config[] = {
+		{
+			.name 		= "R",
+			.chan_nr	= 0,
+			.led_current	= 20,
+			.max_current	= 40,
+		},
+		{
+			.name 		= "G",
+			.chan_nr	= 1,
+			.led_current	= 20,
+			.max_current	= 40,
+		},
+		{
+			.name 		= "B",
+			.chan_nr	= 2,
+			.led_current	= 20,
+			.max_current	= 40,
+		},
+		{
+			.name 		= "W",
+			.chan_nr	= 3,
+			.led_current	= 20,
+			.max_current	= 40,
+		},
+	};
+
+	static int lp5562_setup(void)
+	{
+		/* setup HW resources */
+	}
+
+	static void lp5562_release(void)
+	{
+		/* Release HW resources */
+	}
+
+	static void lp5562_enable(bool state)
+	{
+		/* Control of chip enable signal */
+	}
+
+	static struct lp55xx_platform_data lp5562_platform_data = {
+		.led_config     = lp5562_led_config,
+		.num_channels   = ARRAY_SIZE(lp5562_led_config),
+		.setup_resources   = lp5562_setup,
+		.release_resources = lp5562_release,
+		.enable            = lp5562_enable,
+	};
+
+To configure the platform specific data, lp55xx_platform_data structure is used
+
+
+If the current is set to 0 in the platform data, that channel is
+disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5562.txt b/Documentation/leds/leds-lp5562.txt
deleted file mode 100644
index 5a823ff6b393..000000000000
--- a/Documentation/leds/leds-lp5562.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-Kernel driver for LP5562
-========================
-
-* TI LP5562 LED Driver
-
-Author: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
-
-  LP5562 can drive up to 4 channels. R/G/B and White.
-  LEDs can be controlled directly via the led class control interface.
-
-  All four channels can be also controlled using the engine micro programs.
-  LP5562 has the internal program memory for running various LED patterns.
-  For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-Device attribute: engine_mux
-
-  3 Engines are allocated in LP5562, but the number of channel is 4.
-  Therefore each channel should be mapped to the engine number.
-  Value : RGB or W
-
-  This attribute is used for programming LED data with the firmware interface.
-  Unlike the LP5521/LP5523/55231, LP5562 has unique feature for the engine mux,
-  so additional sysfs is required.
-
-  LED Map
-  Red   ... Engine 1 (fixed)
-  Green ... Engine 2 (fixed)
-  Blue  ... Engine 3 (fixed)
-  White ... Engine 1 or 2 or 3 (selective)
-
-How to load the program data using engine_mux
-
-  Before loading the LP5562 program data, engine_mux should be written between
-  the engine selection and loading the firmware.
-  Engine mux has two different mode, RGB and W.
-  RGB is used for loading RGB program data, W is used for W program data.
-
-  For example, run blinking green channel pattern,
-  echo 2 > /sys/bus/i2c/devices/xxxx/select_engine     # 2 is for green channel
-  echo "RGB" > /sys/bus/i2c/devices/xxxx/engine_mux    # engine mux for RGB
-  echo 1 > /sys/class/firmware/lp5562/loading
-  echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
-  echo 0 > /sys/class/firmware/lp5562/loading
-  echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-  To run a blinking white pattern,
-  echo 1 or 2 or 3 > /sys/bus/i2c/devices/xxxx/select_engine
-  echo "W" > /sys/bus/i2c/devices/xxxx/engine_mux
-  echo 1 > /sys/class/firmware/lp5562/loading
-  echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
-  echo 0 > /sys/class/firmware/lp5562/loading
-  echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-How to load the predefined patterns
-
-  Please refer to 'leds-lp55xx.txt"
-
-Setting Current of Each Channel
-
-  Like LP5521 and LP5523/55231, LP5562 provides LED current settings.
-  The 'led_current' and 'max_current' are used.
-
-(Example of Platform data)
-
-To configure the platform specific data, lp55xx_platform_data structure is used.
-
-static struct lp55xx_led_config lp5562_led_config[] = {
-	{
-		.name 		= "R",
-		.chan_nr	= 0,
-		.led_current	= 20,
-		.max_current	= 40,
-	},
-	{
-		.name 		= "G",
-		.chan_nr	= 1,
-		.led_current	= 20,
-		.max_current	= 40,
-	},
-	{
-		.name 		= "B",
-		.chan_nr	= 2,
-		.led_current	= 20,
-		.max_current	= 40,
-	},
-	{
-		.name 		= "W",
-		.chan_nr	= 3,
-		.led_current	= 20,
-		.max_current	= 40,
-	},
-};
-
-static int lp5562_setup(void)
-{
-	/* setup HW resources */
-}
-
-static void lp5562_release(void)
-{
-	/* Release HW resources */
-}
-
-static void lp5562_enable(bool state)
-{
-	/* Control of chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5562_platform_data = {
-        .led_config     = lp5562_led_config,
-        .num_channels   = ARRAY_SIZE(lp5562_led_config),
-        .setup_resources   = lp5562_setup,
-        .release_resources = lp5562_release,
-        .enable            = lp5562_enable,
-};
-
-If the current is set to 0 in the platform data, that channel is
-disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp55xx.rst b/Documentation/leds/leds-lp55xx.rst
new file mode 100644
index 000000000000..632e41cec0b5
--- /dev/null
+++ b/Documentation/leds/leds-lp55xx.rst
@@ -0,0 +1,224 @@
+=================================================
+LP5521/LP5523/LP55231/LP5562/LP8501 Common Driver
+=================================================
+
+Authors: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+-----------
+LP5521, LP5523/55231, LP5562 and LP8501 have common features as below.
+
+  Register access via the I2C
+  Device initialization/deinitialization
+  Create LED class devices for multiple output channels
+  Device attributes for user-space interface
+  Program memory for running LED patterns
+
+The LP55xx common driver provides these features using exported functions.
+
+  lp55xx_init_device() / lp55xx_deinit_device()
+  lp55xx_register_leds() / lp55xx_unregister_leds()
+  lp55xx_regsister_sysfs() / lp55xx_unregister_sysfs()
+
+( Driver Structure Data )
+
+In lp55xx common driver, two different data structure is used.
+
+* lp55xx_led
+    control multi output LED channels such as led current, channel index.
+* lp55xx_chip
+    general chip control such like the I2C and platform data.
+
+For example, LP5521 has maximum 3 LED channels.
+LP5523/55231 has 9 output channels::
+
+  lp55xx_chip for LP5521 ... lp55xx_led #1
+			     lp55xx_led #2
+			     lp55xx_led #3
+
+  lp55xx_chip for LP5523 ... lp55xx_led #1
+			     lp55xx_led #2
+				   .
+				   .
+			     lp55xx_led #9
+
+( Chip Dependent Code )
+
+To support device specific configurations, special structure
+'lpxx_device_config' is used.
+
+  - Maximum number of channels
+  - Reset command, chip enable command
+  - Chip specific initialization
+  - Brightness control register access
+  - Setting LED output current
+  - Program memory address access for running patterns
+  - Additional device specific attributes
+
+( Firmware Interface )
+
+LP55xx family devices have the internal program memory for running
+various LED patterns.
+
+This pattern data is saved as a file in the user-land or
+hex byte string is written into the memory through the I2C.
+
+LP55xx common driver supports the firmware interface.
+
+LP55xx chips have three program engines.
+
+To load and run the pattern, the programming sequence is following.
+
+  (1) Select an engine number (1/2/3)
+  (2) Mode change to load
+  (3) Write pattern data into selected area
+  (4) Mode change to run
+
+The LP55xx common driver provides simple interfaces as below.
+
+select_engine:
+	Select which engine is used for running program
+run_engine:
+	Start program which is loaded via the firmware interface
+firmware:
+	Load program data
+
+In case of LP5523, one more command is required, 'enginex_leds'.
+It is used for selecting LED output(s) at each engine number.
+In more details, please refer to 'leds-lp5523.txt'.
+
+For example, run blinking pattern in engine #1 of LP5521::
+
+	echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
+	echo 1 > /sys/class/firmware/lp5521/loading
+	echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
+	echo 0 > /sys/class/firmware/lp5521/loading
+	echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+For example, run blinking pattern in engine #3 of LP55231
+
+Two LEDs are configured as pattern output channels::
+
+	echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
+	echo 1 > /sys/class/firmware/lp55231/loading
+	echo "9d0740ff7e0040007e00a0010000" > /sys/class/firmware/lp55231/data
+	echo 0 > /sys/class/firmware/lp55231/loading
+	echo "000001100" > /sys/bus/i2c/devices/xxxx/engine3_leds
+	echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+To start blinking patterns in engine #2 and #3 simultaneously::
+
+	for idx in 2 3
+	do
+	echo $idx > /sys/class/leds/red/device/select_engine
+	sleep 0.1
+	echo 1 > /sys/class/firmware/lp5521/loading
+	echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
+	echo 0 > /sys/class/firmware/lp5521/loading
+	done
+	echo 1 > /sys/class/leds/red/device/run_engine
+
+Here is another example for LP5523.
+
+Full LED strings are selected by 'engine2_leds'::
+
+	echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
+	echo 1 > /sys/class/firmware/lp5523/loading
+	echo "9d80400004ff05ff437f0000" > /sys/class/firmware/lp5523/data
+	echo 0 > /sys/class/firmware/lp5523/loading
+	echo "111111111" > /sys/bus/i2c/devices/xxxx/engine2_leds
+	echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+As soon as 'loading' is set to 0, registered callback is called.
+Inside the callback, the selected engine is loaded and memory is updated.
+To run programmed pattern, 'run_engine' attribute should be enabled.
+
+The pattern sequence of LP8501 is similar to LP5523.
+
+However pattern data is specific.
+
+Ex 1) Engine 1 is used::
+
+	echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
+	echo 1 > /sys/class/firmware/lp8501/loading
+	echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+	echo 0 > /sys/class/firmware/lp8501/loading
+	echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+Ex 2) Engine 2 and 3 are used at the same time::
+
+	echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
+	sleep 1
+	echo 1 > /sys/class/firmware/lp8501/loading
+	echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+	echo 0 > /sys/class/firmware/lp8501/loading
+	sleep 1
+	echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
+	sleep 1
+	echo 1 > /sys/class/firmware/lp8501/loading
+	echo "9d0340ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+	echo 0 > /sys/class/firmware/lp8501/loading
+	sleep 1
+	echo 1 > /sys/class/leds/d1/device/run_engine
+
+( 'run_engine' and 'firmware_cb' )
+
+The sequence of running the program data is common.
+
+But each device has own specific register addresses for commands.
+
+To support this, 'run_engine' and 'firmware_cb' are configurable in each driver.
+
+run_engine:
+	Control the selected engine
+firmware_cb:
+	The callback function after loading the firmware is done.
+
+	Chip specific commands for loading and updating program memory.
+
+( Predefined pattern data )
+
+Without the firmware interface, LP55xx driver provides another method for
+loading a LED pattern. That is 'predefined' pattern.
+
+A predefined pattern is defined in the platform data and load it(or them)
+via the sysfs if needed.
+
+To use the predefined pattern concept, 'patterns' and 'num_patterns' should be
+configured.
+
+Example of predefined pattern data::
+
+  /* mode_1: blinking data */
+  static const u8 mode_1[] = {
+		0x40, 0x00, 0x60, 0x00, 0x40, 0xFF, 0x60, 0x00,
+		};
+
+  /* mode_2: always on */
+  static const u8 mode_2[] = { 0x40, 0xFF, };
+
+  struct lp55xx_predef_pattern board_led_patterns[] = {
+	{
+		.r = mode_1,
+		.size_r = ARRAY_SIZE(mode_1),
+	},
+	{
+		.b = mode_2,
+		.size_b = ARRAY_SIZE(mode_2),
+	},
+  }
+
+  struct lp55xx_platform_data lp5562_pdata = {
+  ...
+	.patterns      = board_led_patterns,
+	.num_patterns  = ARRAY_SIZE(board_led_patterns),
+  };
+
+Then, mode_1 and mode_2 can be run via through the sysfs::
+
+  echo 1 > /sys/bus/i2c/devices/xxxx/led_pattern    # red blinking LED pattern
+  echo 2 > /sys/bus/i2c/devices/xxxx/led_pattern    # blue LED always on
+
+To stop running pattern::
+
+  echo 0 > /sys/bus/i2c/devices/xxxx/led_pattern
diff --git a/Documentation/leds/leds-lp55xx.txt b/Documentation/leds/leds-lp55xx.txt
deleted file mode 100644
index e23fa91ea722..000000000000
--- a/Documentation/leds/leds-lp55xx.txt
+++ /dev/null
@@ -1,194 +0,0 @@
-LP5521/LP5523/LP55231/LP5562/LP8501 Common Driver
-=================================================
-
-Authors: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
------------
-LP5521, LP5523/55231, LP5562 and LP8501 have common features as below.
-
-  Register access via the I2C
-  Device initialization/deinitialization
-  Create LED class devices for multiple output channels
-  Device attributes for user-space interface
-  Program memory for running LED patterns
-
-The LP55xx common driver provides these features using exported functions.
-  lp55xx_init_device() / lp55xx_deinit_device()
-  lp55xx_register_leds() / lp55xx_unregister_leds()
-  lp55xx_regsister_sysfs() / lp55xx_unregister_sysfs()
-
-( Driver Structure Data )
-
-In lp55xx common driver, two different data structure is used.
-
-o lp55xx_led
-  control multi output LED channels such as led current, channel index.
-o lp55xx_chip
-  general chip control such like the I2C and platform data.
-
-For example, LP5521 has maximum 3 LED channels.
-LP5523/55231 has 9 output channels.
-
-lp55xx_chip for LP5521 ... lp55xx_led #1
-                           lp55xx_led #2
-                           lp55xx_led #3
-
-lp55xx_chip for LP5523 ... lp55xx_led #1
-                           lp55xx_led #2
-                                 .
-                                 .
-                           lp55xx_led #9
-
-( Chip Dependent Code )
-
-To support device specific configurations, special structure
-'lpxx_device_config' is used.
-
-  Maximum number of channels
-  Reset command, chip enable command
-  Chip specific initialization
-  Brightness control register access
-  Setting LED output current
-  Program memory address access for running patterns
-  Additional device specific attributes
-
-( Firmware Interface )
-
-LP55xx family devices have the internal program memory for running
-various LED patterns.
-This pattern data is saved as a file in the user-land or
-hex byte string is written into the memory through the I2C.
-LP55xx common driver supports the firmware interface.
-
-LP55xx chips have three program engines.
-To load and run the pattern, the programming sequence is following.
-  (1) Select an engine number (1/2/3)
-  (2) Mode change to load
-  (3) Write pattern data into selected area
-  (4) Mode change to run
-
-The LP55xx common driver provides simple interfaces as below.
-select_engine : Select which engine is used for running program
-run_engine    : Start program which is loaded via the firmware interface
-firmware      : Load program data
-
-In case of LP5523, one more command is required, 'enginex_leds'.
-It is used for selecting LED output(s) at each engine number.
-In more details, please refer to 'leds-lp5523.txt'.
-
-For example, run blinking pattern in engine #1 of LP5521
-echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp5521/loading
-echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
-echo 0 > /sys/class/firmware/lp5521/loading
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-For example, run blinking pattern in engine #3 of LP55231
-Two LEDs are configured as pattern output channels.
-echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp55231/loading
-echo "9d0740ff7e0040007e00a0010000" > /sys/class/firmware/lp55231/data
-echo 0 > /sys/class/firmware/lp55231/loading
-echo "000001100" > /sys/bus/i2c/devices/xxxx/engine3_leds
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-To start blinking patterns in engine #2 and #3 simultaneously,
-for idx in 2 3
-do
-  echo $idx > /sys/class/leds/red/device/select_engine
-  sleep 0.1
-  echo 1 > /sys/class/firmware/lp5521/loading
-  echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
-  echo 0 > /sys/class/firmware/lp5521/loading
-done
-echo 1 > /sys/class/leds/red/device/run_engine
-
-Here is another example for LP5523.
-Full LED strings are selected by 'engine2_leds'.
-echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp5523/loading
-echo "9d80400004ff05ff437f0000" > /sys/class/firmware/lp5523/data
-echo 0 > /sys/class/firmware/lp5523/loading
-echo "111111111" > /sys/bus/i2c/devices/xxxx/engine2_leds
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-As soon as 'loading' is set to 0, registered callback is called.
-Inside the callback, the selected engine is loaded and memory is updated.
-To run programmed pattern, 'run_engine' attribute should be enabled.
-
-The pattern sequence of LP8501 is similar to LP5523.
-However pattern data is specific.
-Ex 1) Engine 1 is used
-echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-Ex 2) Engine 2 and 3 are used at the same time
-echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
-sleep 1
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-sleep 1
-echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
-sleep 1
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0340ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-sleep 1
-echo 1 > /sys/class/leds/d1/device/run_engine
-
-( 'run_engine' and 'firmware_cb' )
-The sequence of running the program data is common.
-But each device has own specific register addresses for commands.
-To support this, 'run_engine' and 'firmware_cb' are configurable in each driver.
-run_engine  : Control the selected engine
-firmware_cb : The callback function after loading the firmware is done.
-              Chip specific commands for loading and updating program memory.
-
-( Predefined pattern data )
-
-Without the firmware interface, LP55xx driver provides another method for
-loading a LED pattern. That is 'predefined' pattern.
-A predefined pattern is defined in the platform data and load it(or them)
-via the sysfs if needed.
-To use the predefined pattern concept, 'patterns' and 'num_patterns' should be
-configured.
-
-  Example of predefined pattern data:
-
-  /* mode_1: blinking data */
-  static const u8 mode_1[] = {
-		0x40, 0x00, 0x60, 0x00, 0x40, 0xFF, 0x60, 0x00,
-		};
-
-  /* mode_2: always on */
-  static const u8 mode_2[] = { 0x40, 0xFF, };
-
-  struct lp55xx_predef_pattern board_led_patterns[] = {
-	{
-		.r = mode_1,
-		.size_r = ARRAY_SIZE(mode_1),
-	},
-	{
-		.b = mode_2,
-		.size_b = ARRAY_SIZE(mode_2),
-	},
-  }
-
-  struct lp55xx_platform_data lp5562_pdata = {
-  ...
-	.patterns      = board_led_patterns,
-	.num_patterns  = ARRAY_SIZE(board_led_patterns),
-  };
-
-Then, mode_1 and mode_2 can be run via through the sysfs.
-
-  echo 1 > /sys/bus/i2c/devices/xxxx/led_pattern    # red blinking LED pattern
-  echo 2 > /sys/bus/i2c/devices/xxxx/led_pattern    # blue LED always on
-
-To stop running pattern,
-  echo 0 > /sys/bus/i2c/devices/xxxx/led_pattern
diff --git a/Documentation/leds/leds-mlxcpld.rst b/Documentation/leds/leds-mlxcpld.rst
new file mode 100644
index 000000000000..528582429e0b
--- /dev/null
+++ b/Documentation/leds/leds-mlxcpld.rst
@@ -0,0 +1,118 @@
+=======================================
+Kernel driver for Mellanox systems LEDs
+=======================================
+
+Provide system LED support for the nex Mellanox systems:
+"msx6710", "msx6720", "msb7700", "msn2700", "msx1410",
+"msn2410", "msb7800", "msn2740", "msn2100".
+
+Description
+-----------
+Driver provides the following LEDs for the systems "msx6710", "msx6720",
+"msb7700", "msn2700", "msx1410", "msn2410", "msb7800", "msn2740":
+
+  - mlxcpld:fan1:green
+  - mlxcpld:fan1:red
+  - mlxcpld:fan2:green
+  - mlxcpld:fan2:red
+  - mlxcpld:fan3:green
+  - mlxcpld:fan3:red
+  - mlxcpld:fan4:green
+  - mlxcpld:fan4:red
+  - mlxcpld:psu:green
+  - mlxcpld:psu:red
+  - mlxcpld:status:green
+  - mlxcpld:status:red
+
+ "status"
+  - CPLD reg offset: 0x20
+  - Bits [3:0]
+
+ "psu"
+  - CPLD reg offset: 0x20
+  - Bits [7:4]
+
+ "fan1"
+  - CPLD reg offset: 0x21
+  - Bits [3:0]
+
+ "fan2"
+  - CPLD reg offset: 0x21
+  - Bits [7:4]
+
+ "fan3"
+  - CPLD reg offset: 0x22
+  - Bits [3:0]
+
+ "fan4"
+  - CPLD reg offset: 0x22
+  - Bits [7:4]
+
+ Color mask for all the above LEDs:
+
+  [bit3,bit2,bit1,bit0] or
+  [bit7,bit6,bit5,bit4]:
+
+	- [0,0,0,0] = LED OFF
+	- [0,1,0,1] = Red static ON
+	- [1,1,0,1] = Green static ON
+	- [0,1,1,0] = Red blink 3Hz
+	- [1,1,1,0] = Green blink 3Hz
+	- [0,1,1,1] = Red blink 6Hz
+	- [1,1,1,1] = Green blink 6Hz
+
+Driver provides the following LEDs for the system "msn2100":
+
+  - mlxcpld:fan:green
+  - mlxcpld:fan:red
+  - mlxcpld:psu1:green
+  - mlxcpld:psu1:red
+  - mlxcpld:psu2:green
+  - mlxcpld:psu2:red
+  - mlxcpld:status:green
+  - mlxcpld:status:red
+  - mlxcpld:uid:blue
+
+ "status"
+  - CPLD reg offset: 0x20
+  - Bits [3:0]
+
+ "fan"
+  - CPLD reg offset: 0x21
+  - Bits [3:0]
+
+ "psu1"
+  - CPLD reg offset: 0x23
+  - Bits [3:0]
+
+ "psu2"
+  - CPLD reg offset: 0x23
+  - Bits [7:4]
+
+ "uid"
+  - CPLD reg offset: 0x24
+  - Bits [3:0]
+
+ Color mask for all the above LEDs, excepted uid:
+
+  [bit3,bit2,bit1,bit0] or
+  [bit7,bit6,bit5,bit4]:
+
+	- [0,0,0,0] = LED OFF
+	- [0,1,0,1] = Red static ON
+	- [1,1,0,1] = Green static ON
+	- [0,1,1,0] = Red blink 3Hz
+	- [1,1,1,0] = Green blink 3Hz
+	- [0,1,1,1] = Red blink 6Hz
+	- [1,1,1,1] = Green blink 6Hz
+
+ Color mask for uid LED:
+  [bit3,bit2,bit1,bit0]:
+
+	- [0,0,0,0] = LED OFF
+	- [1,1,0,1] = Blue static ON
+	- [1,1,1,0] = Blue blink 3Hz
+	- [1,1,1,1] = Blue blink 6Hz
+
+Driver supports HW blinking at 3Hz and 6Hz frequency (50% duty cycle).
+For 3Hz duty cylce is about 167 msec, for 6Hz is about 83 msec.
diff --git a/Documentation/leds/leds-mlxcpld.txt b/Documentation/leds/leds-mlxcpld.txt
deleted file mode 100644
index a0e8fd457117..000000000000
--- a/Documentation/leds/leds-mlxcpld.txt
+++ /dev/null
@@ -1,110 +0,0 @@
-Kernel driver for Mellanox systems LEDs
-=======================================
-
-Provide system LED support for the nex Mellanox systems:
-"msx6710", "msx6720", "msb7700", "msn2700", "msx1410",
-"msn2410", "msb7800", "msn2740", "msn2100".
-
-Description
------------
-Driver provides the following LEDs for the systems "msx6710", "msx6720",
-"msb7700", "msn2700", "msx1410", "msn2410", "msb7800", "msn2740":
-  mlxcpld:fan1:green
-  mlxcpld:fan1:red
-  mlxcpld:fan2:green
-  mlxcpld:fan2:red
-  mlxcpld:fan3:green
-  mlxcpld:fan3:red
-  mlxcpld:fan4:green
-  mlxcpld:fan4:red
-  mlxcpld:psu:green
-  mlxcpld:psu:red
-  mlxcpld:status:green
-  mlxcpld:status:red
-
- "status"
-  CPLD reg offset: 0x20
-  Bits [3:0]
-
- "psu"
-  CPLD reg offset: 0x20
-  Bits [7:4]
-
- "fan1"
-  CPLD reg offset: 0x21
-  Bits [3:0]
-
- "fan2"
-  CPLD reg offset: 0x21
-  Bits [7:4]
-
- "fan3"
-  CPLD reg offset: 0x22
-  Bits [3:0]
-
- "fan4"
-  CPLD reg offset: 0x22
-  Bits [7:4]
-
- Color mask for all the above LEDs:
-  [bit3,bit2,bit1,bit0] or
-  [bit7,bit6,bit5,bit4]:
-	[0,0,0,0] = LED OFF
-	[0,1,0,1] = Red static ON
-	[1,1,0,1] = Green static ON
-	[0,1,1,0] = Red blink 3Hz
-	[1,1,1,0] = Green blink 3Hz
-	[0,1,1,1] = Red blink 6Hz
-	[1,1,1,1] = Green blink 6Hz
-
-Driver provides the following LEDs for the system "msn2100":
-  mlxcpld:fan:green
-  mlxcpld:fan:red
-  mlxcpld:psu1:green
-  mlxcpld:psu1:red
-  mlxcpld:psu2:green
-  mlxcpld:psu2:red
-  mlxcpld:status:green
-  mlxcpld:status:red
-  mlxcpld:uid:blue
-
- "status"
-  CPLD reg offset: 0x20
-  Bits [3:0]
-
- "fan"
-  CPLD reg offset: 0x21
-  Bits [3:0]
-
- "psu1"
-  CPLD reg offset: 0x23
-  Bits [3:0]
-
- "psu2"
-  CPLD reg offset: 0x23
-  Bits [7:4]
-
- "uid"
-  CPLD reg offset: 0x24
-  Bits [3:0]
-
- Color mask for all the above LEDs, excepted uid:
-  [bit3,bit2,bit1,bit0] or
-  [bit7,bit6,bit5,bit4]:
-	[0,0,0,0] = LED OFF
-	[0,1,0,1] = Red static ON
-	[1,1,0,1] = Green static ON
-	[0,1,1,0] = Red blink 3Hz
-	[1,1,1,0] = Green blink 3Hz
-	[0,1,1,1] = Red blink 6Hz
-	[1,1,1,1] = Green blink 6Hz
-
- Color mask for uid LED:
-  [bit3,bit2,bit1,bit0]:
-	[0,0,0,0] = LED OFF
-	[1,1,0,1] = Blue static ON
-	[1,1,1,0] = Blue blink 3Hz
-	[1,1,1,1] = Blue blink 6Hz
-
-Driver supports HW blinking at 3Hz and 6Hz frequency (50% duty cycle).
-For 3Hz duty cylce is about 167 msec, for 6Hz is about 83 msec.
diff --git a/Documentation/leds/ledtrig-oneshot.rst b/Documentation/leds/ledtrig-oneshot.rst
new file mode 100644
index 000000000000..69fa3ea1d554
--- /dev/null
+++ b/Documentation/leds/ledtrig-oneshot.rst
@@ -0,0 +1,44 @@
+====================
+One-shot LED Trigger
+====================
+
+This is a LED trigger useful for signaling the user of an event where there are
+no clear trap points to put standard led-on and led-off settings.  Using this
+trigger, the application needs only to signal the trigger when an event has
+happened, than the trigger turns the LED on and than keeps it off for a
+specified amount of time.
+
+This trigger is meant to be usable both for sporadic and dense events.  In the
+first case, the trigger produces a clear single controlled blink for each
+event, while in the latter it keeps blinking at constant rate, as to signal
+that the events are arriving continuously.
+
+A one-shot LED only stays in a constant state when there are no events.  An
+additional "invert" property specifies if the LED has to stay off (normal) or
+on (inverted) when not rearmed.
+
+The trigger can be activated from user space on led class devices as shown
+below::
+
+  echo oneshot > trigger
+
+This adds sysfs attributes to the LED that are documented in:
+Documentation/ABI/testing/sysfs-class-led-trigger-oneshot
+
+Example use-case: network devices, initialization::
+
+  echo oneshot > trigger # set trigger for this led
+  echo 33 > delay_on     # blink at 1 / (33 + 33) Hz on continuous traffic
+  echo 33 > delay_off
+
+interface goes up::
+
+  echo 1 > invert # set led as normally-on, turn the led on
+
+packet received/transmitted::
+
+  echo 1 > shot # led starts blinking, ignored if already blinking
+
+interface goes down::
+
+  echo 0 > invert # set led as normally-off, turn the led off
diff --git a/Documentation/leds/ledtrig-oneshot.txt b/Documentation/leds/ledtrig-oneshot.txt
deleted file mode 100644
index fe57474a12e2..000000000000
--- a/Documentation/leds/ledtrig-oneshot.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-One-shot LED Trigger
-====================
-
-This is a LED trigger useful for signaling the user of an event where there are
-no clear trap points to put standard led-on and led-off settings.  Using this
-trigger, the application needs only to signal the trigger when an event has
-happened, than the trigger turns the LED on and than keeps it off for a
-specified amount of time.
-
-This trigger is meant to be usable both for sporadic and dense events.  In the
-first case, the trigger produces a clear single controlled blink for each
-event, while in the latter it keeps blinking at constant rate, as to signal
-that the events are arriving continuously.
-
-A one-shot LED only stays in a constant state when there are no events.  An
-additional "invert" property specifies if the LED has to stay off (normal) or
-on (inverted) when not rearmed.
-
-The trigger can be activated from user space on led class devices as shown
-below:
-
-  echo oneshot > trigger
-
-This adds sysfs attributes to the LED that are documented in:
-Documentation/ABI/testing/sysfs-class-led-trigger-oneshot
-
-Example use-case: network devices, initialization:
-
-  echo oneshot > trigger # set trigger for this led
-  echo 33 > delay_on     # blink at 1 / (33 + 33) Hz on continuous traffic
-  echo 33 > delay_off
-
-interface goes up:
-
-  echo 1 > invert # set led as normally-on, turn the led on
-
-packet received/transmitted:
-
-  echo 1 > shot # led starts blinking, ignored if already blinking
-
-interface goes down
-
-  echo 0 > invert # set led as normally-off, turn the led off
diff --git a/Documentation/leds/ledtrig-transient.rst b/Documentation/leds/ledtrig-transient.rst
new file mode 100644
index 000000000000..d921dc830cd0
--- /dev/null
+++ b/Documentation/leds/ledtrig-transient.rst
@@ -0,0 +1,167 @@
+=====================
+LED Transient Trigger
+=====================
+
+The leds timer trigger does not currently have an interface to activate
+a one shot timer. The current support allows for setting two timers, one for
+specifying how long a state to be on, and the second for how long the state
+to be off. The delay_on value specifies the time period an LED should stay
+in on state, followed by a delay_off value that specifies how long the LED
+should stay in off state. The on and off cycle repeats until the trigger
+gets deactivated. There is no provision for one time activation to implement
+features that require an on or off state to be held just once and then stay in
+the original state forever.
+
+Without one shot timer interface, user space can still use timer trigger to
+set a timer to hold a state, however when user space application crashes or
+goes away without deactivating the timer, the hardware will be left in that
+state permanently.
+
+As a specific example of this use-case, let's look at vibrate feature on
+phones. Vibrate function on phones is implemented using PWM pins on SoC or
+PMIC. There is a need to activate one shot timer to control the vibrate
+feature, to prevent user space crashes leaving the phone in vibrate mode
+permanently causing the battery to drain.
+
+Transient trigger addresses the need for one shot timer activation. The
+transient trigger can be enabled and disabled just like the other leds
+triggers.
+
+When an led class device driver registers itself, it can specify all leds
+triggers it supports and a default trigger. During registration, activation
+routine for the default trigger gets called. During registration of an led
+class device, the LED state does not change.
+
+When the driver unregisters, deactivation routine for the currently active
+trigger will be called, and LED state is changed to LED_OFF.
+
+Driver suspend changes the LED state to LED_OFF and resume doesn't change
+the state. Please note that there is no explicit interaction between the
+suspend and resume actions and the currently enabled trigger. LED state
+changes are suspended while the driver is in suspend state. Any timers
+that are active at the time driver gets suspended, continue to run, without
+being able to actually change the LED state. Once driver is resumed, triggers
+start functioning again.
+
+LED state changes are controlled using brightness which is a common led
+class device property. When brightness is set to 0 from user space via
+echo 0 > brightness, it will result in deactivating the current trigger.
+
+Transient trigger uses standard register and unregister interfaces. During
+trigger registration, for each led class device that specifies this trigger
+as its default trigger, trigger activation routine will get called. During
+registration, the LED state does not change, unless there is another trigger
+active, in which case LED state changes to LED_OFF.
+
+During trigger unregistration, LED state gets changed to LED_OFF.
+
+Transient trigger activation routine doesn't change the LED state. It
+creates its properties and does its initialization. Transient trigger
+deactivation routine, will cancel any timer that is active before it cleans
+up and removes the properties it created. It will restore the LED state to
+non-transient state. When driver gets suspended, irrespective of the transient
+state, the LED state changes to LED_OFF.
+
+Transient trigger can be enabled and disabled from user space on led class
+devices, that support this trigger as shown below::
+
+	echo transient > trigger
+	echo none > trigger
+
+NOTE:
+	Add a new property trigger state to control the state.
+
+This trigger exports three properties, activate, state, and duration. When
+transient trigger is activated these properties are set to default values.
+
+- duration allows setting timer value in msecs. The initial value is 0.
+- activate allows activating and deactivating the timer specified by
+  duration as needed. The initial and default value is 0.  This will allow
+  duration to be set after trigger activation.
+- state allows user to specify a transient state to be held for the specified
+  duration.
+
+	activate
+	      - one shot timer activate mechanism.
+		1 when activated, 0 when deactivated.
+		default value is zero when transient trigger is enabled,
+		to allow duration to be set.
+
+		activate state indicates a timer with a value of specified
+		duration running.
+		deactivated state indicates that there is no active timer
+		running.
+
+	duration
+	      - one shot timer value. When activate is set, duration value
+		is used to start a timer that runs once. This value doesn't
+		get changed by the trigger unless user does a set via
+		echo new_value > duration
+
+	state
+	      - transient state to be held. It has two values 0 or 1. 0 maps
+		to LED_OFF and 1 maps to LED_FULL. The specified state is
+		held for the duration of the one shot timer and then the
+		state gets changed to the non-transient state which is the
+		inverse of transient state.
+		If state = LED_FULL, when the timer runs out the state will
+		go back to LED_OFF.
+		If state = LED_OFF, when the timer runs out the state will
+		go back to LED_FULL.
+		Please note that current LED state is not checked prior to
+		changing the state to the specified state.
+		Driver could map these values to inverted depending on the
+		default states it defines for the LED in its brightness_set()
+		interface which is called from the led brightness_set()
+		interfaces to control the LED state.
+
+When timer expires activate goes back to deactivated state, duration is left
+at the set value to be used when activate is set at a future time. This will
+allow user app to set the time once and activate it to run it once for the
+specified value as needed. When timer expires, state is restored to the
+non-transient state which is the inverse of the transient state:
+
+	=================   ===============================================
+	echo 1 > activate   starts timer = duration when duration is not 0.
+	echo 0 > activate   cancels currently running timer.
+	echo n > duration   stores timer value to be used upon next
+			    activate. Currently active timer if
+			    any, continues to run for the specified time.
+	echo 0 > duration   stores timer value to be used upon next
+			    activate. Currently active timer if any,
+			    continues to run for the specified time.
+	echo 1 > state      stores desired transient state LED_FULL to be
+			    held for the specified duration.
+	echo 0 > state      stores desired transient state LED_OFF to be
+			    held for the specified duration.
+	=================   ===============================================
+
+What is not supported
+=====================
+
+- Timer activation is one shot and extending and/or shortening the timer
+  is not supported.
+
+Examples
+========
+
+use-case 1::
+
+	echo transient > trigger
+	echo n > duration
+	echo 1 > state
+
+repeat the following step as needed::
+
+	echo 1 > activate - start timer = duration to run once
+	echo 1 > activate - start timer = duration to run once
+	echo none > trigger
+
+This trigger is intended to be used for for the following example use cases:
+
+ - Control of vibrate (phones, tablets etc.) hardware by user space app.
+ - Use of LED by user space app as activity indicator.
+ - Use of LED by user space app as a kind of watchdog indicator -- as
+   long as the app is alive, it can keep the LED illuminated, if it dies
+   the LED will be extinguished automatically.
+ - Use by any user space app that needs a transient GPIO output.
diff --git a/Documentation/leds/ledtrig-transient.txt b/Documentation/leds/ledtrig-transient.txt
deleted file mode 100644
index 3bd38b487df1..000000000000
--- a/Documentation/leds/ledtrig-transient.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-LED Transient Trigger
-=====================
-
-The leds timer trigger does not currently have an interface to activate
-a one shot timer. The current support allows for setting two timers, one for
-specifying how long a state to be on, and the second for how long the state
-to be off. The delay_on value specifies the time period an LED should stay
-in on state, followed by a delay_off value that specifies how long the LED
-should stay in off state. The on and off cycle repeats until the trigger
-gets deactivated. There is no provision for one time activation to implement
-features that require an on or off state to be held just once and then stay in
-the original state forever.
-
-Without one shot timer interface, user space can still use timer trigger to
-set a timer to hold a state, however when user space application crashes or
-goes away without deactivating the timer, the hardware will be left in that
-state permanently.
-
-As a specific example of this use-case, let's look at vibrate feature on
-phones. Vibrate function on phones is implemented using PWM pins on SoC or
-PMIC. There is a need to activate one shot timer to control the vibrate
-feature, to prevent user space crashes leaving the phone in vibrate mode
-permanently causing the battery to drain.
-
-Transient trigger addresses the need for one shot timer activation. The
-transient trigger can be enabled and disabled just like the other leds
-triggers.
-
-When an led class device driver registers itself, it can specify all leds
-triggers it supports and a default trigger. During registration, activation
-routine for the default trigger gets called. During registration of an led
-class device, the LED state does not change.
-
-When the driver unregisters, deactivation routine for the currently active
-trigger will be called, and LED state is changed to LED_OFF.
-
-Driver suspend changes the LED state to LED_OFF and resume doesn't change
-the state. Please note that there is no explicit interaction between the
-suspend and resume actions and the currently enabled trigger. LED state
-changes are suspended while the driver is in suspend state. Any timers
-that are active at the time driver gets suspended, continue to run, without
-being able to actually change the LED state. Once driver is resumed, triggers
-start functioning again.
-
-LED state changes are controlled using brightness which is a common led
-class device property. When brightness is set to 0 from user space via
-echo 0 > brightness, it will result in deactivating the current trigger.
-
-Transient trigger uses standard register and unregister interfaces. During
-trigger registration, for each led class device that specifies this trigger
-as its default trigger, trigger activation routine will get called. During
-registration, the LED state does not change, unless there is another trigger
-active, in which case LED state changes to LED_OFF.
-
-During trigger unregistration, LED state gets changed to LED_OFF.
-
-Transient trigger activation routine doesn't change the LED state. It
-creates its properties and does its initialization. Transient trigger
-deactivation routine, will cancel any timer that is active before it cleans
-up and removes the properties it created. It will restore the LED state to
-non-transient state. When driver gets suspended, irrespective of the transient
-state, the LED state changes to LED_OFF.
-
-Transient trigger can be enabled and disabled from user space on led class
-devices, that support this trigger as shown below:
-
-echo transient > trigger
-echo none > trigger
-
-NOTE: Add a new property trigger state to control the state.
-
-This trigger exports three properties, activate, state, and duration. When
-transient trigger is activated these properties are set to default values.
-
-- duration allows setting timer value in msecs. The initial value is 0.
-- activate allows activating and deactivating the timer specified by
-  duration as needed. The initial and default value is 0.  This will allow
-  duration to be set after trigger activation.
-- state allows user to specify a transient state to be held for the specified
-  duration.
-
-	activate - one shot timer activate mechanism.
-		1 when activated, 0 when deactivated.
-		default value is zero when transient trigger is enabled,
-		to allow duration to be set.
-
-		activate state indicates a timer with a value of specified
-		duration running.
-		deactivated state indicates that there is no active timer
-		running.
-
-	duration - one shot timer value. When activate is set, duration value
-		is used to start a timer that runs once. This value doesn't
-		get changed by the trigger unless user does a set via
-		echo new_value > duration
-
-	state - transient state to be held. It has two values 0 or 1. 0 maps
-		to LED_OFF and 1 maps to LED_FULL. The specified state is
-		held for the duration of the one shot timer and then the
-		state gets changed to the non-transient state which is the
-		inverse of transient state.
-		If state = LED_FULL, when the timer runs out the state will
-		go back to LED_OFF.
-		If state = LED_OFF, when the timer runs out the state will
-		go back to LED_FULL.
-		Please note that current LED state is not checked prior to
-		changing the state to the specified state.
-		Driver could map these values to inverted depending on the
-		default states it defines for the LED in its brightness_set()
-		interface which is called from the led brightness_set()
-		interfaces to control the LED state.
-
-When timer expires activate goes back to deactivated state, duration is left
-at the set value to be used when activate is set at a future time. This will
-allow user app to set the time once and activate it to run it once for the
-specified value as needed. When timer expires, state is restored to the
-non-transient state which is the inverse of the transient state.
-
-	echo 1 > activate - starts timer = duration when duration is not 0.
-	echo 0 > activate - cancels currently running timer.
-	echo n > duration - stores timer value to be used upon next
-                            activate. Currently active timer if
-                            any, continues to run for the specified time.
-	echo 0 > duration - stores timer value to be used upon next
-                            activate. Currently active timer if any,
-                            continues to run for the specified time.
-	echo 1 > state    - stores desired transient state LED_FULL to be
-			    held for the specified duration.
-	echo 0 > state    - stores desired transient state LED_OFF to be
-			    held for the specified duration.
-
-What is not supported:
-======================
-- Timer activation is one shot and extending and/or shortening the timer
-  is not supported.
-
-Example use-case 1:
-	echo transient > trigger
-	echo n > duration
-	echo 1 > state
-repeat the following step as needed:
-	echo 1 > activate - start timer = duration to run once
-	echo 1 > activate - start timer = duration to run once
-	echo none > trigger
-
-This trigger is intended to be used for for the following example use cases:
- - Control of vibrate (phones, tablets etc.) hardware by user space app.
- - Use of LED by user space app as activity indicator.
- - Use of LED by user space app as a kind of watchdog indicator -- as
-       long as the app is alive, it can keep the LED illuminated, if it dies
-       the LED will be extinguished automatically.
- - Use by any user space app that needs a transient GPIO output.
diff --git a/Documentation/leds/ledtrig-usbport.rst b/Documentation/leds/ledtrig-usbport.rst
new file mode 100644
index 000000000000..37c2505bfd57
--- /dev/null
+++ b/Documentation/leds/ledtrig-usbport.rst
@@ -0,0 +1,46 @@
+====================
+USB port LED trigger
+====================
+
+This LED trigger can be used for signalling to the user a presence of USB device
+in a given port. It simply turns on LED when device appears and turns it off
+when it disappears.
+
+It requires selecting USB ports that should be observed. All available ones are
+listed as separated entries in a "ports" subdirectory. Selecting is handled by
+echoing "1" to a chosen port.
+
+Please note that this trigger allows selecting multiple USB ports for a single
+LED.
+
+This can be useful in two cases:
+
+1) Device with single USB LED and few physical ports
+====================================================
+
+In such a case LED will be turned on as long as there is at least one connected
+USB device.
+
+2) Device with a physical port handled by few controllers
+=========================================================
+
+Some devices may have one controller per PHY standard. E.g. USB 3.0 physical
+port may be handled by ohci-platform, ehci-platform and xhci-hcd. If there is
+only one LED user will most likely want to assign ports from all 3 hubs.
+
+
+This trigger can be activated from user space on led class devices as shown
+below::
+
+  echo usbport > trigger
+
+This adds sysfs attributes to the LED that are documented in:
+Documentation/ABI/testing/sysfs-class-led-trigger-usbport
+
+Example use-case::
+
+  echo usbport > trigger
+  echo 1 > ports/usb1-port1
+  echo 1 > ports/usb2-port1
+  cat ports/usb1-port1
+  echo 0 > ports/usb1-port1
diff --git a/Documentation/leds/ledtrig-usbport.txt b/Documentation/leds/ledtrig-usbport.txt
deleted file mode 100644
index 69f54bfb4789..000000000000
--- a/Documentation/leds/ledtrig-usbport.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-USB port LED trigger
-====================
-
-This LED trigger can be used for signalling to the user a presence of USB device
-in a given port. It simply turns on LED when device appears and turns it off
-when it disappears.
-
-It requires selecting USB ports that should be observed. All available ones are
-listed as separated entries in a "ports" subdirectory. Selecting is handled by
-echoing "1" to a chosen port.
-
-Please note that this trigger allows selecting multiple USB ports for a single
-LED. This can be useful in two cases:
-
-1) Device with single USB LED and few physical ports
-
-In such a case LED will be turned on as long as there is at least one connected
-USB device.
-
-2) Device with a physical port handled by few controllers
-
-Some devices may have one controller per PHY standard. E.g. USB 3.0 physical
-port may be handled by ohci-platform, ehci-platform and xhci-hcd. If there is
-only one LED user will most likely want to assign ports from all 3 hubs.
-
-
-This trigger can be activated from user space on led class devices as shown
-below:
-
-  echo usbport > trigger
-
-This adds sysfs attributes to the LED that are documented in:
-Documentation/ABI/testing/sysfs-class-led-trigger-usbport
-
-Example use-case:
-
-  echo usbport > trigger
-  echo 1 > ports/usb1-port1
-  echo 1 > ports/usb2-port1
-  cat ports/usb1-port1
-  echo 0 > ports/usb1-port1
diff --git a/Documentation/leds/uleds.rst b/Documentation/leds/uleds.rst
new file mode 100644
index 000000000000..83221098009c
--- /dev/null
+++ b/Documentation/leds/uleds.rst
@@ -0,0 +1,37 @@
+==============
+Userspace LEDs
+==============
+
+The uleds driver supports userspace LEDs. This can be useful for testing
+triggers and can also be used to implement virtual LEDs.
+
+
+Usage
+=====
+
+When the driver is loaded, a character device is created at /dev/uleds. To
+create a new LED class device, open /dev/uleds and write a uleds_user_dev
+structure to it (found in kernel public header file linux/uleds.h)::
+
+    #define LED_MAX_NAME_SIZE 64
+
+    struct uleds_user_dev {
+	char name[LED_MAX_NAME_SIZE];
+    };
+
+A new LED class device will be created with the name given. The name can be
+any valid sysfs device node name, but consider using the LED class naming
+convention of "devicename:color:function".
+
+The current brightness is found by reading a single byte from the character
+device. Values are unsigned: 0 to 255. Reading will block until the brightness
+changes. The device node can also be polled to notify when the brightness value
+changes.
+
+The LED class device will be removed when the open file handle to /dev/uleds
+is closed.
+
+Multiple LED class devices are created by opening additional file handles to
+/dev/uleds.
+
+See tools/leds/uledmon.c for an example userspace program.
diff --git a/Documentation/leds/uleds.txt b/Documentation/leds/uleds.txt
deleted file mode 100644
index 13e375a580f9..000000000000
--- a/Documentation/leds/uleds.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-Userspace LEDs
-==============
-
-The uleds driver supports userspace LEDs. This can be useful for testing
-triggers and can also be used to implement virtual LEDs.
-
-
-Usage
-=====
-
-When the driver is loaded, a character device is created at /dev/uleds. To
-create a new LED class device, open /dev/uleds and write a uleds_user_dev
-structure to it (found in kernel public header file linux/uleds.h).
-
-    #define LED_MAX_NAME_SIZE 64
-
-    struct uleds_user_dev {
-        char name[LED_MAX_NAME_SIZE];
-    };
-
-A new LED class device will be created with the name given. The name can be
-any valid sysfs device node name, but consider using the LED class naming
-convention of "devicename:color:function".
-
-The current brightness is found by reading a single byte from the character
-device. Values are unsigned: 0 to 255. Reading will block until the brightness
-changes. The device node can also be polled to notify when the brightness value
-changes.
-
-The LED class device will be removed when the open file handle to /dev/uleds
-is closed.
-
-Multiple LED class devices are created by opening additional file handles to
-/dev/uleds.
-
-See tools/leds/uledmon.c for an example userspace program.
diff --git a/Documentation/livepatch/index.rst b/Documentation/livepatch/index.rst
index edd291d51847..17674a9e21b2 100644
--- a/Documentation/livepatch/index.rst
+++ b/Documentation/livepatch/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
 
 ===================
 Kernel Livepatching
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
new file mode 100644
index 000000000000..626a463f7e42
--- /dev/null
+++ b/Documentation/locking/index.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+locking
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    lockdep-design
+    lockstat
+    locktorture
+    mutex-design
+    rt-mutex-design
+    rt-mutex
+    spinlocks
+    ww-mutex-design
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst
new file mode 100644
index 000000000000..23fcbc4d3fc0
--- /dev/null
+++ b/Documentation/locking/lockdep-design.rst
@@ -0,0 +1,394 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'usage state' of lock-classes, and it tracks
+the dependencies between different lock-classes. Lock usage indicates
+how a lock is used with regard to its IRQ contexts, while lock
+dependency can be understood as lock order, where L1 -> L2 suggests that
+a task is attempting to acquire L2 while holding L1. From lockdep's
+perspective, the two locks (L1 and L2) are not necessarily related; that
+dependency just means the order ever happened. The validator maintains a
+continuing effort to prove lock usages and dependencies are correct or
+the validator will shoot a splat if incorrect.
+
+A lock-class's behavior is constructed by its instances collectively:
+when the first instance of a lock-class is used after bootup the class
+gets registered, then all (subsequent) instances will be mapped to the
+class and hence their usages and dependecies will contribute to those of
+the class. A lock-class does not go away when a lock instance does, but
+it can be removed if the memory space of the lock class (static or
+dynamic) is reclaimed, this happens for example when a module is
+unloaded or a workqueue is destroyed.
+
+State
+-----
+
+The validator tracks lock-class usage history and divides the usage into
+(4 usages * n STATEs + 1) categories:
+
+where the 4 usages can be:
+- 'ever held in STATE context'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
+
+where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
+now they include:
+- hardirq
+- softirq
+
+where the last 1 category is:
+- 'ever used'                                       [ == !unused        ]
+
+When locking rules are violated, these usage bits are presented in the
+locking error messages, inside curlies, with a total of 2 * n STATEs bits.
+A contrived example::
+
+   modprobe/2287 is trying to acquire lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+   but task is already holding lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+For a given lock, the bit positions from left to right indicate the usage
+of the lock and readlock (if exists), for each of the n STATEs listed
+above respectively, and the character displayed at each bit position
+indicates:
+
+   ===  ===================================================
+   '.'  acquired while irqs disabled and not in irq context
+   '-'  acquired in irq context
+   '+'  acquired with irqs enabled
+   '?'  acquired in irq context with irqs enabled.
+   ===  ===================================================
+
+The bits are illustrated with an example::
+
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+                         ||||
+                         ||| \-> softirq disabled and not in softirq context
+                         || \--> acquired in softirq context
+                         | \---> hardirq disabled and not in hardirq context
+                          \----> acquired in hardirq context
+
+
+For a given STATE, whether the lock is ever acquired in that STATE
+context and whether that STATE is enabled yields four possible cases as
+shown in the table below. The bit character is able to indicate which
+exact case is for the lock as of the reporting time.
+
+  +--------------+-------------+--------------+
+  |              | irq enabled | irq disabled |
+  +--------------+-------------+--------------+
+  | ever in irq  |      ?      |       -      |
+  +--------------+-------------+--------------+
+  | never in irq |      +      |       .      |
+  +--------------+-------------+--------------+
+
+The character '-' suggests irq is disabled because if otherwise the
+charactor '?' would have been shown instead. Similar deduction can be
+applied for '+' too.
+
+Unused locks (e.g., mutexes) cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A lock is irq-safe means it was ever used in an irq context, while a lock
+is irq-unsafe means it was ever acquired with irq enabled.
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states must be exclusive: only one of them is allowed to be set
+for any lock-class based on its usage::
+
+ <hardirq-safe> or <hardirq-unsafe>
+ <softirq-safe> or <softirq-unsafe>
+
+This is because if a lock can be used in irq context (irq-safe) then it
+cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
+deadlock may happen. For example, in the scenario that after this lock
+was acquired but before released, if the context is interrupted this
+lock will be attempted to acquire twice, which creates a deadlock,
+referred to as lock recursion deadlock.
+
+The validator detects and reports lock usage that violates these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks can not be taken in inverse order::
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to a deadlock - referred to as lock inversion
+deadlock - as attempts to acquire the two locks form a circle which
+could lead to the two contexts waiting for each other permanently. The
+validator will find such dependency circle in arbitrary complexity,
+i.e., there can be any other locking sequence between the acquire-lock
+operations; the validator will still find whether these locks can be
+acquired in a circular fashion.
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes::
+
+   <hardirq-safe>   ->  <hardirq-unsafe>
+   <softirq-safe>   ->  <softirq-unsafe>
+
+The first rule comes from the fact that a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+  took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+  any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+  hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+  softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this::
+
+  enum bdev_bd_mutex_lock_class
+  {
+       BD_MUTEX_NORMAL,
+       BD_MUTEX_WHOLE,
+       BD_MUTEX_PARTITION
+  };
+
+mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Annotations
+-----------
+
+Two constructs can be used to annotate and check where and if certain locks
+must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
+
+As the name suggests, lockdep_assert_held* family of macros assert that a
+particular lock is held at a certain time (and generate a WARN() otherwise).
+This annotation is largely used all over the kernel, e.g. kernel/sched/
+core.c::
+
+  void update_rq_clock(struct rq *rq)
+  {
+	s64 delta;
+
+	lockdep_assert_held(&rq->lock);
+	[...]
+  }
+
+where holding rq->lock is required to safely update a rq's clock.
+
+The other family of macros is lockdep_*pin_lock(), which is admittedly only
+used for rq->lock ATM. Despite their limited adoption these annotations
+generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
+out to be especially helpful to debug code with callbacks, where an upper
+layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
+and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
+returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
+that nobody tampered with the lock, e.g. kernel/sched/sched.h::
+
+  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	rf->cookie = lockdep_pin_lock(&rq->lock);
+	[...]
+  }
+
+  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	[...]
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
+  }
+
+While comments about locking requirements might provide useful information,
+the runtime checks performed by annotations are invaluable when debugging
+locking problems and they carry the same level of details when inspecting
+code.  Always prefer annotations when in doubt!
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [1]_
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+.. [1]
+
+    assuming that the validator itself is 100% correct, and no other
+    part of the system corrupts the state of the validator in any way.
+    We also assume that all NMI/SMM paths [which could interrupt
+    even hardirq-disabled codepaths] are correct and do not interfere
+    with the validator. We also assume that the 64-bit 'chain hash'
+    value is unique for every lock-chain in the system. Also, lock
+    recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require **massive** amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+don't have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks.  These two problems are illustrated below:
+
+1.	Repeated module loading and unloading while running the validator
+	will result in lock-class leakage.  The issue here is that each
+	load of the module will create a new set of lock classes for
+	that module's locks, but module unloading does not remove old
+	classes (see below discussion of reuse of lock classes for why).
+	Therefore, if that module is loaded and unloaded repeatedly,
+	the number of lock classes will eventually reach the maximum.
+
+2.	Using structures such as arrays that have large numbers of
+	locks that are not explicitly initialized.  For example,
+	a hash table with 8192 buckets where each bucket has its own
+	spinlock_t will consume 8192 lock classes -unless- each spinlock
+	is explicitly initialized at runtime, for example, using the
+	run-time spin_lock_init() as opposed to compile-time initializers
+	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
+	the per-bucket spinlocks would guarantee lock-class overflow.
+	In contrast, a loop that called spin_lock_init() on each lock
+	would place all 8192 locks into a single lock class.
+
+	The moral of this story is that you should always explicitly
+	initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused.  However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph.  This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes.  First, the following command gives
+you the number of lock classes currently in use along with the maximum::
+
+	grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system::
+
+	lock-classes:                          748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak.  The following command can be used to
+identify the leaking lock classes::
+
+	grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers.  This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
deleted file mode 100644
index 39fae143c9cb..000000000000
--- a/Documentation/locking/lockdep-design.txt
+++ /dev/null
@@ -1,333 +0,0 @@
-Runtime locking correctness validator
-=====================================
-
-started by Ingo Molnar <mingo@redhat.com>
-additions by Arjan van de Ven <arjan@linux.intel.com>
-
-Lock-class
-----------
-
-The basic object the validator operates upon is a 'class' of locks.
-
-A class of locks is a group of locks that are logically the same with
-respect to locking rules, even if the locks may have multiple (possibly
-tens of thousands of) instantiations. For example a lock in the inode
-struct is one class, while each inode has its own instantiation of that
-lock class.
-
-The validator tracks the 'state' of lock-classes, and it tracks
-dependencies between different lock-classes. The validator maintains a
-rolling proof that the state and the dependencies are correct.
-
-Unlike an lock instantiation, the lock-class itself never goes away: when
-a lock-class is used for the first time after bootup it gets registered,
-and all subsequent uses of that lock-class will be attached to this
-lock-class.
-
-State
------
-
-The validator tracks lock-class usage history into 4 * nSTATEs + 1 separate
-state bits:
-
-- 'ever held in STATE context'
-- 'ever held as readlock in STATE context'
-- 'ever held with STATE enabled'
-- 'ever held as readlock with STATE enabled'
-
-Where STATE can be either one of (kernel/locking/lockdep_states.h)
- - hardirq
- - softirq
-
-- 'ever used'                                       [ == !unused        ]
-
-When locking rules are violated, these state bits are presented in the
-locking error messages, inside curlies. A contrived example:
-
-   modprobe/2287 is trying to acquire lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-   but task is already holding lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-
-The bit position indicates STATE, STATE-read, for each of the states listed
-above, and the character displayed in each indicates:
-
-   '.'  acquired while irqs disabled and not in irq context
-   '-'  acquired in irq context
-   '+'  acquired with irqs enabled
-   '?'  acquired in irq context with irqs enabled.
-
-Unused mutexes cannot be part of the cause of an error.
-
-
-Single-lock state rules:
-------------------------
-
-A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
-following states are exclusive, and only one of them is allowed to be
-set for any lock-class:
-
- <hardirq-safe> and <hardirq-unsafe>
- <softirq-safe> and <softirq-unsafe>
-
-The validator detects and reports lock usage that violate these
-single-lock state rules.
-
-Multi-lock dependency rules:
-----------------------------
-
-The same lock-class must not be acquired twice, because this could lead
-to lock recursion deadlocks.
-
-Furthermore, two locks may not be taken in different order:
-
- <L1> -> <L2>
- <L2> -> <L1>
-
-because this could lead to lock inversion deadlocks. (The validator
-finds such dependencies in arbitrary complexity, i.e. there can be any
-other locking sequence between the acquire-lock operations, the
-validator will still track all dependencies between locks.)
-
-Furthermore, the following usage based lock dependencies are not allowed
-between any two lock-classes:
-
-   <hardirq-safe>   ->  <hardirq-unsafe>
-   <softirq-safe>   ->  <softirq-unsafe>
-
-The first rule comes from the fact that a hardirq-safe lock could be
-taken by a hardirq context, interrupting a hardirq-unsafe lock - and
-thus could result in a lock inversion deadlock. Likewise, a softirq-safe
-lock could be taken by an softirq context, interrupting a softirq-unsafe
-lock.
-
-The above rules are enforced for any locking sequence that occurs in the
-kernel: when acquiring a new lock, the validator checks whether there is
-any rule violation between the new lock and any of the held locks.
-
-When a lock-class changes its state, the following aspects of the above
-dependency rules are enforced:
-
-- if a new hardirq-safe lock is discovered, we check whether it
-  took any hardirq-unsafe lock in the past.
-
-- if a new softirq-safe lock is discovered, we check whether it took
-  any softirq-unsafe lock in the past.
-
-- if a new hardirq-unsafe lock is discovered, we check whether any
-  hardirq-safe lock took it in the past.
-
-- if a new softirq-unsafe lock is discovered, we check whether any
-  softirq-safe lock took it in the past.
-
-(Again, we do these checks too on the basis that an interrupt context
-could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
-could lead to a lock inversion deadlock - even if that lock scenario did
-not trigger in practice yet.)
-
-Exception: Nested data dependencies leading to nested locking
--------------------------------------------------------------
-
-There are a few cases where the Linux kernel acquires more than one
-instance of the same lock-class. Such cases typically happen when there
-is some sort of hierarchy within objects of the same type. In these
-cases there is an inherent "natural" ordering between the two objects
-(defined by the properties of the hierarchy), and the kernel grabs the
-locks in this fixed order on each of the objects.
-
-An example of such an object hierarchy that results in "nested locking"
-is that of a "whole disk" block-dev object and a "partition" block-dev
-object; the partition is "part of" the whole device and as long as one
-always takes the whole disk lock as a higher lock than the partition
-lock, the lock ordering is fully correct. The validator does not
-automatically detect this natural ordering, as the locking rule behind
-the ordering is not static.
-
-In order to teach the validator about this correct usage model, new
-versions of the various locking primitives were added that allow you to
-specify a "nesting level". An example call, for the block device mutex,
-looks like this:
-
-enum bdev_bd_mutex_lock_class
-{
-       BD_MUTEX_NORMAL,
-       BD_MUTEX_WHOLE,
-       BD_MUTEX_PARTITION
-};
-
- mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
-
-In this case the locking is done on a bdev object that is known to be a
-partition.
-
-The validator treats a lock that is taken in such a nested fashion as a
-separate (sub)class for the purposes of validation.
-
-Note: When changing code to use the _nested() primitives, be careful and
-check really thoroughly that the hierarchy is correctly mapped; otherwise
-you can get false positives or false negatives.
-
-Annotations
------------
-
-Two constructs can be used to annotate and check where and if certain locks
-must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
-
-As the name suggests, lockdep_assert_held* family of macros assert that a
-particular lock is held at a certain time (and generate a WARN() otherwise).
-This annotation is largely used all over the kernel, e.g. kernel/sched/
-core.c
-
-  void update_rq_clock(struct rq *rq)
-  {
-	s64 delta;
-
-	lockdep_assert_held(&rq->lock);
-	[...]
-  }
-
-where holding rq->lock is required to safely update a rq's clock.
-
-The other family of macros is lockdep_*pin_lock(), which is admittedly only
-used for rq->lock ATM. Despite their limited adoption these annotations
-generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
-out to be especially helpful to debug code with callbacks, where an upper
-layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
-and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
-returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
-that nobody tampered with the lock, e.g. kernel/sched/sched.h
-
-  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	rf->cookie = lockdep_pin_lock(&rq->lock);
-	[...]
-  }
-
-  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	[...]
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
-  }
-
-While comments about locking requirements might provide useful information,
-the runtime checks performed by annotations are invaluable when debugging
-locking problems and they carry the same level of details when inspecting
-code.  Always prefer annotations when in doubt!
-
-Proof of 100% correctness:
---------------------------
-
-The validator achieves perfect, mathematical 'closure' (proof of locking
-correctness) in the sense that for every simple, standalone single-task
-locking sequence that occurred at least once during the lifetime of the
-kernel, the validator proves it with a 100% certainty that no
-combination and timing of these locking sequences can cause any class of
-lock related deadlock. [*]
-
-I.e. complex multi-CPU and multi-task locking scenarios do not have to
-occur in practice to prove a deadlock: only the simple 'component'
-locking chains have to occur at least once (anytime, in any
-task/context) for the validator to be able to prove correctness. (For
-example, complex deadlocks that would normally need more than 3 CPUs and
-a very unlikely constellation of tasks, irq-contexts and timings to
-occur, can be detected on a plain, lightly loaded single-CPU system as
-well!)
-
-This radically decreases the complexity of locking related QA of the
-kernel: what has to be done during QA is to trigger as many "simple"
-single-task locking dependencies in the kernel as possible, at least
-once, to prove locking correctness - instead of having to trigger every
-possible combination of locking interaction between CPUs, combined with
-every possible hardirq and softirq nesting scenario (which is impossible
-to do in practice).
-
-[*] assuming that the validator itself is 100% correct, and no other
-    part of the system corrupts the state of the validator in any way.
-    We also assume that all NMI/SMM paths [which could interrupt
-    even hardirq-disabled codepaths] are correct and do not interfere
-    with the validator. We also assume that the 64-bit 'chain hash'
-    value is unique for every lock-chain in the system. Also, lock
-    recursion must not be higher than 20.
-
-Performance:
-------------
-
-The above rules require _massive_ amounts of runtime checking. If we did
-that for every lock taken and for every irqs-enable event, it would
-render the system practically unusably slow. The complexity of checking
-is O(N^2), so even with just a few hundred lock-classes we'd have to do
-tens of thousands of checks for every event.
-
-This problem is solved by checking any given 'locking scenario' (unique
-sequence of locks taken after each other) only once. A simple stack of
-held locks is maintained, and a lightweight 64-bit hash value is
-calculated, which hash is unique for every lock chain. The hash value,
-when the chain is validated for the first time, is then put into a hash
-table, which hash-table can be checked in a lockfree manner. If the
-locking chain occurs again later on, the hash table tells us that we
-don't have to validate the chain again.
-
-Troubleshooting:
-----------------
-
-The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
-Exceeding this number will trigger the following lockdep warning:
-
-	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
-
-By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
-desktop systems have less than 1,000 lock classes, so this warning
-normally results from lock-class leakage or failure to properly
-initialize locks.  These two problems are illustrated below:
-
-1.	Repeated module loading and unloading while running the validator
-	will result in lock-class leakage.  The issue here is that each
-	load of the module will create a new set of lock classes for
-	that module's locks, but module unloading does not remove old
-	classes (see below discussion of reuse of lock classes for why).
-	Therefore, if that module is loaded and unloaded repeatedly,
-	the number of lock classes will eventually reach the maximum.
-
-2.	Using structures such as arrays that have large numbers of
-	locks that are not explicitly initialized.  For example,
-	a hash table with 8192 buckets where each bucket has its own
-	spinlock_t will consume 8192 lock classes -unless- each spinlock
-	is explicitly initialized at runtime, for example, using the
-	run-time spin_lock_init() as opposed to compile-time initializers
-	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
-	the per-bucket spinlocks would guarantee lock-class overflow.
-	In contrast, a loop that called spin_lock_init() on each lock
-	would place all 8192 locks into a single lock class.
-
-	The moral of this story is that you should always explicitly
-	initialize your locks.
-
-One might argue that the validator should be modified to allow
-lock classes to be reused.  However, if you are tempted to make this
-argument, first review the code and think through the changes that would
-be required, keeping in mind that the lock classes to be removed are
-likely to be linked into the lock-dependency graph.  This turns out to
-be harder to do than to say.
-
-Of course, if you do run out of lock classes, the next thing to do is
-to find the offending lock classes.  First, the following command gives
-you the number of lock classes currently in use along with the maximum:
-
-	grep "lock-classes" /proc/lockdep_stats
-
-This command produces the following output on a modest system:
-
-	 lock-classes:                          748 [max: 8191]
-
-If the number allocated (748 above) increases continually over time,
-then there is likely a leak.  The following command can be used to
-identify the leaking lock classes:
-
-	grep "BD" /proc/lockdep
-
-Run the command and save the output, then compare against the output from
-a later run of this command to identify the leakers.  This same output
-can also help you find situations where runtime lock initialization has
-been omitted.
diff --git a/Documentation/locking/lockstat.rst b/Documentation/locking/lockstat.rst
new file mode 100644
index 000000000000..536eab8dbd99
--- /dev/null
+++ b/Documentation/locking/lockstat.rst
@@ -0,0 +1,204 @@
+===============
+Lock Statistics
+===============
+
+What
+====
+
+As the name suggests, it provides statistics on locks.
+
+
+Why
+===
+
+Because things like lock contention can severely impact performance.
+
+How
+===
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that (see Documentation/locking/lockdep-design.rst).
+The graph below shows the relation between the lock functions and the various
+hooks therein::
+
+        __acquire
+            |
+           lock _____
+            |        \
+            |    __contended
+            |         |
+            |       <wait>
+            | _______/
+            |/
+            |
+       __acquired
+            |
+            .
+          <hold>
+            .
+            |
+       __release
+            |
+         unlock
+
+  lock, unlock	- the regular lock functions
+  __*		- the hooks
+  <> 		- states
+
+With these hooks we provide the following statistics:
+
+ con-bounces
+	- number of lock contention that involved x-cpu data
+ contentions
+	- number of lock acquisitions that had to wait
+ wait time
+     min
+	- shortest (non-0) time we ever had to wait for a lock
+     max
+	- longest time we ever had to wait for a lock
+     total
+	- total time we spend waiting on this lock
+     avg
+	- average time spent waiting on this lock
+ acq-bounces
+	- number of lock acquisitions that involved x-cpu data
+ acquisitions
+	- number of times we took the lock
+ hold time
+     min
+	- shortest (non-0) time we ever held the lock
+     max
+	- longest time we ever held the lock
+     total
+	- total time this lock was held
+     avg
+	- average time this lock was held
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+Configuration
+-------------
+
+Lock statistics are enabled via CONFIG_LOCK_STAT.
+
+Usage
+-----
+
+Enable collection of statistics::
+
+	# echo 1 >/proc/sys/kernel/lock_stat
+
+Disable collection of statistics::
+
+	# echo 0 >/proc/sys/kernel/lock_stat
+
+Look at the current lock statistics::
+
+  ( line numbers not part of actual output, done for clarity in the explanation
+    below )
+
+  # less /proc/lock_stat
+
+  01 lock_stat version 0.4
+  02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
+  04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  05
+  06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
+  07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
+  08                         ---------------
+  09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+  10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
+  13                         ---------------
+  14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+  15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+  16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  18
+  19.............................................................................................................................................................................................................................
+  20
+  21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
+  22                         ---------------
+  23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+  27                         ---------------
+  28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 13) from the contention points.
+
+Lines 09-12 show the first 4 recorded contention points (the code
+which tries to get the lock) and lines 14-17 show the first 4 recorded
+contended points (the lock holder). It is possible that the max
+con-bounces point is missing in the statistics.
+
+The first lock (05-18) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
+
+The integer part of the time values is in us.
+
+Dealing with nested locks, subclasses may appear::
+
+  32...........................................................................................................................................................................................................................
+  33
+  34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
+  35                               ---------
+  36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+  39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+  40                               ---------
+  41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
+  45
+  46...........................................................................................................................................................................................................................
+  47
+  48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
+  49                             -----------
+  50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  51                             -----------
+  52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+  54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
+View the top contending locks::
+
+  # grep : /proc/lock_stat | head
+			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
+		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
+		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
+			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
+	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
+			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
+			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
+			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
+       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
+			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
+
+Clear the statistics::
+
+  # echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
deleted file mode 100644
index fdbeb0c45ef3..000000000000
--- a/Documentation/locking/lockstat.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-
-LOCK STATISTICS
-
-- WHAT
-
-As the name suggests, it provides statistics on locks.
-
-- WHY
-
-Because things like lock contention can severely impact performance.
-
-- HOW
-
-Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
-The graph below shows the relation between the lock functions and the various
-hooks therein.
-
-        __acquire
-            |
-           lock _____
-            |        \
-            |    __contended
-            |         |
-            |       <wait>
-            | _______/
-            |/
-            |
-       __acquired
-            |
-            .
-          <hold>
-            .
-            |
-       __release
-            |
-         unlock
-
-lock, unlock	- the regular lock functions
-__*		- the hooks
-<> 		- states
-
-With these hooks we provide the following statistics:
-
- con-bounces       - number of lock contention that involved x-cpu data
- contentions       - number of lock acquisitions that had to wait
- wait time min     - shortest (non-0) time we ever had to wait for a lock
-           max     - longest time we ever had to wait for a lock
-	   total   - total time we spend waiting on this lock
-	   avg     - average time spent waiting on this lock
- acq-bounces       - number of lock acquisitions that involved x-cpu data
- acquisitions      - number of times we took the lock
- hold time min     - shortest (non-0) time we ever held the lock
-	   max     - longest time we ever held the lock
-	   total   - total time this lock was held
-	   avg     - average time this lock was held
-
-These numbers are gathered per lock class, per read/write state (when
-applicable).
-
-It also tracks 4 contention points per class. A contention point is a call site
-that had to wait on lock acquisition.
-
- - CONFIGURATION
-
-Lock statistics are enabled via CONFIG_LOCK_STAT.
-
- - USAGE
-
-Enable collection of statistics:
-
-# echo 1 >/proc/sys/kernel/lock_stat
-
-Disable collection of statistics:
-
-# echo 0 >/proc/sys/kernel/lock_stat
-
-Look at the current lock statistics:
-
-( line numbers not part of actual output, done for clarity in the explanation
-  below )
-
-# less /proc/lock_stat
-
-01 lock_stat version 0.4
-02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
-04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-05
-06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
-07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
-08                         ---------------
-09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
-10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
-13                         ---------------
-14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
-15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
-16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-18
-19.............................................................................................................................................................................................................................
-20
-21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
-22                         ---------------
-23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-27                         ---------------
-28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-
-
-This excerpt shows the first two lock class statistics. Line 01 shows the
-output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-18 and 20-31 show the actual
-statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 13) from the contention points.
-
-Lines 09-12 show the first 4 recorded contention points (the code
-which tries to get the lock) and lines 14-17 show the first 4 recorded
-contended points (the lock holder). It is possible that the max
-con-bounces point is missing in the statistics.
-
-The first lock (05-18) is a read/write lock, and shows two lines above the
-short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol. The second set of contention
-points are the points we're contending with.
-
-The integer part of the time values is in us.
-
-Dealing with nested locks, subclasses may appear:
-
-32...........................................................................................................................................................................................................................
-33
-34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
-35                               ---------
-36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
-40                               ---------
-41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
-45
-46...........................................................................................................................................................................................................................
-47
-48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
-49                             -----------
-50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-51                             -----------
-52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
-54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-
-Line 48 shows statistics for the second subclass (/1) of &rq->lock class
-(subclass starts from 0), since in this case, as line 50 suggests,
-double_rq_lock actually acquires a nested lock of two spinlocks.
-
-View the top contending locks:
-
-# grep : /proc/lock_stat | head
-			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
-		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
-		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
-			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
-	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
-			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
-			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
-			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
-       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
-			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
-
-Clear the statistics:
-
-# echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
new file mode 100644
index 000000000000..e79eeeca3ac6
--- /dev/null
+++ b/Documentation/locking/locktorture.rst
@@ -0,0 +1,170 @@
+==================================
+Kernel Lock Torture Test Operation
+==================================
+
+CONFIG_LOCK_TORTURE_TEST
+========================
+
+The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
+that runs torture tests on core kernel locking primitives. The kernel
+module, 'locktorture', may be built after the fact on the running
+kernel to be tested, if desired. The tests periodically output status
+messages via printk(), which can be examined via the dmesg (perhaps
+grepping for "torture").  The test is started when the module is loaded,
+and stops when the module is unloaded. This program is based on how RCU
+is tortured, via rcutorture.
+
+This torture test consists of creating a number of kernel threads which
+acquire the lock and hold it for specific amount of time, thus simulating
+different critical region behaviors. The amount of contention on the lock
+can be simulated by either enlarging this critical region hold time and/or
+creating more kthreads.
+
+
+Module Parameters
+=================
+
+This module has the following parameters:
+
+
+Locktorture-specific
+--------------------
+
+nwriters_stress
+		  Number of kernel threads that will stress exclusive lock
+		  ownership (writers). The default value is twice the number
+		  of online CPUs.
+
+nreaders_stress
+		  Number of kernel threads that will stress shared lock
+		  ownership (readers). The default is the same amount of writer
+		  locks. If the user did not specify nwriters_stress, then
+		  both readers and writers be the amount of online CPUs.
+
+torture_type
+		  Type of lock to torture. By default, only spinlocks will
+		  be tortured. This module can torture the following locks,
+		  with string values as follows:
+
+		     - "lock_busted":
+				Simulates a buggy lock implementation.
+
+		     - "spin_lock":
+				spin_lock() and spin_unlock() pairs.
+
+		     - "spin_lock_irq":
+				spin_lock_irq() and spin_unlock_irq() pairs.
+
+		     - "rw_lock":
+				read/write lock() and unlock() rwlock pairs.
+
+		     - "rw_lock_irq":
+				read/write lock_irq() and unlock_irq()
+				rwlock pairs.
+
+		     - "mutex_lock":
+				mutex_lock() and mutex_unlock() pairs.
+
+		     - "rtmutex_lock":
+				rtmutex_lock() and rtmutex_unlock() pairs.
+				Kernel must have CONFIG_RT_MUTEX=y.
+
+		     - "rwsem_lock":
+				read/write down() and up() semaphore pairs.
+
+
+Torture-framework (RCU + locking)
+---------------------------------
+
+shutdown_secs
+		  The number of seconds to run the test before terminating
+		  the test and powering off the system.  The default is
+		  zero, which disables test termination and system shutdown.
+		  This capability is useful for automated testing.
+
+onoff_interval
+		  The number of seconds between each attempt to execute a
+		  randomly selected CPU-hotplug operation.  Defaults
+		  to zero, which disables CPU hotplugging.  In
+		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
+		  refuse to do any CPU-hotplug operations regardless of
+		  what value is specified for onoff_interval.
+
+onoff_holdoff
+		  The number of seconds to wait until starting CPU-hotplug
+		  operations.  This would normally only be used when
+		  locktorture was built into the kernel and started
+		  automatically at boot time, in which case it is useful
+		  in order to avoid confusing boot-time code with CPUs
+		  coming and going. This parameter is only useful if
+		  CONFIG_HOTPLUG_CPU is enabled.
+
+stat_interval
+		  Number of seconds between statistics-related printk()s.
+		  By default, locktorture will report stats every 60 seconds.
+		  Setting the interval to zero causes the statistics to
+		  be printed -only- when the module is unloaded, and this
+		  is the default.
+
+stutter
+		  The length of time to run the test before pausing for this
+		  same period of time.  Defaults to "stutter=5", so as
+		  to run and pause for (roughly) five-second intervals.
+		  Specifying "stutter=0" causes the test to run continuously
+		  without pausing, which is the old default behavior.
+
+shuffle_interval
+		  The number of seconds to keep the test threads affinitied
+		  to a particular subset of the CPUs, defaults to 3 seconds.
+		  Used in conjunction with test_no_idle_hz.
+
+verbose
+		  Enable verbose debugging printing, via printk(). Enabled
+		  by default. This extra information is mostly related to
+		  high-level errors and reports from the main 'torture'
+		  framework.
+
+
+Statistics
+==========
+
+Statistics are printed in the following format::
+
+  spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
+     (A)		    (B)		   (C)		  (D)	       (E)
+
+  (A): Lock type that is being tortured -- torture_type parameter.
+
+  (B): Number of writer lock acquisitions. If dealing with a read/write
+       primitive a second "Reads" statistics line is printed.
+
+  (C): Number of times the lock was acquired.
+
+  (D): Min and max number of times threads failed to acquire the lock.
+
+  (E): true/false values if there were errors acquiring the lock. This should
+       -only- be positive if there is a bug in the locking primitive's
+       implementation. Otherwise a lock should never fail (i.e., spin_lock()).
+       Of course, the same applies for (C), above. A dummy example of this is
+       the "lock_busted" type.
+
+Usage
+=====
+
+The following script may be used to torture locks::
+
+	#!/bin/sh
+
+	modprobe locktorture
+	sleep 3600
+	rmmod locktorture
+	dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.  The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
+two are self-explanatory, while the last indicates that while there
+were no locking failures, CPU-hotplug problems were detected.
+
+Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
deleted file mode 100644
index 6a8df4cd19bf..000000000000
--- a/Documentation/locking/locktorture.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Kernel Lock Torture Test Operation
-
-CONFIG_LOCK_TORTURE_TEST
-
-The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
-that runs torture tests on core kernel locking primitives. The kernel
-module, 'locktorture', may be built after the fact on the running
-kernel to be tested, if desired. The tests periodically output status
-messages via printk(), which can be examined via the dmesg (perhaps
-grepping for "torture").  The test is started when the module is loaded,
-and stops when the module is unloaded. This program is based on how RCU
-is tortured, via rcutorture.
-
-This torture test consists of creating a number of kernel threads which
-acquire the lock and hold it for specific amount of time, thus simulating
-different critical region behaviors. The amount of contention on the lock
-can be simulated by either enlarging this critical region hold time and/or
-creating more kthreads.
-
-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-
-	    ** Locktorture-specific **
-
-nwriters_stress   Number of kernel threads that will stress exclusive lock
-		  ownership (writers). The default value is twice the number
-		  of online CPUs.
-
-nreaders_stress   Number of kernel threads that will stress shared lock
-		  ownership (readers). The default is the same amount of writer
-		  locks. If the user did not specify nwriters_stress, then
-		  both readers and writers be the amount of online CPUs.
-
-torture_type	  Type of lock to torture. By default, only spinlocks will
-		  be tortured. This module can torture the following locks,
-		  with string values as follows:
-
-		     o "lock_busted": Simulates a buggy lock implementation.
-
-		     o "spin_lock": spin_lock() and spin_unlock() pairs.
-
-		     o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq()
-					pairs.
-
-		     o "rw_lock": read/write lock() and unlock() rwlock pairs.
-
-		     o "rw_lock_irq": read/write lock_irq() and unlock_irq()
-				      rwlock pairs.
-
-		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
-
-		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
-				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
-
-		     o "rwsem_lock": read/write down() and up() semaphore pairs.
-
-
-	    ** Torture-framework (RCU + locking) **
-
-shutdown_secs	  The number of seconds to run the test before terminating
-		  the test and powering off the system.  The default is
-		  zero, which disables test termination and system shutdown.
-		  This capability is useful for automated testing.
-
-onoff_interval	  The number of seconds between each attempt to execute a
-		  randomly selected CPU-hotplug operation.  Defaults
-		  to zero, which disables CPU hotplugging.  In
-		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
-		  refuse to do any CPU-hotplug operations regardless of
-		  what value is specified for onoff_interval.
-
-onoff_holdoff	  The number of seconds to wait until starting CPU-hotplug
-		  operations.  This would normally only be used when
-		  locktorture was built into the kernel and started
-		  automatically at boot time, in which case it is useful
-		  in order to avoid confusing boot-time code with CPUs
-		  coming and going. This parameter is only useful if
-		  CONFIG_HOTPLUG_CPU is enabled.
-
-stat_interval	  Number of seconds between statistics-related printk()s.
-		  By default, locktorture will report stats every 60 seconds.
-		  Setting the interval to zero causes the statistics to
-		  be printed -only- when the module is unloaded, and this
-		  is the default.
-
-stutter		  The length of time to run the test before pausing for this
-		  same period of time.  Defaults to "stutter=5", so as
-		  to run and pause for (roughly) five-second intervals.
-		  Specifying "stutter=0" causes the test to run continuously
-		  without pausing, which is the old default behavior.
-
-shuffle_interval  The number of seconds to keep the test threads affinitied
-		  to a particular subset of the CPUs, defaults to 3 seconds.
-		  Used in conjunction with test_no_idle_hz.
-
-verbose		  Enable verbose debugging printing, via printk(). Enabled
-		  by default. This extra information is mostly related to
-		  high-level errors and reports from the main 'torture'
-		  framework.
-
-
-STATISTICS
-
-Statistics are printed in the following format:
-
-spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
-   (A)		    (B)		   (C)		  (D)	       (E)
-
-(A): Lock type that is being tortured -- torture_type parameter.
-
-(B): Number of writer lock acquisitions. If dealing with a read/write primitive
-     a second "Reads" statistics line is printed.
-
-(C): Number of times the lock was acquired.
-
-(D): Min and max number of times threads failed to acquire the lock.
-
-(E): true/false values if there were errors acquiring the lock. This should
-     -only- be positive if there is a bug in the locking primitive's
-     implementation. Otherwise a lock should never fail (i.e., spin_lock()).
-     Of course, the same applies for (C), above. A dummy example of this is
-     the "lock_busted" type.
-
-USAGE
-
-The following script may be used to torture locks:
-
-	#!/bin/sh
-
-	modprobe locktorture
-	sleep 3600
-	rmmod locktorture
-	dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors.  The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
-two are self-explanatory, while the last indicates that while there
-were no locking failures, CPU-hotplug problems were detected.
-
-Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
new file mode 100644
index 000000000000..4d8236b81fa5
--- /dev/null
+++ b/Documentation/locking/mutex-design.rst
@@ -0,0 +1,152 @@
+=======================
+Generic Mutex Subsystem
+=======================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+updated by Davidlohr Bueso <davidlohr@hp.com>
+
+What are mutexes?
+-----------------
+
+In the Linux kernel, mutexes refer to a particular locking primitive
+that enforces serialization on shared memory systems, and not only to
+the generic term referring to 'mutual exclusion' found in academia
+or similar theoretical text books. Mutexes are sleeping locks which
+behave similarly to binary semaphores, and were introduced in 2006[1]
+as an alternative to these. This new data structure provided a number
+of advantages, including simpler interfaces, and at that time smaller
+code (see Disadvantages).
+
+[1] http://lwn.net/Articles/164802/
+
+Implementation
+--------------
+
+Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime.  Field owner
+actually contains `struct task_struct *` to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty).  In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
+
+When acquiring a mutex, there are three possible paths that can be
+taken, depending on the state of the lock:
+
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+    the current task. This only works in the uncontended case (cmpxchg() checks
+    against 0UL, so all 3 state bits above have to be 0). If the lock is
+    contended it goes to the next possible path.
+
+(ii) midpath: aka optimistic spinning, tries to spin for acquisition
+     while the lock owner is running and there are no other tasks ready
+     to run that have higher priority (need_resched). The rationale is
+     that if the lock owner is running, it is likely to release the lock
+     soon. The mutex spinners are queued up using MCS lock so that only
+     one spinner can compete for the mutex.
+
+     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
+     with the desirable properties of being fair and with each cpu trying
+     to acquire the lock spinning on a local variable. It avoids expensive
+     cacheline bouncing that common test-and-set spinlock implementations
+     incur. An MCS-like lock is specially tailored for optimistic spinning
+     for sleeping lock implementation. An important feature of the customized
+     MCS lock is that it has the extra property that spinners are able to exit
+     the MCS spinlock queue when they need to reschedule. This further helps
+     avoid situations where MCS spinners that need to reschedule would continue
+     waiting to spin on mutex owner, only to go directly to slowpath upon
+     obtaining the MCS lock.
+
+
+(iii) slowpath: last resort, if the lock is still unable to be acquired,
+      the task is added to the wait-queue and sleeps until woken up by the
+      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
+
+While formally kernel mutexes are sleepable locks, it is path (ii) that
+makes them more practically a hybrid type. By simply not interrupting a
+task and busy-waiting for a few cycles instead of immediately sleeping,
+the performance of this lock has been seen to significantly improve a
+number of workloads. Note that this technique is also used for rw-semaphores.
+
+Semantics
+---------
+
+The mutex subsystem checks and enforces the following rules:
+
+    - Only one task can hold the mutex at a time.
+    - Only the owner can unlock the mutex.
+    - Multiple unlocks are not permitted.
+    - Recursive locking/unlocking is not permitted.
+    - A mutex must only be initialized via the API (see below).
+    - A task may not exit with a mutex held.
+    - Memory areas where held locks reside must not be freed.
+    - Held mutexes must not be reinitialized.
+    - Mutexes may not be used in hardware or software interrupt
+      contexts such as tasklets and timers.
+
+These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
+In addition, the mutex debugging code also implements a number of other
+features that make lock debugging easier and faster:
+
+    - Uses symbolic names of mutexes, whenever they are printed
+      in debug output.
+    - Point-of-acquire tracking, symbolic lookup of function names,
+      list of all locks held in the system, printout of them.
+    - Owner tracking.
+    - Detects self-recursing locks and prints out all relevant info.
+    - Detects multi-task circular deadlocks and prints out all affected
+      locks and tasks (and only those tasks).
+
+
+Interfaces
+----------
+Statically define the mutex::
+
+   DEFINE_MUTEX(name);
+
+Dynamically initialize the mutex::
+
+   mutex_init(mutex);
+
+Acquire the mutex, uninterruptible::
+
+   void mutex_lock(struct mutex *lock);
+   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+   int  mutex_trylock(struct mutex *lock);
+
+Acquire the mutex, interruptible::
+
+   int mutex_lock_interruptible_nested(struct mutex *lock,
+				       unsigned int subclass);
+   int mutex_lock_interruptible(struct mutex *lock);
+
+Acquire the mutex, interruptible, if dec to 0::
+
+   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+
+Unlock the mutex::
+
+   void mutex_unlock(struct mutex *lock);
+
+Test if the mutex is taken::
+
+   int mutex_is_locked(struct mutex *lock);
+
+Disadvantages
+-------------
+
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
+
+When to use mutexes
+-------------------
+
+Unless the strict semantics of mutexes are unsuitable and/or the critical
+region prevents the lock from being shared, always prefer them to any other
+locking primitive.
diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
deleted file mode 100644
index 818aca19612f..000000000000
--- a/Documentation/locking/mutex-design.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Generic Mutex Subsystem
-
-started by Ingo Molnar <mingo@redhat.com>
-updated by Davidlohr Bueso <davidlohr@hp.com>
-
-What are mutexes?
------------------
-
-In the Linux kernel, mutexes refer to a particular locking primitive
-that enforces serialization on shared memory systems, and not only to
-the generic term referring to 'mutual exclusion' found in academia
-or similar theoretical text books. Mutexes are sleeping locks which
-behave similarly to binary semaphores, and were introduced in 2006[1]
-as an alternative to these. This new data structure provided a number
-of advantages, including simpler interfaces, and at that time smaller
-code (see Disadvantages).
-
-[1] http://lwn.net/Articles/164802/
-
-Implementation
---------------
-
-Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use an atomic variable
-(->owner) to keep track of the lock state during its lifetime.  Field owner
-actually contains 'struct task_struct *' to the current lock owner and it is
-therefore NULL if not currently owned. Since task_struct pointers are aligned
-at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
-if waiter list is non-empty).  In its most basic form it also includes a
-wait-queue and a spinlock that serializes access to it. Furthermore,
-CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
-below in (ii).
-
-When acquiring a mutex, there are three possible paths that can be
-taken, depending on the state of the lock:
-
-(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
-    the current task. This only works in the uncontended case (cmpxchg() checks
-    against 0UL, so all 3 state bits above have to be 0). If the lock is
-    contended it goes to the next possible path.
-
-(ii) midpath: aka optimistic spinning, tries to spin for acquisition
-     while the lock owner is running and there are no other tasks ready
-     to run that have higher priority (need_resched). The rationale is
-     that if the lock owner is running, it is likely to release the lock
-     soon. The mutex spinners are queued up using MCS lock so that only
-     one spinner can compete for the mutex.
-
-     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
-     with the desirable properties of being fair and with each cpu trying
-     to acquire the lock spinning on a local variable. It avoids expensive
-     cacheline bouncing that common test-and-set spinlock implementations
-     incur. An MCS-like lock is specially tailored for optimistic spinning
-     for sleeping lock implementation. An important feature of the customized
-     MCS lock is that it has the extra property that spinners are able to exit
-     the MCS spinlock queue when they need to reschedule. This further helps
-     avoid situations where MCS spinners that need to reschedule would continue
-     waiting to spin on mutex owner, only to go directly to slowpath upon
-     obtaining the MCS lock.
-
-
-(iii) slowpath: last resort, if the lock is still unable to be acquired,
-      the task is added to the wait-queue and sleeps until woken up by the
-      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
-
-While formally kernel mutexes are sleepable locks, it is path (ii) that
-makes them more practically a hybrid type. By simply not interrupting a
-task and busy-waiting for a few cycles instead of immediately sleeping,
-the performance of this lock has been seen to significantly improve a
-number of workloads. Note that this technique is also used for rw-semaphores.
-
-Semantics
----------
-
-The mutex subsystem checks and enforces the following rules:
-
-    - Only one task can hold the mutex at a time.
-    - Only the owner can unlock the mutex.
-    - Multiple unlocks are not permitted.
-    - Recursive locking/unlocking is not permitted.
-    - A mutex must only be initialized via the API (see below).
-    - A task may not exit with a mutex held.
-    - Memory areas where held locks reside must not be freed.
-    - Held mutexes must not be reinitialized.
-    - Mutexes may not be used in hardware or software interrupt
-      contexts such as tasklets and timers.
-
-These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
-In addition, the mutex debugging code also implements a number of other
-features that make lock debugging easier and faster:
-
-    - Uses symbolic names of mutexes, whenever they are printed
-      in debug output.
-    - Point-of-acquire tracking, symbolic lookup of function names,
-      list of all locks held in the system, printout of them.
-    - Owner tracking.
-    - Detects self-recursing locks and prints out all relevant info.
-    - Detects multi-task circular deadlocks and prints out all affected
-      locks and tasks (and only those tasks).
-
-
-Interfaces
-----------
-Statically define the mutex:
-   DEFINE_MUTEX(name);
-
-Dynamically initialize the mutex:
-   mutex_init(mutex);
-
-Acquire the mutex, uninterruptible:
-   void mutex_lock(struct mutex *lock);
-   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-   int  mutex_trylock(struct mutex *lock);
-
-Acquire the mutex, interruptible:
-   int mutex_lock_interruptible_nested(struct mutex *lock,
-				       unsigned int subclass);
-   int mutex_lock_interruptible(struct mutex *lock);
-
-Acquire the mutex, interruptible, if dec to 0:
-   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-
-Unlock the mutex:
-   void mutex_unlock(struct mutex *lock);
-
-Test if the mutex is taken:
-   int mutex_is_locked(struct mutex *lock);
-
-Disadvantages
--------------
-
-Unlike its original design and purpose, 'struct mutex' is among the largest
-locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
-is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
-cache and memory footprint.
-
-When to use mutexes
--------------------
-
-Unless the strict semantics of mutexes are unsuitable and/or the critical
-region prevents the lock from being shared, always prefer them to any other
-locking primitive.
diff --git a/Documentation/locking/rt-mutex-design.rst b/Documentation/locking/rt-mutex-design.rst
new file mode 100644
index 000000000000..59c2a64efb21
--- /dev/null
+++ b/Documentation/locking/rt-mutex-design.rst
@@ -0,0 +1,574 @@
+==============================
+RT-mutex implementation design
+==============================
+
+Copyright (c) 2006 Steven Rostedt
+
+Licensed under the GNU Free Documentation License, Version 1.2
+
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/locking/rt-mutex.rst.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is where you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem::
+
+     grab lock L1 (owned by C)
+       |
+  A ---+
+          C preempted by B
+            |
+  C    +----+
+
+  B         +-------->
+                  B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain
+         - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex
+         - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock
+         - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock
+         - Same as lock above.
+
+waiter
+         - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has rbtree node structures to
+           place the task in the waiters rbtree of a mutex as well as the
+           pi_waiters rbtree of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters
+         - A list of processes that are blocked on a mutex.
+
+top waiter
+         - The highest priority process waiting on a specific mutex.
+
+top pi waiter
+              - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:
+       task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example::
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be::
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be::
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains::
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2::
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again::
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
+
+Mutex Waiters Tree
+------------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority.  This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
+
+
+Task PI Tree
+------------
+
+To keep track of the PI chains, each process has its own PI rbtree.  This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI tree is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this tree.
+
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters.  It is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way::
+
+  void func1(void)
+  {
+	mutex_lock(L1);
+
+	/* do anything */
+
+	mutex_unlock(L1);
+  }
+
+  void func2(void)
+  {
+	mutex_lock(L1);
+	mutex_lock(L2);
+
+	/* do something */
+
+	mutex_unlock(L2);
+	mutex_unlock(L1);
+  }
+
+  void func3(void)
+  {
+	mutex_lock(L2);
+	mutex_lock(L3);
+
+	/* do something else */
+
+	mutex_unlock(L3);
+	mutex_unlock(L2);
+  }
+
+  void func4(void)
+  {
+	mutex_lock(L3);
+
+	/* do something again */
+
+	mutex_unlock(L3);
+  }
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows::
+
+  D owns L3
+         C blocked on L3
+         C owns L2
+                B blocked on L2
+                B owns L1
+                       A blocked on L1
+
+  And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+
+See Documentation/locking/rt-mutex.rst for further details.
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically::
+
+  unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+  {
+	unsigned long T = *A;
+	if (*A == *B) {
+		*A = *C;
+	}
+	return T;
+  }
+  #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_waiters of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
+
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
+
+Note:
+	For the "prio" field in task_struct, the lower the number, the
+	higher the priority. A "prio" of 5 is of higher priority than a
+	"prio" of 10.
+
+It is interesting to note that rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_waiters
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
+solves all that.
+
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+
+   1) The lock has no owner
+   2) The current task is the highest priority against all other
+      waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The rbtree node of waiter are initialized to the processes
+current priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the task waiter tree.  If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree.  Since the pi_waiter of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_waiters changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The task can then wake up for a couple of reasons:
+  1) The previous lock owner released the lock, and the task now is top_waiter
+  2) we received a signal or timeout
+
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
+
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
+
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
+
+Original Reviewers:
+		     Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+		     Randy Dunlap
+
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
deleted file mode 100644
index 3d7b865539cc..000000000000
--- a/Documentation/locking/rt-mutex-design.txt
+++ /dev/null
@@ -1,559 +0,0 @@
-#
-# Copyright (c) 2006 Steven Rostedt
-# Licensed under the GNU Free Documentation License, Version 1.2
-#
-
-RT-mutex implementation design
-------------------------------
-
-This document tries to describe the design of the rtmutex.c implementation.
-It doesn't describe the reasons why rtmutex.c exists. For that please see
-Documentation/locking/rt-mutex.txt.  Although this document does explain problems
-that happen without this code, but that is in the concept to understand
-what the code actually is doing.
-
-The goal of this document is to help others understand the priority
-inheritance (PI) algorithm that is used, as well as reasons for the
-decisions that were made to implement PI in the manner that was done.
-
-
-Unbounded Priority Inversion
-----------------------------
-
-Priority inversion is when a lower priority process executes while a higher
-priority process wants to run.  This happens for several reasons, and
-most of the time it can't be helped.  Anytime a high priority process wants
-to use a resource that a lower priority process has (a mutex for example),
-the high priority process must wait until the lower priority process is done
-with the resource.  This is a priority inversion.  What we want to prevent
-is something called unbounded priority inversion.  That is when the high
-priority process is prevented from running by a lower priority process for
-an undetermined amount of time.
-
-The classic example of unbounded priority inversion is where you have three
-processes, let's call them processes A, B, and C, where A is the highest
-priority process, C is the lowest, and B is in between. A tries to grab a lock
-that C owns and must wait and lets C run to release the lock. But in the
-meantime, B executes, and since B is of a higher priority than C, it preempts C,
-but by doing so, it is in fact preempting A which is a higher priority process.
-Now there's no way of knowing how long A will be sleeping waiting for C
-to release the lock, because for all we know, B is a CPU hog and will
-never give C a chance to release the lock.  This is called unbounded priority
-inversion.
-
-Here's a little ASCII art to show the problem.
-
-   grab lock L1 (owned by C)
-     |
-A ---+
-        C preempted by B
-          |
-C    +----+
-
-B         +-------->
-                B now keeps A from running.
-
-
-Priority Inheritance (PI)
--------------------------
-
-There are several ways to solve this issue, but other ways are out of scope
-for this document.  Here we only discuss PI.
-
-PI is where a process inherits the priority of another process if the other
-process blocks on a lock owned by the current process.  To make this easier
-to understand, let's use the previous example, with processes A, B, and C again.
-
-This time, when A blocks on the lock owned by C, C would inherit the priority
-of A.  So now if B becomes runnable, it would not preempt C, since C now has
-the high priority of A.  As soon as C releases the lock, it loses its
-inherited priority, and A then can continue with the resource that C had.
-
-Terminology
------------
-
-Here I explain some terminology that is used in this document to help describe
-the design that is used to implement PI.
-
-PI chain - The PI chain is an ordered series of locks and processes that cause
-           processes to inherit priorities from a previous process that is
-           blocked on one of its locks.  This is described in more detail
-           later in this document.
-
-mutex    - In this document, to differentiate from locks that implement
-           PI and spin locks that are used in the PI code, from now on
-           the PI locks will be called a mutex.
-
-lock     - In this document from now on, I will use the term lock when
-           referring to spin locks that are used to protect parts of the PI
-           algorithm.  These locks disable preemption for UP (when
-           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
-           entering critical sections simultaneously.
-
-spin lock - Same as lock above.
-
-waiter   - A waiter is a struct that is stored on the stack of a blocked
-           process.  Since the scope of the waiter is within the code for
-           a process being blocked on the mutex, it is fine to allocate
-           the waiter on the process's stack (local variable).  This
-           structure holds a pointer to the task, as well as the mutex that
-           the task is blocked on.  It also has rbtree node structures to
-           place the task in the waiters rbtree of a mutex as well as the
-           pi_waiters rbtree of a mutex owner task (described below).
-
-           waiter is sometimes used in reference to the task that is waiting
-           on a mutex. This is the same as waiter->task.
-
-waiters  - A list of processes that are blocked on a mutex.
-
-top waiter - The highest priority process waiting on a specific mutex.
-
-top pi waiter - The highest priority process waiting on one of the mutexes
-                that a specific process owns.
-
-Note:  task and process are used interchangeably in this document, mostly to
-       differentiate between two processes that are being described together.
-
-
-PI chain
---------
-
-The PI chain is a list of processes and mutexes that may cause priority
-inheritance to take place.  Multiple chains may converge, but a chain
-would never diverge, since a process can't be blocked on more than one
-mutex at a time.
-
-Example:
-
-   Process:  A, B, C, D, E
-   Mutexes:  L1, L2, L3, L4
-
-   A owns: L1
-           B blocked on L1
-           B owns L2
-                  C blocked on L2
-                  C owns L3
-                         D blocked on L3
-                         D owns L4
-                                E blocked on L4
-
-The chain would be:
-
-   E->L4->D->L3->C->L2->B->L1->A
-
-To show where two chains merge, we could add another process F and
-another mutex L5 where B owns L5 and F is blocked on mutex L5.
-
-The chain for F would be:
-
-   F->L5->B->L1->A
-
-Since a process may own more than one mutex, but never be blocked on more than
-one, the chains merge.
-
-Here we show both chains:
-
-   E->L4->D->L3->C->L2-+
-                       |
-                       +->B->L1->A
-                       |
-                 F->L5-+
-
-For PI to work, the processes at the right end of these chains (or we may
-also call it the Top of the chain) must be equal to or higher in priority
-than the processes to the left or below in the chain.
-
-Also since a mutex may have more than one process blocked on it, we can
-have multiple chains merge at mutexes.  If we add another process G that is
-blocked on mutex L2:
-
-  G->L2->B->L1->A
-
-And once again, to show how this can grow I will show the merging chains
-again.
-
-   E->L4->D->L3->C-+
-                   +->L2-+
-                   |     |
-                 G-+     +->B->L1->A
-                         |
-                   F->L5-+
-
-If process G has the highest priority in the chain, then all the tasks up
-the chain (A and B in this example), must have their priorities increased
-to that of G.
-
-Mutex Waiters Tree
------------------
-
-Every mutex keeps track of all the waiters that are blocked on itself. The
-mutex has a rbtree to store these waiters by priority.  This tree is protected
-by a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock.
-
-
-Task PI Tree
-------------
-
-To keep track of the PI chains, each process has its own PI rbtree.  This is
-a tree of all top waiters of the mutexes that are owned by the process.
-Note that this tree only holds the top waiters and not all waiters that are
-blocked on mutexes owned by the process.
-
-The top of the task's PI tree is always the highest priority task that
-is waiting on a mutex that is owned by the task.  So if the task has
-inherited a priority, it will always be the priority of the task that is
-at the top of this tree.
-
-This tree is stored in the task structure of a process as a rbtree called
-pi_waiters.  It is protected by a spin lock also in the task structure,
-called pi_lock.  This lock may also be taken in interrupt context, so when
-locking the pi_lock, interrupts must be disabled.
-
-
-Depth of the PI Chain
----------------------
-
-The maximum depth of the PI chain is not dynamic, and could actually be
-defined.  But is very complex to figure it out, since it depends on all
-the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
-L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
-The following shows a locking order of L1->L2->L3, but may not actually
-be directly nested that way.
-
-void func1(void)
-{
-	mutex_lock(L1);
-
-	/* do anything */
-
-	mutex_unlock(L1);
-}
-
-void func2(void)
-{
-	mutex_lock(L1);
-	mutex_lock(L2);
-
-	/* do something */
-
-	mutex_unlock(L2);
-	mutex_unlock(L1);
-}
-
-void func3(void)
-{
-	mutex_lock(L2);
-	mutex_lock(L3);
-
-	/* do something else */
-
-	mutex_unlock(L3);
-	mutex_unlock(L2);
-}
-
-void func4(void)
-{
-	mutex_lock(L3);
-
-	/* do something again */
-
-	mutex_unlock(L3);
-}
-
-Now we add 4 processes that run each of these functions separately.
-Processes A, B, C, and D which run functions func1, func2, func3 and func4
-respectively, and such that D runs first and A last.  With D being preempted
-in func4 in the "do something again" area, we have a locking that follows:
-
-D owns L3
-       C blocked on L3
-       C owns L2
-              B blocked on L2
-              B owns L1
-                     A blocked on L1
-
-And thus we have the chain A->L1->B->L2->C->L3->D.
-
-This gives us a PI depth of 4 (four processes), but looking at any of the
-functions individually, it seems as though they only have at most a locking
-depth of two.  So, although the locking depth is defined at compile time,
-it still is very difficult to find the possibilities of that depth.
-
-Now since mutexes can be defined by user-land applications, we don't want a DOS
-type of application that nests large amounts of mutexes to create a large
-PI chain, and have the code holding spin locks while looking at a large
-amount of data.  So to prevent this, the implementation not only implements
-a maximum lock depth, but also only holds at most two different locks at a
-time, as it walks the PI chain.  More about this below.
-
-
-Mutex owner and flags
----------------------
-
-The mutex structure contains a pointer to the owner of the mutex.  If the
-mutex is not owned, this owner is set to NULL.  Since all architectures
-have the task structure on at least a two byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the least
-significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
-flag. It's set whenever there are waiters on a mutex.
-
-See Documentation/locking/rt-mutex.txt for further details.
-
-cmpxchg Tricks
---------------
-
-Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
-is used (when applicable) to keep the fast path of grabbing and releasing
-mutexes short.
-
-cmpxchg is basically the following function performed atomically:
-
-unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
-{
-	unsigned long T = *A;
-	if (*A == *B) {
-		*A = *C;
-	}
-	return T;
-}
-#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
-
-This is really nice to have, since it allows you to only update a variable
-if the variable is what you expect it to be.  You know if it succeeded if
-the return value (the old value of A) is equal to B.
-
-The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
-the architecture does not support CMPXCHG, then this macro is simply set
-to fail every time.  But if CMPXCHG is supported, then this will
-help out extremely to keep the fast path short.
-
-The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
-the system for architectures that support it.  This will also be explained
-later in this document.
-
-
-Priority adjustments
---------------------
-
-The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority.  With the help of the pi_waiters of a
-process this is rather easy to know what needs to be adjusted.
-
-The functions implementing the task adjustments are rt_mutex_adjust_prio
-and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-
-rt_mutex_adjust_prio examines the priority of the task, and the highest
-priority process that is waiting any of mutexes owned by the task. Since
-the pi_waiters of a task holds an order by priority of all the top waiters
-of all the mutexes that the task owns, we simply need to compare the top
-pi waiter to its own normal/deadline priority and take the higher one.
-Then rt_mutex_setprio is called to adjust the priority of the task to the
-new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
-to implement the actual change in priority.
-
-(Note:  For the "prio" field in task_struct, the lower the number, the
-	higher the priority. A "prio" of 5 is of higher priority than a
-	"prio" of 10.)
-
-It is interesting to note that rt_mutex_adjust_prio can either increase
-or decrease the priority of the task.  In the case that a higher priority
-process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
-would increase/boost the task's priority.  But if a higher priority task
-were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task.  That is because the pi_waiters
-always contains the highest priority task that is waiting on a mutex owned
-by the task, so we only need to compare the priority of that top pi waiter
-to the normal priority of the given task.
-
-
-High level overview of the PI chain walk
-----------------------------------------
-
-The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
-
-The implementation has gone through several iterations, and has ended up
-with what we believe is the best.  It walks the PI chain by only grabbing
-at most two locks at a time, and is very efficient.
-
-The rt_mutex_adjust_prio_chain can be used either to boost or lower process
-priorities.
-
-rt_mutex_adjust_prio_chain is called with a task to be checked for PI
-(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, a pointer to a waiter
-that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting), a pointer to the mutex on which the task
-is blocked, and a top_task as the top waiter of the mutex.
-
-For this explanation, I will not mention deadlock detection. This explanation
-will try to stay at a high level.
-
-When this function is called, there are no locks held.  That also means
-that the state of the owner and lock can change when entered into this function.
-
-Before this function is called, the task has already had rt_mutex_adjust_prio
-performed on it.  This means that the task is set to the priority that it
-should be at, but the rbtree nodes of the task's waiter have not been updated
-with the new priorities, and this task may not be in the proper locations
-in the pi_waiters and waiters trees that the task is blocked on. This function
-solves all that.
-
-The main operation of this function is summarized by Thomas Gleixner in
-rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
-details.
-
-Taking of a mutex (The walk through)
-------------------------------------
-
-OK, now let's take a look at the detailed walk through of what happens when
-taking a mutex.
-
-The first thing that is tried is the fast taking of the mutex.  This is
-done when we have CMPXCHG enabled (otherwise the fast taking automatically
-fails).  Only when the owner field of the mutex is NULL can the lock be
-taken with the CMPXCHG and nothing else needs to be done.
-
-If there is contention on the lock, we go about the slow path
-(rt_mutex_slowlock).
-
-The slow path function is where the task's waiter structure is created on
-the stack.  This is because the waiter structure is only needed for the
-scope of this function.  The waiter structure holds the nodes to store
-the task on the waiters tree of the mutex, and if need be, the pi_waiters
-tree of the owner.
-
-The wait_lock of the mutex is taken since the slow path of unlocking the
-mutex also takes this lock.
-
-We then call try_to_take_rt_mutex.  This is where the architecture that
-does not implement CMPXCHG would always grab the lock (if there's no
-contention).
-
-try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
-slow path.  The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. By setting this flag
-now, the current owner of the mutex being contended for can't release the mutex
-without going into the slow unlock path, and it would then need to grab the
-wait_lock, which this code currently holds. So setting the "Has Waiters" flag
-forces the current owner to synchronize with this code.
-
-The lock is taken if the following are true:
-   1) The lock has no owner
-   2) The current task is the highest priority against all other
-      waiters of the lock
-
-If the task succeeds to acquire the lock, then the task is set as the
-owner of the lock, and if the lock still has waiters, the top_waiter
-(highest priority task waiting on the lock) is added to this task's
-pi_waiters tree.
-
-If the lock is not taken by try_to_take_rt_mutex(), then the
-task_blocks_on_rt_mutex() function is called. This will add the task to
-the lock's waiter tree and propagate the pi chain of the lock as well
-as the lock's owner's pi_waiters tree. This is described in the next
-section.
-
-Task blocks on mutex
---------------------
-
-The accounting of a mutex and process is done with the waiter structure of
-the process.  The "task" field is set to the process, and the "lock" field
-to the mutex.  The rbtree node of waiter are initialized to the processes
-current priority.
-
-Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the task waiter tree.  If the current process is the
-highest priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_waiters of the owner,
-and add the current process to that tree.  Since the pi_waiter of the owner
-has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
-should adjust its priority accordingly.
-
-If the owner is also blocked on a lock, and had its pi_waiters changed
-(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
-and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
-
-Now all locks are released, and if the current process is still blocked on a
-mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
-
-Waking up in the loop
----------------------
-
-The task can then wake up for a couple of reasons:
-  1) The previous lock owner released the lock, and the task now is top_waiter
-  2) we received a signal or timeout
-
-In both cases, the task will try again to acquire the lock. If it
-does, then it will take itself off the waiters tree and set itself back
-to the TASK_RUNNING state.
-
-In first case, if the lock was acquired by another task before this task
-could get the lock, then it will go back to sleep and wait to be woken again.
-
-The second case is only applicable for tasks that are grabbing a mutex
-that can wake up before getting the lock, either due to a signal or
-a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
-take the lock again, if it succeeds, then the task will return with the
-lock held, otherwise it will return with -EINTR if the task was woken
-by a signal, or -ETIMEDOUT if it timed out.
-
-
-Unlocking the Mutex
--------------------
-
-The unlocking of a mutex also has a fast path for those architectures with
-CMPXCHG.  Since the taking of a mutex on contention always sets the
-"Has Waiters" flag of the mutex's owner, we use this to know if we need to
-take the slow path when unlocking the mutex.  If the mutex doesn't have any
-waiters, the owner field of the mutex would equal the current process and
-the mutex can be unlocked by just replacing the owner field with NULL.
-
-If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
-the slow unlock path is taken.
-
-The first thing done in the slow unlock path is to take the wait_lock of the
-mutex.  This synchronizes the locking and unlocking of the mutex.
-
-A check is made to see if the mutex has waiters or not.  On architectures that
-do not have CMPXCHG, this is the location that the owner of the mutex will
-determine if a waiter needs to be awoken or not.  On architectures that
-do have CMPXCHG, that check is done in the fast path, but it is still needed
-in the slow path too.  If a waiter of a mutex woke up because of a signal
-or timeout between the time the owner failed the fast path CMPXCHG check and
-the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters then the mutex
-owner field is set to NULL, the wait_lock is released and nothing more is
-needed.
-
-If there are waiters, then we need to wake one up.
-
-On the wake up code, the pi_lock of the current owner is taken.  The top
-waiter of the lock is found and removed from the waiters tree of the mutex
-as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
-marked to prevent lower priority tasks from stealing the lock.
-
-Finally we unlock the pi_lock of the pending owner and wake it up.
-
-
-Contact
--------
-
-For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
-
-
-Credits
--------
-
-Author:  Steven Rostedt <rostedt@goodmis.org>
-Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
-
-Original Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
-		     Randy Dunlap
-Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
-
-Updates
--------
-
-This document was originally written for 2.6.17-rc3-mm1
-was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst
new file mode 100644
index 000000000000..c365dc302081
--- /dev/null
+++ b/Documentation/locking/rt-mutex.rst
@@ -0,0 +1,77 @@
+==================================
+RT-mutex subsystem with PI support
+==================================
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter tree is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters tree. This tree too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state:
+
+ ============ ======= ================================================
+ owner        bit0    Notes
+ ============ ======= ================================================
+ NULL         0       lock is free (fast acquire possible)
+ NULL         1       lock is free and has waiters and the top waiter
+		      is going to take the lock [1]_
+ taskpointer  0       lock is held (fast release possible)
+ taskpointer  1       lock is held and has waiters [2]_
+ ============ ======= ================================================
+
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
+
+.. [1] It also can be a transitional state when grabbing the lock
+       with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+       we need to set the bit0 before looking at the lock, and the owner may
+       be NULL in this small time, hence this can be a transitional state.
+
+.. [2] There is a small time when bit 0 is set but there are no
+       waiters. This can happen when grabbing the lock in the slow path.
+       To prevent a cmpxchg of the owner releasing the lock, we need to
+       set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
deleted file mode 100644
index 35793e003041..000000000000
--- a/Documentation/locking/rt-mutex.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-RT-mutex subsystem with PI support
-----------------------------------
-
-RT-mutexes with priority inheritance are used to support PI-futexes,
-which enable pthread_mutex_t priority inheritance attributes
-(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
-about PI-futexes.]
-
-This technology was developed in the -rt tree and streamlined for
-pthread_mutex support.
-
-Basic principles:
------------------
-
-RT-mutexes extend the semantics of simple mutexes by the priority
-inheritance protocol.
-
-A low priority owner of a rt-mutex inherits the priority of a higher
-priority waiter until the rt-mutex is released. If the temporarily
-boosted owner blocks on a rt-mutex itself it propagates the priority
-boosting to the owner of the other rt_mutex it gets blocked on. The
-priority boosting is immediately removed once the rt_mutex has been
-unlocked.
-
-This approach allows us to shorten the block of high-prio tasks on
-mutexes which protect shared resources. Priority inheritance is not a
-magic bullet for poorly designed applications, but it allows
-well-designed applications to use userspace locks in critical parts of
-an high priority thread, without losing determinism.
-
-The enqueueing of the waiters into the rtmutex waiter tree is done in
-priority order. For same priorities FIFO order is chosen. For each
-rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters tree. This tree too queues in priority order. Whenever
-the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. The
-priority enqueueing is handled by "pi_waiters".
-
-RT-mutexes are optimized for fastpath operations and have no internal
-locking overhead when locking an uncontended mutex or unlocking a mutex
-without waiters. The optimized fastpath operations require cmpxchg
-support. [If that is not available then the rt-mutex internal spinlock
-is used]
-
-The state of the rt-mutex is tracked via the owner field of the rt-mutex
-structure:
-
-lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
-keep track of the "lock has waiters" state.
-
- owner        bit0
- NULL         0       lock is free (fast acquire possible)
- NULL         1       lock is free and has waiters and the top waiter
-			is going to take the lock*
- taskpointer  0       lock is held (fast release possible)
- taskpointer  1       lock is held and has waiters**
-
-The fast atomic compare exchange based acquire and release is only
-possible when bit 0 of lock->owner is 0.
-
-(*) It also can be a transitional state when grabbing the lock
-with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
-we need to set the bit0 before looking at the lock, and the owner may be
-NULL in this small time, hence this can be a transitional state.
-
-(**) There is a small time when bit 0 is set but there are no
-waiters. This can happen when grabbing the lock in the slow path.
-To prevent a cmpxchg of the owner releasing the lock, we need to
-set this bit before looking at the lock.
-
-BTW, there is still technically a "Pending Owner", it's just not called
-that anymore. The pending owner happens to be the top_waiter of a lock
-that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/spinlocks.rst b/Documentation/locking/spinlocks.rst
new file mode 100644
index 000000000000..e93ec6645238
--- /dev/null
+++ b/Documentation/locking/spinlocks.rst
@@ -0,0 +1,177 @@
+===============
+Locking lessons
+===============
+
+Lesson 1: Spin locks
+====================
+
+The most basic primitive for locking is spinlock::
+
+  static DEFINE_SPINLOCK(xxx_lock);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&xxx_lock, flags);
+	... critical section here ..
+	spin_unlock_irqrestore(&xxx_lock, flags);
+
+The above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock. This works well even under UP also, so the code does _not_ need to
+worry about UP vs SMP issues: the spinlocks work correctly under both.
+
+   NOTE! Implications of spin_locks for memory are further described in:
+
+     Documentation/memory-barriers.txt
+
+       (5) LOCK operations.
+
+       (6) UNLOCK operations.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you **know** need to be split up: avoid it at all cost if you
+aren't sure).
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures **everywhere** they are used. The spinlocks are most
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+   NOTE! The spin-lock is safe only when you **also** use the lock itself
+   to do locking across CPU's, which implies that EVERYTHING that
+   touches a shared variable has to agree about the spinlock they want
+   to use.
+
+----
+
+Lesson 2: reader-writer spinlocks.
+==================================
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock.
+
+   NOTE! reader-writer locks require more atomic memory operations than
+   simple spinlocks.  Unless the reader critical section is long, you
+   are better off just using spinlocks.
+
+The routines look the same as above::
+
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+	unsigned long flags;
+
+	read_lock_irqsave(&xxx_lock, flags);
+	.. critical section that only reads the info ...
+	read_unlock_irqrestore(&xxx_lock, flags);
+
+	write_lock_irqsave(&xxx_lock, flags);
+	.. read and write exclusive access to the info ...
+	write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself.  The read lock allows many concurrent readers.  Anything that
+**changes** the list will have to get the write lock.
+
+   NOTE! RCU is better for list traversal, but requires careful
+   attention to design detail (see Documentation/RCU/listRCU.rst).
+
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning.
+
+   NOTE! We are working hard to remove reader-writer spinlocks in most
+   cases, so please don't add a new one without consensus.  (Instead, see
+   Documentation/RCU/rcu.rst for complete information.)
+
+----
+
+Lesson 3: spinlocks revisited.
+==============================
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly **because** they are safe they are also fairly slow. They are slower
+than they'd need to be, because they do have to disable interrupts
+(which is just a single instruction on a x86, but it's an expensive one -
+and on other architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions::
+
+	spin_lock(&lock);
+	...
+	spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks::
+
+	spin_lock(&lock);
+	...
+		<- interrupt comes in:
+			spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+		Linus
+
+----
+
+Reference information:
+======================
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate::
+
+   spinlock_t xxx_lock;
+   rwlock_t xxx_rw_lock;
+
+   static int __init xxx_init(void)
+   {
+	spin_lock_init(&xxx_lock);
+	rwlock_init(&xxx_rw_lock);
+	...
+   }
+
+   module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt
deleted file mode 100644
index ff35e40bdf5b..000000000000
--- a/Documentation/locking/spinlocks.txt
+++ /dev/null
@@ -1,167 +0,0 @@
-Lesson 1: Spin locks
-
-The most basic primitive for locking is spinlock.
-
-static DEFINE_SPINLOCK(xxx_lock);
-
-	unsigned long flags;
-
-	spin_lock_irqsave(&xxx_lock, flags);
-	... critical section here ..
-	spin_unlock_irqrestore(&xxx_lock, flags);
-
-The above is always safe. It will disable interrupts _locally_, but the
-spinlock itself will guarantee the global lock, so it will guarantee that
-there is only one thread-of-control within the region(s) protected by that
-lock. This works well even under UP also, so the code does _not_ need to
-worry about UP vs SMP issues: the spinlocks work correctly under both.
-
-   NOTE! Implications of spin_locks for memory are further described in:
-
-     Documentation/memory-barriers.txt
-       (5) LOCK operations.
-       (6) UNLOCK operations.
-
-The above is usually pretty simple (you usually need and want only one
-spinlock for most things - using more than one spinlock can make things a
-lot more complex and even slower and is usually worth it only for
-sequences that you _know_ need to be split up: avoid it at all cost if you
-aren't sure).
-
-This is really the only really hard part about spinlocks: once you start
-using spinlocks they tend to expand to areas you might not have noticed
-before, because you have to make sure the spinlocks correctly protect the
-shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (for
-example, internal driver data structures that nobody else ever touches).
-
-   NOTE! The spin-lock is safe only when you _also_ use the lock itself
-   to do locking across CPU's, which implies that EVERYTHING that
-   touches a shared variable has to agree about the spinlock they want
-   to use.
-
-----
-
-Lesson 2: reader-writer spinlocks.
-
-If your data accesses have a very natural pattern where you usually tend
-to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
-readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock.
-
-   NOTE! reader-writer locks require more atomic memory operations than
-   simple spinlocks.  Unless the reader critical section is long, you
-   are better off just using spinlocks.
-
-The routines look the same as above:
-
-   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
-
-	unsigned long flags;
-
-	read_lock_irqsave(&xxx_lock, flags);
-	.. critical section that only reads the info ...
-	read_unlock_irqrestore(&xxx_lock, flags);
-
-	write_lock_irqsave(&xxx_lock, flags);
-	.. read and write exclusive access to the info ...
-	write_unlock_irqrestore(&xxx_lock, flags);
-
-The above kind of lock may be useful for complex data structures like
-linked lists, especially searching for entries without changing the list
-itself.  The read lock allows many concurrent readers.  Anything that
-_changes_ the list will have to get the write lock.
-
-   NOTE! RCU is better for list traversal, but requires careful
-   attention to design detail (see Documentation/RCU/listRCU.txt).
-
-Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
-time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning.
-
-   NOTE! We are working hard to remove reader-writer spinlocks in most
-   cases, so please don't add a new one without consensus.  (Instead, see
-   Documentation/RCU/rcu.txt for complete information.)
-
-----
-
-Lesson 3: spinlocks revisited.
-
-The single spin-lock primitives above are by no means the only ones. They
-are the most safe ones, and the ones that work under all circumstances,
-but partly _because_ they are safe they are also fairly slow. They are slower
-than they'd need to be, because they do have to disable interrupts
-(which is just a single instruction on a x86, but it's an expensive one -
-and on other architectures it can be worse).
-
-If you have a case where you have to protect a data structure across
-several CPU's and you want to use spinlocks you can potentially use
-cheaper versions of the spinlocks. IFF you know that the spinlocks are
-never used in interrupt handlers, you can use the non-irq versions:
-
-	spin_lock(&lock);
-	...
-	spin_unlock(&lock);
-
-(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
-This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
-
-The reasons you mustn't use these versions if you have interrupts that
-play with the spinlock is that you can get deadlocks:
-
-	spin_lock(&lock);
-	...
-		<- interrupt comes in:
-			spin_lock(&lock);
-
-where an interrupt tries to lock an already locked variable. This is ok if
-the other interrupt happens on another CPU, but it is _not_ ok if the
-interrupt happens on the same CPU that already holds the lock, because the
-lock will obviously never be released (because the interrupt is waiting
-for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
-
-(This is also the reason why the irq-versions of the spinlocks only need
-to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
-on other CPU's, because an interrupt on another CPU doesn't interrupt the
-CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
-
-Note that you can be clever with read-write locks and interrupts. For
-example, if you know that the interrupt only ever gets a read-lock, then
-you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
-
-For an example of being clever with rw-locks, see the "waitqueue_lock"
-handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
-within an interrupt, they only read the queue in order to know whom to
-wake up. So read-locks are safe (which is good: they are very common
-indeed), while write-locks need to protect themselves against interrupts.
-
-		Linus
-
-----
-
-Reference information:
-
-For dynamic initialization, use spin_lock_init() or rwlock_init() as
-appropriate:
-
-   spinlock_t xxx_lock;
-   rwlock_t xxx_rw_lock;
-
-   static int __init xxx_init(void)
-   {
-	spin_lock_init(&xxx_lock);
-	rwlock_init(&xxx_rw_lock);
-	...
-   }
-
-   module_init(xxx_init);
-
-For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/ww-mutex-design.rst b/Documentation/locking/ww-mutex-design.rst
new file mode 100644
index 000000000000..1846c199da23
--- /dev/null
+++ b/Documentation/locking/ww-mutex-design.rst
@@ -0,0 +1,393 @@
+======================================
+Wound/Wait Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple.  For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter.  In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the task that just
+  killed its transaction after having dropped all already acquired locks.
+  These functions have the _slow postfix.
+
+  From a simple semantics point-of-view the _slow functions are not strictly
+  required, since simply calling the normal ww_mutex_lock functions on the
+  contending lock (after having dropped all other already acquired locks) will
+  work correctly. After all if no other ww mutex has been acquired yet there's
+  no deadlock potential and hence the ww_mutex_lock call will block and not
+  prematurely return -EDEADLK. The advantage of the _slow functions is in
+  interface safety:
+
+  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+    has a void return type. Note that since ww mutex code needs loops/retries
+    anyway the __must_check doesn't result in spurious warnings, even though the
+    very first lock operation can never fail.
+  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+    ww mutex have been released (preventing deadlocks) and makes sure that we
+    block on the contending lock (preventing spinning through the -EDEADLK
+    slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+  context.
+
+  Again this is not strictly required. But often you only want to acquire a
+  single lock in which case it's pointless to set up an acquire context (and so
+  better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2::
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  struct obj {
+	struct ww_mutex lock;
+	/* obj data */
+  };
+
+  struct obj_entry {
+	struct list_head head;
+	struct obj *obj;
+  };
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl)::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	list_for_each_entry (entry, list, head) {
+		if (entry->obj == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_entry = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+  err:
+	list_for_each_entry_continue_reverse (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+  }
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (entry, list, head) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (entry2, list, head)
+				ww_mutex_unlock(&entry2->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->head);
+			list_add(&entry->head, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+Unlocking works the same way for both methods #1 and #2::
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry;
+
+	list_for_each_entry (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+  }
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK die condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here::
+
+  struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+  };
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  void __unlock_objs(struct list_head *list)
+  {
+	struct obj *entry, *temp;
+
+	list_for_each_entry_safe (entry, temp, list, locked_list) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(&entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+  }
+
+  void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *obj;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list);
+
+			ww_mutex_lock_slow(obj, ctx);
+			list_add(&entry->locked_list, list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add_tail(&entry->locked_list, list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	__unlock_objs(list);
+	ww_acquire_fini(ctx);
+  }
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+^^^^^^^
+
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  We maintain the following invariants for the wait list:
+
+  (1) Waiters with an acquire context are sorted by stamp order; waiters
+      without an acquire context are interspersed in FIFO order.
+  (2) For Wait-Die, among waiters with contexts, only the first one can have
+      other locks acquired already (ctx->acquired > 0). Note that this waiter
+      may come after other waiters without contexts in the list.
+
+  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+  The wounded status of the transaction is checked only when there is
+  contention for a new lock and hence a true chance of deadlock. In that
+  situation, if the transaction is wounded, it backs off, clears the
+  wounded status and retries. A great benefit of implementing preemption in
+  this way is that the wounded transaction can identify a contending lock to
+  wait for before restarting the transaction. Just blindly restarting the
+  transaction would likely make the transaction end up in a situation where
+  it would have to back off again.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices, and optimization focus should
+  therefore be directed towards the uncontended cases.
+
+Lockdep:
+^^^^^^^^
+
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock the wrong mutex after -EDEADLK and
+     unlocking all mutexes.
+   - Attempting to lock the right mutex after -EDEADLK,
+     before unlocking all mutexes.
+
+   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - 'normal' deadlocks that can occur.
+
+FIXME:
+  Update this section once we have the TASK_DEADLOCK task state flag magic
+  implemented.
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
deleted file mode 100644
index f0ed7c30e695..000000000000
--- a/Documentation/locking/ww-mutex-design.txt
+++ /dev/null
@@ -1,383 +0,0 @@
-Wound/Wait Deadlock-Proof Mutex Design
-======================================
-
-Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
-
-Motivation for WW-Mutexes
--------------------------
-
-GPU's do operations that commonly involve many buffers.  Those buffers
-can be shared across contexts/processes, exist in different memory
-domains (for example VRAM vs system memory), and so on.  And with
-PRIME / dmabuf, they can even be shared across devices.  So there are
-a handful of situations where the driver needs to wait for buffers to
-become ready.  If you think about this in terms of waiting on a buffer
-mutex for it to become available, this presents a problem because
-there is no way to guarantee that buffers appear in a execbuf/batch in
-the same order in all contexts.  That is directly under control of
-userspace, and a result of the sequence of GL calls that an application
-makes.	Which results in the potential for deadlock.  The problem gets
-more complex when you consider that the kernel may need to migrate the
-buffer(s) into VRAM before the GPU operates on the buffer(s), which
-may in turn require evicting some other buffers (and you don't want to
-evict other buffers which are already queued up to the GPU), but for a
-simplified understanding of the problem you can ignore this.
-
-The algorithm that the TTM graphics subsystem came up with for dealing with
-this problem is quite simple.  For each group of buffers (execbuf) that need
-to be locked, the caller would be assigned a unique reservation id/ticket,
-from a global counter.  In case of deadlock while locking all the buffers
-associated with a execbuf, the one with the lowest reservation ticket (i.e.
-the oldest task) wins, and the one with the higher reservation id (i.e. the
-younger task) unlocks all of the buffers that it has already locked, and then
-tries again.
-
-In the RDBMS literature, a reservation ticket is associated with a transaction.
-and the deadlock handling approach is called Wait-Die. The name is based on
-the actions of a locking thread when it encounters an already locked mutex.
-If the transaction holding the lock is younger, the locking transaction waits.
-If the transaction holding the lock is older, the locking transaction backs off
-and dies. Hence Wait-Die.
-There is also another algorithm called Wound-Wait:
-If the transaction holding the lock is younger, the locking transaction
-wounds the transaction holding the lock, requesting it to die.
-If the transaction holding the lock is older, it waits for the other
-transaction. Hence Wound-Wait.
-The two algorithms are both fair in that a transaction will eventually succeed.
-However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
-compared to Wait-Die, but is, on the other hand, associated with more work than
-Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
-algorithm in that transactions are wounded by other transactions, and that
-requires a reliable way to pick up up the wounded condition and preempt the
-running transaction. Note that this is not the same as process preemption. A
-Wound-Wait transaction is considered preempted when it dies (returning
--EDEADLK) following a wound.
-
-Concepts
---------
-
-Compared to normal mutexes two additional concepts/objects show up in the lock
-interface for w/w mutexes:
-
-Acquire context: To ensure eventual forward progress it is important the a task
-trying to acquire locks doesn't grab a new reservation id, but keeps the one it
-acquired when starting the lock acquisition. This ticket is stored in the
-acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse. An acquire context is representing a
-transaction.
-
-W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context. The lock
-class also specifies what algorithm to use, Wound-Wait or Wait-Die.
-
-Furthermore there are three different class of w/w lock acquire functions:
-
-* Normal lock acquisition with a context, using ww_mutex_lock.
-
-* Slowpath lock acquisition on the contending lock, used by the task that just
-  killed its transaction after having dropped all already acquired locks.
-  These functions have the _slow postfix.
-
-  From a simple semantics point-of-view the _slow functions are not strictly
-  required, since simply calling the normal ww_mutex_lock functions on the
-  contending lock (after having dropped all other already acquired locks) will
-  work correctly. After all if no other ww mutex has been acquired yet there's
-  no deadlock potential and hence the ww_mutex_lock call will block and not
-  prematurely return -EDEADLK. The advantage of the _slow functions is in
-  interface safety:
-  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
-    has a void return type. Note that since ww mutex code needs loops/retries
-    anyway the __must_check doesn't result in spurious warnings, even though the
-    very first lock operation can never fail.
-  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
-    ww mutex have been released (preventing deadlocks) and makes sure that we
-    block on the contending lock (preventing spinning through the -EDEADLK
-    slowpath until the contended lock can be acquired).
-
-* Functions to only acquire a single w/w mutex, which results in the exact same
-  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
-  context.
-
-  Again this is not strictly required. But often you only want to acquire a
-  single lock in which case it's pointless to set up an acquire context (and so
-  better to avoid grabbing a deadlock avoidance ticket).
-
-Of course, all the usual variants for handling wake-ups due to signals are also
-provided.
-
-Usage
------
-
-The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
-DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
-As a rough rule of thumb, use Wound-Wait iff you
-expect the number of simultaneous competing transactions to be typically small,
-and you want to reduce the number of rollbacks.
-
-Three different ways to acquire locks within the same w/w class. Common
-definitions for methods #1 and #2:
-
-static DEFINE_WW_CLASS(ww_class);
-
-struct obj {
-	struct ww_mutex lock;
-	/* obj data */
-};
-
-struct obj_entry {
-	struct list_head head;
-	struct obj *obj;
-};
-
-Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
-This is useful if a list of required objects is already tracked somewhere.
-Furthermore the lock helper can use propagate the -EALREADY return code back to
-the caller as a signal that an object is twice on the list. This is useful if
-the list is constructed from userspace input and the ABI requires userspace to
-not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *res_obj = NULL;
-	struct obj_entry *contended_entry = NULL;
-	struct obj_entry *entry;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	list_for_each_entry (entry, list, head) {
-		if (entry->obj == res_obj) {
-			res_obj = NULL;
-			continue;
-		}
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			contended_entry = entry;
-			goto err;
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-
-err:
-	list_for_each_entry_continue_reverse (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	if (res_obj)
-		ww_mutex_unlock(&res_obj->lock);
-
-	if (ret == -EDEADLK) {
-		/* we lost out in a seqno race, lock and retry.. */
-		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
-		res_obj = contended_entry->obj;
-		goto retry;
-	}
-	ww_acquire_fini(ctx);
-
-	return ret;
-}
-
-Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
-of duplicate entry detection using -EALREADY as method 1 above. But the
-list-reordering allows for a bit more idiomatic code.
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry, *entry2;
-
-	ww_acquire_init(ctx, &ww_class);
-
-	list_for_each_entry (entry, list, head) {
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			entry2 = entry;
-
-			list_for_each_entry_continue_reverse (entry2, list, head)
-				ww_mutex_unlock(&entry2->obj->lock);
-
-			if (ret != -EDEADLK) {
-				ww_acquire_fini(ctx);
-				return ret;
-			}
-
-			/* we lost out in a seqno race, lock and retry.. */
-			ww_mutex_lock_slow(&entry->obj->lock, ctx);
-
-			/*
-			 * Move buf to head of the list, this will point
-			 * buf->next to the first unlocked entry,
-			 * restarting the for loop.
-			 */
-			list_del(&entry->head);
-			list_add(&entry->head, list);
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-Unlocking works the same way for both methods #1 and #2:
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry;
-
-	list_for_each_entry (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	ww_acquire_fini(ctx);
-}
-
-Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
-e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
-and edges can only be changed when holding the locks of all involved nodes. w/w
-mutexes are a natural fit for such a case for two reasons:
-- They can handle lock-acquisition in any order which allows us to start walking
-  a graph from a starting point and then iteratively discovering new edges and
-  locking down the nodes those edges connect to.
-- Due to the -EALREADY return code signalling that a given objects is already
-  held there's no need for additional book-keeping to break cycles in the graph
-  or keep track off which looks are already held (when using more than one node
-  as a starting point).
-
-Note that this approach differs in two important ways from the above methods:
-- Since the list of objects is dynamically constructed (and might very well be
-  different when retrying due to hitting the -EDEADLK die condition) there's
-  no need to keep any object on a persistent list when it's not locked. We can
-  therefore move the list_head into the object itself.
-- On the other hand the dynamic object list construction also means that the -EALREADY return
-  code can't be propagated.
-
-Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
-list of starting nodes (passed in from userspace) using one of the above
-methods. And then lock any additional objects affected by the operations using
-method #3 below. The backoff/retry procedure will be a bit more involved, since
-when the dynamic locking step hits -EDEADLK we also need to unlock all the
-objects acquired with the fixed list. But the w/w mutex debug checks will catch
-any interface misuse for these cases.
-
-Also, method 3 can't fail the lock acquisition step since it doesn't return
--EALREADY. Of course this would be different when using the _interruptible
-variants, but that's outside of the scope of these examples here.
-
-struct obj {
-	struct ww_mutex ww_mutex;
-	struct list_head locked_list;
-};
-
-static DEFINE_WW_CLASS(ww_class);
-
-void __unlock_objs(struct list_head *list)
-{
-	struct obj *entry, *temp;
-
-	list_for_each_entry_safe (entry, temp, list, locked_list) {
-		/* need to do that before unlocking, since only the current lock holder is
-		allowed to use object */
-		list_del(&entry->locked_list);
-		ww_mutex_unlock(entry->ww_mutex)
-	}
-}
-
-void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *obj;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	/* re-init loop start state */
-	loop {
-		/* magic code which walks over a graph and decides which objects
-		 * to lock */
-
-		ret = ww_mutex_lock(obj->ww_mutex, ctx);
-		if (ret == -EALREADY) {
-			/* we have that one already, get to the next object */
-			continue;
-		}
-		if (ret == -EDEADLK) {
-			__unlock_objs(list);
-
-			ww_mutex_lock_slow(obj, ctx);
-			list_add(&entry->locked_list, list);
-			goto retry;
-		}
-
-		/* locked a new object, add it to the list */
-		list_add_tail(&entry->locked_list, list);
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	__unlock_objs(list);
-	ww_acquire_fini(ctx);
-}
-
-Method 4: Only lock one single objects. In that case deadlock detection and
-prevention is obviously overkill, since with grabbing just one lock you can't
-produce a deadlock within just one class. To simplify this case the w/w mutex
-api can be used with a NULL context.
-
-Implementation Details
-----------------------
-
-Design:
-  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
-  normal mutex locks, which are far more common. As such there is only a small
-  increase in code size if wait/wound mutexes are not used.
-
-  We maintain the following invariants for the wait list:
-  (1) Waiters with an acquire context are sorted by stamp order; waiters
-      without an acquire context are interspersed in FIFO order.
-  (2) For Wait-Die, among waiters with contexts, only the first one can have
-      other locks acquired already (ctx->acquired > 0). Note that this waiter
-      may come after other waiters without contexts in the list.
-
-  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
-  The wounded status of the transaction is checked only when there is
-  contention for a new lock and hence a true chance of deadlock. In that
-  situation, if the transaction is wounded, it backs off, clears the
-  wounded status and retries. A great benefit of implementing preemption in
-  this way is that the wounded transaction can identify a contending lock to
-  wait for before restarting the transaction. Just blindly restarting the
-  transaction would likely make the transaction end up in a situation where
-  it would have to back off again.
-
-  In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices, and optimization focus should
-  therefore be directed towards the uncontended cases.
-
-Lockdep:
-  Special care has been taken to warn for as many cases of api abuse
-  as possible. Some common api abuses will be caught with
-  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
-
-  Some of the errors which will be warned about:
-   - Forgetting to call ww_acquire_fini or ww_acquire_init.
-   - Attempting to lock more mutexes after ww_acquire_done.
-   - Attempting to lock the wrong mutex after -EDEADLK and
-     unlocking all mutexes.
-   - Attempting to lock the right mutex after -EDEADLK,
-     before unlocking all mutexes.
-
-   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
-
-   - Unlocking mutexes with the wrong unlock function.
-   - Calling one of the ww_acquire_* twice on the same context.
-   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
-   - Normal lockdep errors that can result in deadlocks.
-
-  Some of the lockdep errors that can result in deadlocks:
-   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
-     having called ww_acquire_fini on the first.
-   - 'normal' deadlocks that can occur.
-
-FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
-implemented.
diff --git a/Documentation/m68k/index.rst b/Documentation/m68k/index.rst
new file mode 100644
index 000000000000..3a5ba7fe1703
--- /dev/null
+++ b/Documentation/m68k/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+m68k Architecture
+=================
+
+.. toctree::
+   :maxdepth: 2
+
+   kernel-options
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/m68k/kernel-options.rst b/Documentation/m68k/kernel-options.rst
new file mode 100644
index 000000000000..cabd9419740d
--- /dev/null
+++ b/Documentation/m68k/kernel-options.rst
@@ -0,0 +1,911 @@
+===================================
+Command Line Options for Linux/m68k
+===================================
+
+Last Update: 2 May 1999
+
+Linux/m68k version: 2.2.6
+
+Author: Roman.Hodek@informatik.uni-erlangen.de (Roman Hodek)
+
+Update: jds@kom.auc.dk (Jes Sorensen) and faq@linux-m68k.org (Chris Lawrence)
+
+0) Introduction
+===============
+
+Often I've been asked which command line options the Linux/m68k
+kernel understands, or how the exact syntax for the ... option is, or
+... about the option ... . I hope, this document supplies all the
+answers...
+
+Note that some options might be outdated, their descriptions being
+incomplete or missing. Please update the information and send in the
+patches.
+
+
+1) Overview of the Kernel's Option Processing
+=============================================
+
+The kernel knows three kinds of options on its command line:
+
+  1) kernel options
+  2) environment settings
+  3) arguments for init
+
+To which of these classes an argument belongs is determined as
+follows: If the option is known to the kernel itself, i.e. if the name
+(the part before the '=') or, in some cases, the whole argument string
+is known to the kernel, it belongs to class 1. Otherwise, if the
+argument contains an '=', it is of class 2, and the definition is put
+into init's environment. All other arguments are passed to init as
+command line options.
+
+This document describes the valid kernel options for Linux/m68k in
+the version mentioned at the start of this file. Later revisions may
+add new such options, and some may be missing in older versions.
+
+In general, the value (the part after the '=') of an option is a
+list of values separated by commas. The interpretation of these values
+is up to the driver that "owns" the option. This association of
+options with drivers is also the reason that some are further
+subdivided.
+
+
+2) General Kernel Options
+=========================
+
+2.1) root=
+----------
+
+:Syntax: root=/dev/<device>
+:or:     root=<hex_number>
+
+This tells the kernel which device it should mount as the root
+filesystem. The device must be a block device with a valid filesystem
+on it.
+
+The first syntax gives the device by name. These names are converted
+into a major/minor number internally in the kernel in an unusual way.
+Normally, this "conversion" is done by the device files in /dev, but
+this isn't possible here, because the root filesystem (with /dev)
+isn't mounted yet... So the kernel parses the name itself, with some
+hardcoded name to number mappings. The name must always be a
+combination of two or three letters, followed by a decimal number.
+Valid names are::
+
+  /dev/ram: -> 0x0100 (initial ramdisk)
+  /dev/hda: -> 0x0300 (first IDE disk)
+  /dev/hdb: -> 0x0340 (second IDE disk)
+  /dev/sda: -> 0x0800 (first SCSI disk)
+  /dev/sdb: -> 0x0810 (second SCSI disk)
+  /dev/sdc: -> 0x0820 (third SCSI disk)
+  /dev/sdd: -> 0x0830 (forth SCSI disk)
+  /dev/sde: -> 0x0840 (fifth SCSI disk)
+  /dev/fd : -> 0x0200 (floppy disk)
+
+The name must be followed by a decimal number, that stands for the
+partition number. Internally, the value of the number is just
+added to the device number mentioned in the table above. The
+exceptions are /dev/ram and /dev/fd, where /dev/ram refers to an
+initial ramdisk loaded by your bootstrap program (please consult the
+instructions for your bootstrap program to find out how to load an
+initial ramdisk). As of kernel version 2.0.18 you must specify
+/dev/ram as the root device if you want to boot from an initial
+ramdisk. For the floppy devices, /dev/fd, the number stands for the
+floppy drive number (there are no partitions on floppy disks). I.e.,
+/dev/fd0 stands for the first drive, /dev/fd1 for the second, and so
+on. Since the number is just added, you can also force the disk format
+by adding a number greater than 3. If you look into your /dev
+directory, use can see the /dev/fd0D720 has major 2 and minor 16. You
+can specify this device for the root FS by writing "root=/dev/fd16" on
+the kernel command line.
+
+[Strange and maybe uninteresting stuff ON]
+
+This unusual translation of device names has some strange
+consequences: If, for example, you have a symbolic link from /dev/fd
+to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format,
+you cannot use this name for specifying the root device, because the
+kernel cannot see this symlink before mounting the root FS and it
+isn't in the table above. If you use it, the root device will not be
+set at all, without an error message. Another example: You cannot use a
+partition on e.g. the sixth SCSI disk as the root filesystem, if you
+want to specify it by name. This is, because only the devices up to
+/dev/sde are in the table above, but not /dev/sdf. Although, you can
+use the sixth SCSI disk for the root FS, but you have to specify the
+device by number... (see below). Or, even more strange, you can use the
+fact that there is no range checking of the partition number, and your
+knowledge that each disk uses 16 minors, and write "root=/dev/sde17"
+(for /dev/sdf1).
+
+[Strange and maybe uninteresting stuff OFF]
+
+If the device containing your root partition isn't in the table
+above, you can also specify it by major and minor numbers. These are
+written in hex, with no prefix and no separator between. E.g., if you
+have a CD with contents appropriate as a root filesystem in the first
+SCSI CD-ROM drive, you boot from it by "root=0b00". Here, hex "0b" =
+decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
+the first of these. You can find out all valid major numbers by
+looking into include/linux/major.h.
+
+In addition to major and minor numbers, if the device containing your
+root partition uses a partition table format with unique partition
+identifiers, then you may use them.  For instance,
+"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF".  It is also
+possible to reference another partition on the same device using a
+known partition UUID as the starting point.  For example,
+if partition 5 of the device has the UUID of
+00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
+follows:
+
+  PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
+
+Authoritative information can be found in
+"Documentation/admin-guide/kernel-parameters.rst".
+
+
+2.2) ro, rw
+-----------
+
+:Syntax: ro
+:or:     rw
+
+These two options tell the kernel whether it should mount the root
+filesystem read-only or read-write. The default is read-only, except
+for ramdisks, which default to read-write.
+
+
+2.3) debug
+----------
+
+:Syntax: debug
+
+This raises the kernel log level to 10 (the default is 7). This is the
+same level as set by the "dmesg" command, just that the maximum level
+selectable by dmesg is 8.
+
+
+2.4) debug=
+-----------
+
+:Syntax: debug=<device>
+
+This option causes certain kernel messages be printed to the selected
+debugging device. This can aid debugging the kernel, since the
+messages can be captured and analyzed on some other machine. Which
+devices are possible depends on the machine type. There are no checks
+for the validity of the device name. If the device isn't implemented,
+nothing happens.
+
+Messages logged this way are in general stack dumps after kernel
+memory faults or bad kernel traps, and kernel panics. To be exact: all
+messages of level 0 (panic messages) and all messages printed while
+the log level is 8 or more (their level doesn't matter). Before stack
+dumps, the kernel sets the log level to 10 automatically. A level of
+at least 8 can also be set by the "debug" command line option (see
+2.3) and at run time with "dmesg -n 8".
+
+Devices possible for Amiga:
+
+ - "ser":
+	  built-in serial port; parameters: 9600bps, 8N1
+ - "mem":
+	  Save the messages to a reserved area in chip mem. After
+          rebooting, they can be read under AmigaOS with the tool
+          'dmesg'.
+
+Devices possible for Atari:
+
+ - "ser1":
+	   ST-MFP serial port ("Modem1"); parameters: 9600bps, 8N1
+ - "ser2":
+	   SCC channel B serial port ("Modem2"); parameters: 9600bps, 8N1
+ - "ser" :
+	   default serial port
+           This is "ser2" for a Falcon, and "ser1" for any other machine
+ - "midi":
+	   The MIDI port; parameters: 31250bps, 8N1
+ - "par" :
+	   parallel port
+
+           The printing routine for this implements a timeout for the
+           case there's no printer connected (else the kernel would
+           lock up). The timeout is not exact, but usually a few
+           seconds.
+
+
+2.6) ramdisk_size=
+------------------
+
+:Syntax: ramdisk_size=<size>
+
+This option instructs the kernel to set up a ramdisk of the given
+size in KBytes. Do not use this option if the ramdisk contents are
+passed by bootstrap! In this case, the size is selected automatically
+and should not be overwritten.
+
+The only application is for root filesystems on floppy disks, that
+should be loaded into memory. To do that, select the corresponding
+size of the disk as ramdisk size, and set the root device to the disk
+drive (with "root=").
+
+
+2.7) swap=
+
+  I can't find any sign of this option in 2.2.6.
+
+2.8) buff=
+-----------
+
+  I can't find any sign of this option in 2.2.6.
+
+
+3) General Device Options (Amiga and Atari)
+===========================================
+
+3.1) ether=
+-----------
+
+:Syntax: ether=[<irq>[,<base_addr>[,<mem_start>[,<mem_end>]]]],<dev-name>
+
+<dev-name> is the name of a net driver, as specified in
+drivers/net/Space.c in the Linux source. Most prominent are eth0, ...
+eth3, sl0, ... sl3, ppp0, ..., ppp3, dummy, and lo.
+
+The non-ethernet drivers (sl, ppp, dummy, lo) obviously ignore the
+settings by this options. Also, the existing ethernet drivers for
+Linux/m68k (ariadne, a2065, hydra) don't use them because Zorro boards
+are really Plug-'n-Play, so the "ether=" option is useless altogether
+for Linux/m68k.
+
+
+3.2) hd=
+--------
+
+:Syntax: hd=<cylinders>,<heads>,<sectors>
+
+This option sets the disk geometry of an IDE disk. The first hd=
+option is for the first IDE disk, the second for the second one.
+(I.e., you can give this option twice.) In most cases, you won't have
+to use this option, since the kernel can obtain the geometry data
+itself. It exists just for the case that this fails for one of your
+disks.
+
+
+3.3) max_scsi_luns=
+-------------------
+
+:Syntax: max_scsi_luns=<n>
+
+Sets the maximum number of LUNs (logical units) of SCSI devices to
+be scanned. Valid values for <n> are between 1 and 8. Default is 8 if
+"Probe all LUNs on each SCSI device" was selected during the kernel
+configuration, else 1.
+
+
+3.4) st=
+--------
+
+:Syntax: st=<buffer_size>,[<write_thres>,[<max_buffers>]]
+
+Sets several parameters of the SCSI tape driver. <buffer_size> is
+the number of 512-byte buffers reserved for tape operations for each
+device. <write_thres> sets the number of blocks which must be filled
+to start an actual write operation to the tape. Maximum value is the
+total number of buffers. <max_buffer> limits the total number of
+buffers allocated for all tape devices.
+
+
+3.5) dmasound=
+--------------
+
+:Syntax: dmasound=[<buffers>,<buffer-size>[,<catch-radius>]]
+
+This option controls some configurations of the Linux/m68k DMA sound
+driver (Amiga and Atari): <buffers> is the number of buffers you want
+to use (minimum 4, default 4), <buffer-size> is the size of each
+buffer in kilobytes (minimum 4, default 32) and <catch-radius> says
+how much percent of error will be tolerated when setting a frequency
+(maximum 10, default 0). For example with 3% you can play 8000Hz
+AU-Files on the Falcon with its hardware frequency of 8195Hz and thus
+don't need to expand the sound.
+
+
+
+4) Options for Atari Only
+=========================
+
+4.1) video=
+-----------
+
+:Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer,
+eg. most atari users will want to specify `atafb` here. The
+<sub-options> is a comma-separated list of the sub-options listed
+below.
+
+NB:
+    Please notice that this option was renamed from `atavideo` to
+    `video` during the development of the 1.3.x kernels, thus you
+    might need to update your boot-scripts if upgrading to 2.x from
+    an 1.2.x kernel.
+
+NBB:
+    The behavior of video= was changed in 2.1.57 so the recommended
+    option is to specify the name of the frame buffer.
+
+4.1.1) Video Mode
+-----------------
+
+This sub-option may be any of the predefined video modes, as listed
+in atari/atafb.c in the Linux/m68k source tree. The kernel will
+activate the given video mode at boot time and make it the default
+mode, if the hardware allows. Currently defined names are:
+
+ - stlow           : 320x200x4
+ - stmid, default5 : 640x200x2
+ - sthigh, default4: 640x400x1
+ - ttlow           : 320x480x8, TT only
+ - ttmid, default1 : 640x480x4, TT only
+ - tthigh, default2: 1280x960x1, TT only
+ - vga2            : 640x480x1, Falcon only
+ - vga4            : 640x480x2, Falcon only
+ - vga16, default3 : 640x480x4, Falcon only
+ - vga256          : 640x480x8, Falcon only
+ - falh2           : 896x608x1, Falcon only
+ - falh16          : 896x608x4, Falcon only
+
+If no video mode is given on the command line, the kernel tries the
+modes names "default<n>" in turn, until one is possible with the
+hardware in use.
+
+A video mode setting doesn't make sense, if the external driver is
+activated by a "external:" sub-option.
+
+4.1.2) inverse
+--------------
+
+Invert the display. This affects both, text (consoles) and graphics
+(X) display. Usually, the background is chosen to be black. With this
+option, you can make the background white.
+
+4.1.3) font
+-----------
+
+:Syntax: font:<fontname>
+
+Specify the font to use in text modes. Currently you can choose only
+between `VGA8x8`, `VGA8x16` and `PEARL8x8`. `VGA8x8` is default, if the
+vertical size of the display is less than 400 pixel rows. Otherwise, the
+`VGA8x16` font is the default.
+
+4.1.4) `hwscroll_`
+------------------
+
+:Syntax: `hwscroll_<n>`
+
+The number of additional lines of video memory to reserve for
+speeding up the scrolling ("hardware scrolling"). Hardware scrolling
+is possible only if the kernel can set the video base address in steps
+fine enough. This is true for STE, MegaSTE, TT, and Falcon. It is not
+possible with plain STs and graphics cards (The former because the
+base address must be on a 256 byte boundary there, the latter because
+the kernel doesn't know how to set the base address at all.)
+
+By default, <n> is set to the number of visible text lines on the
+display. Thus, the amount of video memory is doubled, compared to no
+hardware scrolling. You can turn off the hardware scrolling altogether
+by setting <n> to 0.
+
+4.1.5) internal:
+----------------
+
+:Syntax: internal:<xres>;<yres>[;<xres_max>;<yres_max>;<offset>]
+
+This option specifies the capabilities of some extended internal video
+hardware, like e.g. OverScan. <xres> and <yres> give the (extended)
+dimensions of the screen.
+
+If your OverScan needs a black border, you have to write the last
+three arguments of the "internal:". <xres_max> is the maximum line
+length the hardware allows, <yres_max> the maximum number of lines.
+<offset> is the offset of the visible part of the screen memory to its
+physical start, in bytes.
+
+Often, extended interval video hardware has to be activated somehow.
+For this, see the "sw_*" options below.
+
+4.1.6) external:
+----------------
+
+:Syntax:
+  external:<xres>;<yres>;<depth>;<org>;<scrmem>[;<scrlen>[;<vgabase>
+  [;<colw>[;<coltype>[;<xres_virtual>]]]]]
+
+.. I had to break this line...
+
+This is probably the most complicated parameter... It specifies that
+you have some external video hardware (a graphics board), and how to
+use it under Linux/m68k. The kernel cannot know more about the hardware
+than you tell it here! The kernel also is unable to set or change any
+video modes, since it doesn't know about any board internal. So, you
+have to switch to that video mode before you start Linux, and cannot
+switch to another mode once Linux has started.
+
+The first 3 parameters of this sub-option should be obvious: <xres>,
+<yres> and <depth> give the dimensions of the screen and the number of
+planes (depth). The depth is the logarithm to base 2 of the number
+of colors possible. (Or, the other way round: The number of colors is
+2^depth).
+
+You have to tell the kernel furthermore how the video memory is
+organized. This is done by a letter as <org> parameter:
+
+ 'n':
+      "normal planes", i.e. one whole plane after another
+ 'i':
+      "interleaved planes", i.e. 16 bit of the first plane, than 16 bit
+      of the next, and so on... This mode is used only with the
+      built-in Atari video modes, I think there is no card that
+      supports this mode.
+ 'p':
+      "packed pixels", i.e. <depth> consecutive bits stand for all
+      planes of one pixel; this is the most common mode for 8 planes
+      (256 colors) on graphic cards
+ 't':
+      "true color" (more or less packed pixels, but without a color
+      lookup table); usually depth is 24
+
+For monochrome modes (i.e., <depth> is 1), the <org> letter has a
+different meaning:
+
+ 'n':
+      normal colors, i.e. 0=white, 1=black
+ 'i':
+      inverted colors, i.e. 0=black, 1=white
+
+The next important information about the video hardware is the base
+address of the video memory. That is given in the <scrmem> parameter,
+as a hexadecimal number with a "0x" prefix. You have to find out this
+address in the documentation of your hardware.
+
+The next parameter, <scrlen>, tells the kernel about the size of the
+video memory. If it's missing, the size is calculated from <xres>,
+<yres>, and <depth>. For now, it is not useful to write a value here.
+It would be used only for hardware scrolling (which isn't possible
+with the external driver, because the kernel cannot set the video base
+address), or for virtual resolutions under X (which the X server
+doesn't support yet). So, it's currently best to leave this field
+empty, either by ending the "external:" after the video address or by
+writing two consecutive semicolons, if you want to give a <vgabase>
+(it is allowed to leave this parameter empty).
+
+The <vgabase> parameter is optional. If it is not given, the kernel
+cannot read or write any color registers of the video hardware, and
+thus you have to set appropriate colors before you start Linux. But if
+your card is somehow VGA compatible, you can tell the kernel the base
+address of the VGA register set, so it can change the color lookup
+table. You have to look up this address in your board's documentation.
+To avoid misunderstandings: <vgabase> is the _base_ address, i.e. a 4k
+aligned address. For read/writing the color registers, the kernel
+uses the addresses vgabase+0x3c7...vgabase+0x3c9. The <vgabase>
+parameter is written in hexadecimal with a "0x" prefix, just as
+<scrmem>.
+
+<colw> is meaningful only if <vgabase> is specified. It tells the
+kernel how wide each of the color register is, i.e. the number of bits
+per single color (red/green/blue). Default is 6, another quite usual
+value is 8.
+
+Also <coltype> is used together with <vgabase>. It tells the kernel
+about the color register model of your gfx board. Currently, the types
+"vga" (which is also the default) and "mv300" (SANG MV300) are
+implemented.
+
+Parameter <xres_virtual> is required for ProMST or ET4000 cards where
+the physical linelength differs from the visible length. With ProMST,
+xres_virtual must be set to 2048. For ET4000, xres_virtual depends on the
+initialisation of the video-card.
+If you're missing a corresponding yres_virtual: the external part is legacy,
+therefore we don't support hardware-dependent functions like hardware-scroll,
+panning or blanking.
+
+4.1.7) eclock:
+--------------
+
+The external pixel clock attached to the Falcon VIDEL shifter. This
+currently works only with the ScreenWonder!
+
+4.1.8) monitorcap:
+-------------------
+
+:Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. Don't use it
+with a fixed-frequency monitor! For now, only the Falcon frame buffer
+uses the settings of "monitorcap:".
+
+<vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+  The defaults are 58;62;31;32 (VGA compatible).
+
+  The defaults for TV/SC1224/SC1435 cover both PAL and NTSC standards.
+
+4.1.9) keep
+------------
+
+If this option is given, the framebuffer device doesn't do any video
+mode calculations and settings on its own. The only Atari fb device
+that does this currently is the Falcon.
+
+What you reach with this: Settings for unknown video extensions
+aren't overridden by the driver, so you can still use the mode found
+when booting, when the driver doesn't know to set this mode itself.
+But this also means, that you can't switch video modes anymore...
+
+An example where you may want to use "keep" is the ScreenBlaster for
+the Falcon.
+
+
+4.2) atamouse=
+--------------
+
+:Syntax: atamouse=<x-threshold>,[<y-threshold>]
+
+With this option, you can set the mouse movement reporting threshold.
+This is the number of pixels of mouse movement that have to accumulate
+before the IKBD sends a new mouse packet to the kernel. Higher values
+reduce the mouse interrupt load and thus reduce the chance of keyboard
+overruns. Lower values give a slightly faster mouse responses and
+slightly better mouse tracking.
+
+You can set the threshold in x and y separately, but usually this is
+of little practical use. If there's just one number in the option, it
+is used for both dimensions. The default value is 2 for both
+thresholds.
+
+
+4.3) ataflop=
+-------------
+
+:Syntax: ataflop=<drive type>[,<trackbuffering>[,<steprateA>[,<steprateB>]]]
+
+   The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This
+   setting affects how many buffers are reserved and which formats are
+   probed (see also below). The default is 1 (HD). Only one drive type
+   can be selected. If you have two disk drives, select the "better"
+   type.
+
+   The second parameter <trackbuffer> tells the kernel whether to use
+   track buffering (1) or not (0). The default is machine-dependent:
+   no for the Medusa and yes for all others.
+
+   With the two following parameters, you can change the default
+   steprate used for drive A and B, resp.
+
+
+4.4) atascsi=
+-------------
+
+:Syntax: atascsi=<can_queue>[,<cmd_per_lun>[,<scat-gat>[,<host-id>[,<tagged>]]]]
+
+This option sets some parameters for the Atari native SCSI driver.
+Generally, any number of arguments can be omitted from the end. And
+for each of the numbers, a negative value means "use default". The
+defaults depend on whether TT-style or Falcon-style SCSI is used.
+Below, defaults are noted as n/m, where the first value refers to
+TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given
+for one parameter, an error message is printed and that one setting is
+ignored (others aren't affected).
+
+  <can_queue>:
+    This is the maximum number of SCSI commands queued internally to the
+    Atari SCSI driver. A value of 1 effectively turns off the driver
+    internal multitasking (if it causes problems). Legal values are >=
+    1. <can_queue> can be as high as you like, but values greater than
+    <cmd_per_lun> times the number of SCSI targets (LUNs) you have
+    don't make sense. Default: 16/8.
+
+  <cmd_per_lun>:
+    Maximum number of SCSI commands issued to the driver for one
+    logical unit (LUN, usually one SCSI target). Legal values start
+    from 1. If tagged queuing (see below) is not used, values greater
+    than 2 don't make sense, but waste memory. Otherwise, the maximum
+    is the number of command tags available to the driver (currently
+    32). Default: 8/1. (Note: Values > 1 seem to cause problems on a
+    Falcon, cause not yet known.)
+
+    The <cmd_per_lun> value at a great part determines the amount of
+    memory SCSI reserves for itself. The formula is rather
+    complicated, but I can give you some hints:
+
+      no scatter-gather:
+	cmd_per_lun * 232 bytes
+      full scatter-gather:
+	cmd_per_lun * approx. 17 Kbytes
+
+  <scat-gat>:
+    Size of the scatter-gather table, i.e. the number of requests
+    consecutive on the disk that can be merged into one SCSI command.
+    Legal values are between 0 and 255. Default: 255/0. Note: This
+    value is forced to 0 on a Falcon, since scatter-gather isn't
+    possible with the ST-DMA. Not using scatter-gather hurts
+    performance significantly.
+
+  <host-id>:
+    The SCSI ID to be used by the initiator (your Atari). This is
+    usually 7, the highest possible ID. Every ID on the SCSI bus must
+    be unique. Default: determined at run time: If the NV-RAM checksum
+    is valid, and bit 7 in byte 30 of the NV-RAM is set, the lower 3
+    bits of this byte are used as the host ID. (This method is defined
+    by Atari and also used by some TOS HD drivers.) If the above
+    isn't given, the default ID is 7. (both, TT and Falcon).
+
+  <tagged>:
+    0 means turn off tagged queuing support, all other values > 0 mean
+    use tagged queuing for targets that support it. Default: currently
+    off, but this may change when tagged queuing handling has been
+    proved to be reliable.
+
+    Tagged queuing means that more than one command can be issued to
+    one LUN, and the SCSI device itself orders the requests so they
+    can be performed in optimal order. Not all SCSI devices support
+    tagged queuing (:-().
+
+4.5 switches=
+-------------
+
+:Syntax: switches=<list of switches>
+
+With this option you can switch some hardware lines that are often
+used to enable/disable certain hardware extensions. Examples are
+OverScan, overclocking, ...
+
+The <list of switches> is a comma-separated list of the following
+items:
+
+  ikbd:
+	set RTS of the keyboard ACIA high
+  midi:
+	set RTS of the MIDI ACIA high
+  snd6:
+	set bit 6 of the PSG port A
+  snd7:
+	set bit 6 of the PSG port A
+
+It doesn't make sense to mention a switch more than once (no
+difference to only once), but you can give as many switches as you
+want to enable different features. The switch lines are set as early
+as possible during kernel initialization (even before determining the
+present hardware.)
+
+All of the items can also be prefixed with `ov_`, i.e. `ov_ikbd`,
+`ov_midi`, ... These options are meant for switching on an OverScan
+video extension. The difference to the bare option is that the
+switch-on is done after video initialization, and somehow synchronized
+to the HBLANK. A speciality is that ov_ikbd and ov_midi are switched
+off before rebooting, so that OverScan is disabled and TOS boots
+correctly.
+
+If you give an option both, with and without the `ov_` prefix, the
+earlier initialization (`ov_`-less) takes precedence. But the
+switching-off on reset still happens in this case.
+
+5) Options for Amiga Only:
+==========================
+
+5.1) video=
+-----------
+
+:Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer, valid
+options are `amifb`, `cyber`, 'virge', `retz3` and `clgen`, provided
+that the respective frame buffer devices have been compiled into the
+kernel (or compiled as loadable modules). The behavior of the <fbname>
+option was changed in 2.1.57 so it is now recommended to specify this
+option.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below. This option is organized similar to the Atari version of the
+"video"-option (4.1), but knows fewer sub-options.
+
+5.1.1) video mode
+-----------------
+
+Again, similar to the video mode for the Atari (see 4.1.1). Predefined
+modes depend on the used frame buffer device.
+
+OCS, ECS and AGA machines all use the color frame buffer. The following
+predefined video modes are available:
+
+NTSC modes:
+ - ntsc            : 640x200, 15 kHz, 60 Hz
+ - ntsc-lace       : 640x400, 15 kHz, 60 Hz interlaced
+
+PAL modes:
+ - pal             : 640x256, 15 kHz, 50 Hz
+ - pal-lace        : 640x512, 15 kHz, 50 Hz interlaced
+
+ECS modes:
+ - multiscan       : 640x480, 29 kHz, 57 Hz
+ - multiscan-lace  : 640x960, 29 kHz, 57 Hz interlaced
+ - euro36          : 640x200, 15 kHz, 72 Hz
+ - euro36-lace     : 640x400, 15 kHz, 72 Hz interlaced
+ - euro72          : 640x400, 29 kHz, 68 Hz
+ - euro72-lace     : 640x800, 29 kHz, 68 Hz interlaced
+ - super72         : 800x300, 23 kHz, 70 Hz
+ - super72-lace    : 800x600, 23 kHz, 70 Hz interlaced
+ - dblntsc-ff      : 640x400, 27 kHz, 57 Hz
+ - dblntsc-lace    : 640x800, 27 kHz, 57 Hz interlaced
+ - dblpal-ff       : 640x512, 27 kHz, 47 Hz
+ - dblpal-lace     : 640x1024, 27 kHz, 47 Hz interlaced
+ - dblntsc         : 640x200, 27 kHz, 57 Hz doublescan
+ - dblpal          : 640x256, 27 kHz, 47 Hz doublescan
+
+VGA modes:
+ - vga             : 640x480, 31 kHz, 60 Hz
+ - vga70           : 640x400, 31 kHz, 70 Hz
+
+Please notice that the ECS and VGA modes require either an ECS or AGA
+chipset, and that these modes are limited to 2-bit color for the ECS
+chipset and 8-bit color for the AGA chipset.
+
+5.1.2) depth
+------------
+
+:Syntax: depth:<nr. of bit-planes>
+
+Specify the number of bit-planes for the selected video-mode.
+
+5.1.3) inverse
+--------------
+
+Use inverted display (black on white). Functionally the same as the
+"inverse" sub-option for the Atari.
+
+5.1.4) font
+-----------
+
+:Syntax: font:<fontname>
+
+Specify the font to use in text modes. Functionally the same as the
+"font" sub-option for the Atari, except that `PEARL8x8` is used instead
+of `VGA8x8` if the vertical size of the display is less than 400 pixel
+rows.
+
+5.1.5) monitorcap:
+-------------------
+
+:Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. For now, only
+the color frame buffer uses the settings of "monitorcap:".
+
+<vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+The defaults are 50;90;15;38 (Generic Amiga multisync monitor).
+
+
+5.2) fd_def_df0=
+----------------
+
+:Syntax: fd_def_df0=<value>
+
+Sets the df0 value for "silent" floppy drives. The value should be in
+hexadecimal with "0x" prefix.
+
+
+5.3) wd33c93=
+-------------
+
+:Syntax: wd33c93=<sub-options...>
+
+These options affect the A590/A2091, A3000 and GVP Series II SCSI
+controllers.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below.
+
+5.3.1) nosync
+-------------
+
+:Syntax: nosync:bitmask
+
+bitmask is a byte where the 1st 7 bits correspond with the 7
+possible SCSI devices. Set a bit to prevent sync negotiation on that
+device. To maintain backwards compatibility, a command-line such as
+"wd33c93=255" will be automatically translated to
+"wd33c93=nosync:0xff". The default is to disable sync negotiation for
+all devices, eg. nosync:0xff.
+
+5.3.2) period
+-------------
+
+:Syntax: period:ns
+
+`ns` is the minimum # of nanoseconds in a SCSI data transfer
+period. Default is 500; acceptable values are 250 - 1000.
+
+5.3.3) disconnect
+-----------------
+
+:Syntax: disconnect:x
+
+Specify x = 0 to never allow disconnects, 2 to always allow them.
+x = 1 does 'adaptive' disconnects, which is the default and generally
+the best choice.
+
+5.3.4) debug
+------------
+
+:Syntax: debug:x
+
+If `DEBUGGING_ON` is defined, x is a bit mask that causes various
+types of debug output to printed - see the DB_xxx defines in
+wd33c93.h.
+
+5.3.5) clock
+------------
+
+:Syntax: clock:x
+
+x = clock input in MHz for WD33c93 chip. Normal values would be from
+8 through 20. The default value depends on your hostadapter(s),
+default for the A3000 internal controller is 14, for the A2091 it's 8
+and for the GVP hostadapters it's either 8 or 14, depending on the
+hostadapter and the SCSI-clock jumper present on some GVP
+hostadapters.
+
+5.3.6) next
+-----------
+
+No argument. Used to separate blocks of keywords when there's more
+than one wd33c93-based host adapter in the system.
+
+5.3.7) nodma
+------------
+
+:Syntax: nodma:x
+
+If x is 1 (or if the option is just written as "nodma"), the WD33c93
+controller will not use DMA (= direct memory access) to access the
+Amiga's memory.  This is useful for some systems (like A3000's and
+A4000's with the A3640 accelerator, revision 3.0) that have problems
+using DMA to chip memory.  The default is 0, i.e. to use DMA if
+possible.
+
+
+5.4) gvp11=
+-----------
+
+:Syntax: gvp11=<addr-mask>
+
+The earlier versions of the GVP driver did not handle DMA
+address-mask settings correctly which made it necessary for some
+people to use this option, in order to get their GVP controller
+running under Linux. These problems have hopefully been solved and the
+use of this option is now highly unrecommended!
+
+Incorrect use can lead to unpredictable behavior, so please only use
+this option if you *know* what you are doing and have a reason to do
+so. In any case if you experience problems and need to use this
+option, please inform us about it by mailing to the Linux/68k kernel
+mailing list.
+
+The address mask set by this option specifies which addresses are
+valid for DMA with the GVP Series II SCSI controller. An address is
+valid, if no bits are set except the bits that are set in the mask,
+too.
+
+Some versions of the GVP can only DMA into a 24 bit address range,
+some can address a 25 bit address range while others can use the whole
+32 bit address range for DMA. The correct setting depends on your
+controller and should be autodetected by the driver. An example is the
+24 bit region which is specified by a mask of 0x00fffffe.
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
deleted file mode 100644
index 79d21246c75a..000000000000
--- a/Documentation/m68k/kernel-options.txt
+++ /dev/null
@@ -1,884 +0,0 @@
-
-
-				  Command Line Options for Linux/m68k
-				  ===================================
-
-Last Update: 2 May 1999
-Linux/m68k version: 2.2.6
-Author: Roman.Hodek@informatik.uni-erlangen.de (Roman Hodek)
-Update: jds@kom.auc.dk (Jes Sorensen) and faq@linux-m68k.org (Chris Lawrence)
-
-0) Introduction
-===============
-
-  Often I've been asked which command line options the Linux/m68k
-kernel understands, or how the exact syntax for the ... option is, or
-... about the option ... . I hope, this document supplies all the
-answers...
-
-  Note that some options might be outdated, their descriptions being
-incomplete or missing. Please update the information and send in the
-patches.
-
-
-1) Overview of the Kernel's Option Processing
-=============================================
-
-The kernel knows three kinds of options on its command line:
-
-  1) kernel options
-  2) environment settings
-  3) arguments for init
-
-To which of these classes an argument belongs is determined as
-follows: If the option is known to the kernel itself, i.e. if the name
-(the part before the '=') or, in some cases, the whole argument string
-is known to the kernel, it belongs to class 1. Otherwise, if the
-argument contains an '=', it is of class 2, and the definition is put
-into init's environment. All other arguments are passed to init as
-command line options.
-
-  This document describes the valid kernel options for Linux/m68k in
-the version mentioned at the start of this file. Later revisions may
-add new such options, and some may be missing in older versions.
-
-  In general, the value (the part after the '=') of an option is a
-list of values separated by commas. The interpretation of these values
-is up to the driver that "owns" the option. This association of
-options with drivers is also the reason that some are further
-subdivided.
-
-
-2) General Kernel Options
-=========================
-
-2.1) root=
-----------
-
-Syntax: root=/dev/<device>
-    or: root=<hex_number>
-
-This tells the kernel which device it should mount as the root
-filesystem. The device must be a block device with a valid filesystem
-on it.
-
-  The first syntax gives the device by name. These names are converted
-into a major/minor number internally in the kernel in an unusual way.
-Normally, this "conversion" is done by the device files in /dev, but
-this isn't possible here, because the root filesystem (with /dev)
-isn't mounted yet... So the kernel parses the name itself, with some
-hardcoded name to number mappings. The name must always be a
-combination of two or three letters, followed by a decimal number.
-Valid names are:
-
-  /dev/ram: -> 0x0100 (initial ramdisk)
-  /dev/hda: -> 0x0300 (first IDE disk)
-  /dev/hdb: -> 0x0340 (second IDE disk)
-  /dev/sda: -> 0x0800 (first SCSI disk)
-  /dev/sdb: -> 0x0810 (second SCSI disk)
-  /dev/sdc: -> 0x0820 (third SCSI disk)
-  /dev/sdd: -> 0x0830 (forth SCSI disk)
-  /dev/sde: -> 0x0840 (fifth SCSI disk)
-  /dev/fd : -> 0x0200 (floppy disk)
-
-  The name must be followed by a decimal number, that stands for the
-partition number. Internally, the value of the number is just
-added to the device number mentioned in the table above. The
-exceptions are /dev/ram and /dev/fd, where /dev/ram refers to an
-initial ramdisk loaded by your bootstrap program (please consult the
-instructions for your bootstrap program to find out how to load an
-initial ramdisk). As of kernel version 2.0.18 you must specify
-/dev/ram as the root device if you want to boot from an initial
-ramdisk. For the floppy devices, /dev/fd, the number stands for the
-floppy drive number (there are no partitions on floppy disks). I.e.,
-/dev/fd0 stands for the first drive, /dev/fd1 for the second, and so
-on. Since the number is just added, you can also force the disk format
-by adding a number greater than 3. If you look into your /dev
-directory, use can see the /dev/fd0D720 has major 2 and minor 16. You
-can specify this device for the root FS by writing "root=/dev/fd16" on
-the kernel command line.
-
-[Strange and maybe uninteresting stuff ON]
-
-  This unusual translation of device names has some strange
-consequences: If, for example, you have a symbolic link from /dev/fd
-to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format,
-you cannot use this name for specifying the root device, because the
-kernel cannot see this symlink before mounting the root FS and it
-isn't in the table above. If you use it, the root device will not be 
-set at all, without an error message. Another example: You cannot use a
-partition on e.g. the sixth SCSI disk as the root filesystem, if you
-want to specify it by name. This is, because only the devices up to
-/dev/sde are in the table above, but not /dev/sdf. Although, you can
-use the sixth SCSI disk for the root FS, but you have to specify the
-device by number... (see below). Or, even more strange, you can use the
-fact that there is no range checking of the partition number, and your
-knowledge that each disk uses 16 minors, and write "root=/dev/sde17"
-(for /dev/sdf1).
-
-[Strange and maybe uninteresting stuff OFF]
-
-  If the device containing your root partition isn't in the table
-above, you can also specify it by major and minor numbers. These are
-written in hex, with no prefix and no separator between. E.g., if you
-have a CD with contents appropriate as a root filesystem in the first
-SCSI CD-ROM drive, you boot from it by "root=0b00". Here, hex "0b" =
-decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
-the first of these. You can find out all valid major numbers by
-looking into include/linux/major.h.
-
-In addition to major and minor numbers, if the device containing your
-root partition uses a partition table format with unique partition
-identifiers, then you may use them.  For instance,
-"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF".  It is also
-possible to reference another partition on the same device using a
-known partition UUID as the starting point.  For example,
-if partition 5 of the device has the UUID of
-00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
-follows:
-  PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
-
-Authoritative information can be found in
-"Documentation/admin-guide/kernel-parameters.rst".
-
-
-2.2) ro, rw
------------
-
-Syntax: ro
-    or: rw
-
-These two options tell the kernel whether it should mount the root
-filesystem read-only or read-write. The default is read-only, except
-for ramdisks, which default to read-write.
-
-
-2.3) debug
-----------
-
-Syntax: debug
-
-This raises the kernel log level to 10 (the default is 7). This is the
-same level as set by the "dmesg" command, just that the maximum level
-selectable by dmesg is 8.
-
-
-2.4) debug=
------------
-
-Syntax: debug=<device>
-
-This option causes certain kernel messages be printed to the selected
-debugging device. This can aid debugging the kernel, since the
-messages can be captured and analyzed on some other machine. Which
-devices are possible depends on the machine type. There are no checks
-for the validity of the device name. If the device isn't implemented,
-nothing happens.
-
-  Messages logged this way are in general stack dumps after kernel
-memory faults or bad kernel traps, and kernel panics. To be exact: all
-messages of level 0 (panic messages) and all messages printed while
-the log level is 8 or more (their level doesn't matter). Before stack
-dumps, the kernel sets the log level to 10 automatically. A level of
-at least 8 can also be set by the "debug" command line option (see
-2.3) and at run time with "dmesg -n 8".
-
-Devices possible for Amiga:
-
- - "ser": built-in serial port; parameters: 9600bps, 8N1
- - "mem": Save the messages to a reserved area in chip mem. After
-          rebooting, they can be read under AmigaOS with the tool
-          'dmesg'.
-
-Devices possible for Atari:
-
- - "ser1": ST-MFP serial port ("Modem1"); parameters: 9600bps, 8N1
- - "ser2": SCC channel B serial port ("Modem2"); parameters: 9600bps, 8N1
- - "ser" : default serial port
-           This is "ser2" for a Falcon, and "ser1" for any other machine
- - "midi": The MIDI port; parameters: 31250bps, 8N1
- - "par" : parallel port
-           The printing routine for this implements a timeout for the
-           case there's no printer connected (else the kernel would
-           lock up). The timeout is not exact, but usually a few
-           seconds.
-
-
-2.6) ramdisk_size=
--------------
-
-Syntax: ramdisk_size=<size>
-
-  This option instructs the kernel to set up a ramdisk of the given
-size in KBytes. Do not use this option if the ramdisk contents are
-passed by bootstrap! In this case, the size is selected automatically
-and should not be overwritten.
-
-  The only application is for root filesystems on floppy disks, that
-should be loaded into memory. To do that, select the corresponding
-size of the disk as ramdisk size, and set the root device to the disk
-drive (with "root=").
-
-
-2.7) swap=
-2.8) buff=
------------
-
-  I can't find any sign of these options in 2.2.6.
-
-
-3) General Device Options (Amiga and Atari)
-===========================================
-
-3.1) ether=
------------
-
-Syntax: ether=[<irq>[,<base_addr>[,<mem_start>[,<mem_end>]]]],<dev-name>
-
-  <dev-name> is the name of a net driver, as specified in
-drivers/net/Space.c in the Linux source. Most prominent are eth0, ...
-eth3, sl0, ... sl3, ppp0, ..., ppp3, dummy, and lo.
-
-  The non-ethernet drivers (sl, ppp, dummy, lo) obviously ignore the
-settings by this options. Also, the existing ethernet drivers for
-Linux/m68k (ariadne, a2065, hydra) don't use them because Zorro boards
-are really Plug-'n-Play, so the "ether=" option is useless altogether
-for Linux/m68k.
-
-
-3.2) hd=
---------
-
-Syntax: hd=<cylinders>,<heads>,<sectors>
-
-  This option sets the disk geometry of an IDE disk. The first hd=
-option is for the first IDE disk, the second for the second one.
-(I.e., you can give this option twice.) In most cases, you won't have
-to use this option, since the kernel can obtain the geometry data
-itself. It exists just for the case that this fails for one of your
-disks.
-
-
-3.3) max_scsi_luns=
--------------------
-
-Syntax: max_scsi_luns=<n>
-
-  Sets the maximum number of LUNs (logical units) of SCSI devices to
-be scanned. Valid values for <n> are between 1 and 8. Default is 8 if
-"Probe all LUNs on each SCSI device" was selected during the kernel
-configuration, else 1.
-
-
-3.4) st=
---------
-
-Syntax: st=<buffer_size>,[<write_thres>,[<max_buffers>]]
-
-  Sets several parameters of the SCSI tape driver. <buffer_size> is
-the number of 512-byte buffers reserved for tape operations for each
-device. <write_thres> sets the number of blocks which must be filled
-to start an actual write operation to the tape. Maximum value is the
-total number of buffers. <max_buffer> limits the total number of
-buffers allocated for all tape devices.
-
-
-3.5) dmasound=
---------------
-
-Syntax: dmasound=[<buffers>,<buffer-size>[,<catch-radius>]]
-
-  This option controls some configurations of the Linux/m68k DMA sound
-driver (Amiga and Atari): <buffers> is the number of buffers you want
-to use (minimum 4, default 4), <buffer-size> is the size of each
-buffer in kilobytes (minimum 4, default 32) and <catch-radius> says
-how much percent of error will be tolerated when setting a frequency
-(maximum 10, default 0). For example with 3% you can play 8000Hz
-AU-Files on the Falcon with its hardware frequency of 8195Hz and thus
-don't need to expand the sound.
-
-
-
-4) Options for Atari Only
-=========================
-
-4.1) video=
------------
-
-Syntax: video=<fbname>:<sub-options...>
-
-The <fbname> parameter specifies the name of the frame buffer,
-eg. most atari users will want to specify `atafb' here. The
-<sub-options> is a comma-separated list of the sub-options listed
-below.
-
-NB: Please notice that this option was renamed from `atavideo' to
-    `video' during the development of the 1.3.x kernels, thus you
-    might need to update your boot-scripts if upgrading to 2.x from
-    an 1.2.x kernel.
-
-NBB: The behavior of video= was changed in 2.1.57 so the recommended
-option is to specify the name of the frame buffer.
-
-4.1.1) Video Mode
------------------
-
-This sub-option may be any of the predefined video modes, as listed
-in atari/atafb.c in the Linux/m68k source tree. The kernel will
-activate the given video mode at boot time and make it the default
-mode, if the hardware allows. Currently defined names are:
-
- - stlow           : 320x200x4
- - stmid, default5 : 640x200x2
- - sthigh, default4: 640x400x1
- - ttlow           : 320x480x8, TT only
- - ttmid, default1 : 640x480x4, TT only
- - tthigh, default2: 1280x960x1, TT only
- - vga2            : 640x480x1, Falcon only
- - vga4            : 640x480x2, Falcon only
- - vga16, default3 : 640x480x4, Falcon only
- - vga256          : 640x480x8, Falcon only
- - falh2           : 896x608x1, Falcon only
- - falh16          : 896x608x4, Falcon only
-
-  If no video mode is given on the command line, the kernel tries the
-modes names "default<n>" in turn, until one is possible with the
-hardware in use.
-
-  A video mode setting doesn't make sense, if the external driver is
-activated by a "external:" sub-option.
-
-4.1.2) inverse
---------------
-
-Invert the display. This affects both, text (consoles) and graphics
-(X) display. Usually, the background is chosen to be black. With this
-option, you can make the background white.
-
-4.1.3) font
------------
-
-Syntax: font:<fontname>
-
-Specify the font to use in text modes. Currently you can choose only
-between `VGA8x8', `VGA8x16' and `PEARL8x8'. `VGA8x8' is default, if the
-vertical size of the display is less than 400 pixel rows. Otherwise, the
-`VGA8x16' font is the default.
-
-4.1.4) hwscroll_
-----------------
-
-Syntax: hwscroll_<n>
-
-The number of additional lines of video memory to reserve for
-speeding up the scrolling ("hardware scrolling"). Hardware scrolling
-is possible only if the kernel can set the video base address in steps
-fine enough. This is true for STE, MegaSTE, TT, and Falcon. It is not
-possible with plain STs and graphics cards (The former because the
-base address must be on a 256 byte boundary there, the latter because
-the kernel doesn't know how to set the base address at all.)
-
-  By default, <n> is set to the number of visible text lines on the
-display. Thus, the amount of video memory is doubled, compared to no
-hardware scrolling. You can turn off the hardware scrolling altogether
-by setting <n> to 0.
-
-4.1.5) internal:
-----------------
-
-Syntax: internal:<xres>;<yres>[;<xres_max>;<yres_max>;<offset>]
-
-This option specifies the capabilities of some extended internal video
-hardware, like e.g. OverScan. <xres> and <yres> give the (extended)
-dimensions of the screen.
-
-  If your OverScan needs a black border, you have to write the last
-three arguments of the "internal:". <xres_max> is the maximum line
-length the hardware allows, <yres_max> the maximum number of lines.
-<offset> is the offset of the visible part of the screen memory to its
-physical start, in bytes.
-
-  Often, extended interval video hardware has to be activated somehow.
-For this, see the "sw_*" options below.
-
-4.1.6) external:
-----------------
-
-Syntax:
-  external:<xres>;<yres>;<depth>;<org>;<scrmem>[;<scrlen>[;<vgabase>\
-           [;<colw>[;<coltype>[;<xres_virtual>]]]]]
-
-[I had to break this line...]
-
-  This is probably the most complicated parameter... It specifies that
-you have some external video hardware (a graphics board), and how to
-use it under Linux/m68k. The kernel cannot know more about the hardware
-than you tell it here! The kernel also is unable to set or change any
-video modes, since it doesn't know about any board internal. So, you
-have to switch to that video mode before you start Linux, and cannot
-switch to another mode once Linux has started.
-
-  The first 3 parameters of this sub-option should be obvious: <xres>,
-<yres> and <depth> give the dimensions of the screen and the number of
-planes (depth). The depth is the logarithm to base 2 of the number
-of colors possible. (Or, the other way round: The number of colors is
-2^depth).
-
-  You have to tell the kernel furthermore how the video memory is
-organized. This is done by a letter as <org> parameter:
-
- 'n': "normal planes", i.e. one whole plane after another
- 'i': "interleaved planes", i.e. 16 bit of the first plane, than 16 bit
-      of the next, and so on... This mode is used only with the
-	  built-in Atari video modes, I think there is no card that
-	  supports this mode.
- 'p': "packed pixels", i.e. <depth> consecutive bits stand for all
-	  planes of one pixel; this is the most common mode for 8 planes
-	  (256 colors) on graphic cards
- 't': "true color" (more or less packed pixels, but without a color
-	  lookup table); usually depth is 24
-
-For monochrome modes (i.e., <depth> is 1), the <org> letter has a
-different meaning:
-
- 'n': normal colors, i.e. 0=white, 1=black
- 'i': inverted colors, i.e. 0=black, 1=white
-
-  The next important information about the video hardware is the base
-address of the video memory. That is given in the <scrmem> parameter,
-as a hexadecimal number with a "0x" prefix. You have to find out this
-address in the documentation of your hardware.
-
-  The next parameter, <scrlen>, tells the kernel about the size of the
-video memory. If it's missing, the size is calculated from <xres>,
-<yres>, and <depth>. For now, it is not useful to write a value here.
-It would be used only for hardware scrolling (which isn't possible
-with the external driver, because the kernel cannot set the video base
-address), or for virtual resolutions under X (which the X server
-doesn't support yet). So, it's currently best to leave this field
-empty, either by ending the "external:" after the video address or by
-writing two consecutive semicolons, if you want to give a <vgabase>
-(it is allowed to leave this parameter empty).
-
-  The <vgabase> parameter is optional. If it is not given, the kernel
-cannot read or write any color registers of the video hardware, and
-thus you have to set appropriate colors before you start Linux. But if
-your card is somehow VGA compatible, you can tell the kernel the base
-address of the VGA register set, so it can change the color lookup
-table. You have to look up this address in your board's documentation.
-To avoid misunderstandings: <vgabase> is the _base_ address, i.e. a 4k
-aligned address. For read/writing the color registers, the kernel
-uses the addresses vgabase+0x3c7...vgabase+0x3c9. The <vgabase>
-parameter is written in hexadecimal with a "0x" prefix, just as
-<scrmem>.
-
-  <colw> is meaningful only if <vgabase> is specified. It tells the
-kernel how wide each of the color register is, i.e. the number of bits
-per single color (red/green/blue). Default is 6, another quite usual
-value is 8.
-
-  Also <coltype> is used together with <vgabase>. It tells the kernel
-about the color register model of your gfx board. Currently, the types
-"vga" (which is also the default) and "mv300" (SANG MV300) are
-implemented.
-
-  Parameter <xres_virtual> is required for ProMST or ET4000 cards where
-the physical linelength differs from the visible length. With ProMST, 
-xres_virtual must be set to 2048. For ET4000, xres_virtual depends on the
-initialisation of the video-card.
-If you're missing a corresponding yres_virtual: the external part is legacy,
-therefore we don't support hardware-dependent functions like hardware-scroll,
-panning or blanking.
-
-4.1.7) eclock:
---------------
-
-The external pixel clock attached to the Falcon VIDEL shifter. This
-currently works only with the ScreenWonder!
-
-4.1.8) monitorcap:
--------------------
-
-Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
-
-This describes the capabilities of a multisync monitor. Don't use it
-with a fixed-frequency monitor! For now, only the Falcon frame buffer
-uses the settings of "monitorcap:".
-
-  <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
-your monitor can work with, in Hz. <hmin> and <hmax> are the same for
-the horizontal frequency, in kHz.
-
-  The defaults are 58;62;31;32 (VGA compatible).
-
-  The defaults for TV/SC1224/SC1435 cover both PAL and NTSC standards.
-
-4.1.9) keep
-------------
-
-If this option is given, the framebuffer device doesn't do any video
-mode calculations and settings on its own. The only Atari fb device
-that does this currently is the Falcon.
-
-  What you reach with this: Settings for unknown video extensions
-aren't overridden by the driver, so you can still use the mode found
-when booting, when the driver doesn't know to set this mode itself.
-But this also means, that you can't switch video modes anymore...
-
-  An example where you may want to use "keep" is the ScreenBlaster for
-the Falcon.
-
-
-4.2) atamouse=
---------------
-
-Syntax: atamouse=<x-threshold>,[<y-threshold>]
-
-  With this option, you can set the mouse movement reporting threshold.
-This is the number of pixels of mouse movement that have to accumulate
-before the IKBD sends a new mouse packet to the kernel. Higher values
-reduce the mouse interrupt load and thus reduce the chance of keyboard
-overruns. Lower values give a slightly faster mouse responses and
-slightly better mouse tracking.
-
-  You can set the threshold in x and y separately, but usually this is
-of little practical use. If there's just one number in the option, it
-is used for both dimensions. The default value is 2 for both
-thresholds.
-
-
-4.3) ataflop=
--------------
-
-Syntax: ataflop=<drive type>[,<trackbuffering>[,<steprateA>[,<steprateB>]]]
-
-   The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This
-   setting affects how many buffers are reserved and which formats are
-   probed (see also below). The default is 1 (HD). Only one drive type
-   can be selected. If you have two disk drives, select the "better"
-   type.
-
-   The second parameter <trackbuffer> tells the kernel whether to use
-   track buffering (1) or not (0). The default is machine-dependent:
-   no for the Medusa and yes for all others.
-
-   With the two following parameters, you can change the default
-   steprate used for drive A and B, resp. 
-
-
-4.4) atascsi=
--------------
-
-Syntax: atascsi=<can_queue>[,<cmd_per_lun>[,<scat-gat>[,<host-id>[,<tagged>]]]]
-
-  This option sets some parameters for the Atari native SCSI driver.
-Generally, any number of arguments can be omitted from the end. And
-for each of the numbers, a negative value means "use default". The
-defaults depend on whether TT-style or Falcon-style SCSI is used.
-Below, defaults are noted as n/m, where the first value refers to
-TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given
-for one parameter, an error message is printed and that one setting is
-ignored (others aren't affected).
-
-  <can_queue>:
-    This is the maximum number of SCSI commands queued internally to the
-    Atari SCSI driver. A value of 1 effectively turns off the driver
-    internal multitasking (if it causes problems). Legal values are >=
-    1. <can_queue> can be as high as you like, but values greater than
-    <cmd_per_lun> times the number of SCSI targets (LUNs) you have
-    don't make sense. Default: 16/8.
-
-  <cmd_per_lun>:
-    Maximum number of SCSI commands issued to the driver for one
-    logical unit (LUN, usually one SCSI target). Legal values start
-    from 1. If tagged queuing (see below) is not used, values greater
-    than 2 don't make sense, but waste memory. Otherwise, the maximum
-    is the number of command tags available to the driver (currently
-    32). Default: 8/1. (Note: Values > 1 seem to cause problems on a
-    Falcon, cause not yet known.)
-
-      The <cmd_per_lun> value at a great part determines the amount of
-    memory SCSI reserves for itself. The formula is rather
-    complicated, but I can give you some hints:
-      no scatter-gather  : cmd_per_lun * 232 bytes
-      full scatter-gather: cmd_per_lun * approx. 17 Kbytes
-
-  <scat-gat>:
-    Size of the scatter-gather table, i.e. the number of requests
-    consecutive on the disk that can be merged into one SCSI command.
-    Legal values are between 0 and 255. Default: 255/0. Note: This
-    value is forced to 0 on a Falcon, since scatter-gather isn't
-    possible with the ST-DMA. Not using scatter-gather hurts
-    performance significantly.
-
-  <host-id>:
-    The SCSI ID to be used by the initiator (your Atari). This is
-    usually 7, the highest possible ID. Every ID on the SCSI bus must
-    be unique. Default: determined at run time: If the NV-RAM checksum
-    is valid, and bit 7 in byte 30 of the NV-RAM is set, the lower 3
-    bits of this byte are used as the host ID. (This method is defined
-    by Atari and also used by some TOS HD drivers.) If the above
-    isn't given, the default ID is 7. (both, TT and Falcon).
-
-  <tagged>:
-    0 means turn off tagged queuing support, all other values > 0 mean
-    use tagged queuing for targets that support it. Default: currently
-    off, but this may change when tagged queuing handling has been
-    proved to be reliable.
-
-    Tagged queuing means that more than one command can be issued to
-    one LUN, and the SCSI device itself orders the requests so they
-    can be performed in optimal order. Not all SCSI devices support
-    tagged queuing (:-().
-
-4.5 switches=
--------------
-
-Syntax: switches=<list of switches>
-
-  With this option you can switch some hardware lines that are often
-used to enable/disable certain hardware extensions. Examples are
-OverScan, overclocking, ...
-
-  The <list of switches> is a comma-separated list of the following
-items:
-
-  ikbd: set RTS of the keyboard ACIA high
-  midi: set RTS of the MIDI ACIA high
-  snd6: set bit 6 of the PSG port A
-  snd7: set bit 6 of the PSG port A
-
-It doesn't make sense to mention a switch more than once (no
-difference to only once), but you can give as many switches as you
-want to enable different features. The switch lines are set as early
-as possible during kernel initialization (even before determining the
-present hardware.)
-
-  All of the items can also be prefixed with "ov_", i.e. "ov_ikbd",
-"ov_midi", ... These options are meant for switching on an OverScan
-video extension. The difference to the bare option is that the
-switch-on is done after video initialization, and somehow synchronized
-to the HBLANK. A speciality is that ov_ikbd and ov_midi are switched
-off before rebooting, so that OverScan is disabled and TOS boots
-correctly.
-
-  If you give an option both, with and without the "ov_" prefix, the
-earlier initialization ("ov_"-less) takes precedence. But the
-switching-off on reset still happens in this case.
-
-5) Options for Amiga Only:
-==========================
-
-5.1) video=
------------
-
-Syntax: video=<fbname>:<sub-options...>
-
-The <fbname> parameter specifies the name of the frame buffer, valid
-options are `amifb', `cyber', 'virge', `retz3' and `clgen', provided
-that the respective frame buffer devices have been compiled into the
-kernel (or compiled as loadable modules). The behavior of the <fbname>
-option was changed in 2.1.57 so it is now recommended to specify this
-option.
-
-The <sub-options> is a comma-separated list of the sub-options listed
-below. This option is organized similar to the Atari version of the
-"video"-option (4.1), but knows fewer sub-options.
-
-5.1.1) video mode
------------------
-
-Again, similar to the video mode for the Atari (see 4.1.1). Predefined
-modes depend on the used frame buffer device.
-
-OCS, ECS and AGA machines all use the color frame buffer. The following
-predefined video modes are available:
-
-NTSC modes:
- - ntsc            : 640x200, 15 kHz, 60 Hz
- - ntsc-lace       : 640x400, 15 kHz, 60 Hz interlaced
-PAL modes:
- - pal             : 640x256, 15 kHz, 50 Hz
- - pal-lace        : 640x512, 15 kHz, 50 Hz interlaced
-ECS modes:
- - multiscan       : 640x480, 29 kHz, 57 Hz
- - multiscan-lace  : 640x960, 29 kHz, 57 Hz interlaced
- - euro36          : 640x200, 15 kHz, 72 Hz
- - euro36-lace     : 640x400, 15 kHz, 72 Hz interlaced
- - euro72          : 640x400, 29 kHz, 68 Hz
- - euro72-lace     : 640x800, 29 kHz, 68 Hz interlaced
- - super72         : 800x300, 23 kHz, 70 Hz
- - super72-lace    : 800x600, 23 kHz, 70 Hz interlaced
- - dblntsc-ff      : 640x400, 27 kHz, 57 Hz
- - dblntsc-lace    : 640x800, 27 kHz, 57 Hz interlaced
- - dblpal-ff       : 640x512, 27 kHz, 47 Hz
- - dblpal-lace     : 640x1024, 27 kHz, 47 Hz interlaced
- - dblntsc         : 640x200, 27 kHz, 57 Hz doublescan
- - dblpal          : 640x256, 27 kHz, 47 Hz doublescan
-VGA modes:
- - vga             : 640x480, 31 kHz, 60 Hz
- - vga70           : 640x400, 31 kHz, 70 Hz
-
-Please notice that the ECS and VGA modes require either an ECS or AGA
-chipset, and that these modes are limited to 2-bit color for the ECS
-chipset and 8-bit color for the AGA chipset.
-
-5.1.2) depth
-------------
-
-Syntax: depth:<nr. of bit-planes>
-
-Specify the number of bit-planes for the selected video-mode.
-
-5.1.3) inverse
---------------
-
-Use inverted display (black on white). Functionally the same as the
-"inverse" sub-option for the Atari.
-
-5.1.4) font
------------
-
-Syntax: font:<fontname>
-
-Specify the font to use in text modes. Functionally the same as the
-"font" sub-option for the Atari, except that `PEARL8x8' is used instead
-of `VGA8x8' if the vertical size of the display is less than 400 pixel
-rows.
-
-5.1.5) monitorcap:
--------------------
-
-Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
-
-This describes the capabilities of a multisync monitor. For now, only
-the color frame buffer uses the settings of "monitorcap:".
-
-  <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
-your monitor can work with, in Hz. <hmin> and <hmax> are the same for
-the horizontal frequency, in kHz.
-
-  The defaults are 50;90;15;38 (Generic Amiga multisync monitor).
-
-
-5.2) fd_def_df0=
-----------------
-
-Syntax: fd_def_df0=<value>
-
-Sets the df0 value for "silent" floppy drives. The value should be in
-hexadecimal with "0x" prefix.
-
-
-5.3) wd33c93=
--------------
-
-Syntax: wd33c93=<sub-options...>
-
-These options affect the A590/A2091, A3000 and GVP Series II SCSI
-controllers.
-
-The <sub-options> is a comma-separated list of the sub-options listed
-below.
-
-5.3.1) nosync
--------------
-
-Syntax: nosync:bitmask
-
-  bitmask is a byte where the 1st 7 bits correspond with the 7
-possible SCSI devices. Set a bit to prevent sync negotiation on that
-device. To maintain backwards compatibility, a command-line such as
-"wd33c93=255" will be automatically translated to
-"wd33c93=nosync:0xff". The default is to disable sync negotiation for
-all devices, eg. nosync:0xff.
-
-5.3.2) period
--------------
-
-Syntax: period:ns
-
-  `ns' is the minimum # of nanoseconds in a SCSI data transfer
-period. Default is 500; acceptable values are 250 - 1000.
-
-5.3.3) disconnect
------------------
-
-Syntax: disconnect:x
-
-  Specify x = 0 to never allow disconnects, 2 to always allow them.
-x = 1 does 'adaptive' disconnects, which is the default and generally
-the best choice.
-
-5.3.4) debug
-------------
-
-Syntax: debug:x
-
-  If `DEBUGGING_ON' is defined, x is a bit mask that causes various
-types of debug output to printed - see the DB_xxx defines in
-wd33c93.h.
-
-5.3.5) clock
-------------
-
-Syntax: clock:x
-
-  x = clock input in MHz for WD33c93 chip. Normal values would be from
-8 through 20. The default value depends on your hostadapter(s),
-default for the A3000 internal controller is 14, for the A2091 it's 8
-and for the GVP hostadapters it's either 8 or 14, depending on the
-hostadapter and the SCSI-clock jumper present on some GVP
-hostadapters.
-
-5.3.6) next
------------
-
-  No argument. Used to separate blocks of keywords when there's more
-than one wd33c93-based host adapter in the system.
-
-5.3.7) nodma
-------------
-
-Syntax: nodma:x
-
-  If x is 1 (or if the option is just written as "nodma"), the WD33c93
-controller will not use DMA (= direct memory access) to access the
-Amiga's memory.  This is useful for some systems (like A3000's and
-A4000's with the A3640 accelerator, revision 3.0) that have problems
-using DMA to chip memory.  The default is 0, i.e. to use DMA if
-possible.
-
-
-5.4) gvp11=
------------
-
-Syntax: gvp11=<addr-mask>
-
-  The earlier versions of the GVP driver did not handle DMA
-address-mask settings correctly which made it necessary for some
-people to use this option, in order to get their GVP controller
-running under Linux. These problems have hopefully been solved and the
-use of this option is now highly unrecommended!
-
-  Incorrect use can lead to unpredictable behavior, so please only use
-this option if you *know* what you are doing and have a reason to do
-so. In any case if you experience problems and need to use this
-option, please inform us about it by mailing to the Linux/68k kernel
-mailing list.
-
-  The address mask set by this option specifies which addresses are
-valid for DMA with the GVP Series II SCSI controller. An address is
-valid, if no bits are set except the bits that are set in the mask,
-too.
-
-  Some versions of the GVP can only DMA into a 24 bit address range,
-some can address a 25 bit address range while others can use the whole
-32 bit address range for DMA. The correct setting depends on your
-controller and should be autodetected by the driver. An example is the
-24 bit region which is specified by a mask of 0x00fffffe.
-
-
-/* Local Variables: */
-/* mode: text       */
-/* End:             */
diff --git a/Documentation/maintainer/conf.py b/Documentation/maintainer/conf.py
deleted file mode 100644
index 81e9eb7a7884..000000000000
--- a/Documentation/maintainer/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = 'Linux Kernel Development Documentation'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'maintainer.tex', 'Linux Kernel Development Documentation',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/maintainer/index.rst b/Documentation/maintainer/index.rst
index 2a14916930cb..56e2c09dfa39 100644
--- a/Documentation/maintainer/index.rst
+++ b/Documentation/maintainer/index.rst
@@ -10,5 +10,6 @@ additions to this manual.
    :maxdepth: 2
 
    configure-git
+   rebasing-and-merging
    pull-requests
 
diff --git a/Documentation/maintainer/rebasing-and-merging.rst b/Documentation/maintainer/rebasing-and-merging.rst
new file mode 100644
index 000000000000..09f988e7fa71
--- /dev/null
+++ b/Documentation/maintainer/rebasing-and-merging.rst
@@ -0,0 +1,226 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Rebasing and merging
+====================
+
+Maintaining a subsystem, as a general rule, requires a familiarity with the
+Git source-code management system.  Git is a powerful tool with a lot of
+features; as is often the case with such tools, there are right and wrong
+ways to use those features.  This document looks in particular at the use
+of rebasing and merging.  Maintainers often get in trouble when they use
+those tools incorrectly, but avoiding problems is not actually all that
+hard.
+
+One thing to be aware of in general is that, unlike many other projects,
+the kernel community is not scared by seeing merge commits in its
+development history.  Indeed, given the scale of the project, avoiding
+merges would be nearly impossible.  Some problems encountered by
+maintainers result from a desire to avoid merges, while others come from
+merging a little too often.
+
+Rebasing
+========
+
+"Rebasing" is the process of changing the history of a series of commits
+within a repository.  There are two different types of operations that are
+referred to as rebasing since both are done with the ``git rebase``
+command, but there are significant differences between them:
+
+ - Changing the parent (starting) commit upon which a series of patches is
+   built.  For example, a rebase operation could take a patch set built on
+   the previous kernel release and base it, instead, on the current
+   release.  We'll call this operation "reparenting" in the discussion
+   below.
+
+ - Changing the history of a set of patches by fixing (or deleting) broken
+   commits, adding patches, adding tags to commit changelogs, or changing
+   the order in which commits are applied.  In the following text, this
+   type of operation will be referred to as "history modification"
+
+The term "rebasing" will be used to refer to both of the above operations.
+Used properly, rebasing can yield a cleaner and clearer development
+history; used improperly, it can obscure that history and introduce bugs.
+
+There are a few rules of thumb that can help developers to avoid the worst
+perils of rebasing:
+
+ - History that has been exposed to the world beyond your private system
+   should usually not be changed.  Others may have pulled a copy of your
+   tree and built on it; modifying your tree will create pain for them.  If
+   work is in need of rebasing, that is usually a sign that it is not yet
+   ready to be committed to a public repository.
+
+   That said, there are always exceptions.  Some trees (linux-next being
+   a significant example) are frequently rebased by their nature, and
+   developers know not to base work on them.  Developers will sometimes
+   expose an unstable branch for others to test with or for automated
+   testing services.  If you do expose a branch that may be unstable in
+   this way, be sure that prospective users know not to base work on it.
+
+ - Do not rebase a branch that contains history created by others.  If you
+   have pulled changes from another developer's repository, you are now a
+   custodian of their history.  You should not change it.  With few
+   exceptions, for example, a broken commit in a tree like this should be
+   explicitly reverted rather than disappeared via history modification.
+
+ - Do not reparent a tree without a good reason to do so.  Just being on a
+   newer base or avoiding a merge with an upstream repository is not
+   generally a good reason.
+
+ - If you must reparent a repository, do not pick some random kernel commit
+   as the new base.  The kernel is often in a relatively unstable state
+   between release points; basing development on one of those points
+   increases the chances of running into surprising bugs.  When a patch
+   series must move to a new base, pick a stable point (such as one of
+   the -rc releases) to move to.
+
+ - Realize that reparenting a patch series (or making significant history
+   modifications) changes the environment in which it was developed and,
+   likely, invalidates much of the testing that was done.  A reparented
+   patch series should, as a general rule, be treated like new code and
+   retested from the beginning.
+
+A frequent cause of merge-window trouble is when Linus is presented with a
+patch series that has clearly been reparented, often to a random commit,
+shortly before the pull request was sent.  The chances of such a series
+having been adequately tested are relatively low - as are the chances of
+the pull request being acted upon.
+
+If, instead, rebasing is limited to private trees, commits are based on a
+well-known starting point, and they are well tested, the potential for
+trouble is low.
+
+Merging
+=======
+
+Merging is a common operation in the kernel development process; the 5.1
+development cycle included 1,126 merge commits - nearly 9% of the total.
+Kernel work is accumulated in over 100 different subsystem trees, each of
+which may contain multiple topic branches; each branch is usually developed
+independently of the others.  So naturally, at least one merge will be
+required before any given branch finds its way into an upstream repository.
+
+Many projects require that branches in pull requests be based on the
+current trunk so that no merge commits appear in the history.  The kernel
+is not such a project; any rebasing of branches to avoid merges will, most
+likely, lead to trouble.
+
+Subsystem maintainers find themselves having to do two types of merges:
+from lower-level subsystem trees and from others, either sibling trees or
+the mainline.  The best practices to follow differ in those two situations.
+
+Merging from lower-level trees
+------------------------------
+
+Larger subsystems tend to have multiple levels of maintainers, with the
+lower-level maintainers sending pull requests to the higher levels.  Acting
+on such a pull request will almost certainly generate a merge commit; that
+is as it should be.  In fact, subsystem maintainers may want to use
+the --no-ff flag to force the addition of a merge commit in the rare cases
+where one would not normally be created so that the reasons for the merge
+can be recorded.  The changelog for the merge should, for any kind of
+merge, say *why* the merge is being done.  For a lower-level tree, "why" is
+usually a summary of the changes that will come with that pull.
+
+Maintainers at all levels should be using signed tags on their pull
+requests, and upstream maintainers should verify the tags when pulling
+branches.  Failure to do so threatens the security of the development
+process as a whole.
+
+As per the rules outlined above, once you have merged somebody else's
+history into your tree, you cannot rebase that branch, even if you
+otherwise would be able to.
+
+Merging from sibling or upstream trees
+--------------------------------------
+
+While merges from downstream are common and unremarkable, merges from other
+trees tend to be a red flag when it comes time to push a branch upstream.
+Such merges need to be carefully thought about and well justified, or
+there's a good chance that a subsequent pull request will be rejected.
+
+It is natural to want to merge the master branch into a repository; this
+type of merge is often called a "back merge".  Back merges can help to make
+sure that there are no conflicts with parallel development and generally
+gives a warm, fuzzy feeling of being up-to-date.  But this temptation
+should be avoided almost all of the time.
+
+Why is that?  Back merges will muddy the development history of your own
+branch.  They will significantly increase your chances of encountering bugs
+from elsewhere in the community and make it hard to ensure that the work
+you are managing is stable and ready for upstream.  Frequent merges can
+also obscure problems with the development process in your tree; they can
+hide interactions with other trees that should not be happening (often) in
+a well-managed branch.
+
+That said, back merges are occasionally required; when that happens, be
+sure to document *why* it was required in the commit message.  As always,
+merge to a well-known stable point, rather than to some random commit.
+Even then, you should not back merge a tree above your immediate upstream
+tree; if a higher-level back merge is really required, the upstream tree
+should do it first.
+
+One of the most frequent causes of merge-related trouble is when a
+maintainer merges with the upstream in order to resolve merge conflicts
+before sending a pull request.  Again, this temptation is easy enough to
+understand, but it should absolutely be avoided.  This is especially true
+for the final pull request: Linus is adamant that he would much rather see
+merge conflicts than unnecessary back merges.  Seeing the conflicts lets
+him know where potential problem areas are.  He does a lot of merges (382
+in the 5.1 development cycle) and has gotten quite good at conflict
+resolution - often better than the developers involved.
+
+So what should a maintainer do when there is a conflict between their
+subsystem branch and the mainline?  The most important step is to warn
+Linus in the pull request that the conflict will happen; if nothing else,
+that demonstrates an awareness of how your branch fits into the whole.  For
+especially difficult conflicts, create and push a *separate* branch to show
+how you would resolve things.  Mention that branch in your pull request,
+but the pull request itself should be for the unmerged branch.
+
+Even in the absence of known conflicts, doing a test merge before sending a
+pull request is a good idea.  It may alert you to problems that you somehow
+didn't see from linux-next and helps to understand exactly what you are
+asking upstream to do.
+
+Another reason for doing merges of upstream or another subsystem tree is to
+resolve dependencies.  These dependency issues do happen at times, and
+sometimes a cross-merge with another tree is the best way to resolve them;
+as always, in such situations, the merge commit should explain why the
+merge has been done.  Take a moment to do it right; people will read those
+changelogs.
+
+Often, though, dependency issues indicate that a change of approach is
+needed.  Merging another subsystem tree to resolve a dependency risks
+bringing in other bugs and should almost never be done.  If that subsystem
+tree fails to be pulled upstream, whatever problems it had will block the
+merging of your tree as well.  Preferable alternatives include agreeing
+with the maintainer to carry both sets of changes in one of the trees or
+creating a topic branch dedicated to the prerequisite commits that can be
+merged into both trees.  If the dependency is related to major
+infrastructural changes, the right solution might be to hold the dependent
+commits for one development cycle so that those changes have time to
+stabilize in the mainline.
+
+Finally
+=======
+
+It is relatively common to merge with the mainline toward the beginning of
+the development cycle in order to pick up changes and fixes done elsewhere
+in the tree.  As always, such a merge should pick a well-known release
+point rather than some random spot.  If your upstream-bound branch has
+emptied entirely into the mainline during the merge window, you can pull it
+forward with a command like::
+
+  git merge v5.2-rc1^0
+
+The "^0" will cause Git to do a fast-forward merge (which should be
+possible in this situation), thus avoiding the addition of a spurious merge
+commit.
+
+The guidelines laid out above are just that: guidelines.  There will always
+be situations that call out for a different solution, and these guidelines
+should not prevent developers from doing the right thing when the need
+arises.  But one should always think about whether the need has truly
+arisen and be prepared to explain why something abnormal needs to be done. 
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
deleted file mode 100644
index e1055f105cf5..000000000000
--- a/Documentation/md/md-cluster.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-The cluster MD is a shared-device RAID for a cluster, it supports
-two levels: raid1 and raid10 (limited support).
-
-
-1. On-disk format
-
-Separate write-intent-bitmaps are used for each cluster node.
-The bitmaps record all writes that may have been started on that node,
-and may not yet have finished. The on-disk layout is:
-
-0                    4k                     8k                    12k
--------------------------------------------------------------------
-| idle                | md super            | bm super [0] + bits |
-| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
-| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
-| bm bits [3, contd]  |                     |                     |
-
-During "normal" functioning we assume the filesystem ensures that only
-one node writes to any given block at a time, so a write request will
-
- - set the appropriate bit (if not already set)
- - commit the write to all mirrors
- - schedule the bit to be cleared after a timeout.
-
-Reads are just handled normally. It is up to the filesystem to ensure
-one node doesn't read from a location where another node (or the same
-node) is writing.
-
-
-2. DLM Locks for management
-
-There are three groups of locks for managing the device:
-
-2.1 Bitmap lock resource (bm_lockres)
-
- The bm_lockres protects individual node bitmaps. They are named in
- the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
- node joins the cluster, it acquires the lock in PW mode and it stays
- so during the lifetime the node is part of the cluster. The lock
- resource number is based on the slot number returned by the DLM
- subsystem. Since DLM starts node count from one and bitmap slots
- start from zero, one is subtracted from the DLM slot number to arrive
- at the bitmap slot number.
-
- The LVB of the bitmap lock for a particular node records the range
- of sectors that are being re-synced by that node.  No other
- node may write to those sectors.  This is used when a new nodes
- joins the cluster.
-
-2.2 Message passing locks
-
- Each node has to communicate with other nodes when starting or ending
- resync, and for metadata superblock updates.  This communication is
- managed through three locks: "token", "message", and "ack", together
- with the Lock Value Block (LVB) of one of the "message" lock.
-
-2.3 new-device management
-
- A single lock: "no-new-dev" is used to co-ordinate the addition of
- new devices - this must be synchronized across the array.
- Normally all nodes hold a concurrent-read lock on this device.
-
-3. Communication
-
- Messages can be broadcast to all nodes, and the sender waits for all
- other nodes to acknowledge the message before proceeding.  Only one
- message can be processed at a time.
-
-3.1 Message Types
-
- There are six types of messages which are passed:
-
- 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has
-   been updated, and the node must re-read the md superblock. This is
-   performed synchronously. It is primarily used to signal device
-   failure.
-
- 3.1.2 RESYNCING: informs other nodes that a resync is initiated or
-   ended so that each node may suspend or resume the region.  Each
-   RESYNCING message identifies a range of the devices that the
-   sending node is about to resync. This overrides any previous
-   notification from that node: only one ranged can be resynced at a
-   time per-node.
-
- 3.1.3 NEWDISK: informs other nodes that a device is being added to
-   the array. Message contains an identifier for that device.  See
-   below for further details.
-
- 3.1.4 REMOVE: A failed or spare device is being removed from the
-   array. The slot-number of the device is included in the message.
-
- 3.1.5 RE_ADD: A failed device is being re-activated - the assumption
-   is that it has been determined to be working again.
-
- 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap
-   isn't clean, then another node is informed to take the ownership of
-   resync.
-
-3.2 Communication mechanism
-
- The DLM LVB is used to communicate within nodes of the cluster. There
- are three resources used for the purpose:
-
-  3.2.1 token: The resource which protects the entire communication
-   system. The node having the token resource is allowed to
-   communicate.
-
-  3.2.2 message: The lock resource which carries the data to
-   communicate.
-
-  3.2.3 ack: The resource, acquiring which means the message has been
-   acknowledged by all nodes in the cluster. The BAST of the resource
-   is used to inform the receiving node that a node wants to
-   communicate.
-
-The algorithm is:
-
- 1. receive status - all nodes have concurrent-reader lock on "ack".
-
-   sender                         receiver                 receiver
-   "ack":CR                       "ack":CR                 "ack":CR
-
- 2. sender get EX on "token"
-    sender get EX on "message"
-    sender                        receiver                 receiver
-    "token":EX                    "ack":CR                 "ack":CR
-    "message":EX
-    "ack":CR
-
-    Sender checks that it still needs to send a message. Messages
-    received or other events that happened while waiting for the
-    "token" may have made this message inappropriate or redundant.
-
- 3. sender writes LVB.
-    sender down-convert "message" from EX to CW
-    sender try to get EX of "ack"
-    [ wait until all receivers have *processed* the "message" ]
-
-                                     [ triggered by bast of "ack" ]
-                                     receiver get CR on "message"
-                                     receiver read LVB
-                                     receiver processes the message
-                                     [ wait finish ]
-                                     receiver releases "ack"
-                                     receiver tries to get PR on "message"
-
-   sender                         receiver                  receiver
-   "token":EX                     "message":CR              "message":CR
-   "message":CW
-   "ack":EX
-
- 4. triggered by grant of EX on "ack" (indicating all receivers
-    have processed message)
-    sender down-converts "ack" from EX to CR
-    sender releases "message"
-    sender releases "token"
-                               receiver upconvert to PR on "message"
-                               receiver get CR of "ack"
-                               receiver release "message"
-
-   sender                      receiver                   receiver
-   "ack":CR                    "ack":CR                   "ack":CR
-
-
-4. Handling Failures
-
-4.1 Node Failure
-
- When a node fails, the DLM informs the cluster with the slot
- number. The node starts a cluster recovery thread. The cluster
- recovery thread:
-
-	- acquires the bitmap<number> lock of the failed node
-	- opens the bitmap
-	- reads the bitmap of the failed node
-	- copies the set bitmap to local node
-	- cleans the bitmap of the failed node
-	- releases bitmap<number> lock of the failed node
-	- initiates resync of the bitmap on the current node
-		md_check_recovery is invoked within recover_bitmaps,
-		then md_check_recovery -> metadata_update_start/finish,
-		it will lock the communication by lock_comm.
-		Which means when one node is resyncing it blocks all
-		other nodes from writing anywhere on the array.
-
- The resync process is the regular md resync. However, in a clustered
- environment when a resync is performed, it needs to tell other nodes
- of the areas which are suspended. Before a resync starts, the node
- send out RESYNCING with the (lo,hi) range of the area which needs to
- be suspended. Each node maintains a suspend_list, which contains the
- list of ranges which are currently suspended. On receiving RESYNCING,
- the node adds the range to the suspend_list. Similarly, when the node
- performing resync finishes, it sends RESYNCING with an empty range to
- other nodes and other nodes remove the corresponding entry from the
- suspend_list.
-
- A helper function, ->area_resyncing() can be used to check if a
- particular I/O range should be suspended or not.
-
-4.2 Device Failure
-
- Device failures are handled and communicated with the metadata update
- routine.  When a node detects a device failure it does not allow
- any further writes to that device until the failure has been
- acknowledged by all other nodes.
-
-5. Adding a new Device
-
- For adding a new device, it is necessary that all nodes "see" the new
- device to be added. For this, the following algorithm is used:
-
-    1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
-       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
-    2. Node 1 sends a NEWDISK message with uuid and slot number
-    3. Other nodes issue kobject_uevent_env with uuid and slot number
-       (Steps 4,5 could be a udev rule)
-    4. In userspace, the node searches for the disk, perhaps
-       using blkid -t SUB_UUID=""
-    5. Other nodes issue either of the following depending on whether
-       the disk was found:
-       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
-             disc.number set to slot number)
-       ioctl(CLUSTERED_DISK_NACK)
-    6. Other nodes drop lock on "no-new-devs" (CR) if device is found
-    7. Node 1 attempts EX lock on "no-new-dev"
-    8. If node 1 gets the lock, it sends METADATA_UPDATED after
-       unmarking the disk as SpareLocal
-    9. If not (get "no-new-dev" lock), it fails the operation and sends
-       METADATA_UPDATED.
-   10. Other nodes get the information whether a disk is added or not
-       by the following METADATA_UPDATED.
-
-6. Module interface.
-
- There are 17 call-backs which the md core can make to the cluster
- module.  Understanding these can give a good overview of the whole
- process.
-
-6.1 join(nodes) and leave()
-
- These are called when an array is started with a clustered bitmap,
- and when the array is stopped.  join() ensures the cluster is
- available and initializes the various resources.
- Only the first 'nodes' nodes in the cluster can use the array.
-
-6.2 slot_number()
-
- Reports the slot number advised by the cluster infrastructure.
- Range is from 0 to nodes-1.
-
-6.3 resync_info_update()
-
- This updates the resync range that is stored in the bitmap lock.
- The starting point is updated as the resync progresses.  The
- end point is always the end of the array.
- It does *not* send a RESYNCING message.
-
-6.4 resync_start(), resync_finish()
-
- These are called when resync/recovery/reshape starts or stops.
- They update the resyncing range in the bitmap lock and also
- send a RESYNCING message.  resync_start reports the whole
- array as resyncing, resync_finish reports none of it.
-
- resync_finish() also sends a BITMAP_NEEDS_SYNC message which
- allows some other node to take over.
-
-6.5 metadata_update_start(), metadata_update_finish(),
-    metadata_update_cancel().
-
- metadata_update_start is used to get exclusive access to
- the metadata.  If a change is still needed once that access is
- gained, metadata_update_finish() will send a METADATA_UPDATE
- message to all other nodes, otherwise metadata_update_cancel()
- can be used to release the lock.
-
-6.6 area_resyncing()
-
- This combines two elements of functionality.
-
- Firstly, it will check if any node is currently resyncing
- anything in a given range of sectors.  If any resync is found,
- then the caller will avoid writing or read-balancing in that
- range.
-
- Secondly, while node recovery is happening it reports that
- all areas are resyncing for READ requests.  This avoids races
- between the cluster-filesystem and the cluster-RAID handling
- a node failure.
-
-6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
-
- These are used to manage the new-disk protocol described above.
- When a new device is added, add_new_disk_start() is called before
- it is bound to the array and, if that succeeds, add_new_disk_finish()
- is called the device is fully added.
-
- When a device is added in acknowledgement to a previous
- request, or when the device is declared "unavailable",
- new_disk_ack() is called.
-
-6.8 remove_disk()
-
- This is called when a spare or failed device is removed from
- the array.  It causes a REMOVE message to be send to other nodes.
-
-6.9 gather_bitmaps()
-
- This sends a RE_ADD message to all other nodes and then
- gathers bitmap information from all bitmaps.  This combined
- bitmap is then used to recovery the re-added device.
-
-6.10 lock_all_bitmaps() and unlock_all_bitmaps()
-
- These are called when change bitmap to none. If a node plans
- to clear the cluster raid's bitmap, it need to make sure no other
- nodes are using the raid which is achieved by lock all bitmap
- locks within the cluster, and also those locks are unlocked
- accordingly.
-
-7. Unsupported features
-
-There are somethings which are not supported by cluster MD yet.
-
-- change array_sectors.
diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
deleted file mode 100644
index 2b210f295786..000000000000
--- a/Documentation/md/raid5-cache.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-RAID5 cache
-
-Raid 4/5/6 could include an extra disk for data cache besides normal RAID
-disks. The role of RAID disks isn't changed with the cache disk. The cache disk
-caches data to the RAID disks. The cache can be in write-through (supported
-since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
-3.4) has a new option '--write-journal' to create array with cache. Please
-refer to mdadm manual for details. By default (RAID array starts), the cache is
-in write-through mode. A user can switch it to write-back mode by:
-
-echo "write-back" > /sys/block/md0/md/journal_mode
-
-And switch it back to write-through mode by:
-
-echo "write-through" > /sys/block/md0/md/journal_mode
-
-In both modes, all writes to the array will hit cache disk first. This means
-the cache disk must be fast and sustainable.
-
--------------------------------------
-write-through mode:
-
-This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
-shutdown can cause data in some stripes to not be in consistent state, eg, data
-and parity don't match. The reason is that a stripe write involves several RAID
-disks and it's possible the writes don't hit all RAID disks yet before the
-unclean shutdown. We call an array degraded if it has inconsistent data. MD
-tries to resync the array to bring it back to normal state. But before the
-resync completes, any system crash will expose the chance of real data
-corruption in the RAID array. This problem is called 'write hole'.
-
-The write-through cache will cache all data on cache disk first. After the data
-is safe on the cache disk, the data will be flushed onto RAID disks. The
-two-step write will guarantee MD can recover correct data after unclean
-shutdown even the array is degraded. Thus the cache can close the 'write hole'.
-
-In write-through mode, MD reports IO completion to upper layer (usually
-filesystems) after the data is safe on RAID disks, so cache disk failure
-doesn't cause data loss. Of course cache disk failure means the array is
-exposed to 'write hole' again.
-
-In write-through mode, the cache disk isn't required to be big. Several
-hundreds megabytes are enough.
-
---------------------------------------
-write-back mode:
-
-write-back mode fixes the 'write hole' issue too, since all write data is
-cached on cache disk. But the main goal of 'write-back' cache is to speed up
-write. If a write crosses all RAID disks of a stripe, we call it full-stripe
-write. For non-full-stripe writes, MD must read old data before the new parity
-can be calculated. These synchronous reads hurt write throughput. Some writes
-which are sequential but not dispatched in the same time will suffer from this
-overhead too. Write-back cache will aggregate the data and flush the data to
-RAID disks only after the data becomes a full stripe write. This will
-completely avoid the overhead, so it's very helpful for some workloads. A
-typical workload which does sequential write followed by fsync is an example.
-
-In write-back mode, MD reports IO completion to upper layer (usually
-filesystems) right after the data hits cache disk. The data is flushed to raid
-disks later after specific conditions met. So cache disk failure will cause
-data loss.
-
-In write-back mode, MD also caches data in memory. The memory cache includes
-the same data stored on cache disk, so a power loss doesn't cause data loss.
-The memory cache size has performance impact for the array. It's recommended
-the size is big. A user can configure the size by:
-
-echo "2048" > /sys/block/md0/md/stripe_cache_size
-
-Too small cache disk will make the write aggregation less efficient in this
-mode depending on the workloads. It's recommended to use a cache disk with at
-least several gigabytes size in write-back mode.
-
---------------------------------------
-The implementation:
-
-The write-through and write-back cache use the same disk format. The cache disk
-is organized as a simple write log. The log consists of 'meta data' and 'data'
-pairs. The meta data describes the data. It also includes checksum and sequence
-ID for recovery identification. Data can be IO data and parity data. Data is
-checksumed too. The checksum is stored in the meta data ahead of the data. The
-checksum is an optimization because MD can write meta and data freely without
-worry about the order. MD superblock has a field pointed to the valid meta data
-of log head.
-
-The log implementation is pretty straightforward. The difficult part is the
-order in which MD writes data to cache disk and RAID disks. Specifically, in
-write-through mode, MD calculates parity for IO data, writes both IO data and
-parity to the log, writes the data and parity to RAID disks after the data and
-parity is settled down in log and finally the IO is finished. Read just reads
-from raid disks as usual.
-
-In write-back mode, MD writes IO data to the log and reports IO completion. The
-data is also fully cached in memory at that time, which means read must query
-memory cache. If some conditions are met, MD will flush the data to RAID disks.
-MD will calculate parity for the data and write parity into the log. After this
-is finished, MD will write both data and parity into RAID disks, then MD can
-release the memory cache. The flush conditions could be stripe becomes a full
-stripe write, free cache disk space is low or free in-kernel memory cache space
-is low.
-
-After an unclean shutdown, MD does recovery. MD reads all meta data and data
-from the log. The sequence ID and checksum will help us detect corrupted meta
-data and data. If MD finds a stripe with data and valid parities (1 parity for
-raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
-parities are incompleted, they are discarded. If part of data is corrupted,
-they are discarded too. MD then loads valid data and writes them to RAID disks
-in normal way.
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
deleted file mode 100644
index bfa092589e00..000000000000
--- a/Documentation/md/raid5-ppl.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-Partial Parity Log
-
-Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
-addressed by PPL is that after a dirty shutdown, parity of a particular stripe
-may become inconsistent with data on other member disks. If the array is also
-in degraded state, there is no way to recalculate parity, because one of the
-disks is missing. This can lead to silent data corruption when rebuilding the
-array or using it is as degraded - data calculated from parity for array blocks
-that have not been touched by a write request during the unclean shutdown can
-be incorrect. Such condition is known as the RAID5 Write Hole. Because of
-this, md by default does not allow starting a dirty degraded array.
-
-Partial parity for a write operation is the XOR of stripe data chunks not
-modified by this write. It is just enough data needed for recovering from the
-write hole. XORing partial parity with the modified chunks produces parity for
-the stripe, consistent with its state before the write operation, regardless of
-which chunk writes have completed. If one of the not modified data disks of
-this stripe is missing, this updated parity can be used to recover its
-contents. PPL recovery is also performed when starting an array after an
-unclean shutdown and all disks are available, eliminating the need to resync
-the array. Because of this, using write-intent bitmap and PPL together is not
-supported.
-
-When handling a write request PPL writes partial parity before new data and
-parity are dispatched to disks. PPL is a distributed log - it is stored on
-array member drives in the metadata area, on the parity drive of a particular
-stripe.  It does not require a dedicated journaling drive. Write performance is
-reduced by up to 30%-40% but it scales with the number of drives in the array
-and the journaling drive does not become a bottleneck or a single point of
-failure.
-
-Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
-not a true journal. It does not protect from losing in-flight data, only from
-silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
-performed for this stripe (parity is not updated). So it is possible to have
-arbitrary data in the written part of a stripe if that disk is lost. In such
-case the behavior is the same as in plain raid5.
-
-PPL is available for md version-1 metadata and external (specifically IMSM)
-metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
-
-There is a limitation of maximum 64 disks in the array for PPL. It allows to
-keep data structures and implementation simple. RAID5 arrays with so many disks
-are not likely due to high risk of multiple disks failure. Such restriction
-should not be a real life limitation.
diff --git a/Documentation/media/conf.py b/Documentation/media/conf.py
deleted file mode 100644
index 1f194fcd2cae..000000000000
--- a/Documentation/media/conf.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-# SPDX-License-Identifier: GPL-2.0
-
-project = 'Linux Media Subsystem Documentation'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'media.tex', 'Linux Media Subsystem Documentation',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/media/kapi/dtv-core.rst b/Documentation/media/kapi/dtv-core.rst
index ac005b46f23e..82c5b85ed9b1 100644
--- a/Documentation/media/kapi/dtv-core.rst
+++ b/Documentation/media/kapi/dtv-core.rst
@@ -11,12 +11,12 @@ Digital TV devices are implemented by several different drivers:
 
 - Frontend drivers that are usually implemented as two separate drivers:
 
-  - A tuner driver that implements the logic with commands the part of the
-    hardware with is responsible to tune into a digital TV transponder or
+  - A tuner driver that implements the logic which commands the part of
+    the hardware responsible for tuning into a digital TV transponder or
     physical channel. The output of a tuner is usually a baseband or
     Intermediate Frequency (IF) signal;
 
-  - A demodulator driver (a.k.a "demod") that implements the logic with
+  - A demodulator driver (a.k.a "demod") that implements the logic which
     commands the digital TV decoding hardware. The output of a demod is
     a digital stream, with multiple audio, video and data channels typically
     multiplexed using MPEG Transport Stream [#f1]_.
diff --git a/Documentation/media/kapi/v4l2-controls.rst b/Documentation/media/kapi/v4l2-controls.rst
index 64ab99abf0b6..ebe2a55908be 100644
--- a/Documentation/media/kapi/v4l2-controls.rst
+++ b/Documentation/media/kapi/v4l2-controls.rst
@@ -26,8 +26,9 @@ The control framework was created in order to implement all the rules of the
 V4L2 specification with respect to controls in a central place. And to make
 life as easy as possible for the driver developer.
 
-Note that the control framework relies on the presence of a struct v4l2_device
-for V4L2 drivers and struct v4l2_subdev for sub-device drivers.
+Note that the control framework relies on the presence of a struct
+:c:type:`v4l2_device` for V4L2 drivers and struct :c:type:`v4l2_subdev` for
+sub-device drivers.
 
 
 Objects in the framework
@@ -35,12 +36,13 @@ Objects in the framework
 
 There are two main objects:
 
-The v4l2_ctrl object describes the control properties and keeps track of the
-control's value (both the current value and the proposed new value).
+The :c:type:`v4l2_ctrl` object describes the control properties and keeps
+track of the control's value (both the current value and the proposed new
+value).
 
-v4l2_ctrl_handler is the object that keeps track of controls. It maintains a
-list of v4l2_ctrl objects that it owns and another list of references to
-controls, possibly to controls owned by other handlers.
+:c:type:`v4l2_ctrl_handler` is the object that keeps track of controls. It
+maintains a list of v4l2_ctrl objects that it owns and another list of
+references to controls, possibly to controls owned by other handlers.
 
 
 Basic usage for V4L2 and sub-device drivers
@@ -48,21 +50,39 @@ Basic usage for V4L2 and sub-device drivers
 
 1) Prepare the driver:
 
+.. code-block:: c
+
+	#include <media/v4l2-ctrls.h>
+
 1.1) Add the handler to your driver's top-level struct:
 
-.. code-block:: none
+For V4L2 drivers:
+
+.. code-block:: c
 
 	struct foo_dev {
 		...
+		struct v4l2_device v4l2_dev;
+		...
 		struct v4l2_ctrl_handler ctrl_handler;
 		...
 	};
 
-	struct foo_dev *foo;
+For sub-device drivers:
+
+.. code-block:: c
+
+	struct foo_dev {
+		...
+		struct v4l2_subdev sd;
+		...
+		struct v4l2_ctrl_handler ctrl_handler;
+		...
+	};
 
 1.2) Initialize the handler:
 
-.. code-block:: none
+.. code-block:: c
 
 	v4l2_ctrl_handler_init(&foo->ctrl_handler, nr_of_controls);
 
@@ -72,72 +92,48 @@ information. It is a hint only.
 
 1.3) Hook the control handler into the driver:
 
-1.3.1) For V4L2 drivers do this:
+For V4L2 drivers:
 
-.. code-block:: none
-
-	struct foo_dev {
-		...
-		struct v4l2_device v4l2_dev;
-		...
-		struct v4l2_ctrl_handler ctrl_handler;
-		...
-	};
+.. code-block:: c
 
 	foo->v4l2_dev.ctrl_handler = &foo->ctrl_handler;
 
-Where foo->v4l2_dev is of type struct v4l2_device.
-
-Finally, remove all control functions from your v4l2_ioctl_ops (if any):
-vidioc_queryctrl, vidioc_query_ext_ctrl, vidioc_querymenu, vidioc_g_ctrl,
-vidioc_s_ctrl, vidioc_g_ext_ctrls, vidioc_try_ext_ctrls and vidioc_s_ext_ctrls.
-Those are now no longer needed.
-
-1.3.2) For sub-device drivers do this:
-
-.. code-block:: none
+For sub-device drivers:
 
-	struct foo_dev {
-		...
-		struct v4l2_subdev sd;
-		...
-		struct v4l2_ctrl_handler ctrl_handler;
-		...
-	};
+.. code-block:: c
 
 	foo->sd.ctrl_handler = &foo->ctrl_handler;
 
-Where foo->sd is of type struct v4l2_subdev.
-
 1.4) Clean up the handler at the end:
 
-.. code-block:: none
+.. code-block:: c
 
 	v4l2_ctrl_handler_free(&foo->ctrl_handler);
 
 
 2) Add controls:
 
-You add non-menu controls by calling v4l2_ctrl_new_std:
+You add non-menu controls by calling :c:func:`v4l2_ctrl_new_std`:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct v4l2_ctrl *v4l2_ctrl_new_std(struct v4l2_ctrl_handler *hdl,
 			const struct v4l2_ctrl_ops *ops,
 			u32 id, s32 min, s32 max, u32 step, s32 def);
 
-Menu and integer menu controls are added by calling v4l2_ctrl_new_std_menu:
+Menu and integer menu controls are added by calling
+:c:func:`v4l2_ctrl_new_std_menu`:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct v4l2_ctrl *v4l2_ctrl_new_std_menu(struct v4l2_ctrl_handler *hdl,
 			const struct v4l2_ctrl_ops *ops,
 			u32 id, s32 max, s32 skip_mask, s32 def);
 
 Menu controls with a driver specific menu are added by calling
-v4l2_ctrl_new_std_menu_items:
+:c:func:`v4l2_ctrl_new_std_menu_items`:
 
-.. code-block:: none
+.. code-block:: c
 
        struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(
                        struct v4l2_ctrl_handler *hdl,
@@ -145,17 +141,18 @@ v4l2_ctrl_new_std_menu_items:
                        s32 skip_mask, s32 def, const char * const *qmenu);
 
 Integer menu controls with a driver specific menu can be added by calling
-v4l2_ctrl_new_int_menu:
+:c:func:`v4l2_ctrl_new_int_menu`:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct v4l2_ctrl *v4l2_ctrl_new_int_menu(struct v4l2_ctrl_handler *hdl,
 			const struct v4l2_ctrl_ops *ops,
 			u32 id, s32 max, s32 def, const s64 *qmenu_int);
 
-These functions are typically called right after the v4l2_ctrl_handler_init:
+These functions are typically called right after the
+:c:func:`v4l2_ctrl_handler_init`:
 
-.. code-block:: none
+.. code-block:: c
 
 	static const s64 exp_bias_qmenu[] = {
 	       -2, -1, 0, 1, 2
@@ -192,33 +189,34 @@ These functions are typically called right after the v4l2_ctrl_handler_init:
 		return err;
 	}
 
-The v4l2_ctrl_new_std function returns the v4l2_ctrl pointer to the new
-control, but if you do not need to access the pointer outside the control ops,
-then there is no need to store it.
-
-The v4l2_ctrl_new_std function will fill in most fields based on the control
-ID except for the min, max, step and default values. These are passed in the
-last four arguments. These values are driver specific while control attributes
-like type, name, flags are all global. The control's current value will be set
-to the default value.
-
-The v4l2_ctrl_new_std_menu function is very similar but it is used for menu
-controls. There is no min argument since that is always 0 for menu controls,
-and instead of a step there is a skip_mask argument: if bit X is 1, then menu
-item X is skipped.
-
-The v4l2_ctrl_new_int_menu function creates a new standard integer menu
-control with driver-specific items in the menu. It differs from
-v4l2_ctrl_new_std_menu in that it doesn't have the mask argument and takes
-as the last argument an array of signed 64-bit integers that form an exact
-menu item list.
-
-The v4l2_ctrl_new_std_menu_items function is very similar to
-v4l2_ctrl_new_std_menu but takes an extra parameter qmenu, which is the driver
-specific menu for an otherwise standard menu control. A good example for this
-control is the test pattern control for capture/display/sensors devices that
-have the capability to generate test patterns. These test patterns are hardware
-specific, so the contents of the menu will vary from device to device.
+The :c:func:`v4l2_ctrl_new_std` function returns the v4l2_ctrl pointer to
+the new control, but if you do not need to access the pointer outside the
+control ops, then there is no need to store it.
+
+The :c:func:`v4l2_ctrl_new_std` function will fill in most fields based on
+the control ID except for the min, max, step and default values. These are
+passed in the last four arguments. These values are driver specific while
+control attributes like type, name, flags are all global. The control's
+current value will be set to the default value.
+
+The :c:func:`v4l2_ctrl_new_std_menu` function is very similar but it is
+used for menu controls. There is no min argument since that is always 0 for
+menu controls, and instead of a step there is a skip_mask argument: if bit
+X is 1, then menu item X is skipped.
+
+The :c:func:`v4l2_ctrl_new_int_menu` function creates a new standard
+integer menu control with driver-specific items in the menu. It differs
+from v4l2_ctrl_new_std_menu in that it doesn't have the mask argument and
+takes as the last argument an array of signed 64-bit integers that form an
+exact menu item list.
+
+The :c:func:`v4l2_ctrl_new_std_menu_items` function is very similar to
+v4l2_ctrl_new_std_menu but takes an extra parameter qmenu, which is the
+driver specific menu for an otherwise standard menu control. A good example
+for this control is the test pattern control for capture/display/sensors
+devices that have the capability to generate test patterns. These test
+patterns are hardware specific, so the contents of the menu will vary from
+device to device.
 
 Note that if something fails, the function will return NULL or an error and
 set ctrl_handler->error to the error code. If ctrl_handler->error was already
@@ -233,7 +231,7 @@ a bit faster that way.
 
 3) Optionally force initial control setup:
 
-.. code-block:: none
+.. code-block:: c
 
 	v4l2_ctrl_handler_setup(&foo->ctrl_handler);
 
@@ -242,9 +240,9 @@ initializes the hardware to the default control values. It is recommended
 that you do this as this ensures that both the internal data structures and
 the hardware are in sync.
 
-4) Finally: implement the v4l2_ctrl_ops
+4) Finally: implement the :c:type:`v4l2_ctrl_ops`
 
-.. code-block:: none
+.. code-block:: c
 
 	static const struct v4l2_ctrl_ops foo_ctrl_ops = {
 		.s_ctrl = foo_s_ctrl,
@@ -252,7 +250,7 @@ the hardware are in sync.
 
 Usually all you need is s_ctrl:
 
-.. code-block:: none
+.. code-block:: c
 
 	static int foo_s_ctrl(struct v4l2_ctrl *ctrl)
 	{
@@ -305,7 +303,7 @@ Accessing Control Values
 The following union is used inside the control framework to access control
 values:
 
-.. code-block:: none
+.. code-block:: c
 
 	union v4l2_ctrl_ptr {
 		s32 *p_s32;
@@ -317,7 +315,7 @@ values:
 The v4l2_ctrl struct contains these fields that can be used to access both
 current and new values:
 
-.. code-block:: none
+.. code-block:: c
 
 	s32 val;
 	struct {
@@ -330,7 +328,7 @@ current and new values:
 
 If the control has a simple s32 type type, then:
 
-.. code-block:: none
+.. code-block:: c
 
 	&ctrl->val == ctrl->p_new.p_s32
 	&ctrl->cur.val == ctrl->p_cur.p_s32
@@ -354,7 +352,7 @@ exception is for controls that return a volatile register such as a signal
 strength read-out that changes continuously. In that case you will need to
 implement g_volatile_ctrl like this:
 
-.. code-block:: none
+.. code-block:: c
 
 	static int foo_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
 	{
@@ -372,7 +370,7 @@ changes.
 
 To mark a control as volatile you have to set V4L2_CTRL_FLAG_VOLATILE:
 
-.. code-block:: none
+.. code-block:: c
 
 	ctrl = v4l2_ctrl_new_std(&sd->ctrl_handler, ...);
 	if (ctrl)
@@ -393,7 +391,7 @@ not to introduce deadlocks.
 Outside of the control ops you have to go through to helper functions to get
 or set a single control value safely in your driver:
 
-.. code-block:: none
+.. code-block:: c
 
 	s32 v4l2_ctrl_g_ctrl(struct v4l2_ctrl *ctrl);
 	int v4l2_ctrl_s_ctrl(struct v4l2_ctrl *ctrl, s32 val);
@@ -404,7 +402,7 @@ will result in a deadlock since these helpers lock the handler as well.
 
 You can also take the handler lock yourself:
 
-.. code-block:: none
+.. code-block:: c
 
 	mutex_lock(&state->ctrl_handler.lock);
 	pr_info("String value is '%s'\n", ctrl1->p_cur.p_char);
@@ -417,7 +415,7 @@ Menu Controls
 
 The v4l2_ctrl struct contains this union:
 
-.. code-block:: none
+.. code-block:: c
 
 	union {
 		u32 step;
@@ -445,7 +443,7 @@ Custom Controls
 
 Driver specific controls can be created using v4l2_ctrl_new_custom():
 
-.. code-block:: none
+.. code-block:: c
 
 	static const struct v4l2_ctrl_config ctrl_filter = {
 		.ops = &ctrl_custom_ops,
@@ -499,7 +497,7 @@ By default all controls are independent from the others. But in more
 complex scenarios you can get dependencies from one control to another.
 In that case you need to 'cluster' them:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct foo {
 		struct v4l2_ctrl_handler ctrl_handler;
@@ -523,7 +521,7 @@ composite control. Similar to how a 'struct' works in C.
 So when s_ctrl is called with V4L2_CID_AUDIO_VOLUME as argument, you should set
 all two controls belonging to the audio_cluster:
 
-.. code-block:: none
+.. code-block:: c
 
 	static int foo_s_ctrl(struct v4l2_ctrl *ctrl)
 	{
@@ -545,7 +543,7 @@ all two controls belonging to the audio_cluster:
 
 In the example above the following are equivalent for the VOLUME case:
 
-.. code-block:: none
+.. code-block:: c
 
 	ctrl == ctrl->cluster[AUDIO_CL_VOLUME] == state->audio_cluster[AUDIO_CL_VOLUME]
 	ctrl->cluster[AUDIO_CL_MUTE] == state->audio_cluster[AUDIO_CL_MUTE]
@@ -553,7 +551,7 @@ In the example above the following are equivalent for the VOLUME case:
 In practice using cluster arrays like this becomes very tiresome. So instead
 the following equivalent method is used:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct {
 		/* audio cluster */
@@ -565,7 +563,7 @@ The anonymous struct is used to clearly 'cluster' these two control pointers,
 but it serves no other purpose. The effect is the same as creating an
 array with two control pointers. So you can just do:
 
-.. code-block:: none
+.. code-block:: c
 
 	state->volume = v4l2_ctrl_new_std(&state->ctrl_handler, ...);
 	state->mute = v4l2_ctrl_new_std(&state->ctrl_handler, ...);
@@ -621,7 +619,7 @@ changing that control affects the control flags of the manual controls.
 In order to simplify this a special variation of v4l2_ctrl_cluster was
 introduced:
 
-.. code-block:: none
+.. code-block:: c
 
 	void v4l2_ctrl_auto_cluster(unsigned ncontrols, struct v4l2_ctrl **controls,
 				    u8 manual_val, bool set_volatile);
@@ -676,7 +674,7 @@ of another handler (e.g. for a video device node), then you should first add
 the controls to the first handler, add the other controls to the second
 handler and finally add the first handler to the second. For example:
 
-.. code-block:: none
+.. code-block:: c
 
 	v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_VOLUME, ...);
 	v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_MUTE, ...);
@@ -690,7 +688,7 @@ all controls.
 
 Or you can add specific controls to a handler:
 
-.. code-block:: none
+.. code-block:: c
 
 	volume = v4l2_ctrl_new_std(&video_ctrl_handler, &ops, V4L2_CID_AUDIO_VOLUME, ...);
 	v4l2_ctrl_new_std(&video_ctrl_handler, &ops, V4L2_CID_BRIGHTNESS, ...);
@@ -699,7 +697,7 @@ Or you can add specific controls to a handler:
 What you should not do is make two identical controls for two handlers.
 For example:
 
-.. code-block:: none
+.. code-block:: c
 
 	v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_MUTE, ...);
 	v4l2_ctrl_new_std(&video_ctrl_handler, &video_ops, V4L2_CID_AUDIO_MUTE, ...);
@@ -720,7 +718,7 @@ not own. For example, if you have to find a volume control from a subdev.
 
 You can do that by calling v4l2_ctrl_find:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct v4l2_ctrl *volume;
 
@@ -729,7 +727,7 @@ You can do that by calling v4l2_ctrl_find:
 Since v4l2_ctrl_find will lock the handler you have to be careful where you
 use it. For example, this is not a good idea:
 
-.. code-block:: none
+.. code-block:: c
 
 	struct v4l2_ctrl_handler ctrl_handler;
 
@@ -738,7 +736,7 @@ use it. For example, this is not a good idea:
 
 ...and in video_ops.s_ctrl:
 
-.. code-block:: none
+.. code-block:: c
 
 	case V4L2_CID_BRIGHTNESS:
 		contrast = v4l2_find_ctrl(&ctrl_handler, V4L2_CID_CONTRAST);
@@ -760,7 +758,7 @@ not when it is used in consumer-level hardware. In that case you want to keep
 those low-level controls local to the subdev. You can do this by simply
 setting the 'is_private' flag of the control to 1:
 
-.. code-block:: none
+.. code-block:: c
 
 	static const struct v4l2_ctrl_config ctrl_private = {
 		.ops = &ctrl_custom_ops,
@@ -797,7 +795,7 @@ Sometimes the platform or bridge driver needs to be notified when a control
 from a sub-device driver changes. You can set a notify callback by calling
 this function:
 
-.. code-block:: none
+.. code-block:: c
 
 	void v4l2_ctrl_notify(struct v4l2_ctrl *ctrl,
 		void (*notify)(struct v4l2_ctrl *ctrl, void *priv), void *priv);
diff --git a/Documentation/media/uapi/cec/cec-api.rst b/Documentation/media/uapi/cec/cec-api.rst
index b614bf81aa20..0780ba07995a 100644
--- a/Documentation/media/uapi/cec/cec-api.rst
+++ b/Documentation/media/uapi/cec/cec-api.rst
@@ -39,7 +39,7 @@ Revision and Copyright
 **********************
 Authors:
 
-- Verkuil, Hans <hans.verkuil@cisco.com>
+- Verkuil, Hans <hverkuil-cisco@xs4all.nl>
 
  - Initial version.
 
diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
index c53bb5f73f0d..d0902f356d65 100644
--- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
@@ -294,7 +294,8 @@ EINVAL
     The requested mode is invalid.
 
 EPERM
-    Monitor mode is requested without having root permissions
+    Monitor mode is requested, but the process does have the ``CAP_NET_ADMIN``
+    capability.
 
 EBUSY
     Someone else is already an exclusive follower or initiator.
diff --git a/Documentation/media/uapi/cec/cec-ioc-receive.rst b/Documentation/media/uapi/cec/cec-ioc-receive.rst
index c3a685ff05cb..4137903d672e 100644
--- a/Documentation/media/uapi/cec/cec-ioc-receive.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-receive.rst
@@ -223,6 +223,18 @@ View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV').
 	result of the :ref:`ioctl CEC_TRANSMIT <CEC_TRANSMIT>`, and once via
 	:ref:`ioctl CEC_RECEIVE <CEC_RECEIVE>`.
 
+    * .. _`CEC-MSG-FL-RAW`:
+
+      - ``CEC_MSG_FL_RAW``
+      - 2
+      - Normally CEC messages are validated before transmitting them. If this
+        flag is set when :ref:`ioctl CEC_TRANSMIT <CEC_TRANSMIT>` is called,
+	then no validation takes place and the message is transmitted as-is.
+	This is useful when debugging CEC issues.
+	This flag is only allowed if the process has the ``CAP_SYS_RAWIO``
+	capability. If that is not set, then the ``EPERM`` error code is
+	returned.
+
 
 .. tabularcolumns:: |p{5.6cm}|p{0.9cm}|p{11.0cm}|
 
@@ -358,7 +370,8 @@ ENOTTY
 
 EPERM
     The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS <CEC_ADAP_S_LOG_ADDRS>`
-    has never been called.
+    has never been called, or ``CEC_MSG_FL_RAW`` was used from a process that
+    did not have the ``CAP_SYS_RAWIO`` capability.
 
 ENONET
     The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS <CEC_ADAP_S_LOG_ADDRS>`
diff --git a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
index a982f16e55a4..b827ebc398f8 100644
--- a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
+++ b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
@@ -84,6 +84,11 @@ returned during the enumeration process.
        -  Pointer to a links array allocated by the application. Ignored if
 	  NULL.
 
+    *  -  __u32
+       -  ``reserved[4]``
+       -  Reserved for future extensions. Drivers and applications must set
+          the array to zero.
+
 
 .. c:type:: media_pad_desc
 
@@ -135,7 +140,7 @@ returned during the enumeration process.
        -  Link flags, see :ref:`media-link-flag` for more details.
 
     *  -  __u32
-       -  ``reserved[4]``
+       -  ``reserved[2]``
        -  Reserved for future extensions. Drivers and applications must set
           the array to zero.
 
diff --git a/Documentation/media/uapi/rc/rc-tables.rst b/Documentation/media/uapi/rc/rc-tables.rst
index 177ac44fa0fa..20d7c686922b 100644
--- a/Documentation/media/uapi/rc/rc-tables.rst
+++ b/Documentation/media/uapi/rc/rc-tables.rst
@@ -54,7 +54,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 3
 
-       -  ``KEY_0``
+       -  ``KEY_NUMERIC_0``
 
        -  Keyboard digit 0
 
@@ -62,7 +62,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 4
 
-       -  ``KEY_1``
+       -  ``KEY_NUMERIC_1``
 
        -  Keyboard digit 1
 
@@ -70,7 +70,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 5
 
-       -  ``KEY_2``
+       -  ``KEY_NUMERIC_2``
 
        -  Keyboard digit 2
 
@@ -78,7 +78,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 6
 
-       -  ``KEY_3``
+       -  ``KEY_NUMERIC_3``
 
        -  Keyboard digit 3
 
@@ -86,7 +86,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 7
 
-       -  ``KEY_4``
+       -  ``KEY_NUMERIC_4``
 
        -  Keyboard digit 4
 
@@ -94,7 +94,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 8
 
-       -  ``KEY_5``
+       -  ``KEY_NUMERIC_5``
 
        -  Keyboard digit 5
 
@@ -102,7 +102,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 9
 
-       -  ``KEY_6``
+       -  ``KEY_NUMERIC_6``
 
        -  Keyboard digit 6
 
@@ -110,7 +110,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 10
 
-       -  ``KEY_7``
+       -  ``KEY_NUMERIC_7``
 
        -  Keyboard digit 7
 
@@ -118,7 +118,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 11
 
-       -  ``KEY_8``
+       -  ``KEY_NUMERIC_8``
 
        -  Keyboard digit 8
 
@@ -126,7 +126,7 @@ the remote via /dev/input/event devices.
 
     -  .. row 12
 
-       -  ``KEY_9``
+       -  ``KEY_NUMERIC_9``
 
        -  Keyboard digit 9
 
@@ -196,7 +196,7 @@ the remote via /dev/input/event devices.
 
        -  ``KEY_PAUSE``
 
-       -  Pause sroweam
+       -  Pause stream
 
        -  PAUSE / FREEZE
 
@@ -220,7 +220,7 @@ the remote via /dev/input/event devices.
 
        -  ``KEY_STOP``
 
-       -  Stop sroweam
+       -  Stop stream
 
        -  STOP
 
@@ -228,7 +228,7 @@ the remote via /dev/input/event devices.
 
        -  ``KEY_RECORD``
 
-       -  Start/stop recording sroweam
+       -  Start/stop recording stream
 
        -  CAPTURE / REC / RECORD/PAUSE
 
@@ -577,7 +577,7 @@ the remote via /dev/input/event devices.
 
        -  ``KEY_CLEAR``
 
-       -  Stop sroweam and return to default input video/audio
+       -  Stop stream and return to default input video/audio
 
        -  CLEAR / RESET / BOSS KEY
 
@@ -593,7 +593,7 @@ the remote via /dev/input/event devices.
 
        -  ``KEY_FAVORITES``
 
-       -  Open the favorites sroweam window
+       -  Open the favorites stream window
 
        -  TV WALL / Favorites
 
diff --git a/Documentation/media/uapi/v4l/biblio.rst b/Documentation/media/uapi/v4l/biblio.rst
index ec33768c055e..8f4eb8823d82 100644
--- a/Documentation/media/uapi/v4l/biblio.rst
+++ b/Documentation/media/uapi/v4l/biblio.rst
@@ -122,6 +122,15 @@ ITU BT.1119
 
 :author:    International Telecommunication Union (http://www.itu.ch)
 
+.. _h264:
+
+ITU-T Rec. H.264 Specification (04/2017 Edition)
+================================================
+
+:title:     ITU-T Recommendation H.264 "Advanced Video Coding for Generic Audiovisual Services"
+
+:author:    International Telecommunication Union (http://www.itu.ch)
+
 .. _jfif:
 
 JFIF
diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
index 4a8446203085..d6ea2ffd65c5 100644
--- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
+++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
@@ -759,6 +759,32 @@ enum v4l2_mpeg_video_h264_level -
 
 
 
+.. _v4l2-mpeg-video-mpeg2-level:
+
+``V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL``
+    (enum)
+
+enum v4l2_mpeg_video_mpeg2_level -
+    The level information for the MPEG2 elementary stream. Applicable to
+    MPEG2 codecs. Possible values are:
+
+
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_LOW``
+      - Low Level (LL)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_MAIN``
+      - Main Level (ML)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH_1440``
+      - High-1440 Level (H-14)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH``
+      - High Level (HL)
+
+
+
 .. _v4l2-mpeg-video-mpeg4-level:
 
 ``V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL``
@@ -845,6 +871,36 @@ enum v4l2_mpeg_video_h264_profile -
 
 
 
+.. _v4l2-mpeg-video-mpeg2-profile:
+
+``V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE``
+    (enum)
+
+enum v4l2_mpeg_video_mpeg2_profile -
+    The profile information for MPEG2. Applicable to MPEG2 codecs.
+    Possible values are:
+
+
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SIMPLE``
+      - Simple profile (SP)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MAIN``
+      - Main profile (MP)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SNR_SCALABLE``
+      - SNR Scalable profile (SNR)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SPATIALLY_SCALABLE``
+      - Spatially Scalable profile (Spt)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_HIGH``
+      - High profile (HP)
+    * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MULTIVIEW``
+      - Multi-view profile (MVP)
+
+
+
 .. _v4l2-mpeg-video-mpeg4-profile:
 
 ``V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE``
@@ -1395,6 +1451,575 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
       - Layer number
 
 
+.. _v4l2-mpeg-h264:
+
+``V4L2_CID_MPEG_VIDEO_H264_SPS (struct)``
+    Specifies the sequence parameter set (as extracted from the
+    bitstream) for the associated H264 slice data. This includes the
+    necessary parameters for configuring a stateless hardware decoding
+    pipeline for H264. The bitstream parameters are defined according
+    to :ref:`h264`, section 7.4.2.1.1 "Sequence Parameter Set Data
+    Semantics". For further documentation, refer to the above
+    specification, unless there is an explicit comment stating
+    otherwise.
+
+    .. note::
+
+       This compound control is not yet part of the public kernel API and
+       it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_sps
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_sps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``profile_idc``
+      -
+    * - __u8
+      - ``constraint_set_flags``
+      - See :ref:`Sequence Parameter Set Constraints Set Flags <h264_sps_constraints_set_flags>`
+    * - __u8
+      - ``level_idc``
+      -
+    * - __u8
+      - ``seq_parameter_set_id``
+      -
+    * - __u8
+      - ``chroma_format_idc``
+      -
+    * - __u8
+      - ``bit_depth_luma_minus8``
+      -
+    * - __u8
+      - ``bit_depth_chroma_minus8``
+      -
+    * - __u8
+      - ``log2_max_frame_num_minus4``
+      -
+    * - __u8
+      - ``pic_order_cnt_type``
+      -
+    * - __u8
+      - ``log2_max_pic_order_cnt_lsb_minus4``
+      -
+    * - __u8
+      - ``max_num_ref_frames``
+      -
+    * - __u8
+      - ``num_ref_frames_in_pic_order_cnt_cycle``
+      -
+    * - __s32
+      - ``offset_for_ref_frame[255]``
+      -
+    * - __s32
+      - ``offset_for_non_ref_pic``
+      -
+    * - __s32
+      - ``offset_for_top_to_bottom_field``
+      -
+    * - __u16
+      - ``pic_width_in_mbs_minus1``
+      -
+    * - __u16
+      - ``pic_height_in_map_units_minus1``
+      -
+    * - __u32
+      - ``flags``
+      - See :ref:`Sequence Parameter Set Flags <h264_sps_flags>`
+
+.. _h264_sps_constraints_set_flags:
+
+``Sequence Parameter Set Constraints Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET0_FLAG``
+      - 0x00000001
+      -
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET1_FLAG``
+      - 0x00000002
+      -
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET2_FLAG``
+      - 0x00000004
+      -
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET3_FLAG``
+      - 0x00000008
+      -
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET4_FLAG``
+      - 0x00000010
+      -
+    * - ``V4L2_H264_SPS_CONSTRAINT_SET5_FLAG``
+      - 0x00000020
+      -
+
+.. _h264_sps_flags:
+
+``Sequence Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE``
+      - 0x00000001
+      -
+    * - ``V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS``
+      - 0x00000002
+      -
+    * - ``V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO``
+      - 0x00000004
+      -
+    * - ``V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED``
+      - 0x00000008
+      -
+    * - ``V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY``
+      - 0x00000010
+      -
+    * - ``V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD``
+      - 0x00000020
+      -
+    * - ``V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE``
+      - 0x00000040
+      -
+
+``V4L2_CID_MPEG_VIDEO_H264_PPS (struct)``
+    Specifies the picture parameter set (as extracted from the
+    bitstream) for the associated H264 slice data. This includes the
+    necessary parameters for configuring a stateless hardware decoding
+    pipeline for H264.  The bitstream parameters are defined according
+    to :ref:`h264`, section 7.4.2.2 "Picture Parameter Set RBSP
+    Semantics". For further documentation, refer to the above
+    specification, unless there is an explicit comment stating
+    otherwise.
+
+    .. note::
+
+       This compound control is not yet part of the public kernel API and
+       it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_pps
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_pps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``pic_parameter_set_id``
+      -
+    * - __u8
+      - ``seq_parameter_set_id``
+      -
+    * - __u8
+      - ``num_slice_groups_minus1``
+      -
+    * - __u8
+      - ``num_ref_idx_l0_default_active_minus1``
+      -
+    * - __u8
+      - ``num_ref_idx_l1_default_active_minus1``
+      -
+    * - __u8
+      - ``weighted_bipred_idc``
+      -
+    * - __s8
+      - ``pic_init_qp_minus26``
+      -
+    * - __s8
+      - ``pic_init_qs_minus26``
+      -
+    * - __s8
+      - ``chroma_qp_index_offset``
+      -
+    * - __s8
+      - ``second_chroma_qp_index_offset``
+      -
+    * - __u16
+      - ``flags``
+      - See :ref:`Picture Parameter Set Flags <h264_pps_flags>`
+
+.. _h264_pps_flags:
+
+``Picture Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE``
+      - 0x00000001
+      -
+    * - ``V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT``
+      - 0x00000002
+      -
+    * - ``V4L2_H264_PPS_FLAG_WEIGHTED_PRED``
+      - 0x00000004
+      -
+    * - ``V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
+      - 0x00000008
+      -
+    * - ``V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED``
+      - 0x00000010
+      -
+    * - ``V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT``
+      - 0x00000020
+      -
+    * - ``V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE``
+      - 0x00000040
+      -
+    * - ``V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT``
+      - 0x00000080
+      -
+
+``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX (struct)``
+    Specifies the scaling matrix (as extracted from the bitstream) for
+    the associated H264 slice data. The bitstream parameters are
+    defined according to :ref:`h264`, section 7.4.2.1.1.1 "Scaling
+    List Semantics". For further documentation, refer to the above
+    specification, unless there is an explicit comment stating
+    otherwise.
+
+    .. note::
+
+       This compound control is not yet part of the public kernel API and
+       it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_scaling_matrix
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_scaling_matrix
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``scaling_list_4x4[6][16]``
+      -
+    * - __u8
+      - ``scaling_list_8x8[6][64]``
+      -
+
+``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS (struct)``
+    Specifies the slice parameters (as extracted from the bitstream)
+    for the associated H264 slice data. This includes the necessary
+    parameters for configuring a stateless hardware decoding pipeline
+    for H264.  The bitstream parameters are defined according to
+    :ref:`h264`, section 7.4.3 "Slice Header Semantics". For further
+    documentation, refer to the above specification, unless there is
+    an explicit comment stating otherwise.
+
+    .. note::
+
+       This compound control is not yet part of the public kernel API
+       and it is expected to change.
+
+       This structure is expected to be passed as an array, with one
+       entry for each slice included in the bitstream buffer.
+
+.. c:type:: v4l2_ctrl_h264_slice_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_slice_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u32
+      - ``size``
+      -
+    * - __u32
+      - ``header_bit_size``
+      -
+    * - __u16
+      - ``first_mb_in_slice``
+      -
+    * - __u8
+      - ``slice_type``
+      -
+    * - __u8
+      - ``pic_parameter_set_id``
+      -
+    * - __u8
+      - ``colour_plane_id``
+      -
+    * - __u8
+      - ``redundant_pic_cnt``
+      -
+    * - __u16
+      - ``frame_num``
+      -
+    * - __u16
+      - ``idr_pic_id``
+      -
+    * - __u16
+      - ``pic_order_cnt_lsb``
+      -
+    * - __s32
+      - ``delta_pic_order_cnt_bottom``
+      -
+    * - __s32
+      - ``delta_pic_order_cnt0``
+      -
+    * - __s32
+      - ``delta_pic_order_cnt1``
+      -
+    * - struct :c:type:`v4l2_h264_pred_weight_table`
+      - ``pred_weight_table``
+      -
+    * - __u32
+      - ``dec_ref_pic_marking_bit_size``
+      -
+    * - __u32
+      - ``pic_order_cnt_bit_size``
+      -
+    * - __u8
+      - ``cabac_init_idc``
+      -
+    * - __s8
+      - ``slice_qp_delta``
+      -
+    * - __s8
+      - ``slice_qs_delta``
+      -
+    * - __u8
+      - ``disable_deblocking_filter_idc``
+      -
+    * - __s8
+      - ``slice_alpha_c0_offset_div2``
+      -
+    * - __s8
+      - ``slice_beta_offset_div2``
+      -
+    * - __u8
+      - ``num_ref_idx_l0_active_minus1``
+      -
+    * - __u8
+      - ``num_ref_idx_l1_active_minus1``
+      -
+    * - __u32
+      - ``slice_group_change_cycle``
+      -
+    * - __u8
+      - ``ref_pic_list0[32]``
+      - Reference picture list after applying the per-slice modifications
+    * - __u8
+      - ``ref_pic_list1[32]``
+      - Reference picture list after applying the per-slice modifications
+    * - __u32
+      - ``flags``
+      - See :ref:`Slice Parameter Flags <h264_slice_flags>`
+
+.. _h264_slice_flags:
+
+``Slice Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_SLICE_FLAG_FIELD_PIC``
+      - 0x00000001
+      -
+    * - ``V4L2_H264_SLICE_FLAG_BOTTOM_FIELD``
+      - 0x00000002
+      -
+    * - ``V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED``
+      - 0x00000004
+      -
+    * - ``V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH``
+      - 0x00000008
+      -
+
+``Prediction Weight Table``
+
+    The bitstream parameters are defined according to :ref:`h264`,
+    section 7.4.3.2 "Prediction Weight Table Semantics". For further
+    documentation, refer to the above specification, unless there is
+    an explicit comment stating otherwise.
+
+.. c:type:: v4l2_h264_pred_weight_table
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_pred_weight_table
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u16
+      - ``luma_log2_weight_denom``
+      -
+    * - __u16
+      - ``chroma_log2_weight_denom``
+      -
+    * - struct :c:type:`v4l2_h264_weight_factors`
+      - ``weight_factors[2]``
+      - The weight factors at index 0 are the weight factors for the reference
+        list 0, the one at index 1 for the reference list 1.
+
+.. c:type:: v4l2_h264_weight_factors
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_weight_factors
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s16
+      - ``luma_weight[32]``
+      -
+    * - __s16
+      - ``luma_offset[32]``
+      -
+    * - __s16
+      - ``chroma_weight[32][2]``
+      -
+    * - __s16
+      - ``chroma_offset[32][2]``
+      -
+
+``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS (struct)``
+    Specifies the decode parameters (as extracted from the bitstream)
+    for the associated H264 slice data. This includes the necessary
+    parameters for configuring a stateless hardware decoding pipeline
+    for H264. The bitstream parameters are defined according to
+    :ref:`h264`. For further documentation, refer to the above
+    specification, unless there is an explicit comment stating
+    otherwise.
+
+    .. note::
+
+       This compound control is not yet part of the public kernel API and
+       it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_decode_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_decode_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - struct :c:type:`v4l2_h264_dpb_entry`
+      - ``dpb[16]``
+      -
+    * - __u16
+      - ``num_slices``
+      - Number of slices needed to decode the current frame
+    * - __u16
+      - ``nal_ref_idc``
+      - NAL reference ID value coming from the NAL Unit header
+    * - __u8
+      - ``ref_pic_list_p0[32]``
+      - Backward reference list used by P-frames in the original bitstream order
+    * - __u8
+      - ``ref_pic_list_b0[32]``
+      - Backward reference list used by B-frames in the original bitstream order
+    * - __u8
+      - ``ref_pic_list_b1[32]``
+      - Forward reference list used by B-frames in the original bitstream order
+    * - __s32
+      - ``top_field_order_cnt``
+      - Picture Order Count for the coded top field
+    * - __s32
+      - ``bottom_field_order_cnt``
+      - Picture Order Count for the coded bottom field
+    * - __u32
+      - ``flags``
+      - See :ref:`Decode Parameters Flags <h264_decode_params_flags>`
+
+.. _h264_decode_params_flags:
+
+``Decode Parameters Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC``
+      - 0x00000001
+      - That picture is an IDR picture
+
+.. c:type:: v4l2_h264_dpb_entry
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_dpb_entry
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u64
+      - ``reference_ts``
+      - Timestamp of the V4L2 capture buffer to use as reference, used
+        with B-coded and P-coded frames. The timestamp refers to the
+	``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
+	:c:func:`v4l2_timeval_to_ns()` function to convert the struct
+	:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
+    * - __u16
+      - ``frame_num``
+      -
+    * - __u16
+      - ``pic_num``
+      -
+    * - __s32
+      - ``top_field_order_cnt``
+      -
+    * - __s32
+      - ``bottom_field_order_cnt``
+      -
+    * - __u32
+      - ``flags``
+      - See :ref:`DPB Entry Flags <h264_dpb_flags>`
+
+.. _h264_dpb_flags:
+
+``DPB Entries Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_H264_DPB_ENTRY_FLAG_VALID``
+      - 0x00000001
+      - The DPB entry is valid and should be considered
+    * - ``V4L2_H264_DPB_ENTRY_FLAG_ACTIVE``
+      - 0x00000002
+      - The DPB entry is currently being used as a reference frame
+    * - ``V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM``
+      - 0x00000004
+      - The DPB entry is a long term reference frame
 
 .. _v4l2-mpeg-mpeg2:
 
diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 24274b398e63..655362483730 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -85,20 +85,17 @@ be able to see such compound controls. In other words, these controls
 with compound types should only be used programmatically.
 
 Since such compound controls need to expose more information about
-themselves than is possible with
-:ref:`VIDIOC_QUERYCTRL` the
-:ref:`VIDIOC_QUERY_EXT_CTRL <VIDIOC_QUERYCTRL>` ioctl was added. In
-particular, this ioctl gives the dimensions of the N-dimensional array
-if this control consists of more than one element.
+themselves than is possible with :ref:`VIDIOC_QUERYCTRL <VIDIOC_QUERYCTRL>`
+the :ref:`VIDIOC_QUERY_EXT_CTRL <VIDIOC_QUERYCTRL>` ioctl was added. In
+particular, this ioctl gives the dimensions of the N-dimensional array if
+this control consists of more than one element.
 
 .. note::
 
    #. It is important to realize that due to the flexibility of controls it is
       necessary to check whether the control you want to set actually is
       supported in the driver and what the valid range of values is. So use
-      the :ref:`VIDIOC_QUERYCTRL` (or :ref:`VIDIOC_QUERY_EXT_CTRL
-      <VIDIOC_QUERYCTRL>`) and :ref:`VIDIOC_QUERYMENU <VIDIOC_QUERYCTRL>`
-      ioctls to check this.
+      :ref:`VIDIOC_QUERYCTRL` to check this.
 
    #. It is possible that some of the menu indices in a control of
       type ``V4L2_CTRL_TYPE_MENU`` may not be supported (``VIDIOC_QUERYMENU``
@@ -144,7 +141,7 @@ control class is found:
     while (0 == ioctl(fd, VIDIOC_QUERYCTRL, &qctrl)) {
 	if (V4L2_CTRL_ID2CLASS(qctrl.id) != V4L2_CTRL_CLASS_MPEG)
 	    break;
-	    /* ... */
+	/* ... */
 	qctrl.id |= V4L2_CTRL_FLAG_NEXT_CTRL;
     }
 
diff --git a/Documentation/media/uapi/v4l/field-order.rst b/Documentation/media/uapi/v4l/field-order.rst
index d640e922a974..c422bebe4314 100644
--- a/Documentation/media/uapi/v4l/field-order.rst
+++ b/Documentation/media/uapi/v4l/field-order.rst
@@ -51,6 +51,11 @@ determined by the video standard. Hence the distinction between temporal
 and spatial order of fields. The diagrams below should make this
 clearer.
 
+In V4L it is assumed that all video cameras transmit fields on the media
+bus in the same order they were captured, so if the top field was
+captured first (is the older field), the top field is also transmitted
+first on the bus.
+
 All video capture and output devices must report the current field
 order. Some drivers may permit the selection of a different order, to
 this end applications initialize the ``field`` field of struct
@@ -101,10 +106,10 @@ enum v4l2_field
     * - ``V4L2_FIELD_INTERLACED``
       - 4
       - Images contain both fields, interleaved line by line. The temporal
-	order of the fields (whether the top or bottom field is first
-	transmitted) depends on the current video standard. M/NTSC
-	transmits the bottom field first, all other standards the top
-	field first.
+	order of the fields (whether the top or bottom field is older)
+	depends on the current video standard. In M/NTSC the bottom
+	field is the older field. In all other standards the top field
+	is the older field.
     * - ``V4L2_FIELD_SEQ_TB``
       - 5
       - Images contain both fields, the top field lines are stored first
@@ -135,11 +140,11 @@ enum v4l2_field
     * - ``V4L2_FIELD_INTERLACED_TB``
       - 8
       - Images contain both fields, interleaved line by line, top field
-	first. The top field is transmitted first.
+	first. The top field is the older field.
     * - ``V4L2_FIELD_INTERLACED_BT``
       - 9
       - Images contain both fields, interleaved line by line, top field
-	first. The bottom field is transmitted first.
+	first. The bottom field is the older field.
 
 
 
diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
index 6c961cfb74da..4b701fc7653e 100644
--- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
@@ -52,6 +52,31 @@ Compressed Formats
       - ``V4L2_PIX_FMT_H264_MVC``
       - 'M264'
       - H264 MVC video elementary stream.
+    * .. _V4L2-PIX-FMT-H264-SLICE-RAW:
+
+      - ``V4L2_PIX_FMT_H264_SLICE_RAW``
+      - 'S264'
+      - H264 parsed slice data, without the start code and as
+	extracted from the H264 bitstream.  This format is adapted for
+	stateless video decoders that implement an H264 pipeline
+	(using the :ref:`mem2mem` and :ref:`media-request-api`).
+	Metadata associated with the frame to decode are required to
+	be passed through the ``V4L2_CID_MPEG_VIDEO_H264_SPS``,
+	``V4L2_CID_MPEG_VIDEO_H264_PPS``,
+	``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX``,
+	``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS`` and
+	``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS`` controls.  See the
+	:ref:`associated Codec Control IDs <v4l2-mpeg-h264>`.  Exactly
+	one output and one capture buffer must be provided for use
+	with this pixel format. The output buffer must contain the
+	appropriate number of macroblocks to decode a full
+	corresponding frame to the matching capture buffer.
+
+	.. note::
+
+	   This format is not yet part of the public kernel API and it
+	   is expected to change.
+
     * .. _V4L2-PIX-FMT-H263:
 
       - ``V4L2_PIX_FMT_H263``
diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
index 5688c816e334..db43dda5aafb 100644
--- a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
@@ -31,7 +31,20 @@ describing all planes of that format.
 
     * - __u32
       - ``sizeimage``
-      - Maximum size in bytes required for image data in this plane.
+      - Maximum size in bytes required for image data in this plane,
+	set by the driver. When the image consists of variable length
+	compressed data this is the number of bytes required by the
+	codec to support the worst-case compression scenario.
+
+	The driver will set the value for uncompressed images.
+
+	Clients are allowed to set the sizeimage field for variable length
+	compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at
+	:ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the
+	value itself, or it may modify the provided value based on
+	alignment requirements or minimum/maximum size requirements.
+	If the client wants to leave this to the driver, then it should
+	set sizeimage to 0.
     * - __u32
       - ``bytesperline``
       - Distance in bytes between the leftmost pixels in two adjacent
diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
index 71eebfc6d853..da6da2ef139a 100644
--- a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
@@ -89,7 +89,18 @@ Single-planar format structure
       - Size in bytes of the buffer to hold a complete image, set by the
 	driver. Usually this is ``bytesperline`` times ``height``. When
 	the image consists of variable length compressed data this is the
-	maximum number of bytes required to hold an image.
+	number of bytes required by the codec to support the worst-case
+	compression scenario.
+
+	The driver will set the value for uncompressed images.
+
+	Clients are allowed to set the sizeimage field for variable length
+	compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at
+	:ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the
+	value itself, or it may modify the provided value based on
+	alignment requirements or minimum/maximum size requirements.
+	If the client wants to leave this to the driver, then it should
+	set sizeimage to 0.
     * - __u32
       - ``colorspace``
       - Image colorspace, from enum :c:type:`v4l2_colorspace`.
diff --git a/Documentation/media/uapi/v4l/vidioc-qbuf.rst b/Documentation/media/uapi/v4l/vidioc-qbuf.rst
index dbf7b445a27b..407302d80684 100644
--- a/Documentation/media/uapi/v4l/vidioc-qbuf.rst
+++ b/Documentation/media/uapi/v4l/vidioc-qbuf.rst
@@ -139,6 +139,14 @@ may continue as normal, but should be aware that data in the dequeued
 buffer might be corrupted. When using the multi-planar API, the planes
 array must be passed in as well.
 
+If the application sets the ``memory`` field to ``V4L2_MEMORY_DMABUF`` to
+dequeue a :ref:`DMABUF <dmabuf>` buffer, the driver fills the ``m.fd`` field
+with a file descriptor numerically the same as the one given to ``VIDIOC_QBUF``
+when the buffer was enqueued. No new file descriptor is created at dequeue time
+and the value is only for the application convenience. When the multi-planar
+API is used the ``m.fd`` fields of the passed array of struct
+:c:type:`v4l2_plane` are filled instead.
+
 By default ``VIDIOC_DQBUF`` blocks when no buffer is in the outgoing
 queue. When the ``O_NONBLOCK`` flag was given to the
 :ref:`open() <func-open>` function, ``VIDIOC_DQBUF`` returns
diff --git a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
index f824162d0ea9..dc500632095d 100644
--- a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
+++ b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
@@ -443,6 +443,36 @@ See also the examples in :ref:`control`.
       - n/a
       - A struct :c:type:`v4l2_ctrl_mpeg2_quantization`, containing MPEG-2
 	quantization matrices for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_H264_SPS``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_h264_sps`, containing H264
+	sequence parameters for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_H264_PPS``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_h264_pps`, containing H264
+	picture parameters for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_H264_SCALING_MATRIX``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_h264_scaling_matrix`, containing H264
+	scaling matrices for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_H264_SLICE_PARAMS``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_h264_slice_params`, containing H264
+	slice parameters for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_H264_DECODE_PARAMS``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_h264_decode_params`, containing H264
+	decode parameters for stateless video decoders.
 
 .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}|
 
diff --git a/Documentation/media/v4l-drivers/index.rst b/Documentation/media/v4l-drivers/index.rst
index 33a055907258..c4c78a28654c 100644
--- a/Documentation/media/v4l-drivers/index.rst
+++ b/Documentation/media/v4l-drivers/index.rst
@@ -64,5 +64,6 @@ For more details see the file COPYING in the source distribution of Linux.
 	si476x
 	soc-camera
 	uvcvideo
+	vimc
 	vivid
 	zr364xx
diff --git a/Documentation/media/v4l-drivers/vimc.dot b/Documentation/media/v4l-drivers/vimc.dot
new file mode 100644
index 000000000000..57863a13fa39
--- /dev/null
+++ b/Documentation/media/v4l-drivers/vimc.dot
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+
+digraph board {
+	rankdir=TB
+	n00000001 [label="{{} | Sensor A\n/dev/v4l-subdev0 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+	n00000001:port0 -> n00000005:port0 [style=bold]
+	n00000001:port0 -> n0000000b [style=bold]
+	n00000003 [label="{{} | Sensor B\n/dev/v4l-subdev1 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+	n00000003:port0 -> n00000008:port0 [style=bold]
+	n00000003:port0 -> n0000000f [style=bold]
+	n00000005 [label="{{<port0> 0} | Debayer A\n/dev/v4l-subdev2 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+	n00000005:port1 -> n00000017:port0
+	n00000008 [label="{{<port0> 0} | Debayer B\n/dev/v4l-subdev3 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+	n00000008:port1 -> n00000017:port0 [style=dashed]
+	n0000000b [label="Raw Capture 0\n/dev/video0", shape=box, style=filled, fillcolor=yellow]
+	n0000000f [label="Raw Capture 1\n/dev/video1", shape=box, style=filled, fillcolor=yellow]
+	n00000013 [label="RGB/YUV Input\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
+	n00000013 -> n00000017:port0 [style=dashed]
+	n00000017 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev4 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+	n00000017:port1 -> n0000001a [style=bold]
+	n0000001a [label="RGB/YUV Capture\n/dev/video3", shape=box, style=filled, fillcolor=yellow]
+}
diff --git a/Documentation/media/v4l-drivers/vimc.rst b/Documentation/media/v4l-drivers/vimc.rst
new file mode 100644
index 000000000000..4628b12d417f
--- /dev/null
+++ b/Documentation/media/v4l-drivers/vimc.rst
@@ -0,0 +1,98 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+The Virtual Media Controller Driver (vimc)
+==========================================
+
+The vimc driver emulates complex video hardware using the V4L2 API and the Media
+API. It has a capture device and three subdevices: sensor, debayer and scaler.
+
+Topology
+--------
+
+The topology is hardcoded, although you could modify it in vimc-core and
+recompile the driver to achieve your own topology. This is the default topology:
+
+.. _vimc_topology_graph:
+
+.. kernel-figure:: vimc.dot
+    :alt:   vimc.dot
+    :align: center
+
+    Media pipeline graph on vimc
+
+Configuring the topology
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each subdevice will come with its default configuration (pixelformat, height,
+width, ...). One needs to configure the topology in order to match the
+configuration on each linked subdevice to stream frames through the pipeline.
+If the configuration doesn't match, the stream will fail. The ``v4l-utils``
+package is a bundle of user-space applications, that comes with ``media-ctl`` and
+``v4l2-ctl`` that can be used to configure the vimc configuration. This sequence
+of commands fits for the default topology:
+
+.. code-block:: bash
+
+        media-ctl -d platform:vimc -V '"Sensor A":0[fmt:SBGGR8_1X8/640x480]'
+        media-ctl -d platform:vimc -V '"Debayer A":0[fmt:SBGGR8_1X8/640x480]'
+        media-ctl -d platform:vimc -V '"Sensor B":0[fmt:SBGGR8_1X8/640x480]'
+        media-ctl -d platform:vimc -V '"Debayer B":0[fmt:SBGGR8_1X8/640x480]'
+        v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=1920,height=1440
+        v4l2-ctl -z platform:vimc -d "Raw Capture 0" -v pixelformat=BA81
+        v4l2-ctl -z platform:vimc -d "Raw Capture 1" -v pixelformat=BA81
+
+Subdevices
+----------
+
+Subdevices define the behavior of an entity in the topology. Depending on the
+subdevice, the entity can have multiple pads of type source or sink.
+
+vimc-sensor:
+	Generates images in several formats using video test pattern generator.
+	Exposes:
+
+	* 1 Pad source
+
+vimc-debayer:
+	Transforms images in bayer format into a non-bayer format.
+	Exposes:
+
+	* 1 Pad sink
+	* 1 Pad source
+
+vimc-scaler:
+	Scale up the image by a factor of 3. E.g.: a 640x480 image becomes a
+        1920x1440 image. (this value can be configured, see at
+        `Module options`_).
+	Exposes:
+
+	* 1 Pad sink
+	* 1 Pad source
+
+vimc-capture:
+	Exposes node /dev/videoX to allow userspace to capture the stream.
+	Exposes:
+
+	* 1 Pad sink
+	* 1 Pad source
+
+Module options
+---------------
+
+Vimc has a few module parameters to configure the driver. You should pass
+those arguments to each subdevice, not to the vimc module. For example::
+
+        vimc_subdevice.param=value
+
+* ``vimc_scaler.sca_mult=<unsigned int>``
+
+        Image size multiplier factor to be used to multiply both width and
+        height, so the image size will be ``sca_mult^2`` bigger than the
+        original one. Currently, only supports scaling up (the default value
+        is 3).
+
+* ``vimc_debayer.deb_mean_win_size=<unsigned int>``
+
+        Window size to calculate the mean. Note: the window size needs to be an
+        odd number, as the main pixel stays in the center of the window,
+        otherwise the next odd number is considered (the default value is 3).
diff --git a/Documentation/media/v4l-drivers/vivid.rst b/Documentation/media/v4l-drivers/vivid.rst
index edb6f33e029c..7082fec4075d 100644
--- a/Documentation/media/v4l-drivers/vivid.rst
+++ b/Documentation/media/v4l-drivers/vivid.rst
@@ -941,6 +941,11 @@ Digital Video Controls
 	affects the reported colorspace since DVI_D outputs will always use
 	sRGB.
 
+- Display Present:
+
+	sets the presence of a "display" on the HDMI output. This affects
+	the tx_edid_present, tx_hotplug and tx_rxsense controls.
+
 
 FM Radio Receiver Controls
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/media/videodev2.h.rst.exceptions b/Documentation/media/videodev2.h.rst.exceptions
index 64d348e67df9..55cbe324b9fc 100644
--- a/Documentation/media/videodev2.h.rst.exceptions
+++ b/Documentation/media/videodev2.h.rst.exceptions
@@ -136,6 +136,11 @@ replace symbol V4L2_CTRL_TYPE_U32 :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_U8 :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_MPEG2_QUANTIZATION :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_PPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SCALING_MATRIX :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_DECODE_PARAMS :c:type:`v4l2_ctrl_type`
 
 # V4L2 capability defines
 replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f70ebcdfe592..1adbb8a371c7 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -3,7 +3,7 @@
 			 ============================
 
 By: David Howells <dhowells@redhat.com>
-    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+    Paul E. McKenney <paulmck@linux.ibm.com>
     Will Deacon <will.deacon@arm.com>
     Peter Zijlstra <peterz@infradead.org>
 
@@ -548,7 +548,7 @@ There are certain things that the Linux kernel memory barriers do not guarantee:
 
 	[*] For information on bus mastering DMA and coherency please read:
 
-	    Documentation/PCI/pci.txt
+	    Documentation/driver-api/pci/pci.rst
 	    Documentation/DMA-API-HOWTO.txt
 	    Documentation/DMA-API.txt
 
diff --git a/Documentation/memory-devices/ti-emif.txt b/Documentation/memory-devices/ti-emif.txt
deleted file mode 100644
index f4ad9a7d0f4b..000000000000
--- a/Documentation/memory-devices/ti-emif.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-TI EMIF SDRAM Controller Driver:
-
-Author
-========
-Aneesh V <aneesh@ti.com>
-
-Location
-============
-driver/memory/emif.c
-
-Supported SoCs:
-===================
-TI OMAP44xx
-TI OMAP54xx
-
-Menuconfig option:
-==========================
-Device Drivers
-	Memory devices
-		Texas Instruments EMIF driver
-
-Description
-===========
-This driver is for the EMIF module available in Texas Instruments
-SoCs. EMIF is an SDRAM controller that, based on its revision,
-supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols.
-This driver takes care of only LPDDR2 memories presently. The
-functions of the driver includes re-configuring AC timing
-parameters and other settings during frequency, voltage and
-temperature changes
-
-Platform Data (see include/linux/platform_data/emif_plat.h):
-=====================================================================
-DDR device details and other board dependent and SoC dependent
-information can be passed through platform data (struct emif_platform_data)
-- DDR device details: 'struct ddr_device_info'
-- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck'
-- Custom configurations: customizable policy options through
-  'struct emif_custom_configs'
-- IP revision
-- PHY type
-
-Interface to the external world:
-================================
-EMIF driver registers notifiers for voltage and frequency changes
-affecting EMIF and takes appropriate actions when these are invoked.
-- freq_pre_notify_handling()
-- freq_post_notify_handling()
-- volt_notify_handling()
-
-Debugfs
-========
-The driver creates two debugfs entries per device.
-- regcache_dump : dump of register values calculated and saved for all
-  frequencies used so far.
-- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4
-  indicates the current temperature level of the device.
diff --git a/Documentation/mic/index.rst b/Documentation/mic/index.rst
new file mode 100644
index 000000000000..3a8d06367ef1
--- /dev/null
+++ b/Documentation/mic/index.rst
@@ -0,0 +1,16 @@
+=============================================
+Intel Many Integrated Core (MIC) architecture
+=============================================
+
+.. toctree::
+    :maxdepth: 1
+
+    mic_overview
+    scif_overview
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/mic/mic_overview.rst b/Documentation/mic/mic_overview.rst
new file mode 100644
index 000000000000..17d956bdaf7c
--- /dev/null
+++ b/Documentation/mic/mic_overview.rst
@@ -0,0 +1,85 @@
+======================================================
+Intel Many Integrated Core (MIC) architecture overview
+======================================================
+
+An Intel MIC X100 device is a PCIe form factor add-in coprocessor
+card based on the Intel Many Integrated Core (MIC) architecture
+that runs a Linux OS. It is a PCIe endpoint in a platform and therefore
+implements the three required standard address spaces i.e. configuration,
+memory and I/O. The host OS loads a device driver as is typical for
+PCIe devices. The card itself runs a bootstrap after reset that
+transfers control to the card OS downloaded from the host driver. The
+host driver supports OSPM suspend and resume operations. It shuts down
+the card during suspend and reboots the card OS during resume.
+The card OS as shipped by Intel is a Linux kernel with modifications
+for the X100 devices.
+
+Since it is a PCIe card, it does not have the ability to host hardware
+devices for networking, storage and console. We provide these devices
+on X100 coprocessors thus enabling a self-bootable equivalent
+environment for applications. A key benefit of our solution is that it
+leverages the standard virtio framework for network, disk and console
+devices, though in our case the virtio framework is used across a PCIe
+bus. A Virtio Over PCIe (VOP) driver allows creating user space
+backends or devices on the host which are used to probe virtio drivers
+for these devices on the MIC card. The existing VRINGH infrastructure
+in the kernel is used to access virtio rings from the host. The card
+VOP driver allows card virtio drivers to communicate with their user
+space backends on the host via a device page. Ring 3 apps on the host
+can add, remove and configure virtio devices. A thin MIC specific
+virtio_config_ops is implemented which is borrowed heavily from
+previous similar implementations in lguest and s390.
+
+MIC PCIe card has a dma controller with 8 channels. These channels are
+shared between the host s/w and the card s/w. 0 to 3 are used by host
+and 4 to 7 by card. As the dma device doesn't show up as PCIe device,
+a virtual bus called mic bus is created and virtual dma devices are
+created on it by the host/card drivers. On host the channels are private
+and used only by the host driver to transfer data for the virtio devices.
+
+The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a
+low level communications API across PCIe currently implemented for MIC.
+More details are available at scif_overview.txt.
+
+The Coprocessor State Management (COSM) driver on the host allows for
+boot, shutdown and reset of Intel MIC devices. It communicates with a COSM
+"client" driver on the MIC cards over SCIF to perform these functions.
+
+Here is a block diagram of the various components described above. The
+virtio backends are situated on the host rather than the card given better
+single threaded performance for the host compared to MIC, the ability of
+the host to initiate DMA's to/from the card using the MIC DMA engine and
+the fact that the virtio block storage backend can only be on the host::
+
+               +----------+           |             +----------+
+               | Card OS  |           |             | Host OS  |
+               +----------+           |             +----------+
+                                      |
+        +-------+ +--------+ +------+ | +---------+  +--------+ +--------+
+        | Virtio| |Virtio  | |Virtio| | |Virtio   |  |Virtio  | |Virtio  |
+        | Net   | |Console | |Block | | |Net      |  |Console | |Block   |
+        | Driver| |Driver  | |Driver| | |backend  |  |backend | |backend |
+        +---+---+ +---+----+ +--+---+ | +---------+  +----+---+ +--------+
+            |         |         |     |      |            |         |
+            |         |         |     |User  |            |         |
+            |         |         |     |------|------------|--+------|-------
+            +---------+---------+     |Kernel                |
+                      |               |                      |
+  +---------+     +---+----+ +------+ | +------+ +------+ +--+---+  +-------+
+  |MIC DMA  |     |  VOP   | | SCIF | | | SCIF | | COSM | | VOP  |  |MIC DMA|
+  +---+-----+     +---+----+ +--+---+ | +--+---+ +--+---+ +------+  +----+--+
+      |               |         |     |    |        |                    |
+  +---+-----+     +---+----+ +--+---+ | +--+---+ +--+---+ +------+  +----+--+
+  |MIC      |     |  VOP   | |SCIF  | | |SCIF  | | COSM | | VOP  |  | MIC   |
+  |HW Bus   |     |  HW Bus| |HW Bus| | |HW Bus| | Bus  | |HW Bus|  |HW Bus |
+  +---------+     +--------+ +--+---+ | +--+---+ +------+ +------+  +-------+
+      |               |         |     |       |     |                    |
+      |   +-----------+--+      |     |       |    +---------------+     |
+      |   |Intel MIC     |      |     |       |    |Intel MIC      |     |
+      |   |Card Driver   |      |     |       |    |Host Driver    |     |
+      +---+--------------+------+     |       +----+---------------+-----+
+                 |                    |                   |
+             +-------------------------------------------------------------+
+             |                                                             |
+             |                    PCIe Bus                                 |
+             +-------------------------------------------------------------+
diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.txt
deleted file mode 100644
index 074adbdf83a4..000000000000
--- a/Documentation/mic/mic_overview.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-An Intel MIC X100 device is a PCIe form factor add-in coprocessor
-card based on the Intel Many Integrated Core (MIC) architecture
-that runs a Linux OS. It is a PCIe endpoint in a platform and therefore
-implements the three required standard address spaces i.e. configuration,
-memory and I/O. The host OS loads a device driver as is typical for
-PCIe devices. The card itself runs a bootstrap after reset that
-transfers control to the card OS downloaded from the host driver. The
-host driver supports OSPM suspend and resume operations. It shuts down
-the card during suspend and reboots the card OS during resume.
-The card OS as shipped by Intel is a Linux kernel with modifications
-for the X100 devices.
-
-Since it is a PCIe card, it does not have the ability to host hardware
-devices for networking, storage and console. We provide these devices
-on X100 coprocessors thus enabling a self-bootable equivalent
-environment for applications. A key benefit of our solution is that it
-leverages the standard virtio framework for network, disk and console
-devices, though in our case the virtio framework is used across a PCIe
-bus. A Virtio Over PCIe (VOP) driver allows creating user space
-backends or devices on the host which are used to probe virtio drivers
-for these devices on the MIC card. The existing VRINGH infrastructure
-in the kernel is used to access virtio rings from the host. The card
-VOP driver allows card virtio drivers to communicate with their user
-space backends on the host via a device page. Ring 3 apps on the host
-can add, remove and configure virtio devices. A thin MIC specific
-virtio_config_ops is implemented which is borrowed heavily from
-previous similar implementations in lguest and s390.
-
-MIC PCIe card has a dma controller with 8 channels. These channels are
-shared between the host s/w and the card s/w. 0 to 3 are used by host
-and 4 to 7 by card. As the dma device doesn't show up as PCIe device,
-a virtual bus called mic bus is created and virtual dma devices are
-created on it by the host/card drivers. On host the channels are private
-and used only by the host driver to transfer data for the virtio devices.
-
-The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a
-low level communications API across PCIe currently implemented for MIC.
-More details are available at scif_overview.txt.
-
-The Coprocessor State Management (COSM) driver on the host allows for
-boot, shutdown and reset of Intel MIC devices. It communicates with a COSM
-"client" driver on the MIC cards over SCIF to perform these functions.
-
-Here is a block diagram of the various components described above. The
-virtio backends are situated on the host rather than the card given better
-single threaded performance for the host compared to MIC, the ability of
-the host to initiate DMA's to/from the card using the MIC DMA engine and
-the fact that the virtio block storage backend can only be on the host.
-
-               +----------+           |             +----------+
-               | Card OS  |           |             | Host OS  |
-               +----------+           |             +----------+
-                                      |
-        +-------+ +--------+ +------+ | +---------+  +--------+ +--------+
-        | Virtio| |Virtio  | |Virtio| | |Virtio   |  |Virtio  | |Virtio  |
-        | Net   | |Console | |Block | | |Net      |  |Console | |Block   |
-        | Driver| |Driver  | |Driver| | |backend  |  |backend | |backend |
-        +---+---+ +---+----+ +--+---+ | +---------+  +----+---+ +--------+
-            |         |         |     |      |            |         |
-            |         |         |     |User  |            |         |
-            |         |         |     |------|------------|--+------|-------
-            +---------+---------+     |Kernel                |
-                      |               |                      |
-  +---------+     +---+----+ +------+ | +------+ +------+ +--+---+  +-------+
-  |MIC DMA  |     |  VOP   | | SCIF | | | SCIF | | COSM | | VOP  |  |MIC DMA|
-  +---+-----+     +---+----+ +--+---+ | +--+---+ +--+---+ +------+  +----+--+
-      |               |         |     |    |        |                    |
-  +---+-----+     +---+----+ +--+---+ | +--+---+ +--+---+ +------+  +----+--+
-  |MIC      |     |  VOP   | |SCIF  | | |SCIF  | | COSM | | VOP  |  | MIC   |
-  |HW Bus   |     |  HW Bus| |HW Bus| | |HW Bus| | Bus  | |HW Bus|  |HW Bus |
-  +---------+     +--------+ +--+---+ | +--+---+ +------+ +------+  +-------+
-      |               |         |     |       |     |                    |
-      |   +-----------+--+      |     |       |    +---------------+     |
-      |   |Intel MIC     |      |     |       |    |Intel MIC      |     |
-      |   |Card Driver   |      |     |       |    |Host Driver    |     |
-      +---+--------------+------+     |       +----+---------------+-----+
-                 |                    |                   |
-             +-------------------------------------------------------------+
-             |                                                             |
-             |                    PCIe Bus                                 |
-             +-------------------------------------------------------------+
diff --git a/Documentation/mic/scif_overview.rst b/Documentation/mic/scif_overview.rst
new file mode 100644
index 000000000000..4c8ad9e43706
--- /dev/null
+++ b/Documentation/mic/scif_overview.rst
@@ -0,0 +1,108 @@
+========================================
+Symmetric Communication Interface (SCIF)
+========================================
+
+The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low
+level communications API across PCIe currently implemented for MIC. Currently
+SCIF provides inter-node communication within a single host platform, where a
+node is a MIC Coprocessor or Xeon based host. SCIF abstracts the details of
+communicating over the PCIe bus while providing an API that is symmetric
+across all the nodes in the PCIe network. An important design objective for SCIF
+is to deliver the maximum possible performance given the communication
+abilities of the hardware. SCIF has been used to implement an offload compiler
+runtime and OFED support for MPI implementations for MIC coprocessors.
+
+SCIF API Components
+===================
+
+The SCIF API has the following parts:
+
+1. Connection establishment using a client server model
+2. Byte stream messaging intended for short messages
+3. Node enumeration to determine online nodes
+4. Poll semantics for detection of incoming connections and messages
+5. Memory registration to pin down pages
+6. Remote memory mapping for low latency CPU accesses via mmap
+7. Remote DMA (RDMA) for high bandwidth DMA transfers
+8. Fence APIs for RDMA synchronization
+
+SCIF exposes the notion of a connection which can be used by peer processes on
+nodes in a SCIF PCIe "network" to share memory "windows" and to communicate. A
+process in a SCIF node initiates a SCIF connection to a peer process on a
+different node via a SCIF "endpoint". SCIF endpoints support messaging APIs
+which are similar to connection oriented socket APIs. Connected SCIF endpoints
+can also register local memory which is followed by data transfer using either
+DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and
+kernel mode clients which are functionally equivalent.
+
+SCIF Performance for MIC
+========================
+
+DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus
+SCIF shows the performance advantages of SCIF for HPC applications and
+runtimes::
+
+             Comparison of TCP and SCIF based BW
+
+  Throughput (GB/sec)
+    8 +                                             PCIe Bandwidth ******
+      +                                                        TCP ######
+    7 +    **************************************             SCIF %%%%%%
+      |                       %%%%%%%%%%%%%%%%%%%
+    6 +                   %%%%
+      |                 %%
+      |               %%%
+    5 +              %%
+      |            %%
+    4 +           %%
+      |          %%
+    3 +         %%
+      |        %
+    2 +      %%
+      |     %%
+      |    %
+    1 +
+      +    ######################################
+    0 +++---+++--+--+-+--+--+-++-+--+-++-+--+-++-+-
+      1       10     100      1000   10000   100000
+                   Transfer Size (KBytes)
+
+SCIF allows memory sharing via mmap(..) between processes on different PCIe
+nodes and thus provides bare-metal PCIe latency. The round trip SCIF mmap
+latency from the host to an x100 MIC for an 8 byte message is 0.44 usecs.
+
+SCIF has a user space library which is a thin IOCTL wrapper providing a user
+space API similar to the kernel API in scif.h. The SCIF user space library
+is distributed @ https://software.intel.com/en-us/mic-developer
+
+Here is some pseudo code for an example of how two applications on two PCIe
+nodes would typically use the SCIF API::
+
+  Process A (on node A)			Process B (on node B)
+
+  /* get online node information */
+  scif_get_node_ids(..)			scif_get_node_ids(..)
+  scif_open(..)				scif_open(..)
+  scif_bind(..)				scif_bind(..)
+  scif_listen(..)
+  scif_accept(..)				scif_connect(..)
+  /* SCIF connection established */
+
+  /* Send and receive short messages */
+  scif_send(..)/scif_recv(..)		scif_send(..)/scif_recv(..)
+
+  /* Register memory */
+  scif_register(..)			scif_register(..)
+
+  /* RDMA */
+  scif_readfrom(..)/scif_writeto(..)	scif_readfrom(..)/scif_writeto(..)
+
+  /* Fence DMAs */
+  scif_fence_signal(..)			scif_fence_signal(..)
+
+  mmap(..)				mmap(..)
+
+  /* Access remote registered memory */
+
+  /* Close the endpoints */
+  scif_close(..)				scif_close(..)
diff --git a/Documentation/mic/scif_overview.txt b/Documentation/mic/scif_overview.txt
deleted file mode 100644
index 0a280d986731..000000000000
--- a/Documentation/mic/scif_overview.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low
-level communications API across PCIe currently implemented for MIC. Currently
-SCIF provides inter-node communication within a single host platform, where a
-node is a MIC Coprocessor or Xeon based host. SCIF abstracts the details of
-communicating over the PCIe bus while providing an API that is symmetric
-across all the nodes in the PCIe network. An important design objective for SCIF
-is to deliver the maximum possible performance given the communication
-abilities of the hardware. SCIF has been used to implement an offload compiler
-runtime and OFED support for MPI implementations for MIC coprocessors.
-
-==== SCIF API Components ====
-The SCIF API has the following parts:
-1. Connection establishment using a client server model
-2. Byte stream messaging intended for short messages
-3. Node enumeration to determine online nodes
-4. Poll semantics for detection of incoming connections and messages
-5. Memory registration to pin down pages
-6. Remote memory mapping for low latency CPU accesses via mmap
-7. Remote DMA (RDMA) for high bandwidth DMA transfers
-8. Fence APIs for RDMA synchronization
-
-SCIF exposes the notion of a connection which can be used by peer processes on
-nodes in a SCIF PCIe "network" to share memory "windows" and to communicate. A
-process in a SCIF node initiates a SCIF connection to a peer process on a
-different node via a SCIF "endpoint". SCIF endpoints support messaging APIs
-which are similar to connection oriented socket APIs. Connected SCIF endpoints
-can also register local memory which is followed by data transfer using either
-DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and
-kernel mode clients which are functionally equivalent.
-
-==== SCIF Performance for MIC ====
-DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus
-SCIF shows the performance advantages of SCIF for HPC applications and runtimes.
-
-             Comparison of TCP and SCIF based BW
-
-  Throughput (GB/sec)
-    8 +                                             PCIe Bandwidth ******
-      +                                                        TCP ######
-    7 +    **************************************             SCIF %%%%%%
-      |                       %%%%%%%%%%%%%%%%%%%
-    6 +                   %%%%
-      |                 %%
-      |               %%%
-    5 +              %%
-      |            %%
-    4 +           %%
-      |          %%
-    3 +         %%
-      |        %
-    2 +      %%
-      |     %%
-      |    %
-    1 +
-      +    ######################################
-    0 +++---+++--+--+-+--+--+-++-+--+-++-+--+-++-+-
-      1       10     100      1000   10000   100000
-                   Transfer Size (KBytes)
-
-SCIF allows memory sharing via mmap(..) between processes on different PCIe
-nodes and thus provides bare-metal PCIe latency. The round trip SCIF mmap
-latency from the host to an x100 MIC for an 8 byte message is 0.44 usecs.
-
-SCIF has a user space library which is a thin IOCTL wrapper providing a user
-space API similar to the kernel API in scif.h. The SCIF user space library
-is distributed @ https://software.intel.com/en-us/mic-developer
-
-Here is some pseudo code for an example of how two applications on two PCIe
-nodes would typically use the SCIF API:
-
-Process A (on node A)			Process B (on node B)
-
-/* get online node information */
-scif_get_node_ids(..)			scif_get_node_ids(..)
-scif_open(..)				scif_open(..)
-scif_bind(..)				scif_bind(..)
-scif_listen(..)
-scif_accept(..)				scif_connect(..)
-/* SCIF connection established */
-
-/* Send and receive short messages */
-scif_send(..)/scif_recv(..)		scif_send(..)/scif_recv(..)
-
-/* Register memory */
-scif_register(..)			scif_register(..)
-
-/* RDMA */
-scif_readfrom(..)/scif_writeto(..)	scif_readfrom(..)/scif_writeto(..)
-
-/* Fence DMAs */
-scif_fence_signal(..)			scif_fence_signal(..)
-
-mmap(..)				mmap(..)
-
-/* Access remote registered memory */
-
-/* Close the endpoints */
-scif_close(..)				scif_close(..)
diff --git a/Documentation/misc-devices/eeprom b/Documentation/misc-devices/eeprom
deleted file mode 100644
index ba692011f221..000000000000
--- a/Documentation/misc-devices/eeprom
+++ /dev/null
@@ -1,96 +0,0 @@
-Kernel driver eeprom
-====================
-
-Supported chips:
-  * Any EEPROM chip in the designated address range
-    Prefix: 'eeprom'
-    Addresses scanned: I2C 0x50 - 0x57
-    Datasheets: Publicly available from:
-                Atmel (www.atmel.com),
-                Catalyst (www.catsemi.com),
-                Fairchild (www.fairchildsemi.com),
-                Microchip (www.microchip.com),
-                Philips (www.semiconductor.philips.com),
-                Rohm (www.rohm.com),
-                ST (www.st.com),
-                Xicor (www.xicor.com),
-                and others.
-
-        Chip     Size (bits)    Address
-        24C01     1K            0x50 (shadows at 0x51 - 0x57)
-        24C01A    1K            0x50 - 0x57 (Typical device on DIMMs)
-        24C02     2K            0x50 - 0x57
-        24C04     4K            0x50, 0x52, 0x54, 0x56
-                                (additional data at 0x51, 0x53, 0x55, 0x57)
-        24C08     8K            0x50, 0x54 (additional data at 0x51, 0x52,
-                                0x53, 0x55, 0x56, 0x57)
-        24C16    16K            0x50 (additional data at 0x51 - 0x57)
-        Sony      2K            0x57
-
-        Atmel     34C02B  2K    0x50 - 0x57, SW write protect at 0x30-37
-        Catalyst  34FC02  2K    0x50 - 0x57, SW write protect at 0x30-37
-        Catalyst  34RC02  2K    0x50 - 0x57, SW write protect at 0x30-37
-        Fairchild 34W02   2K    0x50 - 0x57, SW write protect at 0x30-37
-        Microchip 24AA52  2K    0x50 - 0x57, SW write protect at 0x30-37
-        ST        M34C02  2K    0x50 - 0x57, SW write protect at 0x30-37
-
-
-Authors:
-        Frodo Looijaard <frodol@dds.nl>,
-        Philip Edelbrock <phil@netroedge.com>,
-        Jean Delvare <jdelvare@suse.de>,
-        Greg Kroah-Hartman <greg@kroah.com>,
-        IBM Corp.
-
-Description
------------
-
-This is a simple EEPROM module meant to enable reading the first 256 bytes
-of an EEPROM (on a SDRAM DIMM for example). However, it will access serial
-EEPROMs on any I2C adapter. The supported devices are generically called
-24Cxx, and are listed above; however the numbering for these
-industry-standard devices may vary by manufacturer.
-
-This module was a programming exercise to get used to the new project
-organization laid out by Frodo, but it should be at least completely
-effective for decoding the contents of EEPROMs on DIMMs.
-
-DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants.
-The other devices will not be found on a DIMM because they respond to more
-than one address.
-
-DDC Monitors may contain any device. Often a 24C01, which responds to all 8
-addresses, is found.
-
-Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the
-specification, so it is guess work and far from being complete.
-
-The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional
-software write protect register at 0x30 - 0x37 (0x20 less than the memory
-location). The chip responds to "write quick" detection at this address but
-does not respond to byte reads. If this register is present, the lower 128
-bytes of the memory array are not write protected. Any byte data write to
-this address will write protect the memory array permanently, and the
-device will no longer respond at the 0x30-37 address. The eeprom driver
-does not support this register.
-
-Lacking functionality:
-
-* Full support for larger devices (24C04, 24C08, 24C16). These are not
-typically found on a PC. These devices will appear as separate devices at
-multiple addresses.
-
-* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512).
-These devices require two-byte address fields and are not supported.
-
-* Enable Writing. Again, no technical reason why not, but making it easy
-to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy
-to disable the DIMMs (potentially preventing the computer from booting)
-until the values are restored somehow.
-
-Use:
-
-After inserting the module (and any other required SMBus/i2c modules), you
-should have some EEPROM directories in /sys/bus/i2c/devices/* of names such
-as "0-0050". Inside each of these is a series of files, the eeprom file
-contains the binary data from EEPROM.
diff --git a/Documentation/misc-devices/eeprom.rst b/Documentation/misc-devices/eeprom.rst
new file mode 100644
index 000000000000..008249675ccc
--- /dev/null
+++ b/Documentation/misc-devices/eeprom.rst
@@ -0,0 +1,107 @@
+====================
+Kernel driver eeprom
+====================
+
+Supported chips:
+
+  * Any EEPROM chip in the designated address range
+
+    Prefix: 'eeprom'
+
+    Addresses scanned: I2C 0x50 - 0x57
+
+    Datasheets: Publicly available from:
+
+                Atmel (www.atmel.com),
+                Catalyst (www.catsemi.com),
+                Fairchild (www.fairchildsemi.com),
+                Microchip (www.microchip.com),
+                Philips (www.semiconductor.philips.com),
+                Rohm (www.rohm.com),
+                ST (www.st.com),
+                Xicor (www.xicor.com),
+                and others.
+
+        ========= ============= ============================================
+        Chip      Size (bits)   Address
+        ========= ============= ============================================
+        24C01     1K            0x50 (shadows at 0x51 - 0x57)
+        24C01A    1K            0x50 - 0x57 (Typical device on DIMMs)
+        24C02     2K            0x50 - 0x57
+        24C04     4K            0x50, 0x52, 0x54, 0x56
+                                (additional data at 0x51, 0x53, 0x55, 0x57)
+        24C08     8K            0x50, 0x54 (additional data at 0x51, 0x52,
+                                0x53, 0x55, 0x56, 0x57)
+        24C16     16K           0x50 (additional data at 0x51 - 0x57)
+        Sony      2K            0x57
+
+        Atmel     34C02B  2K    0x50 - 0x57, SW write protect at 0x30-37
+        Catalyst  34FC02  2K    0x50 - 0x57, SW write protect at 0x30-37
+        Catalyst  34RC02  2K    0x50 - 0x57, SW write protect at 0x30-37
+        Fairchild 34W02   2K    0x50 - 0x57, SW write protect at 0x30-37
+        Microchip 24AA52  2K    0x50 - 0x57, SW write protect at 0x30-37
+        ST        M34C02  2K    0x50 - 0x57, SW write protect at 0x30-37
+        ========= ============= ============================================
+
+
+Authors:
+        - Frodo Looijaard <frodol@dds.nl>,
+        - Philip Edelbrock <phil@netroedge.com>,
+        - Jean Delvare <jdelvare@suse.de>,
+        - Greg Kroah-Hartman <greg@kroah.com>,
+        - IBM Corp.
+
+Description
+-----------
+
+This is a simple EEPROM module meant to enable reading the first 256 bytes
+of an EEPROM (on a SDRAM DIMM for example). However, it will access serial
+EEPROMs on any I2C adapter. The supported devices are generically called
+24Cxx, and are listed above; however the numbering for these
+industry-standard devices may vary by manufacturer.
+
+This module was a programming exercise to get used to the new project
+organization laid out by Frodo, but it should be at least completely
+effective for decoding the contents of EEPROMs on DIMMs.
+
+DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants.
+The other devices will not be found on a DIMM because they respond to more
+than one address.
+
+DDC Monitors may contain any device. Often a 24C01, which responds to all 8
+addresses, is found.
+
+Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the
+specification, so it is guess work and far from being complete.
+
+The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional
+software write protect register at 0x30 - 0x37 (0x20 less than the memory
+location). The chip responds to "write quick" detection at this address but
+does not respond to byte reads. If this register is present, the lower 128
+bytes of the memory array are not write protected. Any byte data write to
+this address will write protect the memory array permanently, and the
+device will no longer respond at the 0x30-37 address. The eeprom driver
+does not support this register.
+
+Lacking functionality
+---------------------
+
+* Full support for larger devices (24C04, 24C08, 24C16). These are not
+  typically found on a PC. These devices will appear as separate devices at
+  multiple addresses.
+
+* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512).
+  These devices require two-byte address fields and are not supported.
+
+* Enable Writing. Again, no technical reason why not, but making it easy
+  to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy
+  to disable the DIMMs (potentially preventing the computer from booting)
+  until the values are restored somehow.
+
+Use
+---
+
+After inserting the module (and any other required SMBus/i2c modules), you
+should have some EEPROM directories in ``/sys/bus/i2c/devices/*`` of names such
+as "0-0050". Inside each of these is a series of files, the eeprom file
+contains the binary data from EEPROM.
diff --git a/Documentation/misc-devices/ics932s401 b/Documentation/misc-devices/ics932s401
deleted file mode 100644
index bdac67ff6e3f..000000000000
--- a/Documentation/misc-devices/ics932s401
+++ /dev/null
@@ -1,31 +0,0 @@
-Kernel driver ics932s401
-======================
-
-Supported chips:
-  * IDT ICS932S401
-    Prefix: 'ics932s401'
-    Addresses scanned: I2C 0x69
-    Datasheet: Publicly available at the IDT website
-
-Author: Darrick J. Wong
-
-Description
------------
-
-This driver implements support for the IDT ICS932S401 chip family.
-
-This chip has 4 clock outputs--a base clock for the CPU (which is likely
-multiplied to get the real CPU clock), a system clock, a PCI clock, a USB
-clock, and a reference clock.  The driver reports selected and actual
-frequency.  If spread spectrum mode is enabled, the driver also reports by what
-percent the clock signal is being spread, which should be between 0 and -0.5%.
-All frequencies are reported in KHz.
-
-The ICS932S401 monitors all inputs continuously. The driver will not read
-the registers more often than once every other second.
-
-Special Features
-----------------
-
-The clocks could be reprogrammed to increase system speed.  I will not help you
-do this, as you risk damaging your system!
diff --git a/Documentation/misc-devices/ics932s401.rst b/Documentation/misc-devices/ics932s401.rst
new file mode 100644
index 000000000000..613ee54a9c21
--- /dev/null
+++ b/Documentation/misc-devices/ics932s401.rst
@@ -0,0 +1,36 @@
+========================
+Kernel driver ics932s401
+========================
+
+Supported chips:
+
+  * IDT ICS932S401
+
+    Prefix: 'ics932s401'
+
+    Addresses scanned: I2C 0x69
+
+    Datasheet: Publicly available at the IDT website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the IDT ICS932S401 chip family.
+
+This chip has 4 clock outputs--a base clock for the CPU (which is likely
+multiplied to get the real CPU clock), a system clock, a PCI clock, a USB
+clock, and a reference clock.  The driver reports selected and actual
+frequency.  If spread spectrum mode is enabled, the driver also reports by what
+percent the clock signal is being spread, which should be between 0 and -0.5%.
+All frequencies are reported in KHz.
+
+The ICS932S401 monitors all inputs continuously. The driver will not read
+the registers more often than once every other second.
+
+Special Features
+----------------
+
+The clocks could be reprogrammed to increase system speed.  I will not help you
+do this, as you risk damaging your system!
diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst
index dfd1f45a3127..a57f92dfe49a 100644
--- a/Documentation/misc-devices/index.rst
+++ b/Documentation/misc-devices/index.rst
@@ -14,4 +14,9 @@ fit into other categories.
 .. toctree::
    :maxdepth: 2
 
+   eeprom
    ibmvmc
+   ics932s401
+   isl29003
+   lis3lv02d
+   max6875
diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003
deleted file mode 100644
index 80b952fd32ff..000000000000
--- a/Documentation/misc-devices/isl29003
+++ /dev/null
@@ -1,62 +0,0 @@
-Kernel driver isl29003
-=====================
-
-Supported chips:
-* Intersil ISL29003
-Prefix: 'isl29003'
-Addresses scanned: none
-Datasheet:
-http://www.intersil.com/data/fn/fn7464.pdf
-
-Author: Daniel Mack <daniel@caiaq.de>
-
-
-Description
------------
-The ISL29003 is an integrated light sensor with a 16-bit integrating type
-ADC, I2C user programmable lux range select for optimized counts/lux, and
-I2C multi-function control and monitoring capabilities. The internal ADC
-provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
-artificial light sources.
-
-The driver allows to set the lux range, the bit resolution, the operational
-mode (see below) and the power state of device and can read the current lux
-value, of course.
-
-
-Detection
----------
-
-The ISL29003 does not have an ID register which could be used to identify
-it, so the detection routine will just try to read from the configured I2C
-address and consider the device to be present as soon as it ACKs the
-transfer.
-
-
-Sysfs entries
--------------
-
-range:
-	0: 0 lux to 1000 lux (default)
-	1: 0 lux to 4000 lux
-	2: 0 lux to 16,000 lux
-	3: 0 lux to 64,000 lux
-
-resolution:
-	0: 2^16 cycles (default)
-	1: 2^12 cycles
-	2: 2^8 cycles
-	3: 2^4 cycles
-
-mode:
-	0: diode1's current (unsigned 16bit) (default)
-	1: diode1's current (unsigned 16bit)
-	2: difference between diodes (l1 - l2, signed 15bit)
-
-power_state:
-	0: device is disabled (default)
-	1: device is enabled
-
-lux (read only):
-	returns the value from the last sensor reading
-
diff --git a/Documentation/misc-devices/isl29003.rst b/Documentation/misc-devices/isl29003.rst
new file mode 100644
index 000000000000..0cc38aed6c00
--- /dev/null
+++ b/Documentation/misc-devices/isl29003.rst
@@ -0,0 +1,75 @@
+======================
+Kernel driver isl29003
+======================
+
+Supported chips:
+
+* Intersil ISL29003
+
+Prefix: 'isl29003'
+
+Addresses scanned: none
+
+Datasheet:
+http://www.intersil.com/data/fn/fn7464.pdf
+
+Author: Daniel Mack <daniel@caiaq.de>
+
+
+Description
+-----------
+The ISL29003 is an integrated light sensor with a 16-bit integrating type
+ADC, I2C user programmable lux range select for optimized counts/lux, and
+I2C multi-function control and monitoring capabilities. The internal ADC
+provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
+artificial light sources.
+
+The driver allows to set the lux range, the bit resolution, the operational
+mode (see below) and the power state of device and can read the current lux
+value, of course.
+
+
+Detection
+---------
+
+The ISL29003 does not have an ID register which could be used to identify
+it, so the detection routine will just try to read from the configured I2C
+address and consider the device to be present as soon as it ACKs the
+transfer.
+
+
+Sysfs entries
+-------------
+
+range:
+        == ===========================
+	0: 0 lux to 1000 lux (default)
+	1: 0 lux to 4000 lux
+	2: 0 lux to 16,000 lux
+	3: 0 lux to 64,000 lux
+        == ===========================
+
+resolution:
+        == =====================
+	0: 2^16 cycles (default)
+	1: 2^12 cycles
+	2: 2^8 cycles
+	3: 2^4 cycles
+        == =====================
+
+mode:
+        == =================================================
+	0: diode1's current (unsigned 16bit) (default)
+	1: diode1's current (unsigned 16bit)
+	2: difference between diodes (l1 - l2, signed 15bit)
+        == =================================================
+
+power_state:
+        == =================================================
+	0: device is disabled (default)
+	1: device is enabled
+        == =================================================
+
+lux (read only):
+	returns the value from the last sensor reading
+
diff --git a/Documentation/misc-devices/lis3lv02d b/Documentation/misc-devices/lis3lv02d
deleted file mode 100644
index f89960a0ff95..000000000000
--- a/Documentation/misc-devices/lis3lv02d
+++ /dev/null
@@ -1,93 +0,0 @@
-Kernel driver lis3lv02d
-=======================
-
-Supported chips:
-
-  * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision)
-  * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and
-    LIS331DLH (16 bits)
-
-Authors:
-        Yan Burman <burman.yan@gmail.com>
-	Eric Piel <eric.piel@tremplin-utc.net>
-
-
-Description
------------
-
-This driver provides support for the accelerometer found in various HP laptops
-sporting the feature officially called "HP Mobile Data Protection System 3D" or
-"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known
-models (full list can be found in drivers/platform/x86/hp_accel.c) will have
-their axis automatically oriented on standard way (eg: you can directly play
-neverball). The accelerometer data is readable via
-/sys/devices/platform/lis3lv02d. Reported values are scaled
-to mg values (1/1000th of earth gravity).
-
-Sysfs attributes under /sys/devices/platform/lis3lv02d/:
-position - 3D position that the accelerometer reports. Format: "(x,y,z)"
-rate - read reports the sampling rate of the accelerometer device in HZ.
-	write changes sampling rate of the accelerometer device.
-	Only values which are supported by HW are accepted.
-selftest - performs selftest for the chip as specified by chip manufacturer.
-
-This driver also provides an absolute input class device, allowing
-the laptop to act as a pinball machine-esque joystick. Joystick device can be
-calibrated. Joystick device can be in two different modes.
-By default output values are scaled between -32768 .. 32767. In joystick raw
-mode, joystick and sysfs position entry have the same scale. There can be
-small difference due to input system fuzziness feature.
-Events are also available as input event device.
-
-Selftest is meant only for hardware diagnostic purposes. It is not meant to be
-used during normal operations. Position data is not corrupted during selftest
-but interrupt behaviour is not guaranteed to work reliably. In test mode, the
-sensing element is internally moved little bit. Selftest measures difference
-between normal mode and test mode. Chip specifications tell the acceptance
-limit for each type of the chip. Limits are provided via platform data
-to allow adjustment of the limits without a change to the actual driver.
-Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are
-measured difference between modes. Axes are not remapped in selftest mode.
-Measurement values are provided to help HW diagnostic applications to make
-final decision.
-
-On HP laptops, if the led infrastructure is activated, support for a led
-indicating disk protection will be provided as /sys/class/leds/hp::hddprotect.
-
-Another feature of the driver is misc device called "freefall" that
-acts similar to /dev/rtc and reacts on free-fall interrupts received
-from the device. It supports blocking operations, poll/select and
-fasync operation modes. You must read 1 bytes from the device.  The
-result is number of free-fall interrupts since the last successful
-read (or 255 if number of interrupts would not fit). See the freefall.c
-file for an example on using the device.
-
-
-Axes orientation
-----------------
-
-For better compatibility between the various laptops. The values reported by
-the accelerometer are converted into a "standard" organisation of the axes
-(aka "can play neverball out of the box"):
- * When the laptop is horizontal the position reported is about 0 for X and Y
-	and a positive value for Z
- * If the left side is elevated, X increases (becomes positive)
- * If the front side (where the touchpad is) is elevated, Y decreases
-	(becomes negative)
- * If the laptop is put upside-down, Z becomes negative
-
-If your laptop model is not recognized (cf "dmesg"), you can send an
-email to the maintainer to add it to the database.  When reporting a new
-laptop, please include the output of "dmidecode" plus the value of
-/sys/devices/platform/lis3lv02d/position in these four cases.
-
-Q&A
----
-
-Q: How do I safely simulate freefall? I have an HP "portable
-workstation" which has about 3.5kg and a plastic case, so letting it
-fall to the ground is out of question...
-
-A: The sensor is pretty sensitive, so your hands can do it. Lift it
-into free space, follow the fall with your hands for like 10
-centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/misc-devices/lis3lv02d.rst b/Documentation/misc-devices/lis3lv02d.rst
new file mode 100644
index 000000000000..959bd2b822cf
--- /dev/null
+++ b/Documentation/misc-devices/lis3lv02d.rst
@@ -0,0 +1,99 @@
+=======================
+Kernel driver lis3lv02d
+=======================
+
+Supported chips:
+
+  * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision)
+  * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and
+    LIS331DLH (16 bits)
+
+Authors:
+        - Yan Burman <burman.yan@gmail.com>
+	- Eric Piel <eric.piel@tremplin-utc.net>
+
+
+Description
+-----------
+
+This driver provides support for the accelerometer found in various HP laptops
+sporting the feature officially called "HP Mobile Data Protection System 3D" or
+"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known
+models (full list can be found in drivers/platform/x86/hp_accel.c) will have
+their axis automatically oriented on standard way (eg: you can directly play
+neverball). The accelerometer data is readable via
+/sys/devices/platform/lis3lv02d. Reported values are scaled
+to mg values (1/1000th of earth gravity).
+
+Sysfs attributes under /sys/devices/platform/lis3lv02d/:
+
+position
+      - 3D position that the accelerometer reports. Format: "(x,y,z)"
+rate
+      - read reports the sampling rate of the accelerometer device in HZ.
+	write changes sampling rate of the accelerometer device.
+	Only values which are supported by HW are accepted.
+selftest
+      - performs selftest for the chip as specified by chip manufacturer.
+
+This driver also provides an absolute input class device, allowing
+the laptop to act as a pinball machine-esque joystick. Joystick device can be
+calibrated. Joystick device can be in two different modes.
+By default output values are scaled between -32768 .. 32767. In joystick raw
+mode, joystick and sysfs position entry have the same scale. There can be
+small difference due to input system fuzziness feature.
+Events are also available as input event device.
+
+Selftest is meant only for hardware diagnostic purposes. It is not meant to be
+used during normal operations. Position data is not corrupted during selftest
+but interrupt behaviour is not guaranteed to work reliably. In test mode, the
+sensing element is internally moved little bit. Selftest measures difference
+between normal mode and test mode. Chip specifications tell the acceptance
+limit for each type of the chip. Limits are provided via platform data
+to allow adjustment of the limits without a change to the actual driver.
+Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are
+measured difference between modes. Axes are not remapped in selftest mode.
+Measurement values are provided to help HW diagnostic applications to make
+final decision.
+
+On HP laptops, if the led infrastructure is activated, support for a led
+indicating disk protection will be provided as /sys/class/leds/hp::hddprotect.
+
+Another feature of the driver is misc device called "freefall" that
+acts similar to /dev/rtc and reacts on free-fall interrupts received
+from the device. It supports blocking operations, poll/select and
+fasync operation modes. You must read 1 bytes from the device.  The
+result is number of free-fall interrupts since the last successful
+read (or 255 if number of interrupts would not fit). See the freefall.c
+file for an example on using the device.
+
+
+Axes orientation
+----------------
+
+For better compatibility between the various laptops. The values reported by
+the accelerometer are converted into a "standard" organisation of the axes
+(aka "can play neverball out of the box"):
+
+ * When the laptop is horizontal the position reported is about 0 for X and Y
+   and a positive value for Z
+ * If the left side is elevated, X increases (becomes positive)
+ * If the front side (where the touchpad is) is elevated, Y decreases
+   (becomes negative)
+ * If the laptop is put upside-down, Z becomes negative
+
+If your laptop model is not recognized (cf "dmesg"), you can send an
+email to the maintainer to add it to the database.  When reporting a new
+laptop, please include the output of "dmidecode" plus the value of
+/sys/devices/platform/lis3lv02d/position in these four cases.
+
+Q&A
+---
+
+Q: How do I safely simulate freefall? I have an HP "portable
+workstation" which has about 3.5kg and a plastic case, so letting it
+fall to the ground is out of question...
+
+A: The sensor is pretty sensitive, so your hands can do it. Lift it
+into free space, follow the fall with your hands for like 10
+centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/misc-devices/max6875 b/Documentation/misc-devices/max6875
deleted file mode 100644
index 2f2bd0b17b5d..000000000000
--- a/Documentation/misc-devices/max6875
+++ /dev/null
@@ -1,110 +0,0 @@
-Kernel driver max6875
-=====================
-
-Supported chips:
-  * Maxim MAX6874, MAX6875
-    Prefix: 'max6875'
-    Addresses scanned: None (see below)
-    Datasheet:
-        http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf
-
-Author: Ben Gardner <bgardner@wabtec.com>
-
-
-Description
------------
-
-The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor.
-It provides timed outputs that can be used as a watchdog, if properly wired.
-It also provides 512 bytes of user EEPROM.
-
-At reset, the MAX6875 reads the configuration EEPROM into its configuration
-registers.  The chip then begins to operate according to the values in the
-registers.
-
-The Maxim MAX6874 is a similar, mostly compatible device, with more inputs
-and outputs:
-             vin     gpi    vout
-MAX6874        6       4       8
-MAX6875        4       3       5
-
-See the datasheet for more information.
-
-
-Sysfs entries
--------------
-
-eeprom        - 512 bytes of user-defined EEPROM space.
-
-
-General Remarks
----------------
-
-Valid addresses for the MAX6875 are 0x50 and 0x52.
-Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56.
-The driver does not probe any address, so you explicitly instantiate the
-devices.
-
-Example:
-$ modprobe max6875
-$ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device
-
-The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple
-addresses.  For example, for address 0x50, it also reserves 0x51.
-The even-address instance is called 'max6875', the odd one is 'dummy'.
-
-
-Programming the chip using i2c-dev
-----------------------------------
-
-Use the i2c-dev interface to access and program the chips.
-Reads and writes are performed differently depending on the address range.
-
-The configuration registers are at addresses 0x00 - 0x45.
-Use i2c_smbus_write_byte_data() to write a register and
-i2c_smbus_read_byte_data() to read a register.
-The command is the register number.
-
-Examples:
-To write a 1 to register 0x45:
-  i2c_smbus_write_byte_data(fd, 0x45, 1);
-
-To read register 0x45:
-  value = i2c_smbus_read_byte_data(fd, 0x45);
-
-
-The configuration EEPROM is at addresses 0x8000 - 0x8045.
-The user EEPROM is at addresses 0x8100 - 0x82ff.
-
-Use i2c_smbus_write_word_data() to write a byte to EEPROM.
-
-The command is the upper byte of the address: 0x80, 0x81, or 0x82.
-The data word is the lower part of the address or'd with data << 8.
-  cmd = address >> 8;
-  val = (address & 0xff) | (data << 8);
-
-Example:
-To write 0x5a to address 0x8003:
-  i2c_smbus_write_word_data(fd, 0x80, 0x5a03);
-
-
-Reading data from the EEPROM is a little more complicated.
-Use i2c_smbus_write_byte_data() to set the read address and then
-i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data.
-
-Example:
-To read data starting at offset 0x8100, first set the address:
-  i2c_smbus_write_byte_data(fd, 0x81, 0x00);
-
-And then read the data
-  value = i2c_smbus_read_byte(fd);
-
-  or
-
-  count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
-
-The block read should read 16 bytes.
-0x84 is the block read command.
-
-See the datasheet for more details.
-
diff --git a/Documentation/misc-devices/max6875.rst b/Documentation/misc-devices/max6875.rst
new file mode 100644
index 000000000000..ad419ac22a5b
--- /dev/null
+++ b/Documentation/misc-devices/max6875.rst
@@ -0,0 +1,136 @@
+=====================
+Kernel driver max6875
+=====================
+
+Supported chips:
+
+  * Maxim MAX6874, MAX6875
+
+    Prefix: 'max6875'
+
+    Addresses scanned: None (see below)
+
+    Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf
+
+Author: Ben Gardner <bgardner@wabtec.com>
+
+
+Description
+-----------
+
+The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor.
+It provides timed outputs that can be used as a watchdog, if properly wired.
+It also provides 512 bytes of user EEPROM.
+
+At reset, the MAX6875 reads the configuration EEPROM into its configuration
+registers.  The chip then begins to operate according to the values in the
+registers.
+
+The Maxim MAX6874 is a similar, mostly compatible device, with more inputs
+and outputs:
+
+===========  ===     ===    ====
+-            vin     gpi    vout
+===========  ===     ===    ====
+MAX6874        6       4       8
+MAX6875        4       3       5
+===========  ===     ===    ====
+
+See the datasheet for more information.
+
+
+Sysfs entries
+-------------
+
+eeprom        - 512 bytes of user-defined EEPROM space.
+
+
+General Remarks
+---------------
+
+Valid addresses for the MAX6875 are 0x50 and 0x52.
+
+Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56.
+
+The driver does not probe any address, so you explicitly instantiate the
+devices.
+
+Example::
+
+  $ modprobe max6875
+  $ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device
+
+The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple
+addresses.  For example, for address 0x50, it also reserves 0x51.
+The even-address instance is called 'max6875', the odd one is 'dummy'.
+
+
+Programming the chip using i2c-dev
+----------------------------------
+
+Use the i2c-dev interface to access and program the chips.
+
+Reads and writes are performed differently depending on the address range.
+
+The configuration registers are at addresses 0x00 - 0x45.
+
+Use i2c_smbus_write_byte_data() to write a register and
+i2c_smbus_read_byte_data() to read a register.
+
+The command is the register number.
+
+Examples:
+
+To write a 1 to register 0x45::
+
+  i2c_smbus_write_byte_data(fd, 0x45, 1);
+
+To read register 0x45::
+
+  value = i2c_smbus_read_byte_data(fd, 0x45);
+
+
+The configuration EEPROM is at addresses 0x8000 - 0x8045.
+
+The user EEPROM is at addresses 0x8100 - 0x82ff.
+
+Use i2c_smbus_write_word_data() to write a byte to EEPROM.
+
+The command is the upper byte of the address: 0x80, 0x81, or 0x82.
+The data word is the lower part of the address or'd with data << 8::
+
+  cmd = address >> 8;
+  val = (address & 0xff) | (data << 8);
+
+Example:
+
+To write 0x5a to address 0x8003::
+
+  i2c_smbus_write_word_data(fd, 0x80, 0x5a03);
+
+
+Reading data from the EEPROM is a little more complicated.
+
+Use i2c_smbus_write_byte_data() to set the read address and then
+i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data.
+
+Example:
+
+To read data starting at offset 0x8100, first set the address::
+
+  i2c_smbus_write_byte_data(fd, 0x81, 0x00);
+
+And then read the data::
+
+  value = i2c_smbus_read_byte(fd);
+
+or::
+
+  count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
+
+The block read should read 16 bytes.
+
+0x84 is the block read command.
+
+See the datasheet for more details.
+
diff --git a/Documentation/misc-devices/mei/mei-client-bus.txt b/Documentation/misc-devices/mei/mei-client-bus.txt
deleted file mode 100644
index 743be4ec8989..000000000000
--- a/Documentation/misc-devices/mei/mei-client-bus.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-Intel(R) Management Engine (ME) Client bus API
-==============================================
-
-
-Rationale
-=========
-
-MEI misc character device is useful for dedicated applications to send and receive
-data to the many FW appliance found in Intel's ME from the user space.
-However for some of the ME functionalities it make sense to leverage existing software
-stack and expose them through existing kernel subsystems.
-
-In order to plug seamlessly into the kernel device driver model we add kernel virtual
-bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers
-for the various MEI features as a stand alone entities found in their respective subsystem.
-Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to
-the existing code.
-
-
-MEI CL bus API
-==============
-
-A driver implementation for an MEI Client is very similar to existing bus
-based device drivers. The driver registers itself as an MEI CL bus driver through
-the mei_cl_driver structure:
-
-struct mei_cl_driver {
-	struct device_driver driver;
-	const char *name;
-
-	const struct mei_cl_device_id *id_table;
-
-	int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id);
-	int (*remove)(struct mei_cl_device *dev);
-};
-
-struct mei_cl_id {
-	char name[MEI_NAME_SIZE];
-	kernel_ulong_t driver_info;
-};
-
-The mei_cl_id structure allows the driver to bind itself against a device name.
-
-To actually register a driver on the ME Client bus one must call the mei_cl_add_driver()
-API. This is typically called at module init time.
-
-Once registered on the ME Client bus, a driver will typically try to do some I/O on
-this bus and this should be done through the mei_cl_send() and mei_cl_recv()
-routines. The latter is synchronous (blocks and sleeps until data shows up).
-In order for drivers to be notified of pending events waiting for them (e.g.
-an Rx event) they can register an event handler through the
-mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event
-will trigger an event handler call and the driver implementation is supposed
-to call mei_recv() from the event handler in order to fetch the pending
-received buffers.
-
-
-Example
-=======
-
-As a theoretical example let's pretend the ME comes with a "contact" NFC IP.
-The driver init and exit routines for this device would look like:
-
-#define CONTACT_DRIVER_NAME "contact"
-
-static struct mei_cl_device_id contact_mei_cl_tbl[] = {
-	{ CONTACT_DRIVER_NAME, },
-
-	/* required last entry */
-	{ }
-};
-MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl);
-
-static struct mei_cl_driver contact_driver = {
-	.id_table = contact_mei_tbl,
-	.name = CONTACT_DRIVER_NAME,
-
-	.probe = contact_probe,
-	.remove = contact_remove,
-};
-
-static int contact_init(void)
-{
-	int r;
-
-	r = mei_cl_driver_register(&contact_driver);
-	if (r) {
-		pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n");
-		return r;
-	}
-
-	return 0;
-}
-
-static void __exit contact_exit(void)
-{
-	mei_cl_driver_unregister(&contact_driver);
-}
-
-module_init(contact_init);
-module_exit(contact_exit);
-
-And the driver's simplified probe routine would look like that:
-
-int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
-{
-	struct contact_driver *contact;
-
-	[...]
-	mei_cl_enable_device(dev);
-
-	mei_cl_register_event_cb(dev, contact_event_cb, contact);
-
-	return 0;
-}
-
-In the probe routine the driver first enable the MEI device and then registers
-an ME bus event handler which is as close as it can get to registering a
-threaded IRQ handler.
-The handler implementation will typically call some I/O routine depending on
-the pending events:
-
-#define MAX_NFC_PAYLOAD 128
-
-static void contact_event_cb(struct mei_cl_device *dev, u32 events,
-			     void *context)
-{
-	struct contact_driver *contact = context;
-
-	if (events & BIT(MEI_EVENT_RX)) {
-		u8 payload[MAX_NFC_PAYLOAD];
-		int payload_size;
-
-		payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD);
-		if (payload_size <= 0)
-			return;
-
-		/* Hook to the NFC subsystem */
-		nfc_hci_recv_frame(contact->hdev, payload, payload_size);
-	}
-}
diff --git a/Documentation/misc-devices/mei/mei.txt b/Documentation/misc-devices/mei/mei.txt
deleted file mode 100644
index 2b80a0cd621f..000000000000
--- a/Documentation/misc-devices/mei/mei.txt
+++ /dev/null
@@ -1,266 +0,0 @@
-Intel(R) Management Engine Interface (Intel(R) MEI)
-===================================================
-
-Introduction
-============
-
-The Intel Management Engine (Intel ME) is an isolated and protected computing
-resource (Co-processor) residing inside certain Intel chipsets. The Intel ME
-provides support for computer/IT management features. The feature set
-depends on the Intel chipset SKU.
-
-The Intel Management Engine Interface (Intel MEI, previously known as HECI)
-is the interface between the Host and Intel ME. This interface is exposed
-to the host as a PCI device. The Intel MEI Driver is in charge of the
-communication channel between a host application and the Intel ME feature.
-
-Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and
-each client has its own protocol. The protocol is message-based with a
-header and payload up to 512 bytes.
-
-Prominent usage of the Intel ME Interface is to communicate with Intel(R)
-Active Management Technology (Intel AMT) implemented in firmware running on
-the Intel ME.
-
-Intel AMT provides the ability to manage a host remotely out-of-band (OOB)
-even when the operating system running on the host processor has crashed or
-is in a sleep state.
-
-Some examples of Intel AMT usage are:
-   - Monitoring hardware state and platform components
-   - Remote power off/on (useful for green computing or overnight IT
-     maintenance)
-   - OS updates
-   - Storage of useful platform information such as software assets
-   - Built-in hardware KVM
-   - Selective network isolation of Ethernet and IP protocol flows based
-     on policies set by a remote management console
-   - IDE device redirection from remote management console
-
-Intel AMT (OOB) communication is based on SOAP (deprecated
-starting with Release 6.0) over HTTP/S or WS-Management protocol over
-HTTP/S that are received from a remote management console application.
-
-For more information about Intel AMT:
-http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
-
-
-Intel MEI Driver
-================
-
-The driver exposes a misc device called /dev/mei.
-
-An application maintains communication with an Intel ME feature while
-/dev/mei is open. The binding to a specific feature is performed by calling
-MEI_CONNECT_CLIENT_IOCTL, which passes the desired UUID.
-The number of instances of an Intel ME feature that can be opened
-at the same time depends on the Intel ME feature, but most of the
-features allow only a single instance.
-
-The Intel AMT Host Interface (Intel AMTHI) feature supports multiple
-simultaneous user connected applications. The Intel MEI driver
-handles this internally by maintaining request queues for the applications.
-
-The driver is transparent to data that are passed between firmware feature
-and host application.
-
-Because some of the Intel ME features can change the system
-configuration, the driver by default allows only a privileged
-user to access it.
-
-A code snippet for an application communicating with Intel AMTHI client:
-
-	struct mei_connect_client_data data;
-	fd = open(MEI_DEVICE);
-
-	data.d.in_client_uuid = AMTHI_UUID;
-
-	ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data);
-
-	printf("Ver=%d, MaxLen=%ld\n",
-			data.d.in_client_uuid.protocol_version,
-			data.d.in_client_uuid.max_msg_length);
-
-	[...]
-
-	write(fd, amthi_req_data, amthi_req_data_len);
-
-	[...]
-
-	read(fd, &amthi_res_data, amthi_res_data_len);
-
-	[...]
-	close(fd);
-
-
-IOCTL
-=====
-
-The Intel MEI Driver supports the following IOCTL commands:
-	IOCTL_MEI_CONNECT_CLIENT	Connect to firmware Feature (client).
-
-	usage:
-		struct mei_connect_client_data clientData;
-		ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &clientData);
-
-	inputs:
-		mei_connect_client_data struct contain the following
-		input field:
-
-		in_client_uuid -	UUID of the FW Feature that needs
-					to connect to.
-	outputs:
-		out_client_properties - Client Properties: MTU and Protocol Version.
-
-	error returns:
-		EINVAL	Wrong IOCTL Number
-		ENODEV	Device or Connection is not initialized or ready.
-			(e.g. Wrong UUID)
-		ENOMEM	Unable to allocate memory to client internal data.
-		EFAULT	Fatal Error (e.g. Unable to access user input data)
-		EBUSY	Connection Already Open
-
-	Notes:
-        max_msg_length (MTU) in client properties describes the maximum
-        data that can be sent or received. (e.g. if MTU=2K, can send
-        requests up to bytes 2k and received responses up to 2k bytes).
-
-	IOCTL_MEI_NOTIFY_SET: enable or disable event notifications
-
-	Usage:
-		uint32_t enable;
-		ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable);
-
-	Inputs:
-		uint32_t enable = 1;
-		or
-		uint32_t enable[disable] = 0;
-
-	Error returns:
-		EINVAL	Wrong IOCTL Number
-		ENODEV	Device  is not initialized or the client not connected
-		ENOMEM	Unable to allocate memory to client internal data.
-		EFAULT	Fatal Error (e.g. Unable to access user input data)
-		EOPNOTSUPP if the device doesn't support the feature
-
-	Notes:
-	The client must be connected in order to enable notification events
-
-
-	IOCTL_MEI_NOTIFY_GET : retrieve event
-
-	Usage:
-		uint32_t event;
-		ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event);
-
-	Outputs:
-		1 - if an event is pending
-		0 - if there is no even pending
-
-	Error returns:
-		EINVAL	Wrong IOCTL Number
-		ENODEV	Device is not initialized or the client not connected
-		ENOMEM	Unable to allocate memory to client internal data.
-		EFAULT	Fatal Error (e.g. Unable to access user input data)
-		EOPNOTSUPP if the device doesn't support the feature
-
-	Notes:
-	The client must be connected and event notification has to be enabled
-	in order to receive an event
-
-
-Intel ME Applications
-=====================
-
-	1) Intel Local Management Service (Intel LMS)
-
-	   Applications running locally on the platform communicate with Intel AMT Release
-	   2.0 and later releases in the same way that network applications do via SOAP
-	   over HTTP (deprecated starting with Release 6.0) or with WS-Management over
-	   SOAP over HTTP. This means that some Intel AMT features can be accessed from a
-	   local application using the same network interface as a remote application
-	   communicating with Intel AMT over the network.
-
-	   When a local application sends a message addressed to the local Intel AMT host
-	   name, the Intel LMS, which listens for traffic directed to the host name,
-	   intercepts the message and routes it to the Intel MEI.
-	   For more information:
-	   http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
-	   Under "About Intel AMT" => "Local Access"
-
-	   For downloading Intel LMS:
-	   http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/
-
-	   The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS
-	   firmware feature using a defined UUID and then communicates with the feature
-	   using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol).
-	   The protocol is used to maintain multiple sessions with Intel AMT from a
-	   single application.
-
-	   See the protocol specification in the Intel AMT Software Development Kit (SDK)
-	   http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
-	   Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)"
-	   => "Information for Intel(R) vPro(TM) Gateway Developers"
-	   => "Description of the Intel AMT Port Forwarding (APF) Protocol"
-
-	2) Intel AMT Remote configuration using a Local Agent
-
-	   A Local Agent enables IT personnel to configure Intel AMT out-of-the-box
-	   without requiring installing additional data to enable setup. The remote
-	   configuration process may involve an ISV-developed remote configuration
-	   agent that runs on the host.
-	   For more information:
-	   http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
-	   Under "Setup and Configuration of Intel AMT" =>
-	   "SDK Tools Supporting Setup and Configuration" =>
-	   "Using the Local Agent Sample"
-
-	   An open source Intel AMT configuration utility,	implementing a local agent
-	   that accesses the Intel MEI driver, can be found here:
-	   http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/
-
-
-Intel AMT OS Health Watchdog
-============================
-
-The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog.
-Whenever the OS hangs or crashes, Intel AMT will send an event
-to any subscriber to this event. This mechanism means that
-IT knows when a platform crashes even when there is a hard failure on the host.
-
-The Intel AMT Watchdog is composed of two parts:
-	1) Firmware feature - receives the heartbeats
-	   and sends an event when the heartbeats stop.
-	2) Intel MEI iAMT watchdog driver - connects to the watchdog feature,
-	   configures the watchdog and sends the heartbeats.
-
-The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure
-the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the
-watchdog is 120 seconds.
-
-If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate
-on the me client bus and watchdog devices won't be exposed.
-
-
-Supported Chipsets
-==================
-
-7 Series Chipset Family
-6 Series Chipset Family
-5 Series Chipset Family
-4 Series Chipset Family
-Mobile 4 Series Chipset Family
-ICH9
-82946GZ/GL
-82G35 Express
-82Q963/Q965
-82P965/G965
-Mobile PM965/GM965
-Mobile GME965/GLE960
-82Q35 Express
-82G33/G31/P35/P31 Express
-82Q33 Express
-82X38/X48 Express
-
----
-linux-mei@linux.intel.com
diff --git a/Documentation/mmc/mmc-async-req.txt b/Documentation/mmc/mmc-async-req.txt
deleted file mode 100644
index ae1907b10e4a..000000000000
--- a/Documentation/mmc/mmc-async-req.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-Rationale
-=========
-
-How significant is the cache maintenance overhead?
-It depends. Fast eMMC and multiple cache levels with speculative cache
-pre-fetch makes the cache overhead relatively significant. If the DMA
-preparations for the next request are done in parallel with the current
-transfer, the DMA preparation overhead would not affect the MMC performance.
-The intention of non-blocking (asynchronous) MMC requests is to minimize the
-time between when an MMC request ends and another MMC request begins.
-Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and
-dma_unmap_sg are processing. Using non-blocking MMC requests makes it
-possible to prepare the caches for next job in parallel with an active
-MMC request.
-
-MMC block driver
-================
-
-The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking.
-The increase in throughput is proportional to the time it takes to
-prepare (major part of preparations are dma_map_sg() and dma_unmap_sg())
-a request and how fast the memory is. The faster the MMC/SD is the
-more significant the prepare request time becomes. Roughly the expected
-performance gain is 5% for large writes and 10% on large reads on a L2 cache
-platform. In power save mode, when clocks run on a lower frequency, the DMA
-preparation may cost even more. As long as these slower preparations are run
-in parallel with the transfer performance won't be affected.
-
-Details on measurements from IOZone and mmc_test
-================================================
-
-https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
-
-MMC core API extension
-======================
-
-There is one new public function mmc_start_req().
-It starts a new MMC command request for a host. The function isn't
-truly non-blocking. If there is an ongoing async request it waits
-for completion of that request and starts the new one and returns. It
-doesn't wait for the new request to complete. If there is no ongoing
-request it starts the new request and returns immediately.
-
-MMC host extensions
-===================
-
-There are two optional members in the mmc_host_ops -- pre_req() and
-post_req() -- that the host driver may implement in order to move work
-to before and after the actual mmc_host_ops.request() function is called.
-In the DMA case pre_req() may do dma_map_sg() and prepare the DMA
-descriptor, and post_req() runs the dma_unmap_sg().
-
-Optimize for the first request
-==============================
-
-The first request in a series of requests can't be prepared in parallel
-with the previous transfer, since there is no previous request.
-The argument is_first_req in pre_req() indicates that there is no previous
-request. The host driver may optimize for this scenario to minimize
-the performance loss. A way to optimize for this is to split the current
-request in two chunks, prepare the first chunk and start the request,
-and finally prepare the second chunk and start the transfer.
-
-Pseudocode to handle is_first_req scenario with minimal prepare overhead:
-
-if (is_first_req && req->size > threshold)
-   /* start MMC transfer for the complete transfer size */
-   mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE);
-
-   /*
-    * Begin to prepare DMA while cmd is being processed by MMC.
-    * The first chunk of the request should take the same time
-    * to prepare as the "MMC process command time".
-    * If prepare time exceeds MMC cmd time
-    * the transfer is delayed, guesstimate max 4k as first chunk size.
-    */
-    prepare_1st_chunk_for_dma(req);
-    /* flush pending desc to the DMAC (dmaengine.h) */
-    dma_issue_pending(req->dma_desc);
-
-    prepare_2nd_chunk_for_dma(req);
-    /*
-     * The second issue_pending should be called before MMC runs out
-     * of the first chunk. If the MMC runs out of the first data chunk
-     * before this call, the transfer is delayed.
-     */
-    dma_issue_pending(req->dma_desc);
diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt
deleted file mode 100644
index 4ad0bb17f343..000000000000
--- a/Documentation/mmc/mmc-dev-attrs.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-SD and MMC Block Device Attributes
-==================================
-
-These attributes are defined for the block devices associated with the
-SD or MMC device.
-
-The following attributes are read/write.
-
-	force_ro		Enforce read-only access even if write protect switch is off.
-
-SD and MMC Device Attributes
-============================
-
-All attributes are read-only.
-
-	cid			Card Identification Register
-	csd			Card Specific Data Register
-	scr			SD Card Configuration Register (SD only)
-	date			Manufacturing Date (from CID Register)
-	fwrev			Firmware/Product Revision (from CID Register) (SD and MMCv1 only)
-	hwrev			Hardware/Product Revision (from CID Register) (SD and MMCv1 only)
-	manfid			Manufacturer ID (from CID Register)
-	name			Product Name (from CID Register)
-	oemid			OEM/Application ID (from CID Register)
-	prv			Product Revision (from CID Register) (SD and MMCv4 only)
-	serial			Product Serial Number (from CID Register)
-	erase_size		Erase group size
-	preferred_erase_size	Preferred erase size
-	raw_rpmb_size_mult	RPMB partition size
-	rel_sectors		Reliable write sector count
-	ocr 			Operation Conditions Register
-	dsr			Driver Stage Register
-	cmdq_en			Command Queue enabled: 1 => enabled, 0 => not enabled
-
-Note on Erase Size and Preferred Erase Size:
-
-	"erase_size" is the  minimum size, in bytes, of an erase
-	operation.  For MMC, "erase_size" is the erase group size
-	reported by the card.  Note that "erase_size" does not apply
-	to trim or secure trim operations where the minimum size is
-	always one 512 byte sector.  For SD, "erase_size" is 512
-	if the card is block-addressed, 0 otherwise.
-
-	SD/MMC cards can erase an arbitrarily large area up to and
-	including the whole card.  When erasing a large area it may
-	be desirable to do it in smaller chunks for three reasons:
-		1. A single erase command will make all other I/O on
-		the card wait.  This is not a problem if the whole card
-		is being erased, but erasing one partition will make
-		I/O for another partition on the same card wait for the
-		duration of the erase - which could be a several
-		minutes.
-		2. To be able to inform the user of erase progress.
-		3. The erase timeout becomes too large to be very
-		useful.  Because the erase timeout contains a margin
-		which is multiplied by the size of the erase area,
-		the value can end up being several minutes for large
-		areas.
-
-	"erase_size" is not the most efficient unit to erase
-	(especially for SD where it is just one sector),
-	hence "preferred_erase_size" provides a good chunk
-	size for erasing large areas.
-
-	For MMC, "preferred_erase_size" is the high-capacity
-	erase size if a card specifies one, otherwise it is
-	based on the capacity of the card.
-
-	For SD, "preferred_erase_size" is the allocation unit
-	size specified by the card.
-
-	"preferred_erase_size" is in bytes.
-
-Note on raw_rpmb_size_mult:
-	"raw_rpmb_size_mult" is a multiple of 128kB block.
-	RPMB size in byte is calculated by using the following equation:
-	RPMB partition size = 128kB x raw_rpmb_size_mult
diff --git a/Documentation/mmc/mmc-dev-parts.txt b/Documentation/mmc/mmc-dev-parts.txt
deleted file mode 100644
index f08d078d43cf..000000000000
--- a/Documentation/mmc/mmc-dev-parts.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-SD and MMC Device Partitions
-============================
-
-Device partitions are additional logical block devices present on the
-SD/MMC device.
-
-As of this writing, MMC boot partitions as supported and exposed as
-/dev/mmcblkXboot0 and /dev/mmcblkXboot1, where X is the index of the
-parent /dev/mmcblkX.
-
-MMC Boot Partitions
-===================
-
-Read and write access is provided to the two MMC boot partitions. Due to
-the sensitive nature of the boot partition contents, which often store
-a bootloader or bootloader configuration tables crucial to booting the
-platform, write access is disabled by default to reduce the chance of
-accidental bricking.
-
-To enable write access to /dev/mmcblkXbootY, disable the forced read-only
-access with:
-
-echo 0 > /sys/block/mmcblkXbootY/force_ro
-
-To re-enable read-only access:
-
-echo 1 > /sys/block/mmcblkXbootY/force_ro
-
-The boot partitions can also be locked read only until the next power on,
-with:
-
-echo 1 > /sys/block/mmcblkXbootY/ro_lock_until_next_power_on
-
-This is a feature of the card and not of the kernel. If the card does
-not support boot partition locking, the file will not exist. If the
-feature has been disabled on the card, the file will be read-only.
-
-The boot partitions can also be locked permanently, but this feature is
-not accessible through sysfs in order to avoid accidental or malicious
-bricking.
diff --git a/Documentation/mmc/mmc-tools.txt b/Documentation/mmc/mmc-tools.txt
deleted file mode 100644
index 735509c165d5..000000000000
--- a/Documentation/mmc/mmc-tools.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-MMC tools introduction
-======================
-
-There is one MMC test tools called mmc-utils, which is maintained by Chris Ball,
-you can find it at the below public git repository:
-http://git.kernel.org/cgit/linux/kernel/git/cjb/mmc-utils.git/
-
-Functions
-=========
-
-The mmc-utils tools can do the following:
- - Print and parse extcsd data.
- - Determine the eMMC writeprotect status.
- - Set the eMMC writeprotect status.
- - Set the eMMC data sector size to 4KB by disabling emulation.
- - Create general purpose partition.
- - Enable the enhanced user area.
- - Enable write reliability per partition.
- - Print the response to STATUS_SEND (CMD13).
- - Enable the boot partition.
- - Set Boot Bus Conditions.
- - Enable the eMMC BKOPS feature.
- - Permanently enable the eMMC H/W Reset feature.
- - Permanently disable the eMMC H/W Reset feature.
- - Send Sanitize command.
- - Program authentication key for the device.
- - Counter value for the rpmb device will be read to stdout.
- - Read from rpmb device to output.
- - Write to rpmb device from data file.
- - Enable the eMMC cache feature.
- - Disable the eMMC cache feature.
- - Print and parse CID data.
- - Print and parse CSD data.
- - Print and parse SCR data.
diff --git a/Documentation/mtd/intel-spi.txt b/Documentation/mtd/intel-spi.txt
deleted file mode 100644
index bc357729c2cb..000000000000
--- a/Documentation/mtd/intel-spi.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-Upgrading BIOS using intel-spi
-------------------------------
-
-Many Intel CPUs like Baytrail and Braswell include SPI serial flash host
-controller which is used to hold BIOS and other platform specific data.
-Since contents of the SPI serial flash is crucial for machine to function,
-it is typically protected by different hardware protection mechanisms to
-avoid accidental (or on purpose) overwrite of the content.
-
-Not all manufacturers protect the SPI serial flash, mainly because it
-allows upgrading the BIOS image directly from an OS.
-
-The intel-spi driver makes it possible to read and write the SPI serial
-flash, if certain protection bits are not set and locked. If it finds
-any of them set, the whole MTD device is made read-only to prevent
-partial overwrites. By default the driver exposes SPI serial flash
-contents as read-only but it can be changed from kernel command line,
-passing "intel-spi.writeable=1".
-
-Please keep in mind that overwriting the BIOS image on SPI serial flash
-might render the machine unbootable and requires special equipment like
-Dediprog to revive. You have been warned!
-
-Below are the steps how to upgrade MinnowBoard MAX BIOS directly from
-Linux.
-
- 1) Download and extract the latest Minnowboard MAX BIOS SPI image
-    [1]. At the time writing this the latest image is v92.
-
- 2) Install mtd-utils package [2]. We need this in order to erase the SPI
-    serial flash. Distros like Debian and Fedora have this prepackaged with
-    name "mtd-utils".
-
- 3) Add "intel-spi.writeable=1" to the kernel command line and reboot
-    the board (you can also reload the driver passing "writeable=1" as
-    module parameter to modprobe).
-
- 4) Once the board is up and running again, find the right MTD partition
-    (it is named as "BIOS"):
-
-    # cat /proc/mtd
-    dev:    size   erasesize  name
-    mtd0: 00800000 00001000 "BIOS"
-
-    So here it will be /dev/mtd0 but it may vary.
-
- 5) Make backup of the existing image first:
-
-    # dd if=/dev/mtd0ro of=bios.bak
-    16384+0 records in
-    16384+0 records out
-    8388608 bytes (8.4 MB) copied, 10.0269 s, 837 kB/s
-
- 6) Verify the backup
-
-    # sha1sum /dev/mtd0ro bios.bak
-    fdbb011920572ca6c991377c4b418a0502668b73  /dev/mtd0ro
-    fdbb011920572ca6c991377c4b418a0502668b73  bios.bak
-
-    The SHA1 sums must match. Otherwise do not continue any further!
-
- 7) Erase the SPI serial flash. After this step, do not reboot the
-    board! Otherwise it will not start anymore.
-
-    # flash_erase /dev/mtd0 0 0
-    Erasing 4 Kibyte @ 7ff000 -- 100 % complete
-
- 8) Once completed without errors you can write the new BIOS image:
-
-    # dd if=MNW2MAX1.X64.0092.R01.1605221712.bin of=/dev/mtd0
-
- 9) Verify that the new content of the SPI serial flash matches the new
-    BIOS image:
-
-    # sha1sum /dev/mtd0ro MNW2MAX1.X64.0092.R01.1605221712.bin
-    9b4df9e4be2057fceec3a5529ec3d950836c87a2  /dev/mtd0ro
-    9b4df9e4be2057fceec3a5529ec3d950836c87a2 MNW2MAX1.X64.0092.R01.1605221712.bin
-
-    The SHA1 sums should match.
-
- 10) Now you can reboot your board and observe the new BIOS starting up
-     properly.
-
-References
-----------
-
-[1] https://firmware.intel.com/sites/default/files/MinnowBoard.MAX_.X64.92.R01.zip
-[2] http://www.linux-mtd.infradead.org/
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
deleted file mode 100644
index f8c3284bf6a7..000000000000
--- a/Documentation/mtd/nand_ecc.txt
+++ /dev/null
@@ -1,714 +0,0 @@
-Introduction
-============
-
-Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
-I felt there was room for optimisation. I bashed the code for a few hours
-performing tricks like table lookup removing superfluous code etc.
-After that the speed was increased by 35-40%.
-Still I was not too happy as I felt there was additional room for improvement.
-
-Bad! I was hooked.
-I decided to annotate my steps in this file. Perhaps it is useful to someone
-or someone learns something from it.
-
-
-The problem
-===========
-
-NAND flash (at least SLC one) typically has sectors of 256 bytes.
-However NAND flash is not extremely reliable so some error detection
-(and sometimes correction) is needed.
-
-This is done by means of a Hamming code. I'll try to explain it in
-laymans terms (and apologies to all the pro's in the field in case I do
-not use the right terminology, my coding theory class was almost 30
-years ago, and I must admit it was not one of my favourites).
-
-As I said before the ecc calculation is performed on sectors of 256
-bytes. This is done by calculating several parity bits over the rows and
-columns. The parity used is even parity which means that the parity bit = 1
-if the data over which the parity is calculated is 1 and the parity bit = 0
-if the data over which the parity is calculated is 0. So the total
-number of bits over the data over which the parity is calculated + the
-parity bit is even. (see wikipedia if you can't follow this).
-Parity is often calculated by means of an exclusive or operation,
-sometimes also referred to as xor. In C the operator for xor is ^
-
-Back to ecc.
-Let's give a small figure:
-
-byte   0:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp4 ... rp14
-byte   1:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp2 rp4 ... rp14
-byte   2:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp4 ... rp14
-byte   3:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp4 ... rp14
-byte   4:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp5 ... rp14
-....
-byte 254:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp5 ... rp15
-byte 255:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp5 ... rp15
-           cp1  cp0  cp1  cp0  cp1  cp0  cp1  cp0
-           cp3  cp3  cp2  cp2  cp3  cp3  cp2  cp2
-           cp5  cp5  cp5  cp5  cp4  cp4  cp4  cp4
-
-This figure represents a sector of 256 bytes.
-cp is my abbreviation for column parity, rp for row parity.
-
-Let's start to explain column parity.
-cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
-so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
-Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
-cp2 is the parity over bit0, bit1, bit4 and bit5
-cp3 is the parity over bit2, bit3, bit6 and bit7.
-cp4 is the parity over bit0, bit1, bit2 and bit3.
-cp5 is the parity over bit4, bit5, bit6 and bit7.
-Note that each of cp0 .. cp5 is exactly one bit.
-
-Row parity actually works almost the same.
-rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
-rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
-rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
-(so handle two bytes, then skip 2 bytes).
-rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
-for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
-so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
-and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
-The story now becomes quite boring. I guess you get the idea.
-rp6 covers 8 bytes then skips 8 etc
-rp7 skips 8 bytes then covers 8 etc
-rp8 covers 16 bytes then skips 16 etc
-rp9 skips 16 bytes then covers 16 etc
-rp10 covers 32 bytes then skips 32 etc
-rp11 skips 32 bytes then covers 32 etc
-rp12 covers 64 bytes then skips 64 etc
-rp13 skips 64 bytes then covers 64 etc
-rp14 covers 128 bytes then skips 128
-rp15 skips 128 bytes then covers 128
-
-In the end the parity bits are grouped together in three bytes as
-follows:
-ECC    Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
-ECC 0   rp07  rp06  rp05  rp04  rp03  rp02  rp01  rp00
-ECC 1   rp15  rp14  rp13  rp12  rp11  rp10  rp09  rp08
-ECC 2   cp5   cp4   cp3   cp2   cp1   cp0      1     1
-
-I detected after writing this that ST application note AN1823
-(http://www.st.com/stonline/) gives a much
-nicer picture.(but they use line parity as term where I use row parity)
-Oh well, I'm graphically challenged, so suffer with me for a moment :-)
-And I could not reuse the ST picture anyway for copyright reasons.
-
-
-Attempt 0
-=========
-
-Implementing the parity calculation is pretty simple.
-In C pseudocode:
-for (i = 0; i < 256; i++)
-{
-    if (i & 0x01)
-       rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
-    else
-       rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
-    if (i & 0x02)
-       rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
-    else
-       rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
-    if (i & 0x04)
-      rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
-    else
-      rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
-    if (i & 0x08)
-      rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
-    else
-      rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
-    if (i & 0x10)
-      rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
-    else
-      rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
-    if (i & 0x20)
-      rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
-    else
-      rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
-    if (i & 0x40)
-      rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
-    else
-      rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
-    if (i & 0x80)
-      rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
-    else
-      rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
-    cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
-    cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
-    cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
-    cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
-    cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
-    cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
-}
-
-
-Analysis 0
-==========
-
-C does have bitwise operators but not really operators to do the above
-efficiently (and most hardware has no such instructions either).
-Therefore without implementing this it was clear that the code above was
-not going to bring me a Nobel prize :-)
-
-Fortunately the exclusive or operation is commutative, so we can combine
-the values in any order. So instead of calculating all the bits
-individually, let us try to rearrange things.
-For the column parity this is easy. We can just xor the bytes and in the
-end filter out the relevant bits. This is pretty nice as it will bring
-all cp calculation out of the for loop.
-
-Similarly we can first xor the bytes for the various rows.
-This leads to:
-
-
-Attempt 1
-=========
-
-const char parity[256] = {
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
-};
-
-void ecc1(const unsigned char *buf, unsigned char *code)
-{
-    int i;
-    const unsigned char *bp = buf;
-    unsigned char cur;
-    unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
-    unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
-    unsigned char par;
-
-    par = 0;
-    rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
-    rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
-    rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
-    rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
-
-    for (i = 0; i < 256; i++)
-    {
-        cur = *bp++;
-        par ^= cur;
-        if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
-        if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
-        if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
-        if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
-        if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
-        if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
-        if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
-        if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
-    }
-    code[0] =
-        (parity[rp7] << 7) |
-        (parity[rp6] << 6) |
-        (parity[rp5] << 5) |
-        (parity[rp4] << 4) |
-        (parity[rp3] << 3) |
-        (parity[rp2] << 2) |
-        (parity[rp1] << 1) |
-        (parity[rp0]);
-    code[1] =
-        (parity[rp15] << 7) |
-        (parity[rp14] << 6) |
-        (parity[rp13] << 5) |
-        (parity[rp12] << 4) |
-        (parity[rp11] << 3) |
-        (parity[rp10] << 2) |
-        (parity[rp9]  << 1) |
-        (parity[rp8]);
-    code[2] =
-        (parity[par & 0xf0] << 7) |
-        (parity[par & 0x0f] << 6) |
-        (parity[par & 0xcc] << 5) |
-        (parity[par & 0x33] << 4) |
-        (parity[par & 0xaa] << 3) |
-        (parity[par & 0x55] << 2);
-    code[0] = ~code[0];
-    code[1] = ~code[1];
-    code[2] = ~code[2];
-}
-
-Still pretty straightforward. The last three invert statements are there to
-give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
-all data is 0xff, so the checksum then matches.
-
-I also introduced the parity lookup. I expected this to be the fastest
-way to calculate the parity, but I will investigate alternatives later
-on.
-
-
-Analysis 1
-==========
-
-The code works, but is not terribly efficient. On my system it took
-almost 4 times as much time as the linux driver code. But hey, if it was
-*that* easy this would have been done long before.
-No pain. no gain.
-
-Fortunately there is plenty of room for improvement.
-
-In step 1 we moved from bit-wise calculation to byte-wise calculation.
-However in C we can also use the unsigned long data type and virtually
-every modern microprocessor supports 32 bit operations, so why not try
-to write our code in such a way that we process data in 32 bit chunks.
-
-Of course this means some modification as the row parity is byte by
-byte. A quick analysis:
-for the column parity we use the par variable. When extending to 32 bits
-we can in the end easily calculate rp0 and rp1 from it.
-(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
-respectively, from MSB to LSB)
-also rp2 and rp3 can be easily retrieved from par as rp3 covers the
-first two MSBs and rp2 covers the last two LSBs.
-
-Note that of course now the loop is executed only 64 times (256/4).
-And note that care must taken wrt byte ordering. The way bytes are
-ordered in a long is machine dependent, and might affect us.
-Anyway, if there is an issue: this code is developed on x86 (to be
-precise: a DELL PC with a D920 Intel CPU)
-
-And of course the performance might depend on alignment, but I expect
-that the I/O buffers in the nand driver are aligned properly (and
-otherwise that should be fixed to get maximum performance).
-
-Let's give it a try...
-
-
-Attempt 2
-=========
-
-extern const char parity[256];
-
-void ecc2(const unsigned char *buf, unsigned char *code)
-{
-    int i;
-    const unsigned long *bp = (unsigned long *)buf;
-    unsigned long cur;
-    unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
-    unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
-    unsigned long par;
-
-    par = 0;
-    rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
-    rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
-    rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
-    rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
-
-    for (i = 0; i < 64; i++)
-    {
-        cur = *bp++;
-        par ^= cur;
-        if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
-        if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
-        if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
-        if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
-        if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
-        if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
-    }
-    /*
-       we need to adapt the code generation for the fact that rp vars are now
-       long; also the column parity calculation needs to be changed.
-       we'll bring rp4 to 15 back to single byte entities by shifting and
-       xoring
-    */
-    rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
-    rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
-    rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
-    rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
-    rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
-    rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
-    rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
-    rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
-    rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
-    rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
-    rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
-    rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
-    rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
-    rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
-    par ^= (par >> 16);
-    rp1 = (par >> 8); rp1 &= 0xff;
-    rp0 = (par & 0xff);
-    par ^= (par >> 8); par &= 0xff;
-
-    code[0] =
-        (parity[rp7] << 7) |
-        (parity[rp6] << 6) |
-        (parity[rp5] << 5) |
-        (parity[rp4] << 4) |
-        (parity[rp3] << 3) |
-        (parity[rp2] << 2) |
-        (parity[rp1] << 1) |
-        (parity[rp0]);
-    code[1] =
-        (parity[rp15] << 7) |
-        (parity[rp14] << 6) |
-        (parity[rp13] << 5) |
-        (parity[rp12] << 4) |
-        (parity[rp11] << 3) |
-        (parity[rp10] << 2) |
-        (parity[rp9]  << 1) |
-        (parity[rp8]);
-    code[2] =
-        (parity[par & 0xf0] << 7) |
-        (parity[par & 0x0f] << 6) |
-        (parity[par & 0xcc] << 5) |
-        (parity[par & 0x33] << 4) |
-        (parity[par & 0xaa] << 3) |
-        (parity[par & 0x55] << 2);
-    code[0] = ~code[0];
-    code[1] = ~code[1];
-    code[2] = ~code[2];
-}
-
-The parity array is not shown any more. Note also that for these
-examples I kinda deviated from my regular programming style by allowing
-multiple statements on a line, not using { } in then and else blocks
-with only a single statement and by using operators like ^=
-
-
-Analysis 2
-==========
-
-The code (of course) works, and hurray: we are a little bit faster than
-the linux driver code (about 15%). But wait, don't cheer too quickly.
-There is more to be gained.
-If we look at e.g. rp14 and rp15 we see that we either xor our data with
-rp14 or with rp15. However we also have par which goes over all data.
-This means there is no need to calculate rp14 as it can be calculated from
-rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
-(or if desired we can avoid calculating rp15 and calculate it from
-rp14).  That is why some places refer to inverse parity.
-Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
-Effectively this means we can eliminate the else clause from the if
-statements. Also we can optimise the calculation in the end a little bit
-by going from long to byte first. Actually we can even avoid the table
-lookups
-
-Attempt 3
-=========
-
-Odd replaced:
-        if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
-        if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
-        if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
-        if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
-        if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
-        if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
-with
-        if (i & 0x01) rp5 ^= cur;
-        if (i & 0x02) rp7 ^= cur;
-        if (i & 0x04) rp9 ^= cur;
-        if (i & 0x08) rp11 ^= cur;
-        if (i & 0x10) rp13 ^= cur;
-        if (i & 0x20) rp15 ^= cur;
-
-        and outside the loop added:
-        rp4  = par ^ rp5;
-        rp6  = par ^ rp7;
-        rp8  = par ^ rp9;
-        rp10  = par ^ rp11;
-        rp12  = par ^ rp13;
-        rp14  = par ^ rp15;
-
-And after that the code takes about 30% more time, although the number of
-statements is reduced. This is also reflected in the assembly code.
-
-
-Analysis 3
-==========
-
-Very weird. Guess it has to do with caching or instruction parallellism
-or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
-observation was that this one is only 30% slower (according to time)
-executing the code as my 3Ghz D920 processor.
-
-Well, it was expected not to be easy so maybe instead move to a
-different track: let's move back to the code from attempt2 and do some
-loop unrolling. This will eliminate a few if statements. I'll try
-different amounts of unrolling to see what works best.
-
-
-Attempt 4
-=========
-
-Unrolled the loop 1, 2, 3 and 4 times.
-For 4 the code starts with:
-
-    for (i = 0; i < 4; i++)
-    {
-        cur = *bp++;
-        par ^= cur;
-        rp4 ^= cur;
-        rp6 ^= cur;
-        rp8 ^= cur;
-        rp10 ^= cur;
-        if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
-        if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
-        cur = *bp++;
-        par ^= cur;
-        rp5 ^= cur;
-        rp6 ^= cur;
-        ...
-
-
-Analysis 4
-==========
-
-Unrolling once gains about 15%
-Unrolling twice keeps the gain at about 15%
-Unrolling three times gives a gain of 30% compared to attempt 2.
-Unrolling four times gives a marginal improvement compared to unrolling
-three times.
-
-I decided to proceed with a four time unrolled loop anyway. It was my gut
-feeling that in the next steps I would obtain additional gain from it.
-
-The next step was triggered by the fact that par contains the xor of all
-bytes and rp4 and rp5 each contain the xor of half of the bytes.
-So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
-that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
-eliminate rp5 (or rp4, but I already foresaw another optimisation).
-The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
-
-
-Attempt 5
-=========
-
-Effectively so all odd digit rp assignments in the loop were removed.
-This included the else clause of the if statements.
-Of course after the loop we need to correct things by adding code like:
-    rp5 = par ^ rp4;
-Also the initial assignments (rp5 = 0; etc) could be removed.
-Along the line I also removed the initialisation of rp0/1/2/3.
-
-
-Analysis 5
-==========
-
-Measurements showed this was a good move. The run-time roughly halved
-compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
-of the processor time compared to the current code in the linux kernel.
-
-However, still I thought there was more. I didn't like all the if
-statements. Why not keep a running parity and only keep the last if
-statement. Time for yet another version!
-
-
-Attempt 6
-=========
-
-THe code within the for loop was changed to:
-
-    for (i = 0; i < 4; i++)
-    {
-        cur = *bp++; tmppar  = cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
-
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp8 ^= cur;
-
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur;
-
-        par ^= tmppar;
-        if ((i & 0x1) == 0) rp12 ^= tmppar;
-        if ((i & 0x2) == 0) rp14 ^= tmppar;
-    }
-
-As you can see tmppar is used to accumulate the parity within a for
-iteration. In the last 3 statements is added to par and, if needed,
-to rp12 and rp14.
-
-While making the changes I also found that I could exploit that tmppar
-contains the running parity for this iteration. So instead of having:
-rp4 ^= cur; rp6 ^= cur;
-I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
-statement. A similar change was done for rp8 and rp10
-
-
-Analysis 6
-==========
-
-Measuring this code again showed big gain. When executing the original
-linux code 1 million times, this took about 1 second on my system.
-(using time to measure the performance). After this iteration I was back
-to 0.075 sec. Actually I had to decide to start measuring over 10
-million iterations in order not to lose too much accuracy. This one
-definitely seemed to be the jackpot!
-
-There is a little bit more room for improvement though. There are three
-places with statements:
-rp4 ^= cur; rp6 ^= cur;
-It seems more efficient to also maintain a variable rp4_6 in the while
-loop; This eliminates 3 statements per loop. Of course after the loop we
-need to correct by adding:
-    rp4 ^= rp4_6;
-    rp6 ^= rp4_6
-Furthermore there are 4 sequential assignments to rp8. This can be
-encoded slightly more efficiently by saving tmppar before those 4 lines
-and later do rp8 = rp8 ^ tmppar ^ notrp8;
-(where notrp8 is the value of rp8 before those 4 lines).
-Again a use of the commutative property of xor.
-Time for a new test!
-
-
-Attempt 7
-=========
-
-The new code now looks like:
-
-    for (i = 0; i < 4; i++)
-    {
-        cur = *bp++; tmppar  = cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
-
-        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-
-        notrp8 = tmppar;
-        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur;
-        rp8 = rp8 ^ tmppar ^ notrp8;
-
-        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
-        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
-        cur = *bp++; tmppar ^= cur;
-
-        par ^= tmppar;
-        if ((i & 0x1) == 0) rp12 ^= tmppar;
-        if ((i & 0x2) == 0) rp14 ^= tmppar;
-    }
-    rp4 ^= rp4_6;
-    rp6 ^= rp4_6;
-
-
-Not a big change, but every penny counts :-)
-
-
-Analysis 7
-==========
-
-Actually this made things worse. Not very much, but I don't want to move
-into the wrong direction. Maybe something to investigate later. Could
-have to do with caching again.
-
-Guess that is what there is to win within the loop. Maybe unrolling one
-more time will help. I'll keep the optimisations from 7 for now.
-
-
-Attempt 8
-=========
-
-Unrolled the loop one more time.
-
-
-Analysis 8
-==========
-
-This makes things worse. Let's stick with attempt 6 and continue from there.
-Although it seems that the code within the loop cannot be optimised
-further there is still room to optimize the generation of the ecc codes.
-We can simply calculate the total parity. If this is 0 then rp4 = rp5
-etc. If the parity is 1, then rp4 = !rp5;
-But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
-in the result byte and then do something like
-    code[0] |= (code[0] << 1);
-Lets test this.
-
-
-Attempt 9
-=========
-
-Changed the code but again this slightly degrades performance. Tried all
-kind of other things, like having dedicated parity arrays to avoid the
-shift after parity[rp7] << 7; No gain.
-Change the lookup using the parity array by using shift operators (e.g.
-replace parity[rp7] << 7 with:
-rp7 ^= (rp7 << 4);
-rp7 ^= (rp7 << 2);
-rp7 ^= (rp7 << 1);
-rp7 &= 0x80;
-No gain.
-
-The only marginal change was inverting the parity bits, so we can remove
-the last three invert statements.
-
-Ah well, pity this does not deliver more. Then again 10 million
-iterations using the linux driver code takes between 13 and 13.5
-seconds, whereas my code now takes about 0.73 seconds for those 10
-million iterations. So basically I've improved the performance by a
-factor 18 on my system. Not that bad. Of course on different hardware
-you will get different results. No warranties!
-
-But of course there is no such thing as a free lunch. The codesize almost
-tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
-
-
-Correcting errors
-=================
-
-For correcting errors I again used the ST application note as a starter,
-but I also peeked at the existing code.
-The algorithm itself is pretty straightforward. Just xor the given and
-the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
-are 1 we have one correctable bit error. If there is 1 bit 1, we have an
-error in the given ecc code.
-It proved to be fastest to do some table lookups. Performance gain
-introduced by this is about a factor 2 on my system when a repair had to
-be done, and 1% or so if no repair had to be done.
-Code size increased from 330 bytes to 686 bytes for this function.
-(gcc 4.2, -O3)
-
-
-Conclusion
-==========
-
-The gain when calculating the ecc is tremendous. Om my development hardware
-a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
-embedded system with a MIPS core a factor 7 was obtained.
-On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
-5 (big endian mode, gcc 4.1.2, -O3)
-For correction not much gain could be obtained (as bitflips are rare). Then
-again there are also much less cycles spent there.
-
-It seems there is not much more gain possible in this, at least when
-programmed in C. Of course it might be possible to squeeze something more
-out of it with an assembler program, but due to pipeline behaviour etc
-this is very tricky (at least for intel hw).
-
-Author: Frans Meulenbroeks
-Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/mtd/spi-nor.txt b/Documentation/mtd/spi-nor.txt
deleted file mode 100644
index da1fbff5a24c..000000000000
--- a/Documentation/mtd/spi-nor.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-                          SPI NOR framework
-               ============================================
-
-Part I - Why do we need this framework?
----------------------------------------
-
-SPI bus controllers (drivers/spi/) only deal with streams of bytes; the bus
-controller operates agnostic of the specific device attached. However, some
-controllers (such as Freescale's QuadSPI controller) cannot easily handle
-arbitrary streams of bytes, but rather are designed specifically for SPI NOR.
-
-In particular, Freescale's QuadSPI controller must know the NOR commands to
-find the right LUT sequence. Unfortunately, the SPI subsystem has no notion of
-opcodes, addresses, or data payloads; a SPI controller simply knows to send or
-receive bytes (Tx and Rx). Therefore, we must define a new layering scheme under
-which the controller driver is aware of the opcodes, addressing, and other
-details of the SPI NOR protocol.
-
-Part II - How does the framework work?
---------------------------------------
-
-This framework just adds a new layer between the MTD and the SPI bus driver.
-With this new layer, the SPI NOR controller driver does not depend on the
-m25p80 code anymore.
-
-   Before this framework, the layer is like:
-
-                   MTD
-         ------------------------
-                  m25p80
-         ------------------------
-	       SPI bus driver
-         ------------------------
-	        SPI NOR chip
-
-   After this framework, the layer is like:
-                   MTD
-         ------------------------
-              SPI NOR framework
-         ------------------------
-                  m25p80
-         ------------------------
-	       SPI bus driver
-         ------------------------
-	       SPI NOR chip
-
-  With the SPI NOR controller driver (Freescale QuadSPI), it looks like:
-                   MTD
-         ------------------------
-              SPI NOR framework
-         ------------------------
-                fsl-quadSPI
-         ------------------------
-	       SPI NOR chip
-
-Part III - How can drivers use the framework?
----------------------------------------------
-
-The main API is spi_nor_scan(). Before you call the hook, a driver should
-initialize the necessary fields for spi_nor{}. Please see
-drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to fsl-quadspi.c
-when you want to write a new driver for a SPI NOR controller.
-Another API is spi_nor_restore(), this is used to restore the status of SPI
-flash chip such as addressing mode. Call it whenever detach the driver from
-device or reboot the system.
diff --git a/Documentation/namespaces/compatibility-list.txt b/Documentation/namespaces/compatibility-list.txt
deleted file mode 100644
index defc5589bfcd..000000000000
--- a/Documentation/namespaces/compatibility-list.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-	Namespaces compatibility list
-
-This document contains the information about the problems user
-may have when creating tasks living in different namespaces.
-
-Here's the summary. This matrix shows the known problems, that
-occur when tasks share some namespace (the columns) while living
-in different other namespaces (the rows):
-
-	UTS	IPC	VFS	PID	User	Net
-UTS	 X
-IPC		 X	 1
-VFS			 X
-PID		 1	 1	 X
-User		 2	 2		 X
-Net						 X
-
-1. Both the IPC and the PID namespaces provide IDs to address
-   object inside the kernel. E.g. semaphore with IPCID or
-   process group with pid.
-
-   In both cases, tasks shouldn't try exposing this ID to some
-   other task living in a different namespace via a shared filesystem
-   or IPC shmem/message. The fact is that this ID is only valid
-   within the namespace it was obtained in and may refer to some
-   other object in another namespace.
-
-2. Intentionally, two equal user IDs in different user namespaces
-   should not be equal from the VFS point of view. In other
-   words, user 10 in one user namespace shouldn't have the same
-   access permissions to files, belonging to user 10 in another
-   namespace.
-
-   The same is true for the IPC namespaces being shared - two users
-   from different user namespaces should not access the same IPC objects
-   even having equal UIDs.
-
-   But currently this is not so.
-
diff --git a/Documentation/namespaces/resource-control.txt b/Documentation/namespaces/resource-control.txt
deleted file mode 100644
index abc13c394738..000000000000
--- a/Documentation/namespaces/resource-control.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-There are a lot of kinds of objects in the kernel that don't have
-individual limits or that have limits that are ineffective when a set
-of processes is allowed to switch user ids.  With user namespaces
-enabled in a kernel for people who don't trust their users or their
-users programs to play nice this problems becomes more acute.
-
-Therefore it is recommended that memory control groups be enabled in
-kernels that enable user namespaces, and it is further recommended
-that userspace configure memory control groups to limit how much
-memory user's they don't trust to play nice can use.
-
-Memory control groups can be configured by installing the libcgroup
-package present on most distros editing /etc/cgrules.conf,
-/etc/cgconfig.conf and setting up libpam-cgroup.
diff --git a/Documentation/netlabel/cipso_ipv4.rst b/Documentation/netlabel/cipso_ipv4.rst
new file mode 100644
index 000000000000..cbd3f3231221
--- /dev/null
+++ b/Documentation/netlabel/cipso_ipv4.rst
@@ -0,0 +1,56 @@
+===================================
+NetLabel CIPSO/IPv4 Protocol Engine
+===================================
+
+Paul Moore, paul.moore@hp.com
+
+May 17, 2006
+
+Overview
+========
+
+The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial
+IP Security Option (CIPSO) draft from July 16, 1992.  A copy of this
+draft can be found in this directory
+(draft-ietf-cipso-ipsecurity-01.txt).  While the IETF draft never made
+it to an RFC standard it has become a de-facto standard for labeled
+networking and is used in many trusted operating systems.
+
+Outbound Packet Processing
+==========================
+
+The CIPSO/IPv4 protocol engine applies the CIPSO IP option to packets by
+adding the CIPSO label to the socket.  This causes all packets leaving the
+system through the socket to have the CIPSO IP option applied.  The socket's
+CIPSO label can be changed at any point in time, however, it is recommended
+that it is set upon the socket's creation.  The LSM can set the socket's CIPSO
+label by using the NetLabel security module API; if the NetLabel "domain" is
+configured to use CIPSO for packet labeling then a CIPSO IP option will be
+generated and attached to the socket.
+
+Inbound Packet Processing
+=========================
+
+The CIPSO/IPv4 protocol engine validates every CIPSO IP option it finds at the
+IP layer without any special handling required by the LSM.  However, in order
+to decode and translate the CIPSO label on the packet the LSM must use the
+NetLabel security module API to extract the security attributes of the packet.
+This is typically done at the socket layer using the 'socket_sock_rcv_skb()'
+LSM hook.
+
+Label Translation
+=================
+
+The CIPSO/IPv4 protocol engine contains a mechanism to translate CIPSO security
+attributes such as sensitivity level and category to values which are
+appropriate for the host.  These mappings are defined as part of a CIPSO
+Domain Of Interpretation (DOI) definition and are configured through the
+NetLabel user space communication layer.  Each DOI definition can have a
+different security attribute mapping table.
+
+Label Translation Cache
+=======================
+
+The NetLabel system provides a framework for caching security attribute
+mappings from the network labels to the corresponding LSM identifiers.  The
+CIPSO/IPv4 protocol engine supports this caching mechanism.
diff --git a/Documentation/netlabel/cipso_ipv4.txt b/Documentation/netlabel/cipso_ipv4.txt
deleted file mode 100644
index a6075481fd60..000000000000
--- a/Documentation/netlabel/cipso_ipv4.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-NetLabel CIPSO/IPv4 Protocol Engine
-==============================================================================
-Paul Moore, paul.moore@hp.com
-
-May 17, 2006
-
- * Overview
-
-The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial
-IP Security Option (CIPSO) draft from July 16, 1992.  A copy of this
-draft can be found in this directory
-(draft-ietf-cipso-ipsecurity-01.txt).  While the IETF draft never made
-it to an RFC standard it has become a de-facto standard for labeled
-networking and is used in many trusted operating systems.
-
- * Outbound Packet Processing
-
-The CIPSO/IPv4 protocol engine applies the CIPSO IP option to packets by
-adding the CIPSO label to the socket.  This causes all packets leaving the
-system through the socket to have the CIPSO IP option applied.  The socket's
-CIPSO label can be changed at any point in time, however, it is recommended
-that it is set upon the socket's creation.  The LSM can set the socket's CIPSO
-label by using the NetLabel security module API; if the NetLabel "domain" is
-configured to use CIPSO for packet labeling then a CIPSO IP option will be
-generated and attached to the socket.
-
- * Inbound Packet Processing
-
-The CIPSO/IPv4 protocol engine validates every CIPSO IP option it finds at the
-IP layer without any special handling required by the LSM.  However, in order
-to decode and translate the CIPSO label on the packet the LSM must use the
-NetLabel security module API to extract the security attributes of the packet.
-This is typically done at the socket layer using the 'socket_sock_rcv_skb()'
-LSM hook.
-
- * Label Translation
-
-The CIPSO/IPv4 protocol engine contains a mechanism to translate CIPSO security
-attributes such as sensitivity level and category to values which are
-appropriate for the host.  These mappings are defined as part of a CIPSO
-Domain Of Interpretation (DOI) definition and are configured through the
-NetLabel user space communication layer.  Each DOI definition can have a
-different security attribute mapping table.
-
- * Label Translation Cache
-
-The NetLabel system provides a framework for caching security attribute
-mappings from the network labels to the corresponding LSM identifiers.  The
-CIPSO/IPv4 protocol engine supports this caching mechanism.
diff --git a/Documentation/netlabel/draft_ietf.rst b/Documentation/netlabel/draft_ietf.rst
new file mode 100644
index 000000000000..5ed39ab8234b
--- /dev/null
+++ b/Documentation/netlabel/draft_ietf.rst
@@ -0,0 +1,5 @@
+Draft IETF CIPSO IP Security
+----------------------------
+
+ .. include:: draft-ietf-cipso-ipsecurity-01.txt
+    :literal:
diff --git a/Documentation/netlabel/index.rst b/Documentation/netlabel/index.rst
new file mode 100644
index 000000000000..984e1b191b12
--- /dev/null
+++ b/Documentation/netlabel/index.rst
@@ -0,0 +1,21 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+NetLabel
+========
+
+.. toctree::
+    :maxdepth: 1
+
+    introduction
+    cipso_ipv4
+    lsm_interface
+
+    draft_ietf
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/netlabel/introduction.rst b/Documentation/netlabel/introduction.rst
new file mode 100644
index 000000000000..9333bbb0adc1
--- /dev/null
+++ b/Documentation/netlabel/introduction.rst
@@ -0,0 +1,52 @@
+=====================
+NetLabel Introduction
+=====================
+
+Paul Moore, paul.moore@hp.com
+
+August 2, 2006
+
+Overview
+========
+
+NetLabel is a mechanism which can be used by kernel security modules to attach
+security attributes to outgoing network packets generated from user space
+applications and read security attributes from incoming network packets.  It
+is composed of three main components, the protocol engines, the communication
+layer, and the kernel security module API.
+
+Protocol Engines
+================
+
+The protocol engines are responsible for both applying and retrieving the
+network packet's security attributes.  If any translation between the network
+security attributes and those on the host are required then the protocol
+engine will handle those tasks as well.  Other kernel subsystems should
+refrain from calling the protocol engines directly, instead they should use
+the NetLabel kernel security module API described below.
+
+Detailed information about each NetLabel protocol engine can be found in this
+directory.
+
+Communication Layer
+===================
+
+The communication layer exists to allow NetLabel configuration and monitoring
+from user space.  The NetLabel communication layer uses a message based
+protocol built on top of the Generic NETLINK transport mechanism.  The exact
+formatting of these NetLabel messages as well as the Generic NETLINK family
+names can be found in the 'net/netlabel/' directory as comments in the
+header files as well as in 'include/net/netlabel.h'.
+
+Security Module API
+===================
+
+The purpose of the NetLabel security module API is to provide a protocol
+independent interface to the underlying NetLabel protocol engines.  In addition
+to protocol independence, the security module API is designed to be completely
+LSM independent which should allow multiple LSMs to leverage the same code
+base.
+
+Detailed information about the NetLabel security module API can be found in the
+'include/net/netlabel.h' header file as well as the 'lsm_interface.txt' file
+found in this directory.
diff --git a/Documentation/netlabel/introduction.txt b/Documentation/netlabel/introduction.txt
deleted file mode 100644
index 3caf77bcff0f..000000000000
--- a/Documentation/netlabel/introduction.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-NetLabel Introduction
-==============================================================================
-Paul Moore, paul.moore@hp.com
-
-August 2, 2006
-
- * Overview
-
-NetLabel is a mechanism which can be used by kernel security modules to attach
-security attributes to outgoing network packets generated from user space
-applications and read security attributes from incoming network packets.  It
-is composed of three main components, the protocol engines, the communication
-layer, and the kernel security module API.
-
- * Protocol Engines
-
-The protocol engines are responsible for both applying and retrieving the
-network packet's security attributes.  If any translation between the network
-security attributes and those on the host are required then the protocol
-engine will handle those tasks as well.  Other kernel subsystems should
-refrain from calling the protocol engines directly, instead they should use
-the NetLabel kernel security module API described below.
-
-Detailed information about each NetLabel protocol engine can be found in this
-directory.
-
- * Communication Layer
-
-The communication layer exists to allow NetLabel configuration and monitoring
-from user space.  The NetLabel communication layer uses a message based
-protocol built on top of the Generic NETLINK transport mechanism.  The exact
-formatting of these NetLabel messages as well as the Generic NETLINK family
-names can be found in the 'net/netlabel/' directory as comments in the
-header files as well as in 'include/net/netlabel.h'.
-
- * Security Module API
-
-The purpose of the NetLabel security module API is to provide a protocol
-independent interface to the underlying NetLabel protocol engines.  In addition
-to protocol independence, the security module API is designed to be completely
-LSM independent which should allow multiple LSMs to leverage the same code
-base.
-
-Detailed information about the NetLabel security module API can be found in the
-'include/net/netlabel.h' header file as well as the 'lsm_interface.txt' file
-found in this directory.
diff --git a/Documentation/netlabel/lsm_interface.rst b/Documentation/netlabel/lsm_interface.rst
new file mode 100644
index 000000000000..026fc267f798
--- /dev/null
+++ b/Documentation/netlabel/lsm_interface.rst
@@ -0,0 +1,53 @@
+========================================
+NetLabel Linux Security Module Interface
+========================================
+
+Paul Moore, paul.moore@hp.com
+
+May 17, 2006
+
+Overview
+========
+
+NetLabel is a mechanism which can set and retrieve security attributes from
+network packets.  It is intended to be used by LSM developers who want to make
+use of a common code base for several different packet labeling protocols.
+The NetLabel security module API is defined in 'include/net/netlabel.h' but a
+brief overview is given below.
+
+NetLabel Security Attributes
+============================
+
+Since NetLabel supports multiple different packet labeling protocols and LSMs
+it uses the concept of security attributes to refer to the packet's security
+labels.  The NetLabel security attributes are defined by the
+'netlbl_lsm_secattr' structure in the NetLabel header file.  Internally the
+NetLabel subsystem converts the security attributes to and from the correct
+low-level packet label depending on the NetLabel build time and run time
+configuration.  It is up to the LSM developer to translate the NetLabel
+security attributes into whatever security identifiers are in use for their
+particular LSM.
+
+NetLabel LSM Protocol Operations
+================================
+
+These are the functions which allow the LSM developer to manipulate the labels
+on outgoing packets as well as read the labels on incoming packets.  Functions
+exist to operate both on sockets as well as the sk_buffs directly.  These high
+level functions are translated into low level protocol operations based on how
+the administrator has configured the NetLabel subsystem.
+
+NetLabel Label Mapping Cache Operations
+=======================================
+
+Depending on the exact configuration, translation between the network packet
+label and the internal LSM security identifier can be time consuming.  The
+NetLabel label mapping cache is a caching mechanism which can be used to
+sidestep much of this overhead once a mapping has been established.  Once the
+LSM has received a packet, used NetLabel to decode its security attributes,
+and translated the security attributes into a LSM internal identifier the LSM
+can use the NetLabel caching functions to associate the LSM internal
+identifier with the network packet's label.  This means that in the future
+when a incoming packet matches a cached value not only are the internal
+NetLabel translation mechanisms bypassed but the LSM translation mechanisms are
+bypassed as well which should result in a significant reduction in overhead.
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt
deleted file mode 100644
index 638c74f7de7f..000000000000
--- a/Documentation/netlabel/lsm_interface.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-NetLabel Linux Security Module Interface
-==============================================================================
-Paul Moore, paul.moore@hp.com
-
-May 17, 2006
-
- * Overview
-
-NetLabel is a mechanism which can set and retrieve security attributes from
-network packets.  It is intended to be used by LSM developers who want to make
-use of a common code base for several different packet labeling protocols.
-The NetLabel security module API is defined in 'include/net/netlabel.h' but a
-brief overview is given below.
-
- * NetLabel Security Attributes
-
-Since NetLabel supports multiple different packet labeling protocols and LSMs
-it uses the concept of security attributes to refer to the packet's security
-labels.  The NetLabel security attributes are defined by the
-'netlbl_lsm_secattr' structure in the NetLabel header file.  Internally the
-NetLabel subsystem converts the security attributes to and from the correct
-low-level packet label depending on the NetLabel build time and run time
-configuration.  It is up to the LSM developer to translate the NetLabel
-security attributes into whatever security identifiers are in use for their
-particular LSM.
-
- * NetLabel LSM Protocol Operations
-
-These are the functions which allow the LSM developer to manipulate the labels
-on outgoing packets as well as read the labels on incoming packets.  Functions
-exist to operate both on sockets as well as the sk_buffs directly.  These high
-level functions are translated into low level protocol operations based on how
-the administrator has configured the NetLabel subsystem.
-
- * NetLabel Label Mapping Cache Operations
-
-Depending on the exact configuration, translation between the network packet
-label and the internal LSM security identifier can be time consuming.  The
-NetLabel label mapping cache is a caching mechanism which can be used to
-sidestep much of this overhead once a mapping has been established.  Once the
-LSM has received a packet, used NetLabel to decode its security attributes,
-and translated the security attributes into a LSM internal identifier the LSM
-can use the NetLabel caching functions to associate the LSM internal
-identifier with the network packet's label.  This means that in the future
-when a incoming packet matches a cached value not only are the internal
-NetLabel translation mechanisms bypassed but the LSM translation mechanisms are
-bypassed as well which should result in a significant reduction in overhead.
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index e14d7d40fc75..eeedc2e826aa 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -220,7 +220,21 @@ Usage
 In order to use AF_XDP sockets there are two parts needed. The
 user-space application and the XDP program. For a complete setup and
 usage example, please refer to the sample application. The user-space
-side is xdpsock_user.c and the XDP side xdpsock_kern.c.
+side is xdpsock_user.c and the XDP side is part of libbpf.
+
+The XDP code sample included in tools/lib/bpf/xsk.c is the following::
+
+   SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+   {
+       int index = ctx->rx_queue_index;
+
+       // A set entry here means that the correspnding queue_id
+       // has an active AF_XDP socket bound to it.
+       if (bpf_map_lookup_elem(&xsks_map, &index))
+           return bpf_redirect_map(&xsks_map, index, 0);
+
+       return XDP_PASS;
+   }
 
 Naive ring dequeue and enqueue could look like this::
 
@@ -316,16 +330,16 @@ A: When a netdev of a physical NIC is initialized, Linux usually
    all the traffic, you can force the netdev to only have 1 queue, queue
    id 0, and then bind to queue 0. You can use ethtool to do this::
 
-   sudo ethtool -L <interface> combined 1
+     sudo ethtool -L <interface> combined 1
 
    If you want to only see part of the traffic, you can program the
    NIC through ethtool to filter out your traffic to a single queue id
    that you can bind your XDP socket to. Here is one example in which
    UDP traffic to and from port 4242 are sent to queue 2::
 
-   sudo ethtool -N <interface> rx-flow-hash udp4 fn
-   sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
-   4242 action 2
+     sudo ethtool -N <interface> rx-flow-hash udp4 fn
+     sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
+     4242 action 2
 
    A number of other ways are possible all up to the capabilitites of
    the NIC you have.
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index d3e5dd26db12..e3abfbd32f71 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -706,9 +706,9 @@ num_unsol_na
 	unsolicited IPv6 Neighbor Advertisements) to be issued after a
 	failover event.  As soon as the link is up on the new slave
 	(possibly immediately) a peer notification is sent on the
-	bonding device and each VLAN sub-device.  This is repeated at
-	each link monitor interval (arp_interval or miimon, whichever
-	is active) if the number is greater than 1.
+	bonding device and each VLAN sub-device. This is repeated at
+	the rate specified by peer_notif_delay if the number is
+	greater than 1.
 
 	The valid range is 0 - 255; the default value is 1.  These options
 	affect only the active-backup mode.  These options were added for
@@ -727,6 +727,16 @@ packets_per_slave
 	The valid range is 0 - 65535; the default value is 1. This option
 	has effect only in balance-rr mode.
 
+peer_notif_delay
+
+        Specify the delay, in milliseconds, between each peer
+        notification (gratuitous ARP and unsolicited IPv6 Neighbor
+        Advertisement) when they are issued after a failover event.
+        This delay should be a multiple of the link monitor interval
+        (arp_interval or miimon, whichever is active). The default
+        value is 0 which means to match the value of the link monitor
+        interval.
+
 primary
 
 	A string (eth0, eth2, etc) specifying which slave is the
diff --git a/Documentation/networking/conf.py b/Documentation/networking/conf.py
deleted file mode 100644
index 40f69e67a883..000000000000
--- a/Documentation/networking/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Linux Networking Documentation"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'networking.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/networking/device_drivers/amazon/ena.txt b/Documentation/networking/device_drivers/amazon/ena.txt
index 2b4b6f57e549..1bb55c7b604c 100644
--- a/Documentation/networking/device_drivers/amazon/ena.txt
+++ b/Documentation/networking/device_drivers/amazon/ena.txt
@@ -73,7 +73,7 @@ operation.
 AQ is used for submitting management commands, and the
 results/responses are reported asynchronously through ACQ.
 
-ENA introduces a very small set of management commands with room for
+ENA introduces a small set of management commands with room for
 vendor-specific extensions. Most of the management operations are
 framed in a generic Get/Set feature command.
 
@@ -202,11 +202,14 @@ delay value to each level.
 The user can enable/disable adaptive moderation, modify the interrupt
 delay table and restore its default values through sysfs.
 
+RX copybreak:
+=============
 The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK
 and can be configured by the ETHTOOL_STUNABLE command of the
 SIOCETHTOOL ioctl.
 
 SKB:
+====
 The driver-allocated SKB for frames received from Rx handling using
 NAPI context. The allocation method depends on the size of the packet.
 If the frame length is larger than rx_copybreak, napi_get_frags()
diff --git a/Documentation/networking/device_drivers/aquantia/atlantic.txt b/Documentation/networking/device_drivers/aquantia/atlantic.txt
new file mode 100644
index 000000000000..d235cbaeccc6
--- /dev/null
+++ b/Documentation/networking/device_drivers/aquantia/atlantic.txt
@@ -0,0 +1,439 @@
+aQuantia AQtion Driver for the aQuantia Multi-Gigabit PCI Express Family of
+Ethernet Adapters
+=============================================================================
+
+Contents
+========
+
+- Identifying Your Adapter
+- Configuration
+- Supported ethtool options
+- Command Line Parameters
+- Config file parameters
+- Support
+- License
+
+Identifying Your Adapter
+========================
+
+The driver in this release is compatible with AQC-100, AQC-107, AQC-108 based ethernet adapters.
+
+
+SFP+ Devices (for AQC-100 based adapters)
+----------------------------------
+
+This release tested with passive Direct Attach Cables (DAC) and SFP+/LC Optical Transceiver.
+
+Configuration
+=========================
+  Viewing Link Messages
+  ---------------------
+  Link messages will not be displayed to the console if the distribution is
+  restricting system messages. In order to see network driver link messages on
+  your console, set dmesg to eight by entering the following:
+
+       dmesg -n 8
+
+  NOTE: This setting is not saved across reboots.
+
+  Jumbo Frames
+  ------------
+  The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
+  enabled by changing the MTU to a value larger than the default of 1500.
+  The maximum value for the MTU is 16000.  Use the `ip` command to
+  increase the MTU size.  For example:
+
+        ip link set mtu 16000 dev enp1s0
+
+  ethtool
+  -------
+  The driver utilizes the ethtool interface for driver configuration and
+  diagnostics, as well as displaying statistical information. The latest
+  ethtool version is required for this functionality.
+
+  NAPI
+  ----
+  NAPI (Rx polling mode) is supported in the atlantic driver.
+
+Supported ethtool options
+============================
+ Viewing adapter settings
+ ---------------------
+ ethtool <ethX>
+
+ Output example:
+
+  Settings for enp1s0:
+    Supported ports: [ TP ]
+    Supported link modes:   100baseT/Full
+                            1000baseT/Full
+                            10000baseT/Full
+                            2500baseT/Full
+                            5000baseT/Full
+    Supported pause frame use: Symmetric
+    Supports auto-negotiation: Yes
+    Supported FEC modes: Not reported
+    Advertised link modes:  100baseT/Full
+                            1000baseT/Full
+                            10000baseT/Full
+                            2500baseT/Full
+                            5000baseT/Full
+    Advertised pause frame use: Symmetric
+    Advertised auto-negotiation: Yes
+    Advertised FEC modes: Not reported
+    Speed: 10000Mb/s
+    Duplex: Full
+    Port: Twisted Pair
+    PHYAD: 0
+    Transceiver: internal
+    Auto-negotiation: on
+    MDI-X: Unknown
+    Supports Wake-on: g
+    Wake-on: d
+    Link detected: yes
+
+ ---
+ Note: AQrate speeds (2.5/5 Gb/s) will be displayed only with linux kernels > 4.10.
+    But you can still use these speeds:
+	ethtool -s eth0 autoneg off speed 2500
+
+ Viewing adapter information
+ ---------------------
+ ethtool -i <ethX>
+
+ Output example:
+
+  driver: atlantic
+  version: 5.2.0-050200rc5-generic-kern
+  firmware-version: 3.1.78
+  expansion-rom-version:
+  bus-info: 0000:01:00.0
+  supports-statistics: yes
+  supports-test: no
+  supports-eeprom-access: no
+  supports-register-dump: yes
+  supports-priv-flags: no
+
+
+ Viewing Ethernet adapter statistics:
+ ---------------------
+ ethtool -S <ethX>
+
+ Output example:
+ NIC statistics:
+     InPackets: 13238607
+     InUCast: 13293852
+     InMCast: 52
+     InBCast: 3
+     InErrors: 0
+     OutPackets: 23703019
+     OutUCast: 23704941
+     OutMCast: 67
+     OutBCast: 11
+     InUCastOctects: 213182760
+     OutUCastOctects: 22698443
+     InMCastOctects: 6600
+     OutMCastOctects: 8776
+     InBCastOctects: 192
+     OutBCastOctects: 704
+     InOctects: 2131839552
+     OutOctects: 226938073
+     InPacketsDma: 95532300
+     OutPacketsDma: 59503397
+     InOctetsDma: 1137102462
+     OutOctetsDma: 2394339518
+     InDroppedDma: 0
+     Queue[0] InPackets: 23567131
+     Queue[0] OutPackets: 20070028
+     Queue[0] InJumboPackets: 0
+     Queue[0] InLroPackets: 0
+     Queue[0] InErrors: 0
+     Queue[1] InPackets: 45428967
+     Queue[1] OutPackets: 11306178
+     Queue[1] InJumboPackets: 0
+     Queue[1] InLroPackets: 0
+     Queue[1] InErrors: 0
+     Queue[2] InPackets: 3187011
+     Queue[2] OutPackets: 13080381
+     Queue[2] InJumboPackets: 0
+     Queue[2] InLroPackets: 0
+     Queue[2] InErrors: 0
+     Queue[3] InPackets: 23349136
+     Queue[3] OutPackets: 15046810
+     Queue[3] InJumboPackets: 0
+     Queue[3] InLroPackets: 0
+     Queue[3] InErrors: 0
+
+ Interrupt coalescing support
+ ---------------------------------
+ ITR mode, TX/RX coalescing timings could be viewed with:
+
+ ethtool -c <ethX>
+
+ and changed with:
+
+ ethtool -C <ethX> tx-usecs <usecs> rx-usecs <usecs>
+
+ To disable coalescing:
+
+ ethtool -C <ethX> tx-usecs 0 rx-usecs 0 tx-max-frames 1 tx-max-frames 1
+
+ Wake on LAN support
+ ---------------------------------
+
+ WOL support by magic packet:
+
+ ethtool -s <ethX> wol g
+
+ To disable WOL:
+
+ ethtool -s <ethX> wol d
+
+ Set and check the driver message level
+ ---------------------------------
+
+ Set message level
+
+ ethtool -s <ethX> msglvl <level>
+
+ Level values:
+
+ 0x0001 - general driver status.
+ 0x0002 - hardware probing.
+ 0x0004 - link state.
+ 0x0008 - periodic status check.
+ 0x0010 - interface being brought down.
+ 0x0020 - interface being brought up.
+ 0x0040 - receive error.
+ 0x0080 - transmit error.
+ 0x0200 - interrupt handling.
+ 0x0400 - transmit completion.
+ 0x0800 - receive completion.
+ 0x1000 - packet contents.
+ 0x2000 - hardware status.
+ 0x4000 - Wake-on-LAN status.
+
+ By default, the level of debugging messages is set 0x0001(general driver status).
+
+ Check message level
+
+ ethtool <ethX> | grep "Current message level"
+
+ If you want to disable the output of messages
+
+ ethtool -s <ethX> msglvl 0
+
+ RX flow rules (ntuple filters)
+ ---------------------------------
+ There are separate rules supported, that applies in that order:
+ 1. 16 VLAN ID rules
+ 2. 16 L2 EtherType rules
+ 3. 8 L3/L4 5-Tuple rules
+
+
+ The driver utilizes the ethtool interface for configuring ntuple filters,
+ via "ethtool -N <device> <filter>".
+
+ To enable or disable the RX flow rules:
+
+ ethtool -K ethX ntuple <on|off>
+
+ When disabling ntuple filters, all the user programed filters are
+ flushed from the driver cache and hardware. All needed filters must
+ be re-added when ntuple is re-enabled.
+
+ Because of the fixed order of the rules, the location of filters is also fixed:
+ - Locations 0 - 15 for VLAN ID filters
+ - Locations 16 - 31 for L2 EtherType filters
+ - Locations 32 - 39 for L3/L4 5-tuple filters (locations 32, 36 for IPv6)
+
+ The L3/L4 5-tuple (protocol, source and destination IP address, source and
+ destination TCP/UDP/SCTP port) is compared against 8 filters. For IPv4, up to
+ 8 source and destination addresses can be matched. For IPv6, up to 2 pairs of
+ addresses can be supported. Source and destination ports are only compared for
+ TCP/UDP/SCTP packets.
+
+ To add a filter that directs packet to queue 5, use <-N|-U|--config-nfc|--config-ntuple> switch:
+
+ ethtool -N <ethX> flow-type udp4 src-ip 10.0.0.1 dst-ip 10.0.0.2 src-port 2000 dst-port 2001 action 5 <loc 32>
+
+ - action is the queue number.
+ - loc is the rule number.
+
+ For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you must set the loc
+ number within 32 - 39.
+ For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you can set 8 rules
+ for traffic IPv4 or you can set 2 rules for traffic IPv6. Loc number traffic
+ IPv6 is 32 and 36.
+ At the moment you can not use IPv4 and IPv6 filters at the same time.
+
+ Example filter for IPv6 filter traffic:
+
+ sudo ethtool -N <ethX> flow-type tcp6 src-ip 2001:db8:0:f101::1 dst-ip 2001:db8:0:f101::2 action 1 loc 32
+ sudo ethtool -N <ethX> flow-type ip6 src-ip 2001:db8:0:f101::2 dst-ip 2001:db8:0:f101::5 action -1 loc 36
+
+ Example filter for IPv4 filter traffic:
+
+ sudo ethtool -N <ethX> flow-type udp4 src-ip 10.0.0.4 dst-ip 10.0.0.7 src-port 2000 dst-port 2001 loc 32
+ sudo ethtool -N <ethX> flow-type tcp4 src-ip 10.0.0.3 dst-ip 10.0.0.9 src-port 2000 dst-port 2001 loc 33
+ sudo ethtool -N <ethX> flow-type ip4 src-ip 10.0.0.6 dst-ip 10.0.0.4 loc 34
+
+ If you set action -1, then all traffic corresponding to the filter will be discarded.
+ The maximum value action is 31.
+
+
+ The VLAN filter (VLAN id) is compared against 16 filters.
+ VLAN id must be accompanied by mask 0xF000. That is to distinguish VLAN filter
+ from L2 Ethertype filter with UserPriority since both User Priority and VLAN ID
+ are passed in the same 'vlan' parameter.
+
+ To add a filter that directs packets from VLAN 2001 to queue 5:
+ ethtool -N <ethX> flow-type ip4 vlan 2001 m 0xF000 action 1 loc 0
+
+
+ L2 EtherType filters allows filter packet by EtherType field or both EtherType
+ and User Priority (PCP) field of 802.1Q.
+ UserPriority (vlan) parameter must be accompanied by mask 0x1FFF. That is to
+ distinguish VLAN filter from L2 Ethertype filter with UserPriority since both
+ User Priority and VLAN ID are passed in the same 'vlan' parameter.
+
+ To add a filter that directs IP4 packess of priority 3 to queue 3:
+ ethtool -N <ethX> flow-type ether proto 0x800 vlan 0x600 m 0x1FFF action 3 loc 16
+
+
+ To see the list of filters currently present:
+
+ ethtool <-u|-n|--show-nfc|--show-ntuple> <ethX>
+
+ Rules may be deleted from the table itself. This is done using:
+
+ sudo ethtool <-N|-U|--config-nfc|--config-ntuple> <ethX> delete <loc>
+
+ - loc is the rule number to be deleted.
+
+ Rx filters is an interface to load the filter table that funnels all flow
+ into queue 0 unless an alternative queue is specified using "action". In that
+ case, any flow that matches the filter criteria will be directed to the
+ appropriate queue. RX filters is supported on all kernels 2.6.30 and later.
+
+ RSS for UDP
+ ---------------------------------
+ Currently, NIC does not support RSS for fragmented IP packets, which leads to
+ incorrect working of RSS for fragmented UDP traffic. To disable RSS for UDP the
+ RX Flow L3/L4 rule may be used.
+
+ Example:
+ ethtool -N eth0 flow-type udp4 action 0 loc 32
+
+Command Line Parameters
+=======================
+The following command line parameters are available on atlantic driver:
+
+aq_itr -Interrupt throttling mode
+----------------------------------------
+Accepted values: 0, 1, 0xFFFF
+Default value: 0xFFFF
+0      - Disable interrupt throttling.
+1      - Enable interrupt throttling and use specified tx and rx rates.
+0xFFFF - Auto throttling mode. Driver will choose the best RX and TX
+         interrupt throtting settings based on link speed.
+
+aq_itr_tx - TX interrupt throttle rate
+----------------------------------------
+Accepted values: 0 - 0x1FF
+Default value: 0
+TX side throttling in microseconds. Adapter will setup maximum interrupt delay
+to this value. Minimum interrupt delay will be a half of this value
+
+aq_itr_rx - RX interrupt throttle rate
+----------------------------------------
+Accepted values: 0 - 0x1FF
+Default value: 0
+RX side throttling in microseconds. Adapter will setup maximum interrupt delay
+to this value. Minimum interrupt delay will be a half of this value
+
+Note: ITR settings could be changed in runtime by ethtool -c means (see below)
+
+Config file parameters
+=======================
+For some fine tuning and performance optimizations,
+some parameters can be changed in the {source_dir}/aq_cfg.h file.
+
+AQ_CFG_RX_PAGEORDER
+----------------------------------------
+Default value: 0
+RX page order override. Thats a power of 2 number of RX pages allocated for
+each descriptor. Received descriptor size is still limited by AQ_CFG_RX_FRAME_MAX.
+Increasing pageorder makes page reuse better (actual on iommu enabled systems).
+
+AQ_CFG_RX_REFILL_THRES
+----------------------------------------
+Default value: 32
+RX refill threshold. RX path will not refill freed descriptors until the
+specified number of free descriptors is observed. Larger values may help
+better page reuse but may lead to packet drops as well.
+
+AQ_CFG_VECS_DEF
+------------------------------------------------------------
+Number of queues
+Valid Range: 0 - 8 (up to AQ_CFG_VECS_MAX)
+Default value: 8
+Notice this value will be capped by the number of cores available on the system.
+
+AQ_CFG_IS_RSS_DEF
+------------------------------------------------------------
+Enable/disable Receive Side Scaling
+
+This feature allows the adapter to distribute receive processing
+across multiple CPU-cores and to prevent from overloading a single CPU core.
+
+Valid values
+0 - disabled
+1 - enabled
+
+Default value: 1
+
+AQ_CFG_NUM_RSS_QUEUES_DEF
+------------------------------------------------------------
+Number of queues for Receive Side Scaling
+Valid Range: 0 - 8 (up to AQ_CFG_VECS_DEF)
+
+Default value: AQ_CFG_VECS_DEF
+
+AQ_CFG_IS_LRO_DEF
+------------------------------------------------------------
+Enable/disable Large Receive Offload
+
+This offload enables the adapter to coalesce multiple TCP segments and indicate
+them as a single coalesced unit to the OS networking subsystem.
+The system consumes less energy but it also introduces more latency in packets processing.
+
+Valid values
+0 - disabled
+1 - enabled
+
+Default value: 1
+
+AQ_CFG_TX_CLEAN_BUDGET
+----------------------------------------
+Maximum descriptors to cleanup on TX at once.
+Default value: 256
+
+After the aq_cfg.h file changed the driver must be rebuilt to take effect.
+
+Support
+=======
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to support@aquantia.com
+
+License
+=======
+
+aQuantia Corporation Network Driver
+Copyright(c) 2014 - 2019 aQuantia Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms and conditions of the GNU General Public License,
+version 2, as published by the Free Software Foundation.
diff --git a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst
index 5045df990a4c..17dbee1ac53e 100644
--- a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst
+++ b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst
@@ -39,8 +39,7 @@ The Linux DPIO driver consists of 3 primary components--
 
    DPIO service-- provides APIs to other Linux drivers for services
 
-   QBman portal interface-- sends portal commands, gets responses
-::
+   QBman portal interface-- sends portal commands, gets responses::
 
           fsl-mc          other
            bus           drivers
@@ -60,6 +59,7 @@ The Linux DPIO driver consists of 3 primary components--
 
 The diagram below shows how the DPIO driver components fit with the other
 DPAA2 Linux driver components::
+
                                                    +------------+
                                                    | OS Network |
                                                    |   Stack    |
diff --git a/Documentation/networking/device_drivers/google/gve.rst b/Documentation/networking/device_drivers/google/gve.rst
new file mode 100644
index 000000000000..793693cef6e3
--- /dev/null
+++ b/Documentation/networking/device_drivers/google/gve.rst
@@ -0,0 +1,123 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+==============================================================
+Linux kernel driver for Compute Engine Virtual Ethernet (gve):
+==============================================================
+
+Supported Hardware
+===================
+The GVE driver binds to a single PCI device id used by the virtual
+Ethernet device found in some Compute Engine VMs.
+
++--------------+----------+---------+
+|Field         | Value    | Comments|
++==============+==========+=========+
+|Vendor ID     | `0x1AE0` | Google  |
++--------------+----------+---------+
+|Device ID     | `0x0042` |         |
++--------------+----------+---------+
+|Sub-vendor ID | `0x1AE0` | Google  |
++--------------+----------+---------+
+|Sub-device ID | `0x0058` |         |
++--------------+----------+---------+
+|Revision ID   | `0x0`    |         |
++--------------+----------+---------+
+|Device Class  | `0x200`  | Ethernet|
++--------------+----------+---------+
+
+PCI Bars
+========
+The gVNIC PCI device exposes three 32-bit memory BARS:
+- Bar0 - Device configuration and status registers.
+- Bar1 - MSI-X vector table
+- Bar2 - IRQ, RX and TX doorbells
+
+Device Interactions
+===================
+The driver interacts with the device in the following ways:
+ - Registers
+    - A block of MMIO registers
+    - See gve_register.h for more detail
+ - Admin Queue
+    - See description below
+ - Reset
+    - At any time the device can be reset
+ - Interrupts
+    - See supported interrupts below
+ - Transmit and Receive Queues
+    - See description below
+
+Registers
+---------
+All registers are MMIO and big endian.
+
+The registers are used for initializing and configuring the device as well as
+querying device status in response to management interrupts.
+
+Admin Queue (AQ)
+----------------
+The Admin Queue is a PAGE_SIZE memory block, treated as an array of AQ
+commands, used by the driver to issue commands to the device and set up
+resources.The driver and the device maintain a count of how many commands
+have been submitted and executed. To issue AQ commands, the driver must do
+the following (with proper locking):
+
+1)  Copy new commands into next available slots in the AQ array
+2)  Increment its counter by he number of new commands
+3)  Write the counter into the GVE_ADMIN_QUEUE_DOORBELL register
+4)  Poll the ADMIN_QUEUE_EVENT_COUNTER register until it equals
+    the value written to the doorbell, or until a timeout.
+
+The device will update the status field in each AQ command reported as
+executed through the ADMIN_QUEUE_EVENT_COUNTER register.
+
+Device Resets
+-------------
+A device reset is triggered by writing 0x0 to the AQ PFN register.
+This causes the device to release all resources allocated by the
+driver, including the AQ itself.
+
+Interrupts
+----------
+The following interrupts are supported by the driver:
+
+Management Interrupt
+~~~~~~~~~~~~~~~~~~~~
+The management interrupt is used by the device to tell the driver to
+look at the GVE_DEVICE_STATUS register.
+
+The handler for the management irq simply queues the service task in
+the workqueue to check the register and acks the irq.
+
+Notification Block Interrupts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The notification block interrupts are used to tell the driver to poll
+the queues associated with that interrupt.
+
+The handler for these irqs schedule the napi for that block to run
+and poll the queues.
+
+Traffic Queues
+--------------
+gVNIC's queues are composed of a descriptor ring and a buffer and are
+assigned to a notification block.
+
+The descriptor rings are power-of-two-sized ring buffers consisting of
+fixed-size descriptors. They advance their head pointer using a __be32
+doorbell located in Bar2. The tail pointers are advanced by consuming
+descriptors in-order and updating a __be32 counter. Both the doorbell
+and the counter overflow to zero.
+
+Each queue's buffers must be registered in advance with the device as a
+queue page list, and packet data can only be put in those pages.
+
+Transmit
+~~~~~~~~
+gve maps the buffers for transmit rings into a FIFO and copies the packets
+into the FIFO before sending them to the NIC.
+
+Receive
+~~~~~~~
+The buffers for receive rings are put into a data ring that is the same
+length as the descriptor ring and the head and tail pointers advance over
+the rings together.
diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst
new file mode 100644
index 000000000000..2b7fefe72351
--- /dev/null
+++ b/Documentation/networking/device_drivers/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+Vendor Device Drivers
+=====================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   freescale/dpaa2/index
+   intel/e100
+   intel/e1000
+   intel/e1000e
+   intel/fm10k
+   intel/igb
+   intel/igbvf
+   intel/ixgb
+   intel/ixgbe
+   intel/ixgbevf
+   intel/i40e
+   intel/iavf
+   intel/ice
+   google/gve
+   mellanox/mlx5
+
+.. only::  subproject
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst
new file mode 100644
index 000000000000..214325897732
--- /dev/null
+++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst
@@ -0,0 +1,192 @@
+.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+=================================================
+Mellanox ConnectX(R) mlx5 core VPI Network Driver
+=================================================
+
+Copyright (c) 2019, Mellanox Technologies LTD.
+
+Contents
+========
+
+- `Enabling the driver and kconfig options`_
+- `Devlink info`_
+- `Devlink health reporters`_
+
+Enabling the driver and kconfig options
+================================================
+
+| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out)
+| at build time via kernel Kconfig flags.
+| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags
+| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y.
+| For the list of advanced features please see below.
+
+**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko)
+
+|    The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config.
+|    This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib).
+
+
+**CONFIG_MLX5_CORE_EN=(y/n)**
+
+|    Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads.
+|    mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be
+|    built-in into mlx5_core.ko.
+
+
+**CONFIG_MLX5_EN_ARFS=(y/n)**
+
+|     Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering.
+|     https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4
+
+
+**CONFIG_MLX5_EN_RXNFC=(y/n)**
+
+|    Enables ethtool receive network flow classification, which allows user defined
+|    flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API.
+
+
+**CONFIG_MLX5_CORE_EN_DCB=(y/n)**:
+
+|    Enables `Data Center Bridging (DCB) Support <https://community.mellanox.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_.
+
+
+**CONFIG_MLX5_MPFS=(y/n)**
+
+|    Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC.
+|    MPFs is required for when `Multi-Host <http://www.mellanox.com/page/multihost>`_ configuration is enabled to allow passing
+|    user configured unicast MAC addresses to the requesting PF.
+
+
+**CONFIG_MLX5_ESWITCH=(y/n)**
+
+|    Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering
+|    and switching for the enabled VFs and PF in two available modes:
+|           1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://community.mellanox.com/s/article/howto-configure-sr-iov-for-connectx-4-connectx-5-with-kvm--ethernet-x>`_.
+|           2) `Switchdev mode (eswitch offloads) <https://www.mellanox.com/related-docs/prod_software/ASAP2_Hardware_Offloading_for_vSwitches_User_Manual_v4.4.pdf>`_.
+
+
+**CONFIG_MLX5_CORE_IPOIB=(y/n)**
+
+|    IPoIB offloads & acceleration support.
+|    Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma
+|    IPoIB ulp netdevice.
+
+
+**CONFIG_MLX5_FPGA=(y/n)**
+
+|    Build support for the Innova family of network cards by Mellanox Technologies.
+|    Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board.
+|    If you select this option, the mlx5_core driver will include the Innova FPGA core and allow
+|    building sandbox-specific client drivers.
+
+
+**CONFIG_MLX5_EN_IPSEC=(y/n)**
+
+|    Enables `IPSec XFRM cryptography-offload accelaration <http://www.mellanox.com/related-docs/prod_software/Mellanox_Innova_IPsec_Ethernet_Adapter_Card_User_Manual.pdf>`_.
+
+**CONFIG_MLX5_EN_TLS=(y/n)**
+
+|   TLS cryptography-offload accelaration.
+
+
+**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko)
+
+|   Provides low-level InfiniBand/RDMA and `RoCE <https://community.mellanox.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support.
+
+
+**External options** ( Choose if the corresponding mlx5 feature is required )
+
+- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled
+- CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled.
+- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool).
+
+Devlink info
+============
+
+The devlink info reports the running and stored firmware versions on device.
+It also prints the device PSID which represents the HCA board type ID.
+
+User command example::
+
+   $ devlink dev info pci/0000:00:06.0
+      pci/0000:00:06.0:
+      driver mlx5_core
+      versions:
+         fixed:
+            fw.psid MT_0000000009
+         running:
+            fw.version 16.26.0100
+         stored:
+            fw.version 16.26.0100
+
+Devlink health reporters
+========================
+
+tx reporter
+-----------
+The tx reporter is responsible of two error scenarios:
+
+- TX timeout
+    Report on kernel tx timeout detection.
+    Recover by searching lost interrupts.
+- TX error completion
+    Report on error tx completion.
+    Recover by flushing the TX queue and reset it.
+
+TX reporter also support Diagnose callback, on which it provides
+real time information of its send queues status.
+
+User commands examples:
+
+- Diagnose send queues status::
+
+    $ devlink health diagnose pci/0000:82:00.0 reporter tx
+
+- Show number of tx errors indicated, number of recover flows ended successfully,
+  is autorecover enabled and graceful period from last recover::
+
+    $ devlink health show pci/0000:82:00.0 reporter tx
+
+fw reporter
+-----------
+The fw reporter implements diagnose and dump callbacks.
+It follows symptoms of fw error such as fw syndrome by triggering
+fw core dump and storing it into the dump buffer.
+The fw reporter diagnose command can be triggered any time by the user to check
+current fw status.
+
+User commands examples:
+
+- Check fw heath status::
+
+    $ devlink health diagnose pci/0000:82:00.0 reporter fw
+
+- Read FW core dump if already stored or trigger new one::
+
+    $ devlink health dump show pci/0000:82:00.0 reporter fw
+
+NOTE: This command can run only on the PF which has fw tracer ownership,
+running it on other PF or any VF will return "Operation not permitted".
+
+fw fatal reporter
+-----------------
+The fw fatal reporter implements dump and recover callbacks.
+It follows fatal errors indications by CR-space dump and recover flow.
+The CR-space dump uses vsc interface which is valid even if the FW command
+interface is not functional, which is the case in most FW fatal errors.
+The recover function runs recover flow which reloads the driver and triggers fw
+reset if needed.
+
+User commands examples:
+
+- Run fw recover flow manually::
+
+    $ devlink health recover pci/0000:82:00.0 reporter fw_fatal
+
+- Read FW CR-space dump if already strored or trigger new one::
+
+    $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
+
+NOTE: This command can run only on PF.
diff --git a/Documentation/networking/dsa/b53.rst b/Documentation/networking/dsa/b53.rst
new file mode 100644
index 000000000000..b41637cdb82b
--- /dev/null
+++ b/Documentation/networking/dsa/b53.rst
@@ -0,0 +1,183 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================================
+Broadcom RoboSwitch Ethernet switch driver
+==========================================
+
+The Broadcom RoboSwitch Ethernet switch family is used in quite a range of
+xDSL router, cable modems and other multimedia devices.
+
+The actual implementation supports the devices BCM5325E, BCM5365, BCM539x,
+BCM53115 and BCM53125 as well as BCM63XX.
+
+Implementation details
+======================
+
+The driver is located in ``drivers/net/dsa/b53/`` and is implemented as a
+DSA driver; see ``Documentation/networking/dsa/dsa.rst`` for details on the
+subsystem and what it provides.
+
+The switch is, if possible, configured to enable a Broadcom specific 4-bytes
+switch tag which gets inserted by the switch for every packet forwarded to the
+CPU interface, conversely, the CPU network interface should insert a similar
+tag for packets entering the CPU port. The tag format is described in
+``net/dsa/tag_brcm.c``.
+
+The configuration of the device depends on whether or not tagging is
+supported.
+
+The interface names and example network configuration are used according the
+configuration described in the :ref:`dsa-config-showcases`.
+
+Configuration with tagging support
+----------------------------------
+
+The tagging based configuration is desired. It is not specific to the b53
+DSA driver and will work like all DSA drivers which supports tagging.
+
+See :ref:`dsa-tagged-configuration`.
+
+Configuration without tagging support
+-------------------------------------
+
+Older models (5325, 5365) support a different tag format that is not supported
+yet. 539x and 531x5 require managed mode and some special handling, which is
+also not yet supported. The tagging support is disabled in these cases and the
+switch need a different configuration.
+
+The configuration slightly differ from the :ref:`dsa-vlan-configuration`.
+
+The b53 tags the CPU port in all VLANs, since otherwise any PVID untagged
+VLAN programming would basically change the CPU port's default PVID and make
+it untagged, undesirable.
+
+In difference to the configuration described in :ref:`dsa-vlan-configuration`
+the default VLAN 1 has to be removed from the slave interface configuration in
+single port and gateway configuration, while there is no need to add an extra
+VLAN configuration in the bridge showcase.
+
+single port
+~~~~~~~~~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+By default packages are tagged with vid 1:
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+  ip link add link eth0 name eth0.2 type vlan id 2
+  ip link add link eth0 name eth0.3 type vlan id 3
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+  ip link set eth0.2 up
+  ip link set eth0.3 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridges
+  ip link set dev wan master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev lan1 vid 2 pvid untagged
+  bridge vlan del dev lan1 vid 1
+  bridge vlan add dev lan2 vid 3 pvid untagged
+  bridge vlan del dev lan2 vid 1
+
+  # configure the VLANs
+  ip addr add 192.0.2.1/30 dev eth0.1
+  ip addr add 192.0.2.5/30 dev eth0.2
+  ip addr add 192.0.2.9/30 dev eth0.3
+
+  # bring up the bridge devices
+  ip link set br0 up
+
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridge
+  ip link set dev wan master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set eth0.1 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+  ip link add link eth0 name eth0.2 type vlan id 2
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+  ip link set eth0.2 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridges
+  ip link set dev wan master br0
+  ip link set eth0.1 master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev wan vid 2 pvid untagged
+  bridge vlan del dev wan vid 1
+
+  # configure the VLANs
+  ip addr add 192.0.2.1/30 dev eth0.2
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge devices
+  ip link set br0 up
diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
new file mode 100644
index 000000000000..af029b3ca2ab
--- /dev/null
+++ b/Documentation/networking/dsa/configuration.rst
@@ -0,0 +1,292 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+DSA switch configuration from userspace
+=======================================
+
+The DSA switch configuration is not integrated into the main userspace
+network configuration suites by now and has to be performed manualy.
+
+.. _dsa-config-showcases:
+
+Configuration showcases
+-----------------------
+
+To configure a DSA switch a couple of commands need to be executed. In this
+documentation some common configuration scenarios are handled as showcases:
+
+*single port*
+  Every switch port acts as a different configurable Ethernet port
+
+*bridge*
+  Every switch port is part of one configurable Ethernet bridge
+
+*gateway*
+  Every switch port except one upstream port is part of a configurable
+  Ethernet bridge.
+  The upstream port acts as different configurable Ethernet port.
+
+All configurations are performed with tools from iproute2, which is available
+at https://www.kernel.org/pub/linux/utils/net/iproute2/
+
+Through DSA every port of a switch is handled like a normal linux Ethernet
+interface. The CPU port is the switch port connected to an Ethernet MAC chip.
+The corresponding linux Ethernet interface is called the master interface.
+All other corresponding linux interfaces are called slave interfaces.
+
+The slave interfaces depend on the master interface. They can only brought up,
+when the master interface is up.
+
+In this documentation the following Ethernet interfaces are used:
+
+*eth0*
+  the master interface
+
+*lan1*
+  a slave interface
+
+*lan2*
+  another slave interface
+
+*lan3*
+  a third slave interface
+
+*wan*
+  A slave interface dedicated for upstream traffic
+
+Further Ethernet interfaces can be configured similar.
+The configured IPs and networks are:
+
+*single port*
+  * lan1: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+  * lan2: 192.0.2.5/30 (192.0.2.4 - 192.0.2.7)
+  * lan3: 192.0.2.9/30 (192.0.2.8 - 192.0.2.11)
+
+*bridge*
+  * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+
+*gateway*
+  * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+  * wan: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+
+.. _dsa-tagged-configuration:
+
+Configuration with tagging support
+----------------------------------
+
+The tagging based configuration is desired and supported by the majority of
+DSA switches. These switches are capable to tag incoming and outgoing traffic
+without using a VLAN based configuration.
+
+single port
+~~~~~~~~~~~
+
+.. code-block:: sh
+
+  # configure each interface
+  ip addr add 192.0.2.1/30 dev lan1
+  ip addr add 192.0.2.5/30 dev lan2
+  ip addr add 192.0.2.9/30 dev lan3
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan2 up
+  ip link set lan3 up
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan2 up
+  ip link set lan3 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # add ports to bridge
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set dev lan3 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # configure the upstream port
+  ip addr add 192.0.2.1/30 dev wan
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # add ports to bridge
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+.. _dsa-vlan-configuration:
+
+Configuration without tagging support
+-------------------------------------
+
+A minority of switches are not capable to use a taging protocol
+(DSA_TAG_PROTO_NONE). These switches can be configured by a VLAN based
+configuration.
+
+single port
+~~~~~~~~~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+  ip link add link eth0 name eth0.2 type vlan id 2
+  ip link add link eth0 name eth0.3 type vlan id 3
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+  ip link set eth0.2 up
+  ip link set eth0.3 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan1 up
+  ip link set lan3 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridges
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set dev lan3 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev lan1 vid 1 pvid untagged
+  bridge vlan add dev lan2 vid 2 pvid untagged
+  bridge vlan add dev lan3 vid 3 pvid untagged
+
+  # configure the VLANs
+  ip addr add 192.0.2.1/30 dev eth0.1
+  ip addr add 192.0.2.5/30 dev eth0.2
+  ip addr add 192.0.2.9/30 dev eth0.3
+
+  # bring up the bridge devices
+  ip link set br0 up
+
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan2 up
+  ip link set lan3 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridge
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set dev lan3 master br0
+  ip link set eth0.1 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev lan1 vid 1 pvid untagged
+  bridge vlan add dev lan2 vid 1 pvid untagged
+  bridge vlan add dev lan3 vid 1 pvid untagged
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+  ip link add link eth0 name eth0.2 type vlan id 2
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+  ip link set eth0.2 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridges
+  ip link set dev wan master br0
+  ip link set eth0.1 master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev lan1 vid 1 pvid untagged
+  bridge vlan add dev lan2 vid 1 pvid untagged
+  bridge vlan add dev wan vid 2 pvid untagged
+
+  # configure the VLANs
+  ip addr add 192.0.2.1/30 dev eth0.2
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge devices
+  ip link set br0 up
diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst
index ca87068b9ab9..563d56c6a25c 100644
--- a/Documentation/networking/dsa/dsa.rst
+++ b/Documentation/networking/dsa/dsa.rst
@@ -531,7 +531,7 @@ Bridge VLAN filtering
   a software implementation.
 
 .. note:: VLAN ID 0 corresponds to the port private database, which, in the context
-        of DSA, would be the its port-based VLAN, used by the associated bridge device.
+        of DSA, would be its port-based VLAN, used by the associated bridge device.
 
 - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a
   Forwarding Database entry, the switch hardware should be programmed to delete
@@ -554,7 +554,7 @@ Bridge VLAN filtering
   associated with this VLAN ID.
 
 .. note:: VLAN ID 0 corresponds to the port private database, which, in the context
-        of DSA, would be the its port-based VLAN, used by the associated bridge device.
+        of DSA, would be its port-based VLAN, used by the associated bridge device.
 
 - ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a
   multicast database entry, the switch hardware should be programmed to delete
diff --git a/Documentation/networking/dsa/index.rst b/Documentation/networking/dsa/index.rst
index 0e5b7a9be406..ee631e2d646f 100644
--- a/Documentation/networking/dsa/index.rst
+++ b/Documentation/networking/dsa/index.rst
@@ -6,6 +6,8 @@ Distributed Switch Architecture
    :maxdepth: 1
 
    dsa
+   b53
    bcm_sf2
    lan9303
    sja1105
+   configuration
diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst
index ea7bac438cfd..cb2858dece93 100644
--- a/Documentation/networking/dsa/sja1105.rst
+++ b/Documentation/networking/dsa/sja1105.rst
@@ -86,13 +86,13 @@ functionality.
 The following traffic modes are supported over the switch netdevices:
 
 +--------------------+------------+------------------+------------------+
-|                    | Standalone |   Bridged with   |   Bridged with   |
-|                    |    ports   | vlan_filtering 0 | vlan_filtering 1 |
+|                    | Standalone | Bridged with     | Bridged with     |
+|                    | ports      | vlan_filtering 0 | vlan_filtering 1 |
 +====================+============+==================+==================+
 | Regular traffic    |     Yes    |       Yes        |  No (use master) |
 +--------------------+------------+------------------+------------------+
 | Management traffic |     Yes    |       Yes        |       Yes        |
-|    (BPDU, PTP)     |            |                  |                  |
+| (BPDU, PTP)        |            |                  |                  |
 +--------------------+------------+------------------+------------------+
 
 Switching features
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index f390fe3cfdfb..a46fca264bee 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -11,19 +11,7 @@ Contents:
    batman-adv
    can
    can_ucan_protocol
-   device_drivers/freescale/dpaa2/index
-   device_drivers/intel/e100
-   device_drivers/intel/e1000
-   device_drivers/intel/e1000e
-   device_drivers/intel/fm10k
-   device_drivers/intel/igb
-   device_drivers/intel/igbvf
-   device_drivers/intel/ixgb
-   device_drivers/intel/ixgbe
-   device_drivers/intel/ixgbevf
-   device_drivers/intel/i40e
-   device_drivers/intel/iavf
-   device_drivers/intel/ice
+   device_drivers/index
    dsa/index
    devlink-info-versions
    ieee802154
@@ -40,6 +28,8 @@ Contents:
    checksum-offloads
    segmentation-offloads
    scaling
+   tls
+   tls-offload
 
 .. only::  subproject
 
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 725b8bea58a7..df33674799b5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -80,6 +80,7 @@ fib_multipath_hash_policy - INTEGER
 	Possible values:
 	0 - Layer 3
 	1 - Layer 4
+	2 - Layer 3 or inner Layer 3 if present
 
 fib_sync_mem - UNSIGNED INTEGER
 	Amount of dirty memory from fib entries that can be backlogged before
@@ -255,6 +256,14 @@ tcp_base_mss - INTEGER
 	Path MTU discovery (MTU probing).  If MTU probing is enabled,
 	this is the initial MSS used by the connection.
 
+tcp_min_snd_mss - INTEGER
+	TCP SYN and SYNACK messages usually advertise an ADVMSS option,
+	as described in RFC 1122 and RFC 6691.
+	If this ADVMSS option is smaller than tcp_min_snd_mss,
+	it is silently capped to tcp_min_snd_mss.
+
+	Default : 48 (at least 8 bytes of payload per segment)
+
 tcp_congestion_control - STRING
 	Set the congestion control algorithm to be used for new
 	connections. The algorithm "reno" is always available, but
@@ -560,10 +569,10 @@ tcp_comp_sack_delay_ns - LONG INTEGER
 	Default : 1,000,000 ns (1 ms)
 
 tcp_comp_sack_nr - INTEGER
-	Max numer of SACK that can be compressed.
+	Max number of SACK that can be compressed.
 	Using 0 disables SACK compression.
 
-	Detault : 44
+	Default : 44
 
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
@@ -648,6 +657,26 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER
 	0 to disable the blackhole detection.
 	By default, it is set to 1hr.
 
+tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
+	The list consists of a primary key and an optional backup key. The
+	primary key is used for both creating and validating cookies, while the
+	optional backup key is only used for validating cookies. The purpose of
+	the backup key is to maximize TFO validation when keys are rotated.
+
+	A randomly chosen primary key may be configured by the kernel if
+	the tcp_fastopen sysctl is set to 0x400 (see above), or if the
+	TCP_FASTOPEN setsockopt() optname is set and a key has not been
+	previously configured via sysctl. If keys are configured via
+	setsockopt() by using the TCP_FASTOPEN_KEY optname, then those
+	per-socket keys will be used instead of any keys that are specified via
+	sysctl.
+
+	A key is specified as 4 8-digit hexadecimal integers which are separated
+	by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be
+	omitted. A primary and a backup key may be specified by separating them
+	by a comma. If only one key is specified, it becomes the primary key and
+	any previously configured backup keys are removed.
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 127. Default value
@@ -772,6 +801,14 @@ tcp_challenge_ack_limit - INTEGER
 	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
 	Default: 100
 
+tcp_rx_skb_cache - BOOLEAN
+	Controls a per TCP socket cache of one skb, that might help
+	performance of some workloads. This might be dangerous
+	on systems with a lot of TCP sockets, since it increases
+	memory usage.
+
+	Default: 0 (disabled)
+
 UDP variables:
 
 udp_l3mdev_accept - BOOLEAN
@@ -1409,14 +1446,26 @@ flowlabel_state_ranges - BOOLEAN
 	FALSE: disabled
 	Default: true
 
-flowlabel_reflect - BOOLEAN
-	Automatically reflect the flow label. Needed for Path MTU
+flowlabel_reflect - INTEGER
+	Control flow label reflection. Needed for Path MTU
 	Discovery to work with Equal Cost Multipath Routing in anycast
 	environments. See RFC 7690 and:
 	https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
-	TRUE: enabled
-	FALSE: disabled
-	Default: FALSE
+
+	This is a bitmask.
+	1: enabled for established flows
+
+	Note that this prevents automatic flowlabel changes, as done
+	in "tcp: change IPv6 flow-label upon receiving spurious retransmission"
+	and "tcp: Change txhash on every SYN and RTO retransmit"
+
+	2: enabled for TCP RESET packets (no active listener)
+	If set, a RST packet sent in response to a SYN packet on a closed
+	port will reflect the incoming flow label.
+
+	4: enabled for ICMPv6 echo reply messages.
+
+	Default: 0
 
 fib_multipath_hash_policy - INTEGER
 	Controls which hash policy to use for multipath routes.
@@ -1424,6 +1473,7 @@ fib_multipath_hash_policy - INTEGER
 	Possible values:
 	0 - Layer 3 (source and destination addresses plus flow label)
 	1 - Layer 4 (standard 5-tuple)
+	2 - Layer 3 or inner Layer 3 if present
 
 anycast_src_echo_reply - BOOLEAN
 	Controls the use of anycast addresses as source addresses for ICMPv6
@@ -2237,7 +2287,7 @@ addr_scope_policy - INTEGER
 
 
 /proc/sys/net/core/*
-	Please see: Documentation/sysctl/net.txt for descriptions of these entries.
+	Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries.
 
 
 /proc/sys/net/unix/*
diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
index 2f24a1912a48..025cc9b96992 100644
--- a/Documentation/networking/mpls-sysctl.txt
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -30,7 +30,7 @@ ip_ttl_propagate - BOOL
 	0 - disabled / RFC 3443 [Short] Pipe Model
 	1 - enabled / RFC 3443 Uniform Model (default)
 
-default_ttl - BOOL
+default_ttl - INTEGER
 	Default TTL value to use for MPLS packets where it cannot be
 	propagated from an IP header, either because one isn't present
 	or ip_ttl_propagate has been disabled.
diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 0dd90d7df5ec..a689966bc4be 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -202,7 +202,8 @@ the PHY/controller, of which the PHY needs to be aware.
 
 *interface* is a u32 which specifies the connection type used
 between the controller and the PHY.  Examples are GMII, MII,
-RGMII, and SGMII.  For a full list, see include/linux/phy.h
+RGMII, and SGMII.  See "PHY interface mode" below.  For a full
+list, see include/linux/phy.h
 
 Now just make sure that phydev->supported and phydev->advertising have any
 values pruned from them which don't make sense for your controller (a 10/100
@@ -225,6 +226,48 @@ When you want to disconnect from the network (even if just briefly), you call
 phy_stop(phydev). This function also stops the phylib state machine and
 disables PHY interrupts.
 
+PHY interface modes
+===================
+
+The PHY interface mode supplied in the phy_connect() family of functions
+defines the initial operating mode of the PHY interface.  This is not
+guaranteed to remain constant; there are PHYs which dynamically change
+their interface mode without software interaction depending on the
+negotiation results.
+
+Some of the interface modes are described below:
+
+``PHY_INTERFACE_MODE_1000BASEX``
+    This defines the 1000BASE-X single-lane serdes link as defined by the
+    802.3 standard section 36.  The link operates at a fixed bit rate of
+    1.25Gbaud using a 10B/8B encoding scheme, resulting in an underlying
+    data rate of 1Gbps.  Embedded in the data stream is a 16-bit control
+    word which is used to negotiate the duplex and pause modes with the
+    remote end.  This does not include "up-clocked" variants such as 2.5Gbps
+    speeds (see below.)
+
+``PHY_INTERFACE_MODE_2500BASEX``
+    This defines a variant of 1000BASE-X which is clocked 2.5 times faster,
+    than the 802.3 standard giving a fixed bit rate of 3.125Gbaud.
+
+``PHY_INTERFACE_MODE_SGMII``
+    This is used for Cisco SGMII, which is a modification of 1000BASE-X
+    as defined by the 802.3 standard.  The SGMII link consists of a single
+    serdes lane running at a fixed bit rate of 1.25Gbaud with 10B/8B
+    encoding.  The underlying data rate is 1Gbps, with the slower speeds of
+    100Mbps and 10Mbps being achieved through replication of each data symbol.
+    The 802.3 control word is re-purposed to send the negotiated speed and
+    duplex information from to the MAC, and for the MAC to acknowledge
+    receipt.  This does not include "up-clocked" variants such as 2.5Gbps
+    speeds.
+
+    Note: mismatched SGMII vs 1000BASE-X configuration on a link can
+    successfully pass data in some circumstances, but the 16-bit control
+    word will not be correctly interpreted, which may cause mismatches in
+    duplex, pause or other settings.  This is dependent on the MAC and/or
+    PHY behaviour.
+
+
 Pause frames / flow control
 ===========================
 
diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt
index 0235ae69af2a..f2a0147c933d 100644
--- a/Documentation/networking/rds.txt
+++ b/Documentation/networking/rds.txt
@@ -389,7 +389,7 @@ Multipath RDS (mprds)
   a common (to all paths) part, and a per-path struct rds_conn_path. All
   I/O workqs and reconnect threads are driven from the rds_conn_path.
   Transports such as TCP that are multipath capable may then set up a
-  TPC socket per rds_conn_path, and this is managed by the transport via
+  TCP socket per rds_conn_path, and this is managed by the transport via
   the transport privatee cp_transport_data pointer.
 
   Transports announce themselves as multipath capable by setting the
diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst
index 89d1ee933e9f..085e8fab03fd 100644
--- a/Documentation/networking/segmentation-offloads.rst
+++ b/Documentation/networking/segmentation-offloads.rst
@@ -18,7 +18,7 @@ The following technologies are described:
  * Generic Segmentation Offload - GSO
  * Generic Receive Offload - GRO
  * Partial Generic Segmentation Offload - GSO_PARTIAL
- * SCTP accelleration with GSO - GSO_BY_FRAGS
+ * SCTP acceleration with GSO - GSO_BY_FRAGS
 
 
 TCP Segmentation Offload
@@ -148,7 +148,7 @@ that the IPv4 ID field is incremented in the case that a given header does
 not have the DF bit set.
 
 
-SCTP accelleration with GSO
+SCTP acceleration with GSO
 ===========================
 
 SCTP - despite the lack of hardware support - can still take advantage of
diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 5bd26cb07244..91446b431b70 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -98,6 +98,7 @@ this documentation.
 4. Add::
 
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 
    to the driver's private data structure.  We shall refer to the
    driver's private data pointer as ``priv`` below, and the driver's
@@ -223,8 +224,10 @@ this documentation.
    .. code-block:: c
 
 	struct phylink *phylink;
+	priv->phylink_config.dev = &dev.dev;
+	priv->phylink_config.type = PHYLINK_NETDEV;
 
-	phylink = phylink_create(dev, node, phy_mode, &phylink_ops);
+	phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops);
 	if (IS_ERR(phylink)) {
 		err = PTR_ERR(phylink);
 		fail probe;
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index bbdaf8990031..8dd6333c3270 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -368,7 +368,7 @@ ts[1] used to hold hardware timestamps converted to system time.
 Instead, expose the hardware clock device on the NIC directly as
 a HW PTP clock source, to allow time conversion in userspace and
 optionally synchronize system time with a userspace PTP stack such
-as linuxptp. For the PTP clock API, see Documentation/ptp/ptp.txt.
+as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst.
 
 Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled
 together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false
diff --git a/Documentation/networking/tls-offload-layers.svg b/Documentation/networking/tls-offload-layers.svg
new file mode 100644
index 000000000000..cf72f05dbb21
--- /dev/null
+++ b/Documentation/networking/tls-offload-layers.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 460.0 500.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m117.02887 0l72.28346 0l0 40.25197l-72.28346 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m117.02887 0l72.28346 0l0 40.25197l-72.28346 0z" fill-rule="evenodd"/><path fill="#000000" d="m135.71944 27.045982l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm12.853302 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm15.453842 4.578125q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm10.469467 4.859375l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m309.02887 0l72.28348 0l0 40.25197l-72.28348 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m309.02887 0l72.28348 0l0 40.25197l-72.28348 0z" fill-rule="evenodd"/><path fill="#000000" d="m328.4915 27.045982l-2.96875 -9.671875l1.703125 0l1.53125 5.578125l0.578125 2.078125q0.046875 -0.15625 0.5 -2.0l1.546875 -5.65625l1.6875 0l1.4375 5.609375l0.484375 1.84375l0.5625 -1.859375l1.65625 -5.59375l1.59375 0l-3.03125 9.671875l-1.703125 0l-1.53125 -5.796875l-0.375 -1.640625l-1.953125 7.4375l-1.71875 0zm11.676086 0l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm7.722931 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm8.230194 -1.640625l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m73.64304 101.46588l351.0551 0l0 53.70079l-351.0551 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m73.64304 101.46588l351.0551 0l0 53.70079l-351.0551 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.67503 135.23627l0 -13.359367l1.640625 0l0 7.6249924l3.890625 -3.9374924l2.109375 0l-3.6875 3.5937424l4.0625 6.078125l-2.015625 0l-3.203125 -4.953125l-1.15625 1.125l0 3.828125l-1.640625 0zm12.90625 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.5781174l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.6718674q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.5583038 1.46875l0 -13.359367l1.640625 0l0 13.359367l-1.640625 0zm3.5354462 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.6406174 0.296875 -1.1874924q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.4999924l-1.609375 0.21875q-0.109375 -0.7499924 -0.640625 -1.1718674q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.31249237 0.1875 0.5781174q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm24.136429 -10.468742l1.765625 0l0 7.7187424q0 2.015625 -0.453125 3.203125q-0.453125 1.1875 -1.640625 1.9375q-1.1875 0.734375 -3.125 0.734375q-1.875 0 -3.078125 -0.640625q-1.1875 -0.65625 -1.703125 -1.875q-0.5 -1.234375 -0.5 -3.359375l0 -7.7187424l1.765625 0l0 7.7187424q0 1.734375 0.3125 2.5625q0.328125 0.8125 1.109375 1.265625q0.796875 0.453125 1.9375 0.453125q1.953125 0 2.78125 -0.890625q0.828125 -0.890625 0.828125 -3.390625l0 -7.7187424zm4.629181 13.359367l0 -13.359367l1.78125 0l0 11.781242l6.5625 0l0 1.578125l-8.34375 0zm10.453857 0l0 -13.359367l5.046875 0q1.328125 0 2.03125 0.125q0.96875 0.171875 1.640625 0.640625q0.671875 0.453125 1.078125 1.28125q0.40625 0.828125 0.40625 1.828125q0 1.703125 -1.09375 2.8906174q-1.078125 1.171875 -3.921875 1.171875l-3.421875 0l0 5.421875l-1.765625 0zm1.765625 -7.0l3.453125 0q1.71875 0 2.4375 -0.6406174q0.71875 -0.640625 0.71875 -1.796875q0 -0.84375 -0.421875 -1.4375q-0.421875 -0.59375 -1.125 -0.78125q-0.4375 -0.125 -1.640625 -0.125l-3.421875 0l0 4.7812424z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m73.64304 216.38058l351.0551 0l0 53.700775l-351.0551 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m73.64304 216.38058l351.0551 0l0 53.700775l-351.0551 0z" fill-rule="evenodd"/><path fill="#000000" d="m211.16338 250.15097l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm17.52098 -4.6875l1.765625 0.453125q-0.5625 2.171875 -2.0 3.328125q-1.4375 1.140625 -3.53125 1.140625q-2.15625 0 -3.515625 -0.875q-1.34375 -0.890625 -2.0625 -2.546875q-0.703125 -1.671875 -0.703125 -3.59375q0 -2.078125 0.796875 -3.625q0.796875 -1.5625 2.265625 -2.359375q1.484375 -0.8125 3.25 -0.8125q2.0 0 3.359375 1.015625q1.375 1.015625 1.90625 2.875l-1.734375 0.40625q-0.46875 -1.453125 -1.359375 -2.109375q-0.875 -0.671875 -2.203125 -0.671875q-1.546875 0 -2.578125 0.734375q-1.03125 0.734375 -1.453125 1.984375q-0.421875 1.234375 -0.421875 2.5625q0 1.703125 0.5 2.96875q0.5 1.265625 1.546875 1.90625q1.046875 0.625 2.265625 0.625q1.484375 0 2.515625 -0.859375q1.03125 -0.859375 1.390625 -2.546875zm3.9416962 4.6875l0 -13.359375l5.046875 0q1.328125 0 2.03125 0.125q0.96875 0.171875 1.640625 0.640625q0.671875 0.453125 1.078125 1.28125q0.40625 0.828125 0.40625 1.828125q0 1.703125 -1.09375 2.890625q-1.078125 1.171875 -3.921875 1.171875l-3.421875 0l0 5.421875l-1.765625 0zm1.765625 -7.0l3.453125 0q1.71875 0 2.4375 -0.640625q0.71875 -0.640625 0.71875 -1.796875q0 -0.84375 -0.421875 -1.4375q-0.421875 -0.59375 -1.125 -0.78125q-0.4375 -0.125 -1.640625 -0.125l-3.421875 0l0 4.78125zm14.664642 4.109375l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.2656097 0.890625q0.421875 0.5625 0.578125 1.5l-1.6093597 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.68748474 0.296875 1.0781097 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.3749847 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.562485 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm7.917694 0.28125q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm10.516327 1.3125l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm3.015625 3.546875l0 -13.359375l1.640625 0l0 7.625l3.890625 -3.9375l2.109375 0l-3.6875 3.59375l4.0625 6.078125l-2.015625 0l-3.203125 -4.953125l-1.15625 1.125l0 3.828125l-1.640625 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m73.64304 331.2953l351.0551 0l0 53.700775l-351.0551 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m73.64304 331.2953l351.0551 0l0 53.700775l-351.0551 0z" fill-rule="evenodd"/><path fill="#000000" d="m225.73463 365.06567l0 -13.359375l4.609375 0q1.546875 0 2.375 0.203125q1.140625 0.25 1.953125 0.953125q1.0625 0.890625 1.578125 2.28125q0.53125 1.390625 0.53125 3.171875q0 1.515625 -0.359375 2.703125q-0.359375 1.171875 -0.921875 1.9375q-0.546875 0.765625 -1.203125 1.21875q-0.65625 0.4375 -1.59375 0.671875q-0.9375 0.21875 -2.140625 0.21875l-4.828125 0zm1.765625 -1.578125l2.859375 0q1.3125 0 2.0625 -0.234375q0.75 -0.25 1.203125 -0.703125q0.625 -0.625 0.96875 -1.6875q0.359375 -1.0625 0.359375 -2.578125q0 -2.09375 -0.6875 -3.21875q-0.6875 -1.125 -1.671875 -1.5q-0.703125 -0.28125 -2.28125 -0.28125l-2.8125 0l0 10.203125zm11.488571 1.578125l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm6.832321 0l-3.6875 -9.671875l1.734375 0l2.078125 5.796875q0.328125 0.9375 0.625 1.9375q0.203125 -0.765625 0.609375 -1.828125l2.140625 -5.90625l1.6874847 0l-3.6562347 9.671875l-1.53125 0zm13.26561 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125732 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m73.64304 446.20996l351.0551 0l0 53.700806l-351.0551 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m73.64304 446.20996l351.0551 0l0 53.700806l-351.0551 0z" fill-rule="evenodd"/><path fill="#000000" d="m222.09538 479.98038l0 -13.359375l4.609375 0q1.546875 0 2.375 0.203125q1.140625 0.25 1.953125 0.953125q1.0625 0.890625 1.578125 2.28125q0.53125 1.390625 0.53125 3.171875q0 1.515625 -0.359375 2.703125q-0.359375 1.171875 -0.921875 1.9375q-0.546875 0.765625 -1.203125 1.21875q-0.65625 0.4375 -1.59375 0.671875q-0.9375 0.21875 -2.140625 0.21875l-4.828125 0zm1.765625 -1.578125l2.859375 0q1.3125 0 2.0625 -0.234375q0.75 -0.25 1.203125 -0.703125q0.625 -0.625 0.96875 -1.6875q0.359375 -1.0625 0.359375 -2.578125q0 -2.09375 -0.6875 -3.21875q-0.6875 -1.125 -1.671875 -1.5q-0.703125 -0.28125 -2.28125 -0.28125l-2.8125 0l0 10.203125zm18.129196 -1.53125l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm11.828842 5.765625l-3.6875 -9.671875l1.734375 0l2.078125 5.796875q0.328125 0.9375 0.625 1.9375q0.203125 -0.765625 0.609375 -1.828125l2.140625 -5.90625l1.6875 0l-3.65625 9.671875l-1.53125 0zm6.640625 -11.46875l0 -1.890625l1.6406097 0l0 1.890625l-1.6406097 0zm0 11.46875l0 -9.671875l1.6406097 0l0 9.671875l-1.6406097 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.640625 0.4375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m153.17061 40.25197l0 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m153.17061 40.25197l0 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m48.435696 73.03937l402.67715 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m48.435696 73.03937l402.67715 0" fill-rule="evenodd"/><path fill="#cfe2f3" d="m177.95801 71.49061l-12.393707 0l0 12.897636l-24.7874 0l0 -12.897636l-12.393692 0l24.7874 -12.89764z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m177.95801 71.49061l-12.393707 0l0 12.897636l-24.7874 0l0 -12.897636l-12.393692 0l24.7874 -12.89764z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m320.3832 71.49061l12.393707 0l0 -12.89764l24.787384 0l0 12.89764l12.393707 0l-24.787415 12.897636z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.3832 71.49061l12.393707 0l0 -12.89764l24.787384 0l0 12.89764l12.393707 0l-24.787415 12.897636z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m177.95801 188.3931l-12.393707 0l0 12.897629l-24.7874 0l0 -12.897629l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m177.95801 188.3931l-12.393707 0l0 12.897629l-24.7874 0l0 -12.897629l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m320.3832 188.3931l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897629z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.3832 188.3931l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897629z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m177.95801 301.4256l-12.393707 0l0 12.897644l-24.7874 0l0 -12.897644l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m177.95801 301.4256l-12.393707 0l0 12.897644l-24.7874 0l0 -12.897644l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m320.3832 301.4256l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897644z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.3832 301.4256l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897644z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m177.95801 415.4906l-12.393707 0l0 12.897644l-24.7874 0l0 -12.897644l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m177.95801 415.4906l-12.393707 0l0 12.897644l-24.7874 0l0 -12.897644l-12.393692 0l24.7874 -12.897644z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m320.3832 415.4906l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897644z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.3832 415.4906l12.393707 0l0 -12.897644l24.787384 0l0 12.897644l12.393707 0l-24.787415 12.897644z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m198.14961 44.009186l109.44881 0l0 53.70079l-109.44881 0z" fill-rule="evenodd"/><path fill="#000000" d="m224.98174 70.929184l2.78125 -13.359375l1.65625 0l-1.0 4.78125q0.78125 -0.71875 1.421875 -1.015625q0.640625 -0.296875 1.328125 -0.296875q1.359375 0 2.265625 1.015625q0.90625 1.0 0.90625 2.9375q0 1.28125 -0.375 2.359375q-0.359375 1.0625 -0.890625 1.78125q-0.53125 0.71875 -1.109375 1.15625q-0.578125 0.4375 -1.1875 0.640625q-0.59375 0.21875 -1.140625 0.21875q-0.96875 0 -1.703125 -0.5q-0.71875 -0.515625 -1.125 -1.546875l-0.390625 1.828125l-1.4375 0zm2.4375 -3.96875l-0.015625 0.3125q0 1.234375 0.59375 1.890625q0.59375 0.640625 1.484375 0.640625q0.859375 0 1.578125 -0.609375q0.734375 -0.609375 1.1875 -1.890625q0.46875 -1.28125 0.46875 -2.375q0 -1.21875 -0.59375 -1.890625q-0.578125 -0.671875 -1.4375 -0.671875q-0.90625 0 -1.65625 0.6875q-0.734375 0.6875 -1.234375 2.125q-0.375 1.0625 -0.375 1.78125zm14.531967 2.21875q-1.734375 1.96875 -3.5625 1.96875q-1.109375 0 -1.796875 -0.640625q-0.6875 -0.640625 -0.6875 -1.578125q0 -0.609375 0.296875 -2.09375l1.171875 -5.578125l1.65625 0l-1.296875 6.1875q-0.171875 0.765625 -0.171875 1.203125q0 0.546875 0.328125 0.859375q0.34375 0.296875 0.984375 0.296875q0.703125 0 1.359375 -0.328125q0.65625 -0.34375 1.125 -0.921875q0.484375 -0.578125 0.796875 -1.359375q0.1875 -0.5 0.453125 -1.765625l0.875 -4.171875l1.65625 0l-2.03125 9.671875l-1.515625 0l0.359375 -1.75zm4.000717 1.75l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm5.183304 0l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm12.058319 -3.28125l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm8.063202 5.6875l2.015625 -9.671875l1.453125 0l-0.40625 1.96875q0.75 -1.109375 1.453125 -1.640625q0.71875 -0.546875 1.46875 -0.546875q0.5 0 1.21875 0.359375l-0.671875 1.53125q-0.4375 -0.3125 -0.9375 -0.3125q-0.875 0 -1.78125 0.96875q-0.90625 0.953125 -1.4375 3.46875l-0.8125 3.875l-1.5625 0zm6.368927 -3.3125l1.640625 -0.09375q0 0.703125 0.21875 1.21875q0.21875 0.5 0.796875 0.8125q0.59375 0.3125 1.375 0.3125q1.09375 0 1.640625 -0.4375q0.546875 -0.4375 0.546875 -1.015625q0 -0.4375 -0.328125 -0.8125q-0.328125 -0.390625 -1.640625 -0.953125q-1.3125 -0.5625 -1.671875 -0.78125q-0.609375 -0.375 -0.921875 -0.875q-0.3125 -0.515625 -0.3125 -1.171875q0 -1.140625 0.90625 -1.953125q0.921875 -0.828125 2.5625 -0.828125q1.828125 0 2.765625 0.84375q0.953125 0.84375 1.0 2.21875l-1.609375 0.109375q-0.046875 -0.875 -0.625 -1.390625q-0.578125 -0.515625 -1.65625 -0.515625q-0.84375 0 -1.328125 0.40625q-0.46875 0.390625 -0.46875 0.84375q0 0.453125 0.40625 0.796875q0.28125 0.234375 1.421875 0.734375q1.890625 0.8125 2.375 1.28125q0.78125 0.765625 0.78125 1.84375q0 0.71875 -0.4375 1.421875q-0.4375 0.6875 -1.34375 1.109375q-0.90625 0.40625 -2.140625 0.40625q-1.671875 0 -2.84375 -0.828125q-1.1875 -0.828125 -1.109375 -2.703125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m198.14961 164.00919l109.44881 0l0 53.70079l-109.44881 0z" fill-rule="evenodd"/><path fill="#000000" d="m211.6696 187.61668l1.640625 -0.09375q0 0.703125 0.21875 1.21875q0.21875 0.5 0.796875 0.8125q0.59375 0.3125 1.375 0.3125q1.09375 0 1.640625 -0.4375q0.546875 -0.4375 0.546875 -1.015625q0 -0.4375 -0.328125 -0.8125q-0.328125 -0.390625 -1.640625 -0.953125q-1.3125 -0.5625 -1.671875 -0.78125q-0.609375 -0.375 -0.921875 -0.875q-0.3125 -0.515625 -0.3125 -1.171875q0 -1.140625 0.90625 -1.953125q0.921875 -0.828125 2.5625 -0.828125q1.828125 0 2.765625 0.84375q0.953125 0.84375 1.0 2.21875l-1.609375 0.109375q-0.046875 -0.875 -0.625 -1.390625q-0.578125 -0.515625 -1.65625 -0.515625q-0.84375 0 -1.328125 0.40625q-0.46875 0.390625 -0.46875 0.84375q0 0.453125 0.40625 0.796875q0.28125 0.234375 1.421875 0.734375q1.890625 0.8125 2.375 1.28125q0.78125 0.765625 0.78125 1.84375q0 0.71875 -0.4375 1.421875q-0.4375 0.6875 -1.34375 1.109375q-0.90625 0.40625 -2.140625 0.40625q-1.671875 0 -2.84375 -0.828125q-1.1875 -0.828125 -1.109375 -2.703125zm15.84375 -0.21875l1.65625 0.171875q-0.625 1.8125 -1.765625 2.703125q-1.140625 0.875 -2.609375 0.875q-1.578125 0 -2.5625 -1.015625q-0.96875 -1.03125 -0.96875 -2.859375q0 -1.578125 0.625 -3.109375q0.640625 -1.53125 1.796875 -2.328125q1.171875 -0.796875 2.6875 -0.796875q1.546875 0 2.453125 0.875q0.921875 0.875 0.921875 2.328125l-1.625 0.109375q-0.015625 -0.921875 -0.546875 -1.4375q-0.515625 -0.515625 -1.359375 -0.515625q-1.0 0 -1.734375 0.625q-0.71875 0.625 -1.140625 1.90625q-0.40625 1.28125 -0.40625 2.46875q0 1.234375 0.546875 1.859375q0.546875 0.609375 1.34375 0.609375q0.796875 0 1.53125 -0.609375q0.734375 -0.609375 1.15625 -1.859375zm9.171875 2.328125q-0.859375 0.734375 -1.65625 1.078125q-0.78125 0.34375 -1.6875 0.34375q-1.34375 0 -2.171875 -0.78125q-0.8125 -0.796875 -0.8125 -2.03125q0 -0.796875 0.375 -1.421875q0.375 -0.625 0.984375 -1.0q0.609375 -0.390625 1.484375 -0.546875q0.5625 -0.109375 2.109375 -0.171875q1.5625 -0.0625 2.234375 -0.328125q0.1875 -0.671875 0.1875 -1.125q0 -0.578125 -0.421875 -0.90625q-0.5625 -0.453125 -1.671875 -0.453125q-1.03125 0 -1.703125 0.46875q-0.65625 0.453125 -0.953125 1.296875l-1.671875 -0.140625q0.515625 -1.4375 1.609375 -2.203125q1.109375 -0.765625 2.796875 -0.765625q1.796875 0 2.84375 0.859375q0.796875 0.625 0.796875 1.65625q0 0.765625 -0.21875 1.796875l-0.53125 2.40625q-0.265625 1.140625 -0.265625 1.859375q0 0.453125 0.203125 1.3125l-1.671875 0q-0.125 -0.46875 -0.1875 -1.203125zm0.609375 -3.703125q-0.34375 0.140625 -0.75 0.21875q-0.390625 0.0625 -1.3125 0.15625q-1.4375 0.125 -2.03125 0.328125q-0.59375 0.1875 -0.90625 0.625q-0.296875 0.421875 -0.296875 0.9375q0 0.6875 0.484375 1.140625q0.484375 0.4375 1.359375 0.4375q0.828125 0 1.578125 -0.421875q0.75 -0.4375 1.1875 -1.203125q0.4375 -0.78125 0.6875 -2.21875zm7.094467 3.5625l-0.265625 1.359375q-0.59375 0.140625 -1.15625 0.140625q-0.984375 0 -1.5625 -0.46875q-0.4375 -0.375 -0.4375 -1.0q0 -0.3125 0.234375 -1.46875l1.171875 -5.625l-1.296875 0l0.265625 -1.265625l1.296875 0l0.5 -2.375l1.890625 -1.140625l-0.734375 3.515625l1.625 0l-0.28125 1.265625l-1.609375 0l-1.125 5.359375q-0.203125 1.015625 -0.203125 1.21875q0 0.28125 0.15625 0.4375q0.171875 0.15625 0.5625 0.15625q0.546875 0 0.96875 -0.109375zm5.183304 0l-0.265625 1.359375q-0.59375 0.140625 -1.15625 0.140625q-0.984375 0 -1.5625 -0.46875q-0.4375 -0.375 -0.4375 -1.0q0 -0.3125 0.234375 -1.46875l1.171875 -5.625l-1.296875 0l0.265625 -1.265625l1.296875 0l0.5 -2.375l1.890625 -1.140625l-0.734375 3.515625l1.625 0l-0.28125 1.265625l-1.609375 0l-1.125 5.359375q-0.203125 1.015625 -0.203125 1.21875q0 0.28125 0.15625 0.4375q0.171875 0.15625 0.5625 0.15625q0.546875 0 0.96875 -0.109375zm8.433304 -1.9375l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm8.063202 5.6875l2.015625 -9.671875l1.453125 0l-0.40625 1.96875q0.75 -1.109375 1.453125 -1.640625q0.71875 -0.546875 1.46875 -0.546875q0.5 0 1.21875 0.359375l-0.671875 1.53125q-0.4375 -0.3125 -0.9375 -0.3125q-0.875 0 -1.78125 0.96875q-0.90625 0.953125 -1.4375 3.46875l-0.8125 3.875l-1.5625 0zm11.255371 0l2.796875 -13.359375l1.640625 0l-2.78125 13.359375l-1.65625 0zm6.613556 -11.484375l0.40625 -1.875l1.625 0l-0.390625 1.875l-1.640625 0zm-2.390625 11.484375l2.015625 -9.671875l1.65625 0l-2.03125 9.671875l-1.640625 0zm4.3635864 -3.3125l1.640625 -0.09375q0 0.703125 0.21875 1.21875q0.21875 0.5 0.796875 0.8125q0.59375 0.3125 1.375 0.3125q1.09375 0 1.640625 -0.4375q0.546875 -0.4375 0.546875 -1.015625q0 -0.4375 -0.328125 -0.8125q-0.328125 -0.390625 -1.640625 -0.953125q-1.3125 -0.5625 -1.671875 -0.78125q-0.609375 -0.375 -0.921875 -0.875q-0.3125 -0.515625 -0.3125 -1.171875q0 -1.140625 0.90625 -1.953125q0.921875 -0.828125 2.5625 -0.828125q1.828125 0 2.765625 0.84375q0.953125 0.84375 1.0 2.21875l-1.609375 0.109375q-0.046875 -0.875 -0.625 -1.390625q-0.578125 -0.515625 -1.65625 -0.515625q-0.84375 0 -1.328125 0.40625q-0.46875 0.390625 -0.46875 0.84375q0 0.453125 0.40625 0.796875q0.28125 0.234375 1.421875 0.734375q1.890625 0.8125 2.375 1.28125q0.78125 0.765625 0.78125 1.84375q0 0.71875 -0.4375 1.421875q-0.4375 0.6875 -1.34375 1.109375q-0.90625 0.40625 -2.140625 0.40625q-1.671875 0 -2.84375 -0.828125q-1.1875 -0.828125 -1.109375 -2.703125zm13.015625 1.96875l-0.265625 1.359375q-0.59375 0.140625 -1.15625 0.140625q-0.984375 0 -1.5625 -0.46875q-0.4375 -0.375 -0.4375 -1.0q0 -0.3125 0.234375 -1.46875l1.171875 -5.625l-1.296875 0l0.265625 -1.265625l1.296875 0l0.5 -2.375l1.890625 -1.140625l-0.734375 3.515625l1.625 0l-0.28125 1.265625l-1.609375 0l-1.125 5.359375q-0.203125 1.015625 -0.203125 1.21875q0 0.28125 0.15625 0.4375q0.171875 0.15625 0.5625 0.15625q0.546875 0 0.96875 -0.109375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m198.14961 280.1392l109.44881 0l0 53.700806l-109.44881 0z" fill-rule="evenodd"/><path fill="#000000" d="m223.58026 303.7467l1.640625 -0.09375q0 0.703125 0.21875 1.21875q0.21875 0.5 0.796875 0.8125q0.59375 0.3125 1.375 0.3125q1.09375 0 1.640625 -0.4375q0.546875 -0.4375 0.546875 -1.015625q0 -0.4375 -0.328125 -0.8125q-0.328125 -0.390625 -1.640625 -0.953125q-1.3125 -0.5625 -1.671875 -0.78125q-0.609375 -0.375 -0.921875 -0.875q-0.3125 -0.515625 -0.3125 -1.171875q0 -1.140625 0.90625 -1.953125q0.921875 -0.828125 2.5625 -0.828125q1.828125 0 2.765625 0.84375q0.953125 0.84375 1.0 2.21875l-1.609375 0.109375q-0.046875 -0.875 -0.625 -1.390625q-0.578125 -0.515625 -1.65625 -0.515625q-0.84375 0 -1.328125 0.40625q-0.46875 0.390625 -0.46875 0.84375q0 0.453125 0.40625 0.796875q0.28125 0.234375 1.421875 0.734375q1.890625 0.8125 2.375 1.28125q0.78125 0.765625 0.78125 1.84375q0 0.71875 -0.4375 1.421875q-0.4375 0.6875 -1.34375 1.109375q-0.90625 0.40625 -2.140625 0.40625q-1.671875 0 -2.84375 -0.828125q-1.1875 -0.828125 -1.109375 -2.703125zm9.1875 3.3125l2.796875 -13.359375l1.640625 0l-1.734375 8.28125l4.8125 -4.59375l2.171875 0l-4.125 3.609375l2.5 6.0625l-1.8125 0l-1.921875 -4.96875l-2.015625 1.734375l-0.671875 3.234375l-1.640625 0zm7.5 3.703125l0 -1.1875l10.875 0l0 1.1875l-10.875 0zm12.188217 -3.703125l2.78125 -13.359375l1.6562653 0l-1.0000153 4.78125q0.78126526 -0.71875 1.4218903 -1.015625q0.640625 -0.296875 1.328125 -0.296875q1.359375 0 2.265625 1.015625q0.90625 1.0 0.90625 2.9375q0 1.28125 -0.375 2.359375q-0.359375 1.0625 -0.890625 1.78125q-0.53125 0.71875 -1.109375 1.15625q-0.578125 0.4375 -1.1875 0.640625q-0.59375 0.21875 -1.140625 0.21875q-0.96875 0 -1.7031403 -0.5q-0.71875 -0.515625 -1.125 -1.546875l-0.390625 1.828125l-1.4375 0zm2.4375 -3.96875l-0.015625 0.3125q0 1.234375 0.59375 1.890625q0.59376526 0.640625 1.4843903 0.640625q0.859375 0 1.578125 -0.609375q0.734375 -0.609375 1.1875 -1.890625q0.46875 -1.28125 0.46875 -2.375q0 -1.21875 -0.59375 -1.890625q-0.578125 -0.671875 -1.4375 -0.671875q-0.90625 0 -1.65625 0.6875q-0.73439026 0.6875 -1.2343903 2.125q-0.375 1.0625 -0.375 1.78125zm14.531967 2.21875q-1.734375 1.96875 -3.5625 1.96875q-1.109375 0 -1.796875 -0.640625q-0.6875 -0.640625 -0.6875 -1.578125q0 -0.609375 0.296875 -2.09375l1.171875 -5.578125l1.65625 0l-1.296875 6.1875q-0.171875 0.765625 -0.171875 1.203125q0 0.546875 0.328125 0.859375q0.34375 0.296875 0.984375 0.296875q0.703125 0 1.359375 -0.328125q0.65625 -0.34375 1.125 -0.921875q0.484375 -0.578125 0.796875 -1.359375q0.1875 -0.5 0.453125 -1.765625l0.875 -4.171875l1.65625 0l-2.03125 9.671875l-1.515625 0l0.359375 -1.75zm4.0007324 1.75l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm5.1832886 0l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m163.81627 395.2362l178.11024 0l0 53.700806l-178.11024 0z" fill-rule="evenodd"/><path fill="#000000" d="m195.60925 418.62497l1.65625 0.171875q-0.625 1.8125 -1.765625 2.703125q-1.140625 0.875 -2.609375 0.875q-1.578125 0 -2.5625 -1.015625q-0.96875 -1.03125 -0.96875 -2.859375q0 -1.578125 0.625 -3.109375q0.640625 -1.53125 1.796875 -2.328125q1.171875 -0.796875 2.6875 -0.796875q1.546875 0 2.453125 0.875q0.921875 0.875 0.921875 2.328125l-1.625 0.109375q-0.015625 -0.921875 -0.546875 -1.4375q-0.515625 -0.515625 -1.359375 -0.515625q-1.0 0 -1.734375 0.625q-0.71875 0.625 -1.140625 1.90625q-0.40625 1.28125 -0.40625 2.46875q0 1.234375 0.546875 1.859375q0.546875 0.609375 1.34375 0.609375q0.796875 0 1.53125 -0.609375q0.734375 -0.609375 1.15625 -1.859375zm2.9375 -0.140625q0 -2.828125 1.671875 -4.6875q1.375 -1.53125 3.609375 -1.53125q1.75 0 2.8125 1.09375q1.078125 1.09375 1.078125 2.953125q0 1.65625 -0.671875 3.09375q-0.671875 1.4375 -1.921875 2.203125q-1.25 0.765625 -2.625 0.765625q-1.125 0 -2.046875 -0.484375q-0.921875 -0.484375 -1.421875 -1.359375q-0.484375 -0.890625 -0.484375 -2.046875zm1.65625 -0.15625q0 1.359375 0.65625 2.0625q0.65625 0.703125 1.65625 0.703125q0.53125 0 1.046875 -0.203125q0.53125 -0.21875 0.96875 -0.65625q0.453125 -0.4375 0.765625 -1.0q0.3125 -0.5625 0.5 -1.203125q0.28125 -0.90625 0.28125 -1.734375q0 -1.3125 -0.65625 -2.03125q-0.65625 -0.734375 -1.65625 -0.734375q-0.78125 0 -1.421875 0.375q-0.625 0.375 -1.140625 1.09375q-0.515625 0.703125 -0.765625 1.640625q-0.234375 0.9375 -0.234375 1.6875zm8.438217 3.828125l2.015625 -9.671875l1.5 0l-0.359375 1.6875q0.96875 -1.0 1.8125 -1.453125q0.859375 -0.453125 1.734375 -0.453125q1.1875 0 1.84375 0.640625q0.671875 0.625 0.671875 1.703125q0 0.53125 -0.234375 1.6875l-1.234375 5.859375l-1.640625 0l1.28125 -6.125q0.1875 -0.890625 0.1875 -1.328125q0 -0.484375 -0.328125 -0.78125q-0.328125 -0.296875 -0.953125 -0.296875q-1.28125 0 -2.265625 0.90625q-0.984375 0.90625 -1.453125 3.125l-0.9375 4.5l-1.640625 0zm14.219467 -1.34375l-0.265625 1.359375q-0.59375 0.140625 -1.15625 0.140625q-0.984375 0 -1.5625 -0.46875q-0.4375 -0.375 -0.4375 -1.0q0 -0.3125 0.234375 -1.46875l1.171875 -5.625l-1.296875 0l0.265625 -1.265625l1.296875 0l0.5 -2.375l1.890625 -1.140625l-0.734375 3.515625l1.625 0l-0.28125 1.265625l-1.609375 0l-1.125 5.359375q-0.203125 1.015625 -0.203125 1.21875q0 0.28125 0.15625 0.4375q0.171875 0.15625 0.5625 0.15625q0.546875 0 0.96875 -0.109375zm8.433304 -1.9375l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm7.406967 5.6875l4.21875 -4.90625l-2.421875 -4.765625l1.828125 0l0.8125 1.71875q0.453125 0.96875 0.828125 1.84375l2.78125 -3.5625l2.015625 0l-4.0625 4.890625l2.4375 4.78125l-1.8125 0l-0.96875 -1.96875q-0.3125 -0.625 -0.703125 -1.5625l-2.875 3.53125l-2.078125 0zm13.828125 -1.34375l-0.265625 1.359375q-0.59375 0.140625 -1.15625 0.140625q-0.984375 0 -1.5625 -0.46875q-0.4375 -0.375 -0.4375 -1.0q0 -0.3125 0.234375 -1.46875l1.171875 -5.625l-1.296875 0l0.265625 -1.265625l1.296875 0l0.5 -2.375l1.890625 -1.140625l-0.734375 3.515625l1.625 0l-0.28125 1.265625l-1.609375 0l-1.125 5.359375q-0.203125 1.015625 -0.203125 1.21875q0 0.28125 0.15625 0.4375q0.171875 0.15625 0.5625 0.15625q0.546875 0 0.96875 -0.109375zm11.210358 -0.8125l0 -3.671875l-3.640625 0l0 -1.515625l3.640625 0l0 -3.640625l1.546875 0l0 3.640625l3.640625 0l0 1.515625l-3.640625 0l0 3.671875l-1.546875 0zm11.390778 2.15625l2.78125 -13.359375l1.65625 0l-1.0 4.78125q0.78125 -0.71875 1.421875 -1.015625q0.640625 -0.296875 1.328125 -0.296875q1.359375 0 2.265625 1.015625q0.90625 1.0 0.90625 2.9375q0 1.28125 -0.375 2.359375q-0.359375 1.0625 -0.890625 1.78125q-0.53125 0.71875 -1.109375 1.15625q-0.578125 0.4375 -1.1875 0.640625q-0.59375 0.21875 -1.140625 0.21875q-0.96875 0 -1.703125 -0.5q-0.71875 -0.515625 -1.125 -1.546875l-0.390625 1.828125l-1.4375 0zm2.4375 -3.96875l-0.015625 0.3125q0 1.234375 0.59375 1.890625q0.59375 0.640625 1.484375 0.640625q0.859375 0 1.578125 -0.609375q0.734375 -0.609375 1.1875 -1.890625q0.46875 -1.28125 0.46875 -2.375q0 -1.21875 -0.59375 -1.890625q-0.578125 -0.671875 -1.4375 -0.671875q-0.90625 0 -1.65625 0.6875q-0.734375 0.6875 -1.234375 2.125q-0.375 1.0625 -0.375 1.78125zm14.531952 2.21875q-1.734375 1.96875 -3.5625 1.96875q-1.109375 0 -1.796875 -0.640625q-0.6875 -0.640625 -0.6875 -1.578125q0 -0.609375 0.296875 -2.09375l1.171875 -5.578125l1.65625 0l-1.296875 6.1875q-0.171875 0.765625 -0.171875 1.203125q0 0.546875 0.328125 0.859375q0.34375 0.296875 0.984375 0.296875q0.703125 0 1.359375 -0.328125q0.65625 -0.34375 1.125 -0.921875q0.484375 -0.578125 0.796875 -1.359375q0.1875 -0.5 0.453125 -1.765625l0.875 -4.171875l1.65625 0l-2.03125 9.671875l-1.515625 0l0.359375 -1.75zm4.0007324 1.75l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm5.1832886 0l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm12.058319 -3.28125l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm8.063202 5.6875l2.015625 -9.671875l1.453125 0l-0.40625 1.96875q0.75 -1.109375 1.453125 -1.640625q0.71875 -0.546875 1.46875 -0.546875q0.5 0 1.21875 0.359375l-0.671875 1.53125q-0.4375 -0.3125 -0.9375 -0.3125q-0.875 0 -1.78125 0.96875q-0.90625 0.953125 -1.4375 3.46875l-0.8125 3.875l-1.5625 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m0 165.96588l118.74016 0l0 40.25197l-118.74016 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m0 165.96588l118.74016 0l0 40.25197l-118.74016 0z" fill-rule="evenodd"/><path fill="#000000" d="m23.145836 190.12123l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.5895538 1.46875l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 3.703125l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.203842 3.59375q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.188217 4.859375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm5.572052 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm16.609375 -0.21875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/></g></svg>
diff --git a/Documentation/networking/tls-offload-reorder-bad.svg b/Documentation/networking/tls-offload-reorder-bad.svg
new file mode 100644
index 000000000000..d107aaf0f71e
--- /dev/null
+++ b/Documentation/networking/tls-offload-reorder-bad.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 672.0 68.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#b6d7a8" d="m0 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m0 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m15.953125 52.942722l-1.640625 0l0 -10.453125q-0.59375 0.5625 -1.5625 1.140625q-0.953125 0.5625 -1.71875 0.84375l0 -1.59375q1.375 -0.640625 2.40625 -1.5625q1.03125 -0.921875 1.453125 -1.78125l1.0625 0l0 13.40625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m340.69897 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m340.69897 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m355.73022 52.942722l0 -3.203125l-5.796875 0l0 -1.5l6.09375 -8.65625l1.34375 0l0 8.65625l1.796875 0l0 1.5l-1.796875 0l0 3.203125l-1.640625 0zm0 -4.703125l0 -6.015625l-4.1875 6.015625l4.1875 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m225.37527 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m225.37527 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m235.15652 49.411472l1.640625 -0.21875q0.28125 1.40625 0.953125 2.015625q0.6875 0.609375 1.65625 0.609375q1.15625 0 1.953125 -0.796875q0.796875 -0.796875 0.796875 -1.984375q0 -1.125 -0.734375 -1.859375q-0.734375 -0.734375 -1.875 -0.734375q-0.46875 0 -1.15625 0.171875l0.1875 -1.4375q0.15625 0.015625 0.265625 0.015625q1.046875 0 1.875 -0.546875q0.84375 -0.546875 0.84375 -1.671875q0 -0.90625 -0.609375 -1.5q-0.609375 -0.59375 -1.578125 -0.59375q-0.953125 0 -1.59375 0.609375q-0.640625 0.59375 -0.8125 1.796875l-1.640625 -0.296875q0.296875 -1.640625 1.359375 -2.546875q1.0625 -0.90625 2.65625 -0.90625q1.09375 0 2.0 0.46875q0.921875 0.46875 1.40625 1.28125q0.5 0.8125 0.5 1.71875q0 0.859375 -0.46875 1.578125q-0.46875 0.703125 -1.375 1.125q1.1875 0.28125 1.84375 1.140625q0.65625 0.859375 0.65625 2.15625q0 1.734375 -1.28125 2.953125q-1.265625 1.21875 -3.21875 1.21875q-1.765625 0 -2.921875 -1.046875q-1.15625 -1.046875 -1.328125 -2.71875z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m572.3295 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m572.3295 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m590.72015 51.364597l0 1.578125l-8.828125 0q-0.015625 -0.59375 0.1875 -1.140625q0.34375 -0.90625 1.078125 -1.78125q0.75 -0.875 2.15625 -2.015625q2.171875 -1.78125 2.9375 -2.828125q0.765625 -1.046875 0.765625 -1.96875q0 -0.984375 -0.703125 -1.640625q-0.6875 -0.671875 -1.8125 -0.671875q-1.1875 0 -1.90625 0.71875q-0.703125 0.703125 -0.703125 1.953125l-1.6875 -0.171875q0.171875 -1.890625 1.296875 -2.875q1.140625 -0.984375 3.03125 -0.984375q1.921875 0 3.046875 1.0625q1.125 1.0625 1.125 2.640625q0 0.796875 -0.328125 1.578125q-0.328125 0.78125 -1.09375 1.640625q-0.75 0.84375 -2.53125 2.34375q-1.46875 1.234375 -1.890625 1.6875q-0.421875 0.4375 -0.6875 0.875l6.546875 0z" fill-rule="nonzero"/><path fill="#e06666" d="m615.56793 24.999102l6.5512085 0l0 42.04725l-6.5512085 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m615.56793 24.999102l6.5512085 0l0 42.04725l-6.5512085 0z" fill-rule="evenodd"/><path fill="#c9daf8" d="m456.51425 24.999102l99.02365 0l0 42.04725l-99.02365 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m456.51425 24.999102l99.02365 0l0 42.04725l-99.02365 0z" fill-rule="evenodd"/><path fill="#000000" d="m466.2955 49.442722l1.71875 -0.140625q0.1875 1.25 0.875 1.890625q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.890625q0.828125 -0.890625 0.828125 -2.359375q0 -1.40625 -0.796875 -2.21875q-0.78125 -0.8125 -2.0625 -0.8125q-0.78125 0 -1.421875 0.359375q-0.640625 0.359375 -1.0 0.9375l-1.546875 -0.203125l1.296875 -6.859375l6.640625 0l0 1.5625l-5.328125 0l-0.71875 3.59375q1.203125 -0.84375 2.515625 -0.84375q1.75 0 2.953125 1.21875q1.203125 1.203125 1.203125 3.109375q0 1.8125 -1.046875 3.140625q-1.296875 1.625 -3.515625 1.625q-1.8125 0 -2.96875 -1.015625q-1.15625 -1.03125 -1.3125 -2.71875z" fill-rule="nonzero"/><path fill="#e06666" d="m391.1985 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m391.1985 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m114.43843 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m114.43843 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m163.95024 24.999102c0 -12.5 114.47246 -25.007874 228.9449 -25.0c114.47241 0.007874016 228.94489 12.531496 228.94489 25.062992" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.95024 24.9991c0 -12.499998 114.47246 -25.007872 228.9449 -24.999998c57.236206 0.003937008 114.47244 3.136811 157.3996 7.835138c21.463562 2.3491635 39.349915 5.0896897 51.8703 8.026144c3.130127 0.7341137 5.9248657 1.4804726 8.356262 2.236023c0.30395508 0.09444237 0.6022339 0.1890316 0.89471436 0.28375435l0.37457275 0.12388611" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m609.98517 21.270555l9.406311 2.1936665l-5.7955933 -7.7266836z" fill-rule="evenodd"/><path fill="#e06666" d="m47.56793 24.999102l6.551182 0l0 42.04725l-6.551182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m47.56793 24.999102l6.551182 0l0 42.04725l-6.551182 0z" fill-rule="evenodd"/></g></svg>
diff --git a/Documentation/networking/tls-offload-reorder-good.svg b/Documentation/networking/tls-offload-reorder-good.svg
new file mode 100644
index 000000000000..10e17d91f70c
--- /dev/null
+++ b/Documentation/networking/tls-offload-reorder-good.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 672.0 68.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#b6d7a8" d="m0 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m0 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m15.953125 52.942722l-1.640625 0l0 -10.453125q-0.59375 0.5625 -1.5625 1.140625q-0.953125 0.5625 -1.71875 0.84375l0 -1.59375q1.375 -0.640625 2.40625 -1.5625q1.03125 -0.921875 1.453125 -1.78125l1.0625 0l0 13.40625z" fill-rule="nonzero"/><path fill="#b6d7a8" d="m340.69897 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m340.69897 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m355.73022 52.942722l0 -3.203125l-5.796875 0l0 -1.5l6.09375 -8.65625l1.34375 0l0 8.65625l1.796875 0l0 1.5l-1.796875 0l0 3.203125l-1.640625 0zm0 -4.703125l0 -6.015625l-4.1875 6.015625l4.1875 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m225.37527 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m225.37527 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m235.15652 49.411472l1.640625 -0.21875q0.28125 1.40625 0.953125 2.015625q0.6875 0.609375 1.65625 0.609375q1.15625 0 1.953125 -0.796875q0.796875 -0.796875 0.796875 -1.984375q0 -1.125 -0.734375 -1.859375q-0.734375 -0.734375 -1.875 -0.734375q-0.46875 0 -1.15625 0.171875l0.1875 -1.4375q0.15625 0.015625 0.265625 0.015625q1.046875 0 1.875 -0.546875q0.84375 -0.546875 0.84375 -1.671875q0 -0.90625 -0.609375 -1.5q-0.609375 -0.59375 -1.578125 -0.59375q-0.953125 0 -1.59375 0.609375q-0.640625 0.59375 -0.8125 1.796875l-1.640625 -0.296875q0.296875 -1.640625 1.359375 -2.546875q1.0625 -0.90625 2.65625 -0.90625q1.09375 0 2.0 0.46875q0.921875 0.46875 1.40625 1.28125q0.5 0.8125 0.5 1.71875q0 0.859375 -0.46875 1.578125q-0.46875 0.703125 -1.375 1.125q1.1875 0.28125 1.84375 1.140625q0.65625 0.859375 0.65625 2.15625q0 1.734375 -1.28125 2.953125q-1.265625 1.21875 -3.21875 1.21875q-1.765625 0 -2.921875 -1.046875q-1.15625 -1.046875 -1.328125 -2.71875z" fill-rule="nonzero"/><path fill="#e06666" d="m271.56793 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m271.56793 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m572.3295 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m572.3295 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" d="m590.72015 51.364597l0 1.578125l-8.828125 0q-0.015625 -0.59375 0.1875 -1.140625q0.34375 -0.90625 1.078125 -1.78125q0.75 -0.875 2.15625 -2.015625q2.171875 -1.78125 2.9375 -2.828125q0.765625 -1.046875 0.765625 -1.96875q0 -0.984375 -0.703125 -1.640625q-0.6875 -0.671875 -1.8125 -0.671875q-1.1875 0 -1.90625 0.71875q-0.703125 0.703125 -0.703125 1.953125l-1.6875 -0.171875q0.171875 -1.890625 1.296875 -2.875q1.140625 -0.984375 3.03125 -0.984375q1.921875 0 3.046875 1.0625q1.125 1.0625 1.125 2.640625q0 0.796875 -0.328125 1.578125q-0.328125 0.78125 -1.09375 1.640625q-0.75 0.84375 -2.53125 2.34375q-1.46875 1.234375 -1.890625 1.6875q-0.421875 0.4375 -0.6875 0.875l6.546875 0z" fill-rule="nonzero"/><path fill="#b6d7a8" d="m456.51425 24.999102l99.02365 0l0 42.04725l-99.02365 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m456.51425 24.999102l99.02365 0l0 42.04725l-99.02365 0z" fill-rule="evenodd"/><path fill="#000000" d="m466.2955 49.442722l1.71875 -0.140625q0.1875 1.25 0.875 1.890625q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.890625q0.828125 -0.890625 0.828125 -2.359375q0 -1.40625 -0.796875 -2.21875q-0.78125 -0.8125 -2.0625 -0.8125q-0.78125 0 -1.421875 0.359375q-0.640625 0.359375 -1.0 0.9375l-1.546875 -0.203125l1.296875 -6.859375l6.640625 0l0 1.5625l-5.328125 0l-0.71875 3.59375q1.203125 -0.84375 2.515625 -0.84375q1.75 0 2.953125 1.21875q1.203125 1.203125 1.203125 3.109375q0 1.8125 -1.046875 3.140625q-1.296875 1.625 -3.515625 1.625q-1.8125 0 -2.96875 -1.015625q-1.15625 -1.03125 -1.3125 -2.71875z" fill-rule="nonzero"/><path fill="#e06666" d="m503.1985 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m503.1985 24.999102l6.551178 0l0 42.04725l-6.551178 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m114.43843 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m114.43843 24.999102l99.02362 0l0 42.04725l-99.02362 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m163.95024 24.999102c0 -12.5 114.47246 -25.007874 228.9449 -25.0c114.47241 0.007874016 228.94489 12.531496 228.94489 25.062992" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.95024 24.9991c0 -12.499998 114.47246 -25.007872 228.9449 -24.999998c57.236206 0.003937008 114.47244 3.136811 157.3996 7.835138c21.463562 2.3491635 39.349915 5.0896897 51.8703 8.026144c3.130127 0.7341137 5.9248657 1.4804726 8.356262 2.236023c0.30395508 0.09444237 0.6022339 0.1890316 0.89471436 0.28375435l0.37457275 0.12388611" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m609.98517 21.270555l9.406311 2.1936665l-5.7955933 -7.7266836z" fill-rule="evenodd"/><path fill="#e06666" d="m47.56793 24.999102l6.551182 0l0 42.04725l-6.551182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m47.56793 24.999102l6.551182 0l0 42.04725l-6.551182 0z" fill-rule="evenodd"/></g></svg>
diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
new file mode 100644
index 000000000000..048e5ca44824
--- /dev/null
+++ b/Documentation/networking/tls-offload.rst
@@ -0,0 +1,515 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+==================
+Kernel TLS offload
+==================
+
+Kernel TLS operation
+====================
+
+Linux kernel provides TLS connection offload infrastructure. Once a TCP
+connection is in ``ESTABLISHED`` state user space can enable the TLS Upper
+Layer Protocol (ULP) and install the cryptographic connection state.
+For details regarding the user-facing interface refer to the TLS
+documentation in :ref:`Documentation/networking/tls.rst <kernel_tls>`.
+
+``ktls`` can operate in three modes:
+
+ * Software crypto mode (``TLS_SW``) - CPU handles the cryptography.
+   In most basic cases only crypto operations synchronous with the CPU
+   can be used, but depending on calling context CPU may utilize
+   asynchronous crypto accelerators. The use of accelerators introduces extra
+   latency on socket reads (decryption only starts when a read syscall
+   is made) and additional I/O load on the system.
+ * Packet-based NIC offload mode (``TLS_HW``) - the NIC handles crypto
+   on a packet by packet basis, provided the packets arrive in order.
+   This mode integrates best with the kernel stack and is described in detail
+   in the remaining part of this document
+   (``ethtool`` flags ``tls-hw-tx-offload`` and ``tls-hw-rx-offload``).
+ * Full TCP NIC offload mode (``TLS_HW_RECORD``) - mode of operation where
+   NIC driver and firmware replace the kernel networking stack
+   with its own TCP handling, it is not usable in production environments
+   making use of the Linux networking stack for example any firewalling
+   abilities or QoS and packet scheduling (``ethtool`` flag ``tls-hw-record``).
+
+The operation mode is selected automatically based on device configuration,
+offload opt-in or opt-out on per-connection basis is not currently supported.
+
+TX
+--
+
+At a high level user write requests are turned into a scatter list, the TLS ULP
+intercepts them, inserts record framing, performs encryption (in ``TLS_SW``
+mode) and then hands the modified scatter list to the TCP layer. From this
+point on the TCP stack proceeds as normal.
+
+In ``TLS_HW`` mode the encryption is not performed in the TLS ULP.
+Instead packets reach a device driver, the driver will mark the packets
+for crypto offload based on the socket the packet is attached to,
+and send them to the device for encryption and transmission.
+
+RX
+--
+
+On the receive side if the device handled decryption and authentication
+successfully, the driver will set the decrypted bit in the associated
+:c:type:`struct sk_buff <sk_buff>`. The packets reach the TCP stack and
+are handled normally. ``ktls`` is informed when data is queued to the socket
+and the ``strparser`` mechanism is used to delineate the records. Upon read
+request, records are retrieved from the socket and passed to decryption routine.
+If device decrypted all the segments of the record the decryption is skipped,
+otherwise software path handles decryption.
+
+.. kernel-figure::  tls-offload-layers.svg
+   :alt:	TLS offload layers
+   :align:	center
+   :figwidth:	28em
+
+   Layers of Kernel TLS stack
+
+Device configuration
+====================
+
+During driver initialization device sets the ``NETIF_F_HW_TLS_RX`` and
+``NETIF_F_HW_TLS_TX`` features and installs its
+:c:type:`struct tlsdev_ops <tlsdev_ops>`
+pointer in the :c:member:`tlsdev_ops` member of the
+:c:type:`struct net_device <net_device>`.
+
+When TLS cryptographic connection state is installed on a ``ktls`` socket
+(note that it is done twice, once for RX and once for TX direction,
+and the two are completely independent), the kernel checks if the underlying
+network device is offload-capable and attempts the offload. In case offload
+fails the connection is handled entirely in software using the same mechanism
+as if the offload was never tried.
+
+Offload request is performed via the :c:member:`tls_dev_add` callback of
+:c:type:`struct tlsdev_ops <tlsdev_ops>`:
+
+.. code-block:: c
+
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+
+``direction`` indicates whether the cryptographic information is for
+the received or transmitted packets. Driver uses the ``sk`` parameter
+to retrieve the connection 5-tuple and socket family (IPv4 vs IPv6).
+Cryptographic information in ``crypto_info`` includes the key, iv, salt
+as well as TLS record sequence number. ``start_offload_tcp_sn`` indicates
+which TCP sequence number corresponds to the beginning of the record with
+sequence number from ``crypto_info``. The driver can add its state
+at the end of kernel structures (see :c:member:`driver_state` members
+in ``include/net/tls.h``) to avoid additional allocations and pointer
+dereferences.
+
+TX
+--
+
+After TX state is installed, the stack guarantees that the first segment
+of the stream will start exactly at the ``start_offload_tcp_sn`` sequence
+number, simplifying TCP sequence number matching.
+
+TX offload being fully initialized does not imply that all segments passing
+through the driver and which belong to the offloaded socket will be after
+the expected sequence number and will have kernel record information.
+In particular, already encrypted data may have been queued to the socket
+before installing the connection state in the kernel.
+
+RX
+--
+
+In RX direction local networking stack has little control over the segmentation,
+so the initial records' TCP sequence number may be anywhere inside the segment.
+
+Normal operation
+================
+
+At the minimum the device maintains the following state for each connection, in
+each direction:
+
+ * crypto secrets (key, iv, salt)
+ * crypto processing state (partial blocks, partial authentication tag, etc.)
+ * record metadata (sequence number, processing offset and length)
+ * expected TCP sequence number
+
+There are no guarantees on record length or record segmentation. In particular
+segments may start at any point of a record and contain any number of records.
+Assuming segments are received in order, the device should be able to perform
+crypto operations and authentication regardless of segmentation. For this
+to be possible device has to keep small amount of segment-to-segment state.
+This includes at least:
+
+ * partial headers (if a segment carried only a part of the TLS header)
+ * partial data block
+ * partial authentication tag (all data had been seen but part of the
+   authentication tag has to be written or read from the subsequent segment)
+
+Record reassembly is not necessary for TLS offload. If the packets arrive
+in order the device should be able to handle them separately and make
+forward progress.
+
+TX
+--
+
+The kernel stack performs record framing reserving space for the authentication
+tag and populating all other TLS header and tailer fields.
+
+Both the device and the driver maintain expected TCP sequence numbers
+due to the possibility of retransmissions and the lack of software fallback
+once the packet reaches the device.
+For segments passed in order, the driver marks the packets with
+a connection identifier (note that a 5-tuple lookup is insufficient to identify
+packets requiring HW offload, see the :ref:`5tuple_problems` section)
+and hands them to the device. The device identifies the packet as requiring
+TLS handling and confirms the sequence number matches its expectation.
+The device performs encryption and authentication of the record data.
+It replaces the authentication tag and TCP checksum with correct values.
+
+RX
+--
+
+Before a packet is DMAed to the host (but after NIC's embedded switching
+and packet transformation functions) the device validates the Layer 4
+checksum and performs a 5-tuple lookup to find any TLS connection the packet
+may belong to (technically a 4-tuple
+lookup is sufficient - IP addresses and TCP port numbers, as the protocol
+is always TCP). If connection is matched device confirms if the TCP sequence
+number is the expected one and proceeds to TLS handling (record delineation,
+decryption, authentication for each record in the packet). The device leaves
+the record framing unmodified, the stack takes care of record decapsulation.
+Device indicates successful handling of TLS offload in the per-packet context
+(descriptor) passed to the host.
+
+Upon reception of a TLS offloaded packet, the driver sets
+the :c:member:`decrypted` mark in :c:type:`struct sk_buff <sk_buff>`
+corresponding to the segment. Networking stack makes sure decrypted
+and non-decrypted segments do not get coalesced (e.g. by GRO or socket layer)
+and takes care of partial decryption.
+
+Resync handling
+===============
+
+In presence of packet drops or network packet reordering, the device may lose
+synchronization with the TLS stream, and require a resync with the kernel's
+TCP stack.
+
+Note that resync is only attempted for connections which were successfully
+added to the device table and are in TLS_HW mode. For example,
+if the table was full when cryptographic state was installed in the kernel,
+such connection will never get offloaded. Therefore the resync request
+does not carry any cryptographic connection state.
+
+TX
+--
+
+Segments transmitted from an offloaded socket can get out of sync
+in similar ways to the receive side-retransmissions - local drops
+are possible, though network reorders are not. There are currently
+two mechanisms for dealing with out of order segments.
+
+Crypto state rebuilding
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever an out of order segment is transmitted the driver provides
+the device with enough information to perform cryptographic operations.
+This means most likely that the part of the record preceding the current
+segment has to be passed to the device as part of the packet context,
+together with its TCP sequence number and TLS record number. The device
+can then initialize its crypto state, process and discard the preceding
+data (to be able to insert the authentication tag) and move onto handling
+the actual packet.
+
+In this mode depending on the implementation the driver can either ask
+for a continuation with the crypto state and the new sequence number
+(next expected segment is the one after the out of order one), or continue
+with the previous stream state - assuming that the out of order segment
+was just a retransmission. The former is simpler, and does not require
+retransmission detection therefore it is the recommended method until
+such time it is proven inefficient.
+
+Next record sync
+~~~~~~~~~~~~~~~~
+
+Whenever an out of order segment is detected the driver requests
+that the ``ktls`` software fallback code encrypt it. If the segment's
+sequence number is lower than expected the driver assumes retransmission
+and doesn't change device state. If the segment is in the future, it
+may imply a local drop, the driver asks the stack to sync the device
+to the next record state and falls back to software.
+
+Resync request is indicated with:
+
+.. code-block:: c
+
+  void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
+
+Until resync is complete driver should not access its expected TCP
+sequence number (as it will be updated from a different context).
+Following helper should be used to test if resync is complete:
+
+.. code-block:: c
+
+  bool tls_offload_tx_resync_pending(struct sock *sk)
+
+Next time ``ktls`` pushes a record it will first send its TCP sequence number
+and TLS record number to the driver. Stack will also make sure that
+the new record will start on a segment boundary (like it does when
+the connection is initially added).
+
+RX
+--
+
+A small amount of RX reorder events may not require a full resynchronization.
+In particular the device should not lose synchronization
+when record boundary can be recovered:
+
+.. kernel-figure::  tls-offload-reorder-good.svg
+   :alt:	reorder of non-header segment
+   :align:	center
+
+   Reorder of non-header segment
+
+Green segments are successfully decrypted, blue ones are passed
+as received on wire, red stripes mark start of new records.
+
+In above case segment 1 is received and decrypted successfully.
+Segment 2 was dropped so 3 arrives out of order. The device knows
+the next record starts inside 3, based on record length in segment 1.
+Segment 3 is passed untouched, because due to lack of data from segment 2
+the remainder of the previous record inside segment 3 cannot be handled.
+The device can, however, collect the authentication algorithm's state
+and partial block from the new record in segment 3 and when 4 and 5
+arrive continue decryption. Finally when 2 arrives it's completely outside
+of expected window of the device so it's passed as is without special
+handling. ``ktls`` software fallback handles the decryption of record
+spanning segments 1, 2 and 3. The device did not get out of sync,
+even though two segments did not get decrypted.
+
+Kernel synchronization may be necessary if the lost segment contained
+a record header and arrived after the next record header has already passed:
+
+.. kernel-figure::  tls-offload-reorder-bad.svg
+   :alt:	reorder of header segment
+   :align:	center
+
+   Reorder of segment with a TLS header
+
+In this example segment 2 gets dropped, and it contains a record header.
+Device can only detect that segment 4 also contains a TLS header
+if it knows the length of the previous record from segment 2. In this case
+the device will lose synchronization with the stream.
+
+Stream scan resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When the device gets out of sync and the stream reaches TCP sequence
+numbers more than a max size record past the expected TCP sequence number,
+the device starts scanning for a known header pattern. For example
+for TLS 1.2 and TLS 1.3 subsequent bytes of value ``0x03 0x03`` occur
+in the SSL/TLS version field of the header. Once pattern is matched
+the device continues attempting parsing headers at expected locations
+(based on the length fields at guessed locations).
+Whenever the expected location does not contain a valid header the scan
+is restarted.
+
+When the header is matched the device sends a confirmation request
+to the kernel, asking if the guessed location is correct (if a TLS record
+really starts there), and which record sequence number the given header had.
+The kernel confirms the guessed location was correct and tells the device
+the record sequence number. Meanwhile, the device had been parsing
+and counting all records since the just-confirmed one, it adds the number
+of records it had seen to the record number provided by the kernel.
+At this point the device is in sync and can resume decryption at next
+segment boundary.
+
+In a pathological case the device may latch onto a sequence of matching
+headers and never hear back from the kernel (there is no negative
+confirmation from the kernel). The implementation may choose to periodically
+restart scan. Given how unlikely falsely-matching stream is, however,
+periodic restart is not deemed necessary.
+
+Special care has to be taken if the confirmation request is passed
+asynchronously to the packet stream and record may get processed
+by the kernel before the confirmation request.
+
+Stack-driven resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The driver may also request the stack to perform resynchronization
+whenever it sees the records are no longer getting decrypted.
+If the connection is configured in this mode the stack automatically
+schedules resynchronization after it has received two completely encrypted
+records.
+
+The stack waits for the socket to drain and informs the device about
+the next expected record number and its TCP sequence number. If the
+records continue to be received fully encrypted stack retries the
+synchronization with an exponential back off (first after 2 encrypted
+records, then after 4 records, after 8, after 16... up until every
+128 records).
+
+Error handling
+==============
+
+TX
+--
+
+Packets may be redirected or rerouted by the stack to a different
+device than the selected TLS offload device. The stack will handle
+such condition using the :c:func:`sk_validate_xmit_skb` helper
+(TLS offload code installs :c:func:`tls_validate_xmit_skb` at this hook).
+Offload maintains information about all records until the data is
+fully acknowledged, so if skbs reach the wrong device they can be handled
+by software fallback.
+
+Any device TLS offload handling error on the transmission side must result
+in the packet being dropped. For example if a packet got out of order
+due to a bug in the stack or the device, reached the device and can't
+be encrypted such packet must be dropped.
+
+RX
+--
+
+If the device encounters any problems with TLS offload on the receive
+side it should pass the packet to the host's networking stack as it was
+received on the wire.
+
+For example authentication failure for any record in the segment should
+result in passing the unmodified packet to the software fallback. This means
+packets should not be modified "in place". Splitting segments to handle partial
+decryption is not advised. In other words either all records in the packet
+had been handled successfully and authenticated or the packet has to be passed
+to the host's stack as it was on the wire (recovering original packet in the
+driver if device provides precise error is sufficient).
+
+The Linux networking stack does not provide a way of reporting per-packet
+decryption and authentication errors, packets with errors must simply not
+have the :c:member:`decrypted` mark set.
+
+A packet should also not be handled by the TLS offload if it contains
+incorrect checksums.
+
+Performance metrics
+===================
+
+TLS offload can be characterized by the following basic metrics:
+
+ * max connection count
+ * connection installation rate
+ * connection installation latency
+ * total cryptographic performance
+
+Note that each TCP connection requires a TLS session in both directions,
+the performance may be reported treating each direction separately.
+
+Max connection count
+--------------------
+
+The number of connections device can support can be exposed via
+``devlink resource`` API.
+
+Total cryptographic performance
+-------------------------------
+
+Offload performance may depend on segment and record size.
+
+Overload of the cryptographic subsystem of the device should not have
+significant performance impact on non-offloaded streams.
+
+Statistics
+==========
+
+Following minimum set of TLS-related statistics should be reported
+by the driver:
+
+ * ``rx_tls_decrypted`` - number of successfully decrypted TLS segments
+ * ``tx_tls_encrypted`` - number of in-order TLS segments passed to device
+   for encryption
+ * ``tx_tls_ooo`` - number of TX packets which were part of a TLS stream
+   but did not arrive in the expected order
+ * ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because
+   they arrived out of order and associated record could not be found
+
+Notable corner cases, exceptions and additional requirements
+============================================================
+
+.. _5tuple_problems:
+
+5-tuple matching limitations
+----------------------------
+
+The device can only recognize received packets based on the 5-tuple
+of the socket. Current ``ktls`` implementation will not offload sockets
+routed through software interfaces such as those used for tunneling
+or virtual networking. However, many packet transformations performed
+by the networking stack (most notably any BPF logic) do not require
+any intermediate software device, therefore a 5-tuple match may
+consistently miss at the device level. In such cases the device
+should still be able to perform TX offload (encryption) and should
+fallback cleanly to software decryption (RX).
+
+Out of order
+------------
+
+Introducing extra processing in NICs should not cause packets to be
+transmitted or received out of order, for example pure ACK packets
+should not be reordered with respect to data segments.
+
+Ingress reorder
+---------------
+
+A device is permitted to perform packet reordering for consecutive
+TCP segments (i.e. placing packets in the correct order) but any form
+of additional buffering is disallowed.
+
+Coexistence with standard networking offload features
+-----------------------------------------------------
+
+Offloaded ``ktls`` sockets should support standard TCP stack features
+transparently. Enabling device TLS offload should not cause any difference
+in packets as seen on the wire.
+
+Transport layer transparency
+----------------------------
+
+The device should not modify any packet headers for the purpose
+of the simplifying TLS offload.
+
+The device should not depend on any packet headers beyond what is strictly
+necessary for TLS offload.
+
+Segment drops
+-------------
+
+Dropping packets is acceptable only in the event of catastrophic
+system errors and should never be used as an error handling mechanism
+in cases arising from normal operation. In other words, reliance
+on TCP retransmissions to handle corner cases is not acceptable.
+
+TLS device features
+-------------------
+
+Drivers should ignore the changes to TLS the device feature flags.
+These flags will be acted upon accordingly by the core ``ktls`` code.
+TLS device feature flags only control adding of new TLS connection
+offloads, old connections will remain active after flags are cleared.
+
+Known bugs
+==========
+
+skb_orphan() leaks clear text
+-----------------------------
+
+Currently drivers depend on the :c:member:`sk` member of
+:c:type:`struct sk_buff <sk_buff>` to identify segments requiring
+encryption. Any operation which removes or does not preserve the socket
+association such as :c:func:`skb_orphan` or :c:func:`skb_clone`
+will cause the driver to miss the packets and lead to clear text leaks.
+
+Redirects leak clear text
+-------------------------
+
+In the RX direction, if segment has already been decrypted by the device
+and it gets redirected or mirrored - clear text will be transmitted out.
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
new file mode 100644
index 000000000000..5bcbf75e2025
--- /dev/null
+++ b/Documentation/networking/tls.rst
@@ -0,0 +1,215 @@
+.. _kernel_tls:
+
+==========
+Kernel TLS
+==========
+
+Overview
+========
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
+TCP. TLS provides end-to-end data integrity and confidentiality.
+
+User interface
+==============
+
+Creating a TLS connection
+-------------------------
+
+First create a new TCP socket and set the TLS ULP.
+
+.. code-block:: c
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
+
+Setting the TLS ULP allows us to set/get TLS socket options. Currently
+only the symmetric encryption is handled in the kernel.  After the TLS
+handshake is complete, we have all the parameters required to move the
+data-path to the kernel. There is a separate socket option for moving
+the transmit and the receive into the kernel.
+
+.. code-block:: c
+
+  /* From linux/tls.h */
+  struct tls_crypto_info {
+          unsigned short version;
+          unsigned short cipher_type;
+  };
+
+  struct tls12_crypto_info_aes_gcm_128 {
+          struct tls_crypto_info info;
+          unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+          unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+          unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+          unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+  };
+
+
+  struct tls12_crypto_info_aes_gcm_128 crypto_info;
+
+  crypto_info.info.version = TLS_1_2_VERSION;
+  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
+  memcpy(crypto_info.rec_seq, seq_number_write,
+					TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+  memcpy(crypto_info.salt, implicit_iv_write, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+  setsockopt(sock, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info));
+
+Transmit and receive are set separately, but the setup is the same, using either
+TLS_TX or TLS_RX.
+
+Sending TLS application data
+----------------------------
+
+After setting the TLS_TX socket option all application data sent over this
+socket is encrypted using TLS and the parameters provided in the socket option.
+For example, we can send an encrypted hello world record as follows:
+
+.. code-block:: c
+
+  const char *msg = "hello world\n";
+  send(sock, msg, strlen(msg));
+
+send() data is directly encrypted from the userspace buffer provided
+to the encrypted kernel send buffer if possible.
+
+The sendfile system call will send the file's data over TLS records of maximum
+length (2^14).
+
+.. code-block:: c
+
+  file = open(filename, O_RDONLY);
+  fstat(file, &stat);
+  sendfile(sock, file, &offset, stat.st_size);
+
+TLS records are created and sent after each send() call, unless
+MSG_MORE is passed.  MSG_MORE will delay creation of a record until
+MSG_MORE is not passed, or the maximum record size is reached.
+
+The kernel will need to allocate a buffer for the encrypted data.
+This buffer is allocated at the time send() is called, such that
+either the entire send() call will return -ENOMEM (or block waiting
+for memory), or the encryption will always succeed.  If send() returns
+-ENOMEM and some data was left on the socket buffer from a previous
+call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
+
+Receiving TLS application data
+------------------------------
+
+After setting the TLS_RX socket option, all recv family socket calls
+are decrypted using TLS parameters provided.  A full TLS record must
+be received before decryption can happen.
+
+.. code-block:: c
+
+  char buffer[16384];
+  recv(sock, buffer, 16384);
+
+Received data is decrypted directly in to the user buffer if it is
+large enough, and no additional allocations occur.  If the userspace
+buffer is too small, data is decrypted in the kernel and copied to
+userspace.
+
+``EINVAL`` is returned if the TLS version in the received message does not
+match the version passed in setsockopt.
+
+``EMSGSIZE`` is returned if the received message is too big.
+
+``EBADMSG`` is returned if decryption failed for any other reason.
+
+Send TLS control messages
+-------------------------
+
+Other than application data, TLS has control messages such as alert
+messages (record type 21) and handshake messages (record type 22), etc.
+These messages can be sent over the socket by providing the TLS record type
+via a CMSG. For example the following function sends @data of @length bytes
+using a record of type @record_type.
+
+.. code-block:: c
+
+  /* send TLS control message using record_type */
+  static int klts_send_ctrl_message(int sock, unsigned char record_type,
+                                    void *data, size_t length)
+  {
+        struct msghdr msg = {0};
+        int cmsg_len = sizeof(record_type);
+        struct cmsghdr *cmsg;
+        char buf[CMSG_SPACE(cmsg_len)];
+        struct iovec msg_iov;   /* Vector of data to send/receive into.  */
+
+        msg.msg_control = buf;
+        msg.msg_controllen = sizeof(buf);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_TLS;
+        cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+        cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+        *CMSG_DATA(cmsg) = record_type;
+        msg.msg_controllen = cmsg->cmsg_len;
+
+        msg_iov.iov_base = data;
+        msg_iov.iov_len = length;
+        msg.msg_iov = &msg_iov;
+        msg.msg_iovlen = 1;
+
+        return sendmsg(sock, &msg, 0);
+  }
+
+Control message data should be provided unencrypted, and will be
+encrypted by the kernel.
+
+Receiving TLS control messages
+------------------------------
+
+TLS control messages are passed in the userspace buffer, with message
+type passed via cmsg.  If no cmsg buffer is provided, an error is
+returned if a control message is received.  Data messages may be
+received without a cmsg buffer set.
+
+.. code-block:: c
+
+  char buffer[16384];
+  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
+  struct msghdr msg = {0};
+  msg.msg_control = cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+
+  struct iovec msg_iov;
+  msg_iov.iov_base = buffer;
+  msg_iov.iov_len = 16384;
+
+  msg.msg_iov = &msg_iov;
+  msg.msg_iovlen = 1;
+
+  int ret = recvmsg(sock, &msg, 0 /* flags */);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+  if (cmsg->cmsg_level == SOL_TLS &&
+      cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+      int record_type = *((unsigned char *)CMSG_DATA(cmsg));
+      // Do something with record_type, and control message data in
+      // buffer.
+      //
+      // Note that record_type may be == to application data (23).
+  } else {
+      // Buffer contains application data.
+  }
+
+recv will never return data from mixed types of TLS records.
+
+Integrating in to userspace TLS library
+---------------------------------------
+
+At a high level, the kernel TLS ULP is a replacement for the record
+layer of a userspace TLS library.
+
+A patchset to OpenSSL to use ktls as the record layer is
+`here <https://github.com/Mellanox/openssl/commits/tls_rx2>`_.
+
+`An example <https://github.com/ktls/af_ktls-tool/commits/RX>`_
+of calling send directly after a handshake using gnutls.
+Since it doesn't implement a full record layer, control
+messages are not supported.
diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
deleted file mode 100644
index 58b5ef75f1b7..000000000000
--- a/Documentation/networking/tls.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-Overview
-========
-
-Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
-TCP. TLS provides end-to-end data integrity and confidentiality.
-
-User interface
-==============
-
-Creating a TLS connection
--------------------------
-
-First create a new TCP socket and set the TLS ULP.
-
-  sock = socket(AF_INET, SOCK_STREAM, 0);
-  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
-
-Setting the TLS ULP allows us to set/get TLS socket options. Currently
-only the symmetric encryption is handled in the kernel.  After the TLS
-handshake is complete, we have all the parameters required to move the
-data-path to the kernel. There is a separate socket option for moving
-the transmit and the receive into the kernel.
-
-  /* From linux/tls.h */
-  struct tls_crypto_info {
-          unsigned short version;
-          unsigned short cipher_type;
-  };
-
-  struct tls12_crypto_info_aes_gcm_128 {
-          struct tls_crypto_info info;
-          unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
-          unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
-          unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
-          unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
-  };
-
-
-  struct tls12_crypto_info_aes_gcm_128 crypto_info;
-
-  crypto_info.info.version = TLS_1_2_VERSION;
-  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
-  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
-  memcpy(crypto_info.rec_seq, seq_number_write,
-					TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
-  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
-  memcpy(crypto_info.salt, implicit_iv_write, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
-
-  setsockopt(sock, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info));
-
-Transmit and receive are set separately, but the setup is the same, using either
-TLS_TX or TLS_RX.
-
-Sending TLS application data
-----------------------------
-
-After setting the TLS_TX socket option all application data sent over this
-socket is encrypted using TLS and the parameters provided in the socket option.
-For example, we can send an encrypted hello world record as follows:
-
-  const char *msg = "hello world\n";
-  send(sock, msg, strlen(msg));
-
-send() data is directly encrypted from the userspace buffer provided
-to the encrypted kernel send buffer if possible.
-
-The sendfile system call will send the file's data over TLS records of maximum
-length (2^14).
-
-  file = open(filename, O_RDONLY);
-  fstat(file, &stat);
-  sendfile(sock, file, &offset, stat.st_size);
-
-TLS records are created and sent after each send() call, unless
-MSG_MORE is passed.  MSG_MORE will delay creation of a record until
-MSG_MORE is not passed, or the maximum record size is reached.
-
-The kernel will need to allocate a buffer for the encrypted data.
-This buffer is allocated at the time send() is called, such that
-either the entire send() call will return -ENOMEM (or block waiting
-for memory), or the encryption will always succeed.  If send() returns
--ENOMEM and some data was left on the socket buffer from a previous
-call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
-
-Receiving TLS application data
-------------------------------
-
-After setting the TLS_RX socket option, all recv family socket calls
-are decrypted using TLS parameters provided.  A full TLS record must
-be received before decryption can happen.
-
-  char buffer[16384];
-  recv(sock, buffer, 16384);
-
-Received data is decrypted directly in to the user buffer if it is
-large enough, and no additional allocations occur.  If the userspace
-buffer is too small, data is decrypted in the kernel and copied to
-userspace.
-
-EINVAL is returned if the TLS version in the received message does not
-match the version passed in setsockopt.
-
-EMSGSIZE is returned if the received message is too big.
-
-EBADMSG is returned if decryption failed for any other reason.
-
-Send TLS control messages
--------------------------
-
-Other than application data, TLS has control messages such as alert
-messages (record type 21) and handshake messages (record type 22), etc.
-These messages can be sent over the socket by providing the TLS record type
-via a CMSG. For example the following function sends @data of @length bytes
-using a record of type @record_type.
-
-/* send TLS control message using record_type */
-  static int klts_send_ctrl_message(int sock, unsigned char record_type,
-                                  void *data, size_t length)
-  {
-        struct msghdr msg = {0};
-        int cmsg_len = sizeof(record_type);
-        struct cmsghdr *cmsg;
-        char buf[CMSG_SPACE(cmsg_len)];
-        struct iovec msg_iov;   /* Vector of data to send/receive into.  */
-
-        msg.msg_control = buf;
-        msg.msg_controllen = sizeof(buf);
-        cmsg = CMSG_FIRSTHDR(&msg);
-        cmsg->cmsg_level = SOL_TLS;
-        cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
-        cmsg->cmsg_len = CMSG_LEN(cmsg_len);
-        *CMSG_DATA(cmsg) = record_type;
-        msg.msg_controllen = cmsg->cmsg_len;
-
-        msg_iov.iov_base = data;
-        msg_iov.iov_len = length;
-        msg.msg_iov = &msg_iov;
-        msg.msg_iovlen = 1;
-
-        return sendmsg(sock, &msg, 0);
-  }
-
-Control message data should be provided unencrypted, and will be
-encrypted by the kernel.
-
-Receiving TLS control messages
-------------------------------
-
-TLS control messages are passed in the userspace buffer, with message
-type passed via cmsg.  If no cmsg buffer is provided, an error is
-returned if a control message is received.  Data messages may be
-received without a cmsg buffer set.
-
-  char buffer[16384];
-  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
-  struct msghdr msg = {0};
-  msg.msg_control = cmsg;
-  msg.msg_controllen = sizeof(cmsg);
-
-  struct iovec msg_iov;
-  msg_iov.iov_base = buffer;
-  msg_iov.iov_len = 16384;
-
-  msg.msg_iov = &msg_iov;
-  msg.msg_iovlen = 1;
-
-  int ret = recvmsg(sock, &msg, 0 /* flags */);
-
-  struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-  if (cmsg->cmsg_level == SOL_TLS &&
-      cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
-      int record_type = *((unsigned char *)CMSG_DATA(cmsg));
-      // Do something with record_type, and control message data in
-      // buffer.
-      //
-      // Note that record_type may be == to application data (23).
-  } else {
-      // Buffer contains application data.
-  }
-
-recv will never return data from mixed types of TLS records.
-
-Integrating in to userspace TLS library
----------------------------------------
-
-At a high level, the kernel TLS ULP is a replacement for the record
-layer of a userspace TLS library.
-
-A patchset to OpenSSL to use ktls as the record layer is here:
-
-https://github.com/Mellanox/openssl/commits/tls_rx2
-
-An example of calling send directly after a handshake using
-gnutls.  Since it doesn't implement a full record layer, control
-messages are not supported:
-
-https://github.com/ktls/af_ktls-tool/commits/RX
diff --git a/Documentation/nfc/nfc-hci.txt b/Documentation/nfc/nfc-hci.txt
deleted file mode 100644
index 0dc078cab972..000000000000
--- a/Documentation/nfc/nfc-hci.txt
+++ /dev/null
@@ -1,290 +0,0 @@
-HCI backend for NFC Core
-
-Author: Eric Lapuyade, Samuel Ortiz
-Contact: eric.lapuyade@intel.com, samuel.ortiz@intel.com
-
-General
--------
-
-The HCI layer implements much of the ETSI TS 102 622 V10.2.0 specification. It
-enables easy writing of HCI-based NFC drivers. The HCI layer runs as an NFC Core
-backend, implementing an abstract nfc device and translating NFC Core API
-to HCI commands and events.
-
-HCI
----
-
-HCI registers as an nfc device with NFC Core. Requests coming from userspace are
-routed through netlink sockets to NFC Core and then to HCI. From this point,
-they are translated in a sequence of HCI commands sent to the HCI layer in the
-host controller (the chip). Commands can be executed synchronously (the sending
-context blocks waiting for response) or asynchronously (the response is returned
-from HCI Rx context).
-HCI events can also be received from the host controller. They will be handled
-and a translation will be forwarded to NFC Core as needed. There are hooks to
-let the HCI driver handle proprietary events or override standard behavior.
-HCI uses 2 execution contexts:
-- one for executing commands : nfc_hci_msg_tx_work(). Only one command
-can be executing at any given moment.
-- one for dispatching received events and commands : nfc_hci_msg_rx_work().
-
-HCI Session initialization:
----------------------------
-
-The Session initialization is an HCI standard which must unfortunately
-support proprietary gates. This is the reason why the driver will pass a list
-of proprietary gates that must be part of the session. HCI will ensure all
-those gates have pipes connected when the hci device is set up.
-In case the chip supports pre-opened gates and pseudo-static pipes, the driver
-can pass that information to HCI core.
-
-HCI Gates and Pipes
--------------------
-
-A gate defines the 'port' where some service can be found. In order to access
-a service, one must create a pipe to that gate and open it. In this
-implementation, pipes are totally hidden. The public API only knows gates.
-This is consistent with the driver need to send commands to proprietary gates
-without knowing the pipe connected to it.
-
-Driver interface
-----------------
-
-A driver is generally written in two parts : the physical link management and
-the HCI management. This makes it easier to maintain a driver for a chip that
-can be connected using various phy (i2c, spi, ...)
-
-HCI Management
---------------
-
-A driver would normally register itself with HCI and provide the following
-entry points:
-
-struct nfc_hci_ops {
-	int (*open)(struct nfc_hci_dev *hdev);
-	void (*close)(struct nfc_hci_dev *hdev);
-	int (*hci_ready) (struct nfc_hci_dev *hdev);
-	int (*xmit) (struct nfc_hci_dev *hdev, struct sk_buff *skb);
-	int (*start_poll) (struct nfc_hci_dev *hdev,
-			   u32 im_protocols, u32 tm_protocols);
-	int (*dep_link_up)(struct nfc_hci_dev *hdev, struct nfc_target *target,
-			   u8 comm_mode, u8 *gb, size_t gb_len);
-	int (*dep_link_down)(struct nfc_hci_dev *hdev);
-	int (*target_from_gate) (struct nfc_hci_dev *hdev, u8 gate,
-				 struct nfc_target *target);
-	int (*complete_target_discovered) (struct nfc_hci_dev *hdev, u8 gate,
-					   struct nfc_target *target);
-	int (*im_transceive) (struct nfc_hci_dev *hdev,
-			      struct nfc_target *target, struct sk_buff *skb,
-			      data_exchange_cb_t cb, void *cb_context);
-	int (*tm_send)(struct nfc_hci_dev *hdev, struct sk_buff *skb);
-	int (*check_presence)(struct nfc_hci_dev *hdev,
-			      struct nfc_target *target);
-	int (*event_received)(struct nfc_hci_dev *hdev, u8 gate, u8 event,
-			      struct sk_buff *skb);
-};
-
-- open() and close() shall turn the hardware on and off.
-- hci_ready() is an optional entry point that is called right after the hci
-session has been set up. The driver can use it to do additional initialization
-that must be performed using HCI commands.
-- xmit() shall simply write a frame to the physical link.
-- start_poll() is an optional entrypoint that shall set the hardware in polling
-mode. This must be implemented only if the hardware uses proprietary gates or a
-mechanism slightly different from the HCI standard.
-- dep_link_up() is called after a p2p target has been detected, to finish
-the p2p connection setup with hardware parameters that need to be passed back
-to nfc core.
-- dep_link_down() is called to bring the p2p link down.
-- target_from_gate() is an optional entrypoint to return the nfc protocols
-corresponding to a proprietary gate.
-- complete_target_discovered() is an optional entry point to let the driver
-perform additional proprietary processing necessary to auto activate the
-discovered target.
-- im_transceive() must be implemented by the driver if proprietary HCI commands
-are required to send data to the tag. Some tag types will require custom
-commands, others can be written to using the standard HCI commands. The driver
-can check the tag type and either do proprietary processing, or return 1 to ask
-for standard processing. The data exchange command itself must be sent
-asynchronously.
-- tm_send() is called to send data in the case of a p2p connection
-- check_presence() is an optional entry point that will be called regularly
-by the core to check that an activated tag is still in the field. If this is
-not implemented, the core will not be able to push tag_lost events to the user
-space
-- event_received() is called to handle an event coming from the chip. Driver
-can handle the event or return 1 to let HCI attempt standard processing.
-
-On the rx path, the driver is responsible to push incoming HCP frames to HCI
-using nfc_hci_recv_frame(). HCI will take care of re-aggregation and handling
-This must be done from a context that can sleep.
-
-PHY Management
---------------
-
-The physical link (i2c, ...) management is defined by the following structure:
-
-struct nfc_phy_ops {
-	int (*write)(void *dev_id, struct sk_buff *skb);
-	int (*enable)(void *dev_id);
-	void (*disable)(void *dev_id);
-};
-
-enable(): turn the phy on (power on), make it ready to transfer data
-disable(): turn the phy off
-write(): Send a data frame to the chip. Note that to enable higher
-layers such as an llc to store the frame for re-emission, this function must
-not alter the skb. It must also not return a positive result (return 0 for
-success, negative for failure).
-
-Data coming from the chip shall be sent directly to nfc_hci_recv_frame().
-
-LLC
----
-
-Communication between the CPU and the chip often requires some link layer
-protocol. Those are isolated as modules managed by the HCI layer. There are
-currently two modules : nop (raw transfert) and shdlc.
-A new llc must implement the following functions:
-
-struct nfc_llc_ops {
-	void *(*init) (struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
-		       rcv_to_hci_t rcv_to_hci, int tx_headroom,
-		       int tx_tailroom, int *rx_headroom, int *rx_tailroom,
-		       llc_failure_t llc_failure);
-	void (*deinit) (struct nfc_llc *llc);
-	int (*start) (struct nfc_llc *llc);
-	int (*stop) (struct nfc_llc *llc);
-	void (*rcv_from_drv) (struct nfc_llc *llc, struct sk_buff *skb);
-	int (*xmit_from_hci) (struct nfc_llc *llc, struct sk_buff *skb);
-};
-
-- init() : allocate and init your private storage
-- deinit() : cleanup
-- start() : establish the logical connection
-- stop () : terminate the logical connection
-- rcv_from_drv() : handle data coming from the chip, going to HCI
-- xmit_from_hci() : handle data sent by HCI, going to the chip
-
-The llc must be registered with nfc before it can be used. Do that by
-calling nfc_llc_register(const char *name, struct nfc_llc_ops *ops);
-
-Again, note that the llc does not handle the physical link. It is thus very
-easy to mix any physical link with any llc for a given chip driver.
-
-Included Drivers
-----------------
-
-An HCI based driver for an NXP PN544, connected through I2C bus, and using
-shdlc is included.
-
-Execution Contexts
-------------------
-
-The execution contexts are the following:
-- IRQ handler (IRQH):
-fast, cannot sleep. sends incoming frames to HCI where they are passed to
-the current llc. In case of shdlc, the frame is queued in shdlc rx queue.
-
-- SHDLC State Machine worker (SMW)
-Only when llc_shdlc is used: handles shdlc rx & tx queues.
-Dispatches HCI cmd responses.
-
-- HCI Tx Cmd worker (MSGTXWQ)
-Serializes execution of HCI commands. Completes execution in case of response
-timeout.
-
-- HCI Rx worker (MSGRXWQ)
-Dispatches incoming HCI commands or events.
-
-- Syscall context from a userspace call (SYSCALL)
-Any entrypoint in HCI called from NFC Core
-
-Workflow executing an HCI command (using shdlc)
------------------------------------------------
-
-Executing an HCI command can easily be performed synchronously using the
-following API:
-
-int nfc_hci_send_cmd (struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
-			const u8 *param, size_t param_len, struct sk_buff **skb)
-
-The API must be invoked from a context that can sleep. Most of the time, this
-will be the syscall context. skb will return the result that was received in
-the response.
-
-Internally, execution is asynchronous. So all this API does is to enqueue the
-HCI command, setup a local wait queue on stack, and wait_event() for completion.
-The wait is not interruptible because it is guaranteed that the command will
-complete after some short timeout anyway.
-
-MSGTXWQ context will then be scheduled and invoke nfc_hci_msg_tx_work().
-This function will dequeue the next pending command and send its HCP fragments
-to the lower layer which happens to be shdlc. It will then start a timer to be
-able to complete the command with a timeout error if no response arrive.
-
-SMW context gets scheduled and invokes nfc_shdlc_sm_work(). This function
-handles shdlc framing in and out. It uses the driver xmit to send frames and
-receives incoming frames in an skb queue filled from the driver IRQ handler.
-SHDLC I(nformation) frames payload are HCP fragments. They are aggregated to
-form complete HCI frames, which can be a response, command, or event.
-
-HCI Responses are dispatched immediately from this context to unblock
-waiting command execution. Response processing involves invoking the completion
-callback that was provided by nfc_hci_msg_tx_work() when it sent the command.
-The completion callback will then wake the syscall context.
-
-It is also possible to execute the command asynchronously using this API:
-
-static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
-			       const u8 *param, size_t param_len,
-			       data_exchange_cb_t cb, void *cb_context)
-
-The workflow is the same, except that the API call returns immediately, and
-the callback will be called with the result from the SMW context.
-
-Workflow receiving an HCI event or command
-------------------------------------------
-
-HCI commands or events are not dispatched from SMW context. Instead, they are
-queued to HCI rx_queue and will be dispatched from HCI rx worker
-context (MSGRXWQ). This is done this way to allow a cmd or event handler
-to also execute other commands (for example, handling the
-NFC_HCI_EVT_TARGET_DISCOVERED event from PN544 requires to issue an
-ANY_GET_PARAMETER to the reader A gate to get information on the target
-that was discovered).
-
-Typically, such an event will be propagated to NFC Core from MSGRXWQ context.
-
-Error management
-----------------
-
-Errors that occur synchronously with the execution of an NFC Core request are
-simply returned as the execution result of the request. These are easy.
-
-Errors that occur asynchronously (e.g. in a background protocol handling thread)
-must be reported such that upper layers don't stay ignorant that something
-went wrong below and know that expected events will probably never happen.
-Handling of these errors is done as follows:
-
-- driver (pn544) fails to deliver an incoming frame: it stores the error such
-that any subsequent call to the driver will result in this error. Then it calls
-the standard nfc_shdlc_recv_frame() with a NULL argument to report the problem
-above. shdlc stores a EREMOTEIO sticky status, which will trigger SMW to
-report above in turn.
-
-- SMW is basically a background thread to handle incoming and outgoing shdlc
-frames. This thread will also check the shdlc sticky status and report to HCI
-when it discovers it is not able to run anymore because of an unrecoverable
-error that happened within shdlc or below. If the problem occurs during shdlc
-connection, the error is reported through the connect completion.
-
-- HCI: if an internal HCI error happens (frame is lost), or HCI is reported an
-error from a lower layer, HCI will either complete the currently executing
-command with that error, or notify NFC Core directly if no command is executing.
-
-- NFC Core: when NFC Core is notified of an error from below and polling is
-active, it will send a tag discovered event with an empty tag list to the user
-space to let it know that the poll operation will never be able to detect a tag.
-If polling is not active and the error was sticky, lower levels will return it
-at next invocation.
diff --git a/Documentation/nfc/nfc-pn544.txt b/Documentation/nfc/nfc-pn544.txt
deleted file mode 100644
index b36ca14ca2d6..000000000000
--- a/Documentation/nfc/nfc-pn544.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Kernel driver for the NXP Semiconductors PN544 Near Field
-Communication chip
-
-General
--------
-
-The PN544 is an integrated transmission module for contactless
-communication. The driver goes under drives/nfc/ and is compiled as a
-module named "pn544".
-
-Host Interfaces: I2C, SPI and HSU, this driver supports currently only I2C.
-
-Protocols
----------
-
-In the normal (HCI) mode and in the firmware update mode read and
-write functions behave a bit differently because the message formats
-or the protocols are different.
-
-In the normal (HCI) mode the protocol used is derived from the ETSI
-HCI specification. The firmware is updated using a specific protocol,
-which is different from HCI.
-
-HCI messages consist of an eight bit header and the message body. The
-header contains the message length. Maximum size for an HCI message is
-33. In HCI mode sent messages are tested for a correct
-checksum. Firmware update messages have the length in the second (MSB)
-and third (LSB) bytes of the message. The maximum FW message length is
-1024 bytes.
-
-For the ETSI HCI specification see
-http://www.etsi.org/WebSite/Technologies/ProtocolSpecification.aspx
diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
deleted file mode 100644
index 074a423c853c..000000000000
--- a/Documentation/ntb.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-===========
-NTB Drivers
-===========
-
-NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
-the separate memory systems of two or more computers to the same PCI-Express
-fabric. Existing NTB hardware supports a common feature set: doorbell
-registers and memory translation windows, as well as non common features like
-scratchpad and message registers. Scratchpad registers are read-and-writable
-registers that are accessible from either side of the device, so that peers can
-exchange a small amount of information at a fixed address. Message registers can
-be utilized for the same purpose. Additionally they are provided with with
-special status bits to make sure the information isn't rewritten by another
-peer. Doorbell registers provide a way for peers to send interrupt events.
-Memory windows allow translated read and write access to the peer memory.
-
-NTB Core Driver (ntb)
-=====================
-
-The NTB core driver defines an api wrapping the common feature set, and allows
-clients interested in NTB features to discover NTB the devices supported by
-hardware drivers.  The term "client" is used here to mean an upper layer
-component making use of the NTB api.  The term "driver," or "hardware driver,"
-is used here to mean a driver for a specific vendor and model of NTB hardware.
-
-NTB Client Drivers
-==================
-
-NTB client drivers should register with the NTB core driver.  After
-registering, the client probe and remove functions will be called appropriately
-as ntb hardware, or hardware drivers, are inserted and removed.  The
-registration uses the Linux Device framework, so it should feel familiar to
-anyone who has written a pci driver.
-
-NTB Typical client driver implementation
-----------------------------------------
-
-Primary purpose of NTB is to share some peace of memory between at least two
-systems. So the NTB device features like Scratchpad/Message registers are
-mainly used to perform the proper memory window initialization. Typically
-there are two types of memory window interfaces supported by the NTB API:
-inbound translation configured on the local ntb port and outbound translation
-configured by the peer, on the peer ntb port. The first type is
-depicted on the next figure::
-
- Inbound translation:
-
- Memory:              Local NTB Port:      Peer NTB Port:      Peer MMIO:
-  ____________
- | dma-mapped |-ntb_mw_set_trans(addr)  |
- | memory     |        _v____________   |   ______________
- | (addr)     |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO
- |------------|       |--------------|  |  |--------------|
-
-So typical scenario of the first type memory window initialization looks:
-1) allocate a memory region, 2) put translated address to NTB config,
-3) somehow notify a peer device of performed initialization, 4) peer device
-maps corresponding outbound memory window so to have access to the shared
-memory region.
-
-The second type of interface, that implies the shared windows being
-initialized by a peer device, is depicted on the figure::
-
- Outbound translation:
-
- Memory:        Local NTB Port:    Peer NTB Port:      Peer MMIO:
-  ____________                      ______________
- | dma-mapped |                |   | MW base addr |<== memory-mapped IO
- | memory     |                |   |--------------|
- | (addr)     |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr)
- |------------|                |   |--------------|
-
-Typical scenario of the second type interface initialization would be:
-1) allocate a memory region, 2) somehow deliver a translated address to a peer
-device, 3) peer puts the translated address to NTB config, 4) peer device maps
-outbound memory window so to have access to the shared memory region.
-
-As one can see the described scenarios can be combined in one portable
-algorithm.
-
- Local device:
-  1) Allocate memory for a shared window
-  2) Initialize memory window by translated address of the allocated region
-     (it may fail if local memory window initialization is unsupported)
-  3) Send the translated address and memory window index to a peer device
-
- Peer device:
-  1) Initialize memory window with retrieved address of the allocated
-     by another device memory region (it may fail if peer memory window
-     initialization is unsupported)
-  2) Map outbound memory window
-
-In accordance with this scenario, the NTB Memory Window API can be used as
-follows:
-
- Local device:
-  1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can
-     be allocated for memory windows between local device and peer device
-     of port with specified index.
-  2) ntb_get_align(pidx, midx) - retrieve parameters restricting the
-     shared memory region alignment and size. Then memory can be properly
-     allocated.
-  3) Allocate physically contiguous memory region in compliance with
-     restrictions retrieved in 2).
-  4) ntb_mw_set_trans(pidx, midx) - try to set translation address of
-     the memory window with specified index for the defined peer device
-     (it may fail if local translated address setting is not supported)
-  5) Send translated base address (usually together with memory window
-     number) to the peer device using, for instance, scratchpad or message
-     registers.
-
- Peer device:
-  1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other
-     device (related to pidx) translated address for specified memory
-     window. It may fail if retrieved address, for instance, exceeds
-     maximum possible address or isn't properly aligned.
-  2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory
-     window so to have an access to the shared memory.
-
-Also it is worth to note, that method ntb_mw_count(pidx) should return the
-same value as ntb_peer_mw_count() on the peer with port index - pidx.
-
-NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
-------------------------------------------------------------------
-
-The primary client for NTB is the Transport client, used in tandem with NTB
-Netdev.  These drivers function together to create a logical link to the peer,
-across the ntb, to exchange packets of network data.  The Transport client
-establishes a logical link to the peer, and creates queue pairs to exchange
-messages and data.  The NTB Netdev then creates an ethernet device using a
-Transport queue pair.  Network data is copied between socket buffers and the
-Transport queue pair buffer.  The Transport client may be used for other things
-besides Netdev, however no other applications have yet been written.
-
-NTB Ping Pong Test Client (ntb\_pingpong)
------------------------------------------
-
-The Ping Pong test client serves as a demonstration to exercise the doorbell
-and scratchpad registers of NTB hardware, and as an example simple NTB client.
-Ping Pong enables the link when started, waits for the NTB link to come up, and
-then proceeds to read and write the doorbell scratchpad registers of the NTB.
-The peers interrupt each other using a bit mask of doorbell bits, which is
-shifted by one in each round, to test the behavior of multiple doorbell bits
-and interrupt vectors.  The Ping Pong driver also reads the first local
-scratchpad, and writes the value plus one to the first peer scratchpad, each
-round before writing the peer doorbell register.
-
-Module Parameters:
-
-* unsafe - Some hardware has known issues with scratchpad and doorbell
-	registers.  By default, Ping Pong will not attempt to exercise such
-	hardware.  You may override this behavior at your own risk by setting
-	unsafe=1.
-* delay\_ms - Specify the delay between receiving a doorbell
-	interrupt event and setting the peer doorbell register for the next
-	round.
-* init\_db - Specify the doorbell bits to start new series of rounds.  A new
-	series begins once all the doorbell bits have been shifted out of
-	range.
-* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and
-	then to observe debugging output on the console.
-
-NTB Tool Test Client (ntb\_tool)
---------------------------------
-
-The Tool test client serves for debugging, primarily, ntb hardware and drivers.
-The Tool provides access through debugfs for reading, setting, and clearing the
-NTB doorbell, and reading and writing scratchpads.
-
-The Tool does not currently have any module parameters.
-
-Debugfs Files:
-
-* *debugfs*/ntb\_tool/*hw*/
-	A directory in debugfs will be created for each
-	NTB device probed by the tool.  This directory is shortened to *hw*
-	below.
-* *hw*/db
-	This file is used to read, set, and clear the local doorbell.  Not
-	all operations may be supported by all hardware.  To read the doorbell,
-	read the file.  To set the doorbell, write `s` followed by the bits to
-	set (eg: `echo 's 0x0101' > db`).  To clear the doorbell, write `c`
-	followed by the bits to clear.
-* *hw*/mask
-	This file is used to read, set, and clear the local doorbell mask.
-	See *db* for details.
-* *hw*/peer\_db
-	This file is used to read, set, and clear the peer doorbell.
-	See *db* for details.
-* *hw*/peer\_mask
-	This file is used to read, set, and clear the peer doorbell
-	mask.  See *db* for details.
-* *hw*/spad
-	This file is used to read and write local scratchpads.  To read
-	the values of all scratchpads, read the file.  To write values, write a
-	series of pairs of scratchpad number and value
-	(eg: `echo '4 0x123 7 0xabc' > spad`
-	# to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively).
-* *hw*/peer\_spad
-	This file is used to read and write peer scratchpads.  See
-	*spad* for details.
-
-NTB Hardware Drivers
-====================
-
-NTB hardware drivers should register devices with the NTB core driver.  After
-registering, clients probe and remove functions will be called.
-
-NTB Intel Hardware Driver (ntb\_hw\_intel)
-------------------------------------------
-
-The Intel hardware driver supports NTB on Xeon and Atom CPUs.
-
-Module Parameters:
-
-* b2b\_mw\_idx
-	If the peer ntb is to be accessed via a memory window, then use
-	this memory window to access the peer ntb.  A value of zero or positive
-	starts from the first mw idx, and a negative value starts from the last
-	mw idx.  Both sides MUST set the same value here!  The default value is
-	`-1`.
-* b2b\_mw\_share
-	If the peer ntb is to be accessed via a memory window, and if
-	the memory window is large enough, still allow the client to use the
-	second half of the memory window for address translation to the peer.
-* xeon\_b2b\_usd\_bar2\_addr64
-	If using B2B topology on Xeon hardware, use
-	this 64 bit address on the bus between the NTB devices for the window
-	at BAR2, on the upstream side of the link.
-* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
deleted file mode 100644
index e293fb664924..000000000000
--- a/Documentation/nvdimm/btt.txt
+++ /dev/null
@@ -1,273 +0,0 @@
-BTT - Block Translation Table
-=============================
-
-
-1. Introduction
----------------
-
-Persistent memory based storage is able to perform IO at byte (or more
-accurately, cache line) granularity. However, we often want to expose such
-storage as traditional block devices. The block drivers for persistent memory
-will do exactly this. However, they do not provide any atomicity guarantees.
-Traditional SSDs typically provide protection against torn sectors in hardware,
-using stored energy in capacitors to complete in-flight block writes, or perhaps
-in firmware. We don't have this luxury with persistent memory - if a write is in
-progress, and we experience a power failure, the block will contain a mix of old
-and new data. Applications may not be prepared to handle such a scenario.
-
-The Block Translation Table (BTT) provides atomic sector update semantics for
-persistent memory devices, so that applications that rely on sector writes not
-being torn can continue to do so. The BTT manifests itself as a stacked block
-device, and reserves a portion of the underlying storage for its metadata. At
-the heart of it, is an indirection table that re-maps all the blocks on the
-volume. It can be thought of as an extremely simple file system that only
-provides atomic sector updates.
-
-
-2. Static Layout
-----------------
-
-The underlying storage on which a BTT can be laid out is not limited in any way.
-The BTT, however, splits the available space into chunks of up to 512 GiB,
-called "Arenas".
-
-Each arena follows the same layout for its metadata, and all references in an
-arena are internal to it (with the exception of one field that points to the
-next arena). The following depicts the "On-disk" metadata layout:
-
-
-  Backing Store     +------->  Arena
-+---------------+   |   +------------------+
-|               |   |   | Arena info block |
-|    Arena 0    +---+   |       4K         |
-|     512G      |       +------------------+
-|               |       |                  |
-+---------------+       |                  |
-|               |       |                  |
-|    Arena 1    |       |   Data Blocks    |
-|     512G      |       |                  |
-|               |       |                  |
-+---------------+       |                  |
-|       .       |       |                  |
-|       .       |       |                  |
-|       .       |       |                  |
-|               |       |                  |
-|               |       |                  |
-+---------------+       +------------------+
-                        |                  |
-                        |     BTT Map      |
-                        |                  |
-                        |                  |
-                        +------------------+
-                        |                  |
-                        |     BTT Flog     |
-                        |                  |
-                        +------------------+
-                        | Info block copy  |
-                        |       4K         |
-                        +------------------+
-
-
-3. Theory of Operation
-----------------------
-
-
-a. The BTT Map
---------------
-
-The map is a simple lookup/indirection table that maps an LBA to an internal
-block. Each map entry is 32 bits. The two most significant bits are special
-flags, and the remaining form the internal block number.
-
-Bit      Description
-31 - 30	: Error and Zero flags - Used in the following way:
-	 Bit		      Description
-	31 30
-	-----------------------------------------------------------------------
-	 00	Initial state. Reads return zeroes; Premap = Postmap
-	 01	Zero state: Reads return zeroes
-	 10	Error state: Reads fail; Writes clear 'E' bit
-	 11	Normal Block – has valid postmap
-
-
-29 - 0	: Mappings to internal 'postmap' blocks
-
-
-Some of the terminology that will be subsequently used:
-
-External LBA  : LBA as made visible to upper layers.
-ABA           : Arena Block Address - Block offset/number within an arena
-Premap ABA    : The block offset into an arena, which was decided upon by range
-		checking the External LBA
-Postmap ABA   : The block number in the "Data Blocks" area obtained after
-		indirection from the map
-nfree	      : The number of free blocks that are maintained at any given time.
-		This is the number of concurrent writes that can happen to the
-		arena.
-
-
-For example, after adding a BTT, we surface a disk of 1024G. We get a read for
-the external LBA at 768G. This falls into the second arena, and of the 512G
-worth of blocks that this arena contributes, this block is at 256G. Thus, the
-premap ABA is 256G. We now refer to the map, and find out the mapping for block
-'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
-
-
-b. The BTT Flog
----------------
-
-The BTT provides sector atomicity by making every write an "allocating write",
-i.e. Every write goes to a "free" block. A running list of free blocks is
-maintained in the form of the BTT flog. 'Flog' is a combination of the words
-"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
-
-lba     : The premap ABA that is being written to
-old_map : The old postmap ABA - after 'this' write completes, this will be a
-	  free block.
-new_map : The new postmap ABA. The map will up updated to reflect this
-	  lba->postmap_aba mapping, but we log it here in case we have to
-	  recover.
-seq	: Sequence number to mark which of the 2 sections of this flog entry is
-	  valid/newest. It cycles between 01->10->11->01 (binary) under normal
-	  operation, with 00 indicating an uninitialized state.
-lba'	: alternate lba entry
-old_map': alternate old postmap entry
-new_map': alternate new postmap entry
-seq'	: alternate sequence number.
-
-Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
-padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
-done such that for any entry being written, it:
-a. overwrites the 'old' section in the entry based on sequence numbers
-b. writes the 'new' section such that the sequence number is written last.
-
-
-c. The concept of lanes
------------------------
-
-While 'nfree' describes the number of concurrent IOs an arena can process
-concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
-process.
- nlanes = min(nfree, num_cpus)
-A lane number is obtained at the start of any IO, and is used for indexing into
-all the on-disk and in-memory data structures for the duration of the IO. If
-there are more CPUs than the max number of available lanes, than lanes are
-protected by spinlocks.
-
-
-d. In-memory data structure: Read Tracking Table (RTT)
-------------------------------------------------------
-
-Consider a case where we have two threads, one doing reads and the other,
-writes. We can hit a condition where the writer thread grabs a free block to do
-a new IO, but the (slow) reader thread is still reading from it. In other words,
-the reader consulted a map entry, and started reading the corresponding block. A
-writer started writing to the same external LBA, and finished the write updating
-the map for that external LBA to point to its new postmap ABA. At this point the
-internal, postmap block that the reader is (still) reading has been inserted
-into the list of free blocks. If another write comes in for the same LBA, it can
-grab this free block, and start writing to it, causing the reader to read
-incorrect data. To prevent this, we introduce the RTT.
-
-The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
-into rtt[lane_number], the postmap ABA it is reading, and clears it after the
-read is complete. Every writer thread, after grabbing a free block, checks the
-RTT for its presence. If the postmap free block is in the RTT, it waits till the
-reader clears the RTT entry, and only then starts writing to it.
-
-
-e. In-memory data structure: map locks
---------------------------------------
-
-Consider a case where two writer threads are writing to the same LBA. There can
-be a race in the following sequence of steps:
-
-free[lane] = map[premap_aba]
-map[premap_aba] = postmap_aba
-
-Both threads can update their respective free[lane] with the same old, freed
-postmap_aba. This has made the layout inconsistent by losing a free entry, and
-at the same time, duplicating another free entry for two lanes.
-
-To solve this, we could have a single map lock (per arena) that has to be taken
-before performing the above sequence, but we feel that could be too contentious.
-Instead we use an array of (nfree) map_locks that is indexed by
-(premap_aba modulo nfree).
-
-
-f. Reconstruction from the Flog
--------------------------------
-
-On startup, we analyze the BTT flog to create our list of free blocks. We walk
-through all the entries, and for each lane, of the set of two possible
-'sections', we always look at the most recent one only (based on the sequence
-number). The reconstruction rules/steps are simple:
-- Read map[log_entry.lba].
-- If log_entry.new matches the map entry, then log_entry.old is free.
-- If log_entry.new does not match the map entry, then log_entry.new is free.
-  (This case can only be caused by power-fails/unsafe shutdowns)
-
-
-g. Summarizing - Read and Write flows
--------------------------------------
-
-Read:
-
-1.  Convert external LBA to arena number + pre-map ABA
-2.  Get a lane (and take lane_lock)
-3.  Read map to get the entry for this pre-map ABA
-4.  Enter post-map ABA into RTT[lane]
-5.  If TRIM flag set in map, return zeroes, and end IO (go to step 8)
-6.  If ERROR flag set in map, end IO with EIO (go to step 8)
-7.  Read data from this block
-8.  Remove post-map ABA entry from RTT[lane]
-9.  Release lane (and lane_lock)
-
-Write:
-
-1.  Convert external LBA to Arena number + pre-map ABA
-2.  Get a lane (and take lane_lock)
-3.  Use lane to index into in-memory free list and obtain a new block, next flog
-        index, next sequence number
-4.  Scan the RTT to check if free block is present, and spin/wait if it is.
-5.  Write data to this free block
-6.  Read map to get the existing post-map ABA entry for this pre-map ABA
-7.  Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
-8.  Write new post-map ABA into map.
-9.  Write old post-map entry into the free list
-10. Calculate next sequence number and write into the free list entry
-11. Release lane (and lane_lock)
-
-
-4. Error Handling
-=================
-
-An arena would be in an error state if any of the metadata is corrupted
-irrecoverably, either due to a bug or a media error. The following conditions
-indicate an error:
-- Info block checksum does not match (and recovering from the copy also fails)
-- All internal available blocks are not uniquely and entirely addressed by the
-  sum of mapped blocks and free blocks (from the BTT flog).
-- Rebuilding free list from the flog reveals missing/duplicate/impossible
-  entries
-- A map entry is out of bounds
-
-If any of these error conditions are encountered, the arena is put into a read
-only state using a flag in the info block.
-
-
-5. Usage
-========
-
-The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
-(pmem, or blk mode). The easiest way to set up such a namespace is using the
-'ndctl' utility [1]:
-
-For example, the ndctl command line to setup a btt with a 4k sector size is:
-
-    ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
-
-See ndctl create-namespace --help for more options.
-
-[1]: https://github.com/pmem/ndctl
-
diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt
deleted file mode 100644
index e894de69915a..000000000000
--- a/Documentation/nvdimm/nvdimm.txt
+++ /dev/null
@@ -1,815 +0,0 @@
-			  LIBNVDIMM: Non-Volatile Devices
-	      libnvdimm - kernel / libndctl - userspace helper library
-			   linux-nvdimm@lists.01.org
-				      v13
-
-
-	Glossary
-	Overview
-	    Supporting Documents
-	    Git Trees
-	LIBNVDIMM PMEM and BLK
-	Why BLK?
-	    PMEM vs BLK
-	        BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
-	Example NVDIMM Platform
-	LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
-	    LIBNDCTL: Context
-	        libndctl: instantiate a new library context example
-	    LIBNVDIMM/LIBNDCTL: Bus
-	        libnvdimm: control class device in /sys/class
-	        libnvdimm: bus
-	        libndctl: bus enumeration example
-	    LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
-	        libnvdimm: DIMM (NMEM)
-	        libndctl: DIMM enumeration example
-	    LIBNVDIMM/LIBNDCTL: Region
-	        libnvdimm: region
-	        libndctl: region enumeration example
-	        Why Not Encode the Region Type into the Region Name?
-	        How Do I Determine the Major Type of a Region?
-	    LIBNVDIMM/LIBNDCTL: Namespace
-	        libnvdimm: namespace
-	        libndctl: namespace enumeration example
-	        libndctl: namespace creation example
-	        Why the Term "namespace"?
-	    LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
-	        libnvdimm: btt layout
-	        libndctl: btt creation example
-	Summary LIBNDCTL Diagram
-
-
-Glossary
---------
-
-PMEM: A system-physical-address range where writes are persistent.  A
-block device composed of PMEM is capable of DAX.  A PMEM address range
-may span an interleave of several DIMMs.
-
-BLK: A set of one or more programmable memory mapped apertures provided
-by a DIMM to access its media.  This indirection precludes the
-performance benefit of interleaving, but enables DIMM-bounded failure
-modes.
-
-DPA: DIMM Physical Address, is a DIMM-relative offset.  With one DIMM in
-the system there would be a 1:1 system-physical-address:DPA association.
-Once more DIMMs are added a memory controller interleave must be
-decoded to determine the DPA associated with a given
-system-physical-address.  BLK capacity always has a 1:1 relationship
-with a single-DIMM's DPA range.
-
-DAX: File system extensions to bypass the page cache and block layer to
-mmap persistent memory, from a PMEM block device, directly into a
-process address space.
-
-DSM: Device Specific Method: ACPI method to to control specific
-device - in this case the firmware.
-
-DCR: NVDIMM Control Region Structure defined in ACPI 6 Section 5.2.25.5.
-It defines a vendor-id, device-id, and interface format for a given DIMM.
-
-BTT: Block Translation Table: Persistent memory is byte addressable.
-Existing software may have an expectation that the power-fail-atomicity
-of writes is at least one sector, 512 bytes.  The BTT is an indirection
-table with atomic update semantics to front a PMEM/BLK block device
-driver and present arbitrary atomic sector sizes.
-
-LABEL: Metadata stored on a DIMM device that partitions and identifies
-(persistently names) storage between PMEM and BLK.  It also partitions
-BLK storage to host BTTs with different parameters per BLK-partition.
-Note that traditional partition tables, GPT/MBR, are layered on top of a
-BLK or PMEM device.
-
-
-Overview
---------
-
-The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
-PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
-and BLK mode access.  These three modes of operation are described by
-the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6.  While the LIBNVDIMM
-implementation is generic and supports pre-NFIT platforms, it was guided
-by the superset of capabilities need to support this ACPI 6 definition
-for NVDIMM resources.  The bulk of the kernel implementation is in place
-to handle the case where DPA accessible via PMEM is aliased with DPA
-accessible via BLK.  When that occurs a LABEL is needed to reserve DPA
-for exclusive access via one mode a time.
-
-Supporting Documents
-ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
-NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
-DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
-Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
-
-Git Trees
-LIBNVDIMM: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
-LIBNDCTL: https://github.com/pmem/ndctl.git
-PMEM: https://github.com/01org/prd
-
-
-LIBNVDIMM PMEM and BLK
-------------------
-
-Prior to the arrival of the NFIT, non-volatile memory was described to a
-system in various ad-hoc ways.  Usually only the bare minimum was
-provided, namely, a single system-physical-address range where writes
-are expected to be durable after a system power loss.  Now, the NFIT
-specification standardizes not only the description of PMEM, but also
-BLK and platform message-passing entry points for control and
-configuration.
-
-For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
-device driver:
-
-    1. PMEM (nd_pmem.ko): Drives a system-physical-address range.  This
-    range is contiguous in system memory and may be interleaved (hardware
-    memory controller striped) across multiple DIMMs.  When interleaved the
-    platform may optionally provide details of which DIMMs are participating
-    in the interleave.
-
-    Note that while LIBNVDIMM describes system-physical-address ranges that may
-    alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
-    alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
-    distinction.  The different device-types are an implementation detail
-    that userspace can exploit to implement policies like "only interface
-    with address ranges from certain DIMMs".  It is worth noting that when
-    aliasing is present and a DIMM lacks a label, then no block device can
-    be created by default as userspace needs to do at least one allocation
-    of DPA to the PMEM range.  In contrast ND_NAMESPACE_IO ranges, once
-    registered, can be immediately attached to nd_pmem.
-
-    2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
-    defined apertures.  A set of apertures will access just one DIMM.
-    Multiple windows (apertures) allow multiple concurrent accesses, much like
-    tagged-command-queuing, and would likely be used by different threads or
-    different CPUs.
-
-    The NFIT specification defines a standard format for a BLK-aperture, but
-    the spec also allows for vendor specific layouts, and non-NFIT BLK
-    implementations may have other designs for BLK I/O.  For this reason
-    "nd_blk" calls back into platform-specific code to perform the I/O.
-    One such implementation is defined in the "Driver Writer's Guide" and "DSM
-    Interface Example".
-
-
-Why BLK?
---------
-
-While PMEM provides direct byte-addressable CPU-load/store access to
-NVDIMM storage, it does not provide the best system RAS (recovery,
-availability, and serviceability) model.  An access to a corrupted
-system-physical-address address causes a CPU exception while an access
-to a corrupted address through an BLK-aperture causes that block window
-to raise an error status in a register.  The latter is more aligned with
-the standard error model that host-bus-adapter attached disks present.
-Also, if an administrator ever wants to replace a memory it is easier to
-service a system at DIMM module boundaries.  Compare this to PMEM where
-data could be interleaved in an opaque hardware specific manner across
-several DIMMs.
-
-PMEM vs BLK
-BLK-apertures solve these RAS problems, but their presence is also the
-major contributing factor to the complexity of the ND subsystem.  They
-complicate the implementation because PMEM and BLK alias in DPA space.
-Any given DIMM's DPA-range may contribute to one or more
-system-physical-address sets of interleaved DIMMs, *and* may also be
-accessed in its entirety through its BLK-aperture.  Accessing a DPA
-through a system-physical-address while simultaneously accessing the
-same DPA through a BLK-aperture has undefined results.  For this reason,
-DIMMs with this dual interface configuration include a DSM function to
-store/retrieve a LABEL.  The LABEL effectively partitions the DPA-space
-into exclusive system-physical-address and BLK-aperture accessible
-regions.  For simplicity a DIMM is allowed a PMEM "region" per each
-interleave set in which it is a member.  The remaining DPA space can be
-carved into an arbitrary number of BLK devices with discontiguous
-extents.
-
-BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
---------------------------------------------------
-
-One of the few
-reasons to allow multiple BLK namespaces per REGION is so that each
-BLK-namespace can be configured with a BTT with unique atomic sector
-sizes.  While a PMEM device can host a BTT the LABEL specification does
-not provide for a sector size to be specified for a PMEM namespace.
-This is due to the expectation that the primary usage model for PMEM is
-via DAX, and the BTT is incompatible with DAX.  However, for the cases
-where an application or filesystem still needs atomic sector update
-guarantees it can register a BTT on a PMEM device or partition.  See
-LIBNVDIMM/NDCTL: Block Translation Table "btt"
-
-
-Example NVDIMM Platform
------------------------
-
-For the remainder of this document the following diagram will be
-referenced for any example sysfs layouts.
-
-
-                             (a)               (b)           DIMM   BLK-REGION
-          +-------------------+--------+--------+--------+
-+------+  |       pm0.0       | blk2.0 | pm1.0  | blk2.1 |    0      region2
-| imc0 +--+- - - region0- - - +--------+        +--------+
-+--+---+  |       pm0.0       | blk3.0 | pm1.0  | blk3.1 |    1      region3
-   |      +-------------------+--------v        v--------+
-+--+---+                               |                 |
-| cpu0 |                                     region1
-+--+---+                               |                 |
-   |      +----------------------------^        ^--------+
-+--+---+  |           blk4.0           | pm1.0  | blk4.0 |    2      region4
-| imc1 +--+----------------------------|        +--------+
-+------+  |           blk5.0           | pm1.0  | blk5.0 |    3      region5
-          +----------------------------+--------+--------+
-
-In this platform we have four DIMMs and two memory controllers in one
-socket.  Each unique interface (BLK or PMEM) to DPA space is identified
-by a region device with a dynamically assigned id (REGION0 - REGION5).
-
-    1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
-    single PMEM namespace is created in the REGION0-SPA-range that spans most
-    of DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
-    interleaved system-physical-address range is reclaimed as BLK-aperture
-    accessed space starting at DPA-offset (a) into each DIMM.  In that
-    reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
-    REGION3 where "blk2.0" and "blk3.0" are just human readable names that
-    could be set to any user-desired name in the LABEL.
-
-    2. In the last portion of DIMM0 and DIMM1 we have an interleaved
-    system-physical-address range, REGION1, that spans those two DIMMs as
-    well as DIMM2 and DIMM3.  Some of REGION1 is allocated to a PMEM namespace
-    named "pm1.0", the rest is reclaimed in 4 BLK-aperture namespaces (for
-    each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
-    "blk5.0".
-
-    3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
-    interleaved system-physical-address range (i.e. the DPA address past
-    offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
-    Note, that this example shows that BLK-aperture namespaces don't need to
-    be contiguous in DPA-space.
-
-    This bus is provided by the kernel under the device
-    /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
-    the nfit_test.ko module is loaded.  This not only test LIBNVDIMM but the
-    acpi_nfit.ko driver as well.
-
-
-LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
-----------------------------------------------------
-
-What follows is a description of the LIBNVDIMM sysfs layout and a
-corresponding object hierarchy diagram as viewed through the LIBNDCTL
-API.  The example sysfs paths and diagrams are relative to the Example
-NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
-test.
-
-LIBNDCTL: Context
-Every API call in the LIBNDCTL library requires a context that holds the
-logging parameters and other library instance state.  The library is
-based on the libabc template:
-https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git
-
-LIBNDCTL: instantiate a new library context example
-
-	struct ndctl_ctx *ctx;
-
-	if (ndctl_new(&ctx) == 0)
-		return ctx;
-	else
-		return NULL;
-
-LIBNVDIMM/LIBNDCTL: Bus
--------------------
-
-A bus has a 1:1 relationship with an NFIT.  The current expectation for
-ACPI based systems is that there is only ever one platform-global NFIT.
-That said, it is trivial to register multiple NFITs, the specification
-does not preclude it.  The infrastructure supports multiple busses and
-we we use this capability to test multiple NFIT configurations in the
-unit test.
-
-LIBNVDIMM: control class device in /sys/class
-
-This character device accepts DSM messages to be passed to DIMM
-identified by its NFIT handle.
-
-	/sys/class/nd/ndctl0
-	|-- dev
-	|-- device -> ../../../ndbus0
-	|-- subsystem -> ../../../../../../../class/nd
-
-
-
-LIBNVDIMM: bus
-
-	struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
-	       struct nvdimm_bus_descriptor *nfit_desc);
-
-	/sys/devices/platform/nfit_test.0/ndbus0
-	|-- commands
-	|-- nd
-	|-- nfit
-	|-- nmem0
-	|-- nmem1
-	|-- nmem2
-	|-- nmem3
-	|-- power
-	|-- provider
-	|-- region0
-	|-- region1
-	|-- region2
-	|-- region3
-	|-- region4
-	|-- region5
-	|-- uevent
-	`-- wait_probe
-
-LIBNDCTL: bus enumeration example
-Find the bus handle that describes the bus from Example NVDIMM Platform
-
-	static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
-			const char *provider)
-	{
-		struct ndctl_bus *bus;
-
-		ndctl_bus_foreach(ctx, bus)
-			if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
-				return bus;
-
-		return NULL;
-	}
-
-	bus = get_bus_by_provider(ctx, "nfit_test.0");
-
-
-LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
----------------------------
-
-The DIMM device provides a character device for sending commands to
-hardware, and it is a container for LABELs.  If the DIMM is defined by
-NFIT then an optional 'nfit' attribute sub-directory is available to add
-NFIT-specifics.
-
-Note that the kernel device name for "DIMMs" is "nmemX".  The NFIT
-describes these devices via "Memory Device to System Physical Address
-Range Mapping Structure", and there is no requirement that they actually
-be physical DIMMs, so we use a more generic name.
-
-LIBNVDIMM: DIMM (NMEM)
-
-	struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
-			const struct attribute_group **groups, unsigned long flags,
-			unsigned long *dsm_mask);
-
-	/sys/devices/platform/nfit_test.0/ndbus0
-	|-- nmem0
-	|   |-- available_slots
-	|   |-- commands
-	|   |-- dev
-	|   |-- devtype
-	|   |-- driver -> ../../../../../bus/nd/drivers/nvdimm
-	|   |-- modalias
-	|   |-- nfit
-	|   |   |-- device
-	|   |   |-- format
-	|   |   |-- handle
-	|   |   |-- phys_id
-	|   |   |-- rev_id
-	|   |   |-- serial
-	|   |   `-- vendor
-	|   |-- state
-	|   |-- subsystem -> ../../../../../bus/nd
-	|   `-- uevent
-	|-- nmem1
-	[..]
-
-
-LIBNDCTL: DIMM enumeration example
-
-Note, in this example we are assuming NFIT-defined DIMMs which are
-identified by an "nfit_handle" a 32-bit value where:
-Bit 3:0 DIMM number within the memory channel
-Bit 7:4 memory channel number
-Bit 11:8 memory controller ID
-Bit 15:12 socket ID (within scope of a Node controller if node controller is present)
-Bit 27:16 Node Controller ID
-Bit 31:28 Reserved
-
-	static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
-	       unsigned int handle)
-	{
-		struct ndctl_dimm *dimm;
-
-		ndctl_dimm_foreach(bus, dimm)
-			if (ndctl_dimm_get_handle(dimm) == handle)
-				return dimm;
-
-		return NULL;
-	}
-
-	#define DIMM_HANDLE(n, s, i, c, d) \
-		(((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
-		 | ((c & 0xf) << 4) | (d & 0xf))
-
-	dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
-
-LIBNVDIMM/LIBNDCTL: Region
-----------------------
-
-A generic REGION device is registered for each PMEM range or BLK-aperture
-set.  Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
-sets on the "nfit_test.0" bus.  The primary role of regions are to be a
-container of "mappings".  A mapping is a tuple of <DIMM,
-DPA-start-offset, length>.
-
-LIBNVDIMM provides a built-in driver for these REGION devices.  This driver
-is responsible for reconciling the aliased DPA mappings across all
-regions, parsing the LABEL, if present, and then emitting NAMESPACE
-devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
-nd_blk device driver to consume.
-
-In addition to the generic attributes of "mapping"s, "interleave_ways"
-and "size" the REGION device also exports some convenience attributes.
-"nstype" indicates the integer type of namespace-device this region
-emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
-'add' event, "modalias" duplicates the MODALIAS variable stored by udev
-at the 'add' event, and finally, the optional "spa_index" is provided in
-the case where the region is defined by a SPA.
-
-LIBNVDIMM: region
-
-	struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
-			struct nd_region_desc *ndr_desc);
-	struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
-			struct nd_region_desc *ndr_desc);
-
-	/sys/devices/platform/nfit_test.0/ndbus0
-	|-- region0
-	|   |-- available_size
-	|   |-- btt0
-	|   |-- btt_seed
-	|   |-- devtype
-	|   |-- driver -> ../../../../../bus/nd/drivers/nd_region
-	|   |-- init_namespaces
-	|   |-- mapping0
-	|   |-- mapping1
-	|   |-- mappings
-	|   |-- modalias
-	|   |-- namespace0.0
-	|   |-- namespace_seed
-	|   |-- numa_node
-	|   |-- nfit
-	|   |   `-- spa_index
-	|   |-- nstype
-	|   |-- set_cookie
-	|   |-- size
-	|   |-- subsystem -> ../../../../../bus/nd
-	|   `-- uevent
-	|-- region1
-	[..]
-
-LIBNDCTL: region enumeration example
-
-Sample region retrieval routines based on NFIT-unique data like
-"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
-BLK.
-
-	static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
-			unsigned int spa_index)
-	{
-		struct ndctl_region *region;
-
-		ndctl_region_foreach(bus, region) {
-			if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
-				continue;
-			if (ndctl_region_get_spa_index(region) == spa_index)
-				return region;
-		}
-		return NULL;
-	}
-
-	static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
-			unsigned int handle)
-	{
-		struct ndctl_region *region;
-
-		ndctl_region_foreach(bus, region) {
-			struct ndctl_mapping *map;
-
-			if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
-				continue;
-			ndctl_mapping_foreach(region, map) {
-				struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
-
-				if (ndctl_dimm_get_handle(dimm) == handle)
-					return region;
-			}
-		}
-		return NULL;
-	}
-
-
-Why Not Encode the Region Type into the Region Name?
-----------------------------------------------------
-
-At first glance it seems since NFIT defines just PMEM and BLK interface
-types that we should simply name REGION devices with something derived
-from those type names.  However, the ND subsystem explicitly keeps the
-REGION name generic and expects userspace to always consider the
-region-attributes for four reasons:
-
-    1. There are already more than two REGION and "namespace" types.  For
-    PMEM there are two subtypes.  As mentioned previously we have PMEM where
-    the constituent DIMM devices are known and anonymous PMEM.  For BLK
-    regions the NFIT specification already anticipates vendor specific
-    implementations.  The exact distinction of what a region contains is in
-    the region-attributes not the region-name or the region-devtype.
-
-    2. A region with zero child-namespaces is a possible configuration.  For
-    example, the NFIT allows for a DCR to be published without a
-    corresponding BLK-aperture.  This equates to a DIMM that can only accept
-    control/configuration messages, but no i/o through a descendant block
-    device.  Again, this "type" is advertised in the attributes ('mappings'
-    == 0) and the name does not tell you much.
-
-    3. What if a third major interface type arises in the future?  Outside
-    of vendor specific implementations, it's not difficult to envision a
-    third class of interface type beyond BLK and PMEM.  With a generic name
-    for the REGION level of the device-hierarchy old userspace
-    implementations can still make sense of new kernel advertised
-    region-types.  Userspace can always rely on the generic region
-    attributes like "mappings", "size", etc and the expected child devices
-    named "namespace".  This generic format of the device-model hierarchy
-    allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
-    future-proof.
-
-    4. There are more robust mechanisms for determining the major type of a
-    region than a device name.  See the next section, How Do I Determine the
-    Major Type of a Region?
-
-How Do I Determine the Major Type of a Region?
-----------------------------------------------
-
-Outside of the blanket recommendation of "use libndctl", or simply
-looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
-"nstype" integer attribute, here are some other options.
-
-    1. module alias lookup:
-
-    The whole point of region/namespace device type differentiation is to
-    decide which block-device driver will attach to a given LIBNVDIMM namespace.
-    One can simply use the modalias to lookup the resulting module.  It's
-    important to note that this method is robust in the presence of a
-    vendor-specific driver down the road.  If a vendor-specific
-    implementation wants to supplant the standard nd_blk driver it can with
-    minimal impact to the rest of LIBNVDIMM.
-
-    In fact, a vendor may also want to have a vendor-specific region-driver
-    (outside of nd_region).  For example, if a vendor defined its own LABEL
-    format it would need its own region driver to parse that LABEL and emit
-    the resulting namespaces.  The output from module resolution is more
-    accurate than a region-name or region-devtype.
-
-    2. udev:
-
-    The kernel "devtype" is registered in the udev database
-    # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
-    P: /devices/platform/nfit_test.0/ndbus0/region0
-    E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
-    E: DEVTYPE=nd_pmem
-    E: MODALIAS=nd:t2
-    E: SUBSYSTEM=nd
-
-    # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
-    P: /devices/platform/nfit_test.0/ndbus0/region4
-    E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
-    E: DEVTYPE=nd_blk
-    E: MODALIAS=nd:t3
-    E: SUBSYSTEM=nd
-
-    ...and is available as a region attribute, but keep in mind that the
-    "devtype" does not indicate sub-type variations and scripts should
-    really be understanding the other attributes.
-
-    3. type specific attributes:
-
-    As it currently stands a BLK-aperture region will never have a
-    "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region.  A
-    BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
-    that does not allow I/O.  A PMEM region with a "mappings" value of zero
-    is a simple system-physical-address range.
-
-
-LIBNVDIMM/LIBNDCTL: Namespace
--------------------------
-
-A REGION, after resolving DPA aliasing and LABEL specified boundaries,
-surfaces one or more "namespace" devices.  The arrival of a "namespace"
-device currently triggers either the nd_blk or nd_pmem driver to load
-and register a disk/block device.
-
-LIBNVDIMM: namespace
-Here is a sample layout from the three major types of NAMESPACE where
-namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
-attribute), namespace2.0 represents a BLK namespace (note it has a
-'sector_size' attribute) that, and namespace6.0 represents an anonymous
-PMEM namespace (note that has no 'uuid' attribute due to not support a
-LABEL).
-
-	/sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
-	|-- alt_name
-	|-- devtype
-	|-- dpa_extents
-	|-- force_raw
-	|-- modalias
-	|-- numa_node
-	|-- resource
-	|-- size
-	|-- subsystem -> ../../../../../../bus/nd
-	|-- type
-	|-- uevent
-	`-- uuid
-	/sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
-	|-- alt_name
-	|-- devtype
-	|-- dpa_extents
-	|-- force_raw
-	|-- modalias
-	|-- numa_node
-	|-- sector_size
-	|-- size
-	|-- subsystem -> ../../../../../../bus/nd
-	|-- type
-	|-- uevent
-	`-- uuid
-	/sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
-	|-- block
-	|   `-- pmem0
-	|-- devtype
-	|-- driver -> ../../../../../../bus/nd/drivers/pmem
-	|-- force_raw
-	|-- modalias
-	|-- numa_node
-	|-- resource
-	|-- size
-	|-- subsystem -> ../../../../../../bus/nd
-	|-- type
-	`-- uevent
-
-LIBNDCTL: namespace enumeration example
-Namespaces are indexed relative to their parent region, example below.
-These indexes are mostly static from boot to boot, but subsystem makes
-no guarantees in this regard.  For a static namespace identifier use its
-'uuid' attribute.
-
-static struct ndctl_namespace *get_namespace_by_id(struct ndctl_region *region,
-                unsigned int id)
-{
-        struct ndctl_namespace *ndns;
-
-        ndctl_namespace_foreach(region, ndns)
-                if (ndctl_namespace_get_id(ndns) == id)
-                        return ndns;
-
-        return NULL;
-}
-
-LIBNDCTL: namespace creation example
-Idle namespaces are automatically created by the kernel if a given
-region has enough available capacity to create a new namespace.
-Namespace instantiation involves finding an idle namespace and
-configuring it.  For the most part the setting of namespace attributes
-can occur in any order, the only constraint is that 'uuid' must be set
-before 'size'.  This enables the kernel to track DPA allocations
-internally with a static identifier.
-
-static int configure_namespace(struct ndctl_region *region,
-                struct ndctl_namespace *ndns,
-                struct namespace_parameters *parameters)
-{
-        char devname[50];
-
-        snprintf(devname, sizeof(devname), "namespace%d.%d",
-                        ndctl_region_get_id(region), paramaters->id);
-
-        ndctl_namespace_set_alt_name(ndns, devname);
-        /* 'uuid' must be set prior to setting size! */
-        ndctl_namespace_set_uuid(ndns, paramaters->uuid);
-        ndctl_namespace_set_size(ndns, paramaters->size);
-        /* unlike pmem namespaces, blk namespaces have a sector size */
-        if (parameters->lbasize)
-                ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
-        ndctl_namespace_enable(ndns);
-}
-
-
-Why the Term "namespace"?
-
-    1. Why not "volume" for instance?  "volume" ran the risk of confusing
-    ND (libnvdimm subsystem) to a volume manager like device-mapper.
-
-    2. The term originated to describe the sub-devices that can be created
-    within a NVME controller (see the nvme specification:
-    http://www.nvmexpress.org/specifications/), and NFIT namespaces are
-    meant to parallel the capabilities and configurability of
-    NVME-namespaces.
-
-
-LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
----------------------------------------------
-
-A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
-block device driver that fronts either the whole block device or a
-partition of a block device emitted by either a PMEM or BLK NAMESPACE.
-
-LIBNVDIMM: btt layout
-Every region will start out with at least one BTT device which is the
-seed device.  To activate it set the "namespace", "uuid", and
-"sector_size" attributes and then bind the device to the nd_pmem or
-nd_blk driver depending on the region type.
-
-	/sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
-	|-- namespace
-	|-- delete
-	|-- devtype
-	|-- modalias
-	|-- numa_node
-	|-- sector_size
-	|-- subsystem -> ../../../../../bus/nd
-	|-- uevent
-	`-- uuid
-
-LIBNDCTL: btt creation example
-Similar to namespaces an idle BTT device is automatically created per
-region.  Each time this "seed" btt device is configured and enabled a new
-seed is created.  Creating a BTT configuration involves two steps of
-finding and idle BTT and assigning it to consume a PMEM or BLK namespace.
-
-	static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
-	{
-		struct ndctl_btt *btt;
-
-		ndctl_btt_foreach(region, btt)
-			if (!ndctl_btt_is_enabled(btt)
-					&& !ndctl_btt_is_configured(btt))
-				return btt;
-
-		return NULL;
-	}
-
-	static int configure_btt(struct ndctl_region *region,
-			struct btt_parameters *parameters)
-	{
-		btt = get_idle_btt(region);
-
-		ndctl_btt_set_uuid(btt, parameters->uuid);
-		ndctl_btt_set_sector_size(btt, parameters->sector_size);
-		ndctl_btt_set_namespace(btt, parameters->ndns);
-		/* turn off raw mode device */
-		ndctl_namespace_disable(parameters->ndns);
-		/* turn on btt access */
-		ndctl_btt_enable(btt);
-	}
-
-Once instantiated a new inactive btt seed device will appear underneath
-the region.
-
-Once a "namespace" is removed from a BTT that instance of the BTT device
-will be deleted or otherwise reset to default values.  This deletion is
-only at the device model level.  In order to destroy a BTT the "info
-block" needs to be destroyed.  Note, that to destroy a BTT the media
-needs to be written in raw mode.  By default, the kernel will autodetect
-the presence of a BTT and disable raw mode.  This autodetect behavior
-can be suppressed by enabling raw mode for the namespace via the
-ndctl_namespace_set_raw_mode() API.
-
-
-Summary LIBNDCTL Diagram
-------------------------
-
-For the given example above, here is the view of the objects as seen by the
-LIBNDCTL API:
-            +---+
-            |CTX|    +---------+   +--------------+  +---------------+
-            +-+-+  +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
-              |    | +---------+   +--------------+  +---------------+
-+-------+     |    | +---------+   +--------------+  +---------------+
-| DIMM0 <-+   |    +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
-+-------+ |   |    | +---------+   +--------------+  +---------------+
-| DIMM1 <-+ +-v--+ | +---------+   +--------------+  +---------------+
-+-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6  "blk2.0" |
-| DIMM2 <-+ +----+ | +---------+ | +--------------+  +----------------------+
-+-------+ |        |             +-> NAMESPACE2.1 +--> ND5  "blk2.1" | BTT2 |
-| DIMM3 <-+        |               +--------------+  +----------------------+
-+-------+          | +---------+   +--------------+  +---------------+
-                   +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4  "blk3.0" |
-                   | +---------+ | +--------------+  +----------------------+
-                   |             +-> NAMESPACE3.1 +--> ND3  "blk3.1" | BTT1 |
-                   |               +--------------+  +----------------------+
-                   | +---------+   +--------------+  +---------------+
-                   +-> REGION4 +---> NAMESPACE4.0 +--> ND2  "blk4.0" |
-                   | +---------+   +--------------+  +---------------+
-                   | +---------+   +--------------+  +----------------------+
-                   +-> REGION5 +---> NAMESPACE5.0 +--> ND1  "blk5.0" | BTT0 |
-                     +---------+   +--------------+  +---------------+------+
-
-
diff --git a/Documentation/nvdimm/security.txt b/Documentation/nvdimm/security.txt
deleted file mode 100644
index 4c36c05ca98e..000000000000
--- a/Documentation/nvdimm/security.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-NVDIMM SECURITY
-===============
-
-1. Introduction
----------------
-
-With the introduction of Intel Device Specific Methods (DSM) v1.8
-specification [1], security DSMs are introduced. The spec added the following
-security DSMs: "get security state", "set passphrase", "disable passphrase",
-"unlock unit", "freeze lock", "secure erase", and "overwrite". A security_ops
-data structure has been added to struct dimm in order to support the security
-operations and generic APIs are exposed to allow vendor neutral operations.
-
-2. Sysfs Interface
-------------------
-The "security" sysfs attribute is provided in the nvdimm sysfs directory. For
-example:
-/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/security
-
-The "show" attribute of that attribute will display the security state for
-that DIMM. The following states are available: disabled, unlocked, locked,
-frozen, and overwrite. If security is not supported, the sysfs attribute
-will not be visible.
-
-The "store" attribute takes several commands when it is being written to
-in order to support some of the security functionalities:
-update <old_keyid> <new_keyid> - enable or update passphrase.
-disable <keyid> - disable enabled security and remove key.
-freeze - freeze changing of security states.
-erase <keyid> - delete existing user encryption key.
-overwrite <keyid> - wipe the entire nvdimm.
-master_update <keyid> <new_keyid> - enable or update master passphrase.
-master_erase <keyid> - delete existing user encryption key.
-
-3. Key Management
------------------
-
-The key is associated to the payload by the DIMM id. For example:
-# cat /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/nfit/id
-8089-a2-1740-00000133
-The DIMM id would be provided along with the key payload (passphrase) to
-the kernel.
-
-The security keys are managed on the basis of a single key per DIMM. The
-key "passphrase" is expected to be 32bytes long. This is similar to the ATA
-security specification [2]. A key is initially acquired via the request_key()
-kernel API call during nvdimm unlock. It is up to the user to make sure that
-all the keys are in the kernel user keyring for unlock.
-
-A nvdimm encrypted-key of format enc32 has the description format of:
-nvdimm:<bus-provider-specific-unique-id>
-
-See file ``Documentation/security/keys/trusted-encrypted.rst`` for creating
-encrypted-keys of enc32 format. TPM usage with a master trusted key is
-preferred for sealing the encrypted-keys.
-
-4. Unlocking
-------------
-When the DIMMs are being enumerated by the kernel, the kernel will attempt to
-retrieve the key from the kernel user keyring. This is the only time
-a locked DIMM can be unlocked. Once unlocked, the DIMM will remain unlocked
-until reboot. Typically an entity (i.e. shell script) will inject all the
-relevant encrypted-keys into the kernel user keyring during the initramfs phase.
-This provides the unlock function access to all the related keys that contain
-the passphrase for the respective nvdimms.  It is also recommended that the
-keys are injected before libnvdimm is loaded by modprobe.
-
-5. Update
----------
-When doing an update, it is expected that the existing key is removed from
-the kernel user keyring and reinjected as different (old) key. It's irrelevant
-what the key description is for the old key since we are only interested in the
-keyid when doing the update operation. It is also expected that the new key
-is injected with the description format described from earlier in this
-document.  The update command written to the sysfs attribute will be with
-the format:
-update <old keyid> <new keyid>
-
-If there is no old keyid due to a security enabling, then a 0 should be
-passed in.
-
-6. Freeze
----------
-The freeze operation does not require any keys. The security config can be
-frozen by a user with root privelege.
-
-7. Disable
-----------
-The security disable command format is:
-disable <keyid>
-
-An key with the current passphrase payload that is tied to the nvdimm should be
-in the kernel user keyring.
-
-8. Secure Erase
----------------
-The command format for doing a secure erase is:
-erase <keyid>
-
-An key with the current passphrase payload that is tied to the nvdimm should be
-in the kernel user keyring.
-
-9. Overwrite
-------------
-The command format for doing an overwrite is:
-overwrite <keyid>
-
-Overwrite can be done without a key if security is not enabled. A key serial
-of 0 can be passed in to indicate no key.
-
-The sysfs attribute "security" can be polled to wait on overwrite completion.
-Overwrite can last tens of minutes or more depending on nvdimm size.
-
-An encrypted-key with the current user passphrase that is tied to the nvdimm
-should be injected and its keyid should be passed in via sysfs.
-
-10. Master Update
------------------
-The command format for doing a master update is:
-update <old keyid> <new keyid>
-
-The operating mechanism for master update is identical to update except the
-master passphrase key is passed to the kernel. The master passphrase key
-is just another encrypted-key.
-
-This command is only available when security is disabled.
-
-11. Master Erase
-----------------
-The command format for doing a master erase is:
-master_erase <current keyid>
-
-This command has the same operating mechanism as erase except the master
-passphrase key is passed to the kernel. The master passphrase key is just
-another encrypted-key.
-
-This command is only available when the master security is enabled, indicated
-by the extended security status.
-
-[1]: http://pmem.io/documents/NVDIMM_DSM_Interface-V1.8.pdf
-[2]: http://www.t13.org/documents/UploadedDocuments/docs2006/e05179r4-ACS-SecurityClarifications.pdf
diff --git a/Documentation/nvmem/nvmem.txt b/Documentation/nvmem/nvmem.txt
deleted file mode 100644
index fc2fe4b18655..000000000000
--- a/Documentation/nvmem/nvmem.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-			    NVMEM SUBSYSTEM
-	  Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
-
-This document explains the NVMEM Framework along with the APIs provided,
-and how to use it.
-
-1. Introduction
-===============
-*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to
-retrieve configuration of SOC or Device specific data from non volatile
-memories like eeprom, efuses and so on.
-
-Before this framework existed, NVMEM drivers like eeprom were stored in
-drivers/misc, where they all had to duplicate pretty much the same code to
-register a sysfs file, allow in-kernel users to access the content of the
-devices they were driving, etc.
-
-This was also a problem as far as other in-kernel users were involved, since
-the solutions used were pretty much different from one driver to another, there
-was a rather big abstraction leak.
-
-This framework aims at solve these problems. It also introduces DT
-representation for consumer devices to go get the data they require (MAC
-Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This
-framework is based on regmap, so that most of the abstraction available in
-regmap can be reused, across multiple types of buses.
-
-NVMEM Providers
-+++++++++++++++
-
-NVMEM provider refers to an entity that implements methods to initialize, read
-and write the non-volatile memory.
-
-2. Registering/Unregistering the NVMEM provider
-===============================================
-
-A NVMEM provider can register with NVMEM core by supplying relevant
-nvmem configuration to nvmem_register(), on success core would return a valid
-nvmem_device pointer.
-
-nvmem_unregister(nvmem) is used to unregister a previously registered provider.
-
-For example, a simple qfprom case:
-
-static struct nvmem_config econfig = {
-	.name = "qfprom",
-	.owner = THIS_MODULE,
-};
-
-static int qfprom_probe(struct platform_device *pdev)
-{
-	...
-	econfig.dev = &pdev->dev;
-	nvmem = nvmem_register(&econfig);
-	...
-}
-
-It is mandatory that the NVMEM provider has a regmap associated with its
-struct device. Failure to do would return error code from nvmem_register().
-
-Users of board files can define and register nvmem cells using the
-nvmem_cell_table struct:
-
-static struct nvmem_cell_info foo_nvmem_cells[] = {
-	{
-		.name		= "macaddr",
-		.offset		= 0x7f00,
-		.bytes		= ETH_ALEN,
-	}
-};
-
-static struct nvmem_cell_table foo_nvmem_cell_table = {
-	.nvmem_name		= "i2c-eeprom",
-	.cells			= foo_nvmem_cells,
-	.ncells			= ARRAY_SIZE(foo_nvmem_cells),
-};
-
-nvmem_add_cell_table(&foo_nvmem_cell_table);
-
-Additionally it is possible to create nvmem cell lookup entries and register
-them with the nvmem framework from machine code as shown in the example below:
-
-static struct nvmem_cell_lookup foo_nvmem_lookup = {
-	.nvmem_name		= "i2c-eeprom",
-	.cell_name		= "macaddr",
-	.dev_id			= "foo_mac.0",
-	.con_id			= "mac-address",
-};
-
-nvmem_add_cell_lookups(&foo_nvmem_lookup, 1);
-
-NVMEM Consumers
-+++++++++++++++
-
-NVMEM consumers are the entities which make use of the NVMEM provider to
-read from and to NVMEM.
-
-3. NVMEM cell based consumer APIs
-=================================
-
-NVMEM cells are the data entries/fields in the NVMEM.
-The NVMEM framework provides 3 APIs to read/write NVMEM cells.
-
-struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name);
-struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name);
-
-void nvmem_cell_put(struct nvmem_cell *cell);
-void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
-
-void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len);
-int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len);
-
-*nvmem_cell_get() apis will get a reference to nvmem cell for a given id,
-and nvmem_cell_read/write() can then read or write to the cell.
-Once the usage of the cell is finished the consumer should call *nvmem_cell_put()
-to free all the allocation memory for the cell.
-
-4. Direct NVMEM device based consumer APIs
-==========================================
-
-In some instances it is necessary to directly read/write the NVMEM.
-To facilitate such consumers NVMEM framework provides below apis.
-
-struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
-struct nvmem_device *devm_nvmem_device_get(struct device *dev,
-					   const char *name);
-void nvmem_device_put(struct nvmem_device *nvmem);
-int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset,
-		      size_t bytes, void *buf);
-int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset,
-		       size_t bytes, void *buf);
-int nvmem_device_cell_read(struct nvmem_device *nvmem,
-			   struct nvmem_cell_info *info, void *buf);
-int nvmem_device_cell_write(struct nvmem_device *nvmem,
-			    struct nvmem_cell_info *info, void *buf);
-
-Before the consumers can read/write NVMEM directly, it should get hold
-of nvmem_controller from one of the *nvmem_device_get() api.
-
-The difference between these apis and cell based apis is that these apis always
-take nvmem_device as parameter.
-
-5. Releasing a reference to the NVMEM
-=====================================
-
-When a consumer no longer needs the NVMEM, it has to release the reference
-to the NVMEM it has obtained using the APIs mentioned in the above section.
-The NVMEM framework provides 2 APIs to release a reference to the NVMEM.
-
-void nvmem_cell_put(struct nvmem_cell *cell);
-void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
-void nvmem_device_put(struct nvmem_device *nvmem);
-void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem);
-
-Both these APIs are used to release a reference to the NVMEM and
-devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated
-with this NVMEM.
-
-Userspace
-+++++++++
-
-6. Userspace binary interface
-==============================
-
-Userspace can read/write the raw NVMEM file located at
-/sys/bus/nvmem/devices/*/nvmem
-
-ex:
-
-hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
-
-0000000 0000 0000 0000 0000 0000 0000 0000 0000
-*
-00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00
-0000000 0000 0000 0000 0000 0000 0000 0000 0000
-...
-*
-0001000
-
-7. DeviceTree Binding
-=====================
-
-See Documentation/devicetree/bindings/nvmem/nvmem.txt
diff --git a/Documentation/pcmcia/devicetable.rst b/Documentation/pcmcia/devicetable.rst
new file mode 100644
index 000000000000..fd1d60d12ca1
--- /dev/null
+++ b/Documentation/pcmcia/devicetable.rst
@@ -0,0 +1,37 @@
+============
+Device table
+============
+
+Matching of PCMCIA devices to drivers is done using one or more of the
+following criteria:
+
+- manufactor ID
+- card ID
+- product ID strings _and_ hashes of these strings
+- function ID
+- device function (actual and pseudo)
+
+You should use the helpers in include/pcmcia/device_id.h for generating the
+struct pcmcia_device_id[] entries which match devices to drivers.
+
+If you want to match product ID strings, you also need to pass the crc32
+hashes of the string to the macro, e.g. if you want to match the product ID
+string 1, you need to use
+
+PCMCIA_DEVICE_PROD_ID1("some_string", 0x(hash_of_some_string)),
+
+If the hash is incorrect, the kernel will inform you about this in "dmesg"
+upon module initialization, and tell you of the correct hash.
+
+You can determine the hash of the product ID strings by catting the file
+"modalias" in the sysfs directory of the PCMCIA device. It generates a string
+in the following form:
+pcmcia:m0149cC1ABf06pfn00fn00pa725B842DpbF1EFEE84pc0877B627pd00000000
+
+The hex value after "pa" is the hash of product ID string 1, after "pb" for
+string 2 and so on.
+
+Alternatively, you can use crc32hash (see tools/pcmcia/crc32hash.c)
+to determine the crc32 hash.  Simply pass the string you want to evaluate
+as argument to this program, e.g.:
+$ tools/pcmcia/crc32hash "Dual Speed"
diff --git a/Documentation/pcmcia/devicetable.txt b/Documentation/pcmcia/devicetable.txt
deleted file mode 100644
index 5f3e00ab54c4..000000000000
--- a/Documentation/pcmcia/devicetable.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Matching of PCMCIA devices to drivers is done using one or more of the
-following criteria:
-
-- manufactor ID
-- card ID
-- product ID strings _and_ hashes of these strings
-- function ID
-- device function (actual and pseudo)
-
-You should use the helpers in include/pcmcia/device_id.h for generating the
-struct pcmcia_device_id[] entries which match devices to drivers.
-
-If you want to match product ID strings, you also need to pass the crc32
-hashes of the string to the macro, e.g. if you want to match the product ID
-string 1, you need to use
-
-PCMCIA_DEVICE_PROD_ID1("some_string", 0x(hash_of_some_string)),
-
-If the hash is incorrect, the kernel will inform you about this in "dmesg"
-upon module initialization, and tell you of the correct hash.
-
-You can determine the hash of the product ID strings by catting the file
-"modalias" in the sysfs directory of the PCMCIA device. It generates a string
-in the following form:
-pcmcia:m0149cC1ABf06pfn00fn00pa725B842DpbF1EFEE84pc0877B627pd00000000
-
-The hex value after "pa" is the hash of product ID string 1, after "pb" for
-string 2 and so on.
-
-Alternatively, you can use crc32hash (see tools/pcmcia/crc32hash.c)
-to determine the crc32 hash.  Simply pass the string you want to evaluate
-as argument to this program, e.g.:
-$ tools/pcmcia/crc32hash "Dual Speed"
diff --git a/Documentation/pcmcia/driver-changes.rst b/Documentation/pcmcia/driver-changes.rst
new file mode 100644
index 000000000000..33fe9ebec049
--- /dev/null
+++ b/Documentation/pcmcia/driver-changes.rst
@@ -0,0 +1,160 @@
+==============
+Driver changes
+==============
+
+This file details changes in 2.6 which affect PCMCIA card driver authors:
+
+* pcmcia_loop_config() and autoconfiguration (as of 2.6.36)
+   If `struct pcmcia_device *p_dev->config_flags` is set accordingly,
+   pcmcia_loop_config() now sets up certain configuration values
+   automatically, though the driver may still override the settings
+   in the callback function. The following autoconfiguration options
+   are provided at the moment:
+
+	- CONF_AUTO_CHECK_VCC : check for matching Vcc
+	- CONF_AUTO_SET_VPP   : set Vpp
+	- CONF_AUTO_AUDIO     : auto-enable audio line, if required
+	- CONF_AUTO_SET_IO    : set ioport resources (->resource[0,1])
+	- CONF_AUTO_SET_IOMEM : set first iomem resource (->resource[2])
+
+* pcmcia_request_configuration -> pcmcia_enable_device (as of 2.6.36)
+   pcmcia_request_configuration() got renamed to pcmcia_enable_device(),
+   as it mirrors pcmcia_disable_device(). Configuration settings are now
+   stored in struct pcmcia_device, e.g. in the fields config_flags,
+   config_index, config_base, vpp.
+
+* pcmcia_request_window changes (as of 2.6.36)
+   Instead of win_req_t, drivers are now requested to fill out
+   `struct pcmcia_device *p_dev->resource[2,3,4,5]` for up to four ioport
+   ranges. After a call to pcmcia_request_window(), the regions found there
+   are reserved and may be used immediately -- until pcmcia_release_window()
+   is called.
+
+* pcmcia_request_io changes (as of 2.6.36)
+   Instead of io_req_t, drivers are now requested to fill out
+   `struct pcmcia_device *p_dev->resource[0,1]` for up to two ioport
+   ranges. After a call to pcmcia_request_io(), the ports found there
+   are reserved, after calling pcmcia_request_configuration(), they may
+   be used.
+
+* No dev_info_t, no cs_types.h (as of 2.6.36)
+   dev_info_t and a few other typedefs are removed. No longer use them
+   in PCMCIA device drivers. Also, do not include pcmcia/cs_types.h, as
+   this file is gone.
+
+* No dev_node_t (as of 2.6.35)
+   There is no more need to fill out a "dev_node_t" structure.
+
+* New IRQ request rules (as of 2.6.35)
+   Instead of the old pcmcia_request_irq() interface, drivers may now
+   choose between:
+
+   - calling request_irq/free_irq directly. Use the IRQ from `*p_dev->irq`.
+   - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will
+     clean up automatically on calls to pcmcia_disable_device() or
+     device ejection.
+
+* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
+   Instead of the cs_error() callback or the CS_CHECK() macro, please use
+   Linux-style checking of return values, and -- if necessary -- debug
+   messages using "dev_dbg()" or "pr_debug()".
+
+* New CIS tuple access (as of 2.6.33)
+   Instead of pcmcia_get_{first,next}_tuple(), pcmcia_get_tuple_data() and
+   pcmcia_parse_tuple(), a driver shall use "pcmcia_get_tuple()" if it is
+   only interested in one (raw) tuple, or "pcmcia_loop_tuple()" if it is
+   interested in all tuples of one type. To decode the MAC from CISTPL_FUNCE,
+   a new helper "pcmcia_get_mac_from_cis()" was added.
+
+* New configuration loop helper (as of 2.6.28)
+   By calling pcmcia_loop_config(), a driver can iterate over all available
+   configuration options. During a driver's probe() phase, one doesn't need
+   to use pcmcia_get_{first,next}_tuple, pcmcia_get_tuple_data and
+   pcmcia_parse_tuple directly in most if not all cases.
+
+* New release helper (as of 2.6.17)
+   Instead of calling pcmcia_release_{configuration,io,irq,win}, all that's
+   necessary now is calling pcmcia_disable_device. As there is no valid
+   reason left to call pcmcia_release_io and pcmcia_release_irq, the
+   exports for them were removed.
+
+* Unify detach and REMOVAL event code, as well as attach and INSERTION
+  code (as of 2.6.16)::
+
+       void (*remove)          (struct pcmcia_device *dev);
+       int (*probe)            (struct pcmcia_device *dev);
+
+* Move suspend, resume and reset out of event handler (as of 2.6.16)::
+
+       int (*suspend)          (struct pcmcia_device *dev);
+       int (*resume)           (struct pcmcia_device *dev);
+
+  should be initialized in struct pcmcia_driver, and handle
+  (SUSPEND == RESET_PHYSICAL) and (RESUME == CARD_RESET) events
+
+* event handler initialization in struct pcmcia_driver (as of 2.6.13)
+   The event handler is notified of all events, and must be initialized
+   as the event() callback in the driver's struct pcmcia_driver.
+
+* pcmcia/version.h should not be used (as of 2.6.13)
+   This file will be removed eventually.
+
+* in-kernel device<->driver matching (as of 2.6.13)
+   PCMCIA devices and their correct drivers can now be matched in
+   kernelspace. See 'devicetable.txt' for details.
+
+* Device model integration (as of 2.6.11)
+   A struct pcmcia_device is registered with the device model core,
+   and can be used (e.g. for SET_NETDEV_DEV) by using
+   handle_to_dev(client_handle_t * handle).
+
+* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
+   ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
+
+* irq_mask and irq_list parameters (as of 2.6.11)
+   The irq_mask and irq_list parameters should no longer be used in
+   PCMCIA card drivers. Instead, it is the job of the PCMCIA core to
+   determine which IRQ should be used. Therefore, link->irq.IRQInfo2
+   is ignored.
+
+* client->PendingEvents is gone (as of 2.6.11)
+   client->PendingEvents is no longer available.
+
+* client->Attributes are gone (as of 2.6.11)
+   client->Attributes is unused, therefore it is removed from all
+   PCMCIA card drivers
+
+* core functions no longer available (as of 2.6.11)
+   The following functions have been removed from the kernel source
+   because they are unused by all in-kernel drivers, and no external
+   driver was reported to rely on them::
+
+	pcmcia_get_first_region()
+	pcmcia_get_next_region()
+	pcmcia_modify_window()
+	pcmcia_set_event_mask()
+	pcmcia_get_first_window()
+	pcmcia_get_next_window()
+
+* device list iteration upon module removal (as of 2.6.10)
+   It is no longer necessary to iterate on the driver's internal
+   client list and call the ->detach() function upon module removal.
+
+* Resource management. (as of 2.6.8)
+   Although the PCMCIA subsystem will allocate resources for cards,
+   it no longer marks these resources busy. This means that driver
+   authors are now responsible for claiming your resources as per
+   other drivers in Linux. You should use request_region() to mark
+   your IO regions in-use, and request_mem_region() to mark your
+   memory regions in-use. The name argument should be a pointer to
+   your driver name. Eg, for pcnet_cs, name should point to the
+   string "pcnet_cs".
+
+* CardServices is gone
+  CardServices() in 2.4 is just a big switch statement to call various
+  services.  In 2.6, all of those entry points are exported and called
+  directly (except for pcmcia_report_error(), just use cs_error() instead).
+
+* struct pcmcia_driver
+  You need to use struct pcmcia_driver and pcmcia_{un,}register_driver
+  instead of {un,}register_pccard_driver
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
deleted file mode 100644
index 78355c4c268a..000000000000
--- a/Documentation/pcmcia/driver-changes.txt
+++ /dev/null
@@ -1,149 +0,0 @@
-This file details changes in 2.6 which affect PCMCIA card driver authors:
-* pcmcia_loop_config() and autoconfiguration (as of 2.6.36)
-   If struct pcmcia_device *p_dev->config_flags is set accordingly,
-   pcmcia_loop_config() now sets up certain configuration values
-   automatically, though the driver may still override the settings
-   in the callback function. The following autoconfiguration options
-   are provided at the moment:
-	CONF_AUTO_CHECK_VCC : check for matching Vcc
-	CONF_AUTO_SET_VPP   : set Vpp
-	CONF_AUTO_AUDIO     : auto-enable audio line, if required
-	CONF_AUTO_SET_IO    : set ioport resources (->resource[0,1])
-	CONF_AUTO_SET_IOMEM : set first iomem resource (->resource[2])
-
-* pcmcia_request_configuration -> pcmcia_enable_device (as of 2.6.36)
-   pcmcia_request_configuration() got renamed to pcmcia_enable_device(),
-   as it mirrors pcmcia_disable_device(). Configuration settings are now
-   stored in struct pcmcia_device, e.g. in the fields config_flags,
-   config_index, config_base, vpp.
-
-* pcmcia_request_window changes (as of 2.6.36)
-   Instead of win_req_t, drivers are now requested to fill out
-   struct pcmcia_device *p_dev->resource[2,3,4,5] for up to four ioport
-   ranges. After a call to pcmcia_request_window(), the regions found there
-   are reserved and may be used immediately -- until pcmcia_release_window()
-   is called.
-
-* pcmcia_request_io changes (as of 2.6.36)
-   Instead of io_req_t, drivers are now requested to fill out
-   struct pcmcia_device *p_dev->resource[0,1] for up to two ioport
-   ranges. After a call to pcmcia_request_io(), the ports found there
-   are reserved, after calling pcmcia_request_configuration(), they may
-   be used.
-
-* No dev_info_t, no cs_types.h (as of 2.6.36)
-   dev_info_t and a few other typedefs are removed. No longer use them
-   in PCMCIA device drivers. Also, do not include pcmcia/cs_types.h, as
-   this file is gone.
-
-* No dev_node_t (as of 2.6.35)
-   There is no more need to fill out a "dev_node_t" structure.
-
-* New IRQ request rules (as of 2.6.35)
-   Instead of the old pcmcia_request_irq() interface, drivers may now
-   choose between:
-   - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq.
-   - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will
-     clean up automatically on calls to pcmcia_disable_device() or
-     device ejection.
-
-* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
-   Instead of the cs_error() callback or the CS_CHECK() macro, please use
-   Linux-style checking of return values, and -- if necessary -- debug
-   messages using "dev_dbg()" or "pr_debug()".
-
-* New CIS tuple access (as of 2.6.33)
-   Instead of pcmcia_get_{first,next}_tuple(), pcmcia_get_tuple_data() and
-   pcmcia_parse_tuple(), a driver shall use "pcmcia_get_tuple()" if it is
-   only interested in one (raw) tuple, or "pcmcia_loop_tuple()" if it is
-   interested in all tuples of one type. To decode the MAC from CISTPL_FUNCE,
-   a new helper "pcmcia_get_mac_from_cis()" was added.
-
-* New configuration loop helper (as of 2.6.28)
-   By calling pcmcia_loop_config(), a driver can iterate over all available
-   configuration options. During a driver's probe() phase, one doesn't need
-   to use pcmcia_get_{first,next}_tuple, pcmcia_get_tuple_data and
-   pcmcia_parse_tuple directly in most if not all cases.
-
-* New release helper (as of 2.6.17)
-   Instead of calling pcmcia_release_{configuration,io,irq,win}, all that's
-   necessary now is calling pcmcia_disable_device. As there is no valid
-   reason left to call pcmcia_release_io and pcmcia_release_irq, the
-   exports for them were removed.
-
-* Unify detach and REMOVAL event code, as well as attach and INSERTION
-  code (as of 2.6.16)
-       void (*remove)          (struct pcmcia_device *dev);
-       int (*probe)            (struct pcmcia_device *dev);
-
-* Move suspend, resume and reset out of event handler (as of 2.6.16)
-       int (*suspend)          (struct pcmcia_device *dev);
-       int (*resume)           (struct pcmcia_device *dev);
-  should be initialized in struct pcmcia_driver, and handle
-  (SUSPEND == RESET_PHYSICAL) and (RESUME == CARD_RESET) events
-
-* event handler initialization in struct pcmcia_driver (as of 2.6.13)
-   The event handler is notified of all events, and must be initialized
-   as the event() callback in the driver's struct pcmcia_driver.
-
-* pcmcia/version.h should not be used (as of 2.6.13)
-   This file will be removed eventually.
-
-* in-kernel device<->driver matching (as of 2.6.13)
-   PCMCIA devices and their correct drivers can now be matched in
-   kernelspace. See 'devicetable.txt' for details.
-
-* Device model integration (as of 2.6.11)
-   A struct pcmcia_device is registered with the device model core,
-   and can be used (e.g. for SET_NETDEV_DEV) by using
-   handle_to_dev(client_handle_t * handle).
-
-* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
-   ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
-
-* irq_mask and irq_list parameters (as of 2.6.11)
-   The irq_mask and irq_list parameters should no longer be used in
-   PCMCIA card drivers. Instead, it is the job of the PCMCIA core to
-   determine which IRQ should be used. Therefore, link->irq.IRQInfo2
-   is ignored.
-
-* client->PendingEvents is gone (as of 2.6.11)
-   client->PendingEvents is no longer available.
-
-* client->Attributes are gone (as of 2.6.11)
-   client->Attributes is unused, therefore it is removed from all
-   PCMCIA card drivers
-
-* core functions no longer available (as of 2.6.11)
-   The following functions have been removed from the kernel source
-   because they are unused by all in-kernel drivers, and no external
-   driver was reported to rely on them:
-	pcmcia_get_first_region()
-	pcmcia_get_next_region()
-	pcmcia_modify_window()
-	pcmcia_set_event_mask()
-	pcmcia_get_first_window()
-	pcmcia_get_next_window()
-
-* device list iteration upon module removal (as of 2.6.10)
-   It is no longer necessary to iterate on the driver's internal
-   client list and call the ->detach() function upon module removal.
-
-* Resource management. (as of 2.6.8)
-   Although the PCMCIA subsystem will allocate resources for cards,
-   it no longer marks these resources busy. This means that driver
-   authors are now responsible for claiming your resources as per
-   other drivers in Linux. You should use request_region() to mark
-   your IO regions in-use, and request_mem_region() to mark your
-   memory regions in-use. The name argument should be a pointer to
-   your driver name. Eg, for pcnet_cs, name should point to the
-   string "pcnet_cs".
-
-* CardServices is gone
-  CardServices() in 2.4 is just a big switch statement to call various
-  services.  In 2.6, all of those entry points are exported and called
-  directly (except for pcmcia_report_error(), just use cs_error() instead).
-
-* struct pcmcia_driver
-  You need to use struct pcmcia_driver and pcmcia_{un,}register_driver
-  instead of {un,}register_pccard_driver
diff --git a/Documentation/pcmcia/driver.rst b/Documentation/pcmcia/driver.rst
new file mode 100644
index 000000000000..5c4fe84d51c1
--- /dev/null
+++ b/Documentation/pcmcia/driver.rst
@@ -0,0 +1,30 @@
+=============
+PCMCIA Driver
+=============
+
+sysfs
+-----
+
+New PCMCIA IDs may be added to a device driver pcmcia_device_id table at
+runtime as shown below::
+
+  echo "match_flags manf_id card_id func_id function device_no \
+  prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \
+  /sys/bus/pcmcia/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The meaning is described in the PCMCIA specification, the match_flags is
+a bitwise or-ed combination from PCMCIA_DEV_ID_MATCH_* constants
+defined in include/linux/mod_devicetable.h.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCMCIA device listed in its (newly updated) pcmcia_device_id list.
+
+A common use-case is to add a new device according to the manufacturer ID
+and the card ID (form the manf_id and card_id file in the device tree).
+For this, just use::
+
+  echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \
+    /sys/bus/pcmcia/drivers/{driver}/new_id
+
+after loading the driver.
diff --git a/Documentation/pcmcia/driver.txt b/Documentation/pcmcia/driver.txt
deleted file mode 100644
index 0ac167920778..000000000000
--- a/Documentation/pcmcia/driver.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-PCMCIA Driver
--------------
-
-
-sysfs
------
-
-New PCMCIA IDs may be added to a device driver pcmcia_device_id table at
-runtime as shown below:
-
-echo "match_flags manf_id card_id func_id function device_no \
-prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \
-/sys/bus/pcmcia/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The meaning is described in the PCMCIA specification, the match_flags is
-a bitwise or-ed combination from PCMCIA_DEV_ID_MATCH_* constants
-defined in include/linux/mod_devicetable.h.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCMCIA device listed in its (newly updated) pcmcia_device_id list.
-
-A common use-case is to add a new device according to the manufacturer ID
-and the card ID (form the manf_id and card_id file in the device tree).
-For this, just use:
-
-echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \
-        /sys/bus/pcmcia/drivers/{driver}/new_id
-
-after loading the driver.
diff --git a/Documentation/pcmcia/index.rst b/Documentation/pcmcia/index.rst
new file mode 100644
index 000000000000..7ae1f62fca14
--- /dev/null
+++ b/Documentation/pcmcia/index.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+pcmcia
+======
+
+.. toctree::
+    :maxdepth: 1
+
+    driver
+    devicetable
+    locking
+    driver-changes
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/pcmcia/locking.rst b/Documentation/pcmcia/locking.rst
new file mode 100644
index 000000000000..e35257139c89
--- /dev/null
+++ b/Documentation/pcmcia/locking.rst
@@ -0,0 +1,133 @@
+=======
+Locking
+=======
+
+This file explains the locking and exclusion scheme used in the PCCARD
+and PCMCIA subsystems.
+
+
+A) Overview, Locking Hierarchy:
+===============================
+
+pcmcia_socket_list_rwsem
+	- protects only the list of sockets
+
+- skt_mutex
+	- serializes card insert / ejection
+
+  - ops_mutex
+	- serializes socket operation
+
+
+B) Exclusion
+============
+
+The following functions and callbacks to struct pcmcia_socket must
+be called with "skt_mutex" held::
+
+	socket_detect_change()
+	send_event()
+	socket_reset()
+	socket_shutdown()
+	socket_setup()
+	socket_remove()
+	socket_insert()
+	socket_early_resume()
+	socket_late_resume()
+	socket_resume()
+	socket_suspend()
+
+	struct pcmcia_callback	*callback
+
+The following functions and callbacks to struct pcmcia_socket must
+be called with "ops_mutex" held::
+
+	socket_reset()
+	socket_setup()
+
+	struct pccard_operations	*ops
+	struct pccard_resource_ops	*resource_ops;
+
+Note that send_event() and `struct pcmcia_callback *callback` must not be
+called with "ops_mutex" held.
+
+
+C) Protection
+=============
+
+1. Global Data:
+---------------
+struct list_head	pcmcia_socket_list;
+
+protected by pcmcia_socket_list_rwsem;
+
+
+2. Per-Socket Data:
+-------------------
+The resource_ops and their data are protected by ops_mutex.
+
+The "main" struct pcmcia_socket is protected as follows (read-only fields
+or single-use fields not mentioned):
+
+- by pcmcia_socket_list_rwsem::
+
+	struct list_head	socket_list;
+
+- by thread_lock::
+
+	unsigned int		thread_events;
+
+- by skt_mutex::
+
+	u_int			suspended_state;
+	void			(*tune_bridge);
+	struct pcmcia_callback	*callback;
+	int			resume_status;
+
+- by ops_mutex::
+
+	socket_state_t		socket;
+	u_int			state;
+	u_short			lock_count;
+	pccard_mem_map		cis_mem;
+	void __iomem 		*cis_virt;
+	struct { }		irq;
+	io_window_t		io[];
+	pccard_mem_map		win[];
+	struct list_head	cis_cache;
+	size_t			fake_cis_len;
+	u8			*fake_cis;
+	u_int			irq_mask;
+	void 			(*zoom_video);
+	int 			(*power_hook);
+	u8			resource...;
+	struct list_head	devices_list;
+	u8			device_count;
+	struct 			pcmcia_state;
+
+
+3. Per PCMCIA-device Data:
+--------------------------
+
+The "main" struct pcmcia_device is protected as follows (read-only fields
+or single-use fields not mentioned):
+
+
+- by pcmcia_socket->ops_mutex::
+
+	struct list_head	socket_device_list;
+	struct config_t		*function_config;
+	u16			_irq:1;
+	u16			_io:1;
+	u16			_win:4;
+	u16			_locked:1;
+	u16			allow_func_id_match:1;
+	u16			suspended:1;
+	u16			_removed:1;
+
+- by the PCMCIA driver::
+
+	io_req_t		io;
+	irq_req_t		irq;
+	config_req_t		conf;
+	window_handle_t		win;
diff --git a/Documentation/pcmcia/locking.txt b/Documentation/pcmcia/locking.txt
deleted file mode 100644
index b2c9b478906b..000000000000
--- a/Documentation/pcmcia/locking.txt
+++ /dev/null
@@ -1,118 +0,0 @@
-This file explains the locking and exclusion scheme used in the PCCARD
-and PCMCIA subsystems.
-
-
-A) Overview, Locking Hierarchy:
-===============================
-
-pcmcia_socket_list_rwsem	- protects only the list of sockets
-- skt_mutex			- serializes card insert / ejection
-  - ops_mutex			- serializes socket operation
-
-
-B) Exclusion
-============
-
-The following functions and callbacks to struct pcmcia_socket must
-be called with "skt_mutex" held:
-
-	socket_detect_change()
-	send_event()
-	socket_reset()
-	socket_shutdown()
-	socket_setup()
-	socket_remove()
-	socket_insert()
-	socket_early_resume()
-	socket_late_resume()
-	socket_resume()
-	socket_suspend()
-
-	struct pcmcia_callback	*callback
-
-The following functions and callbacks to struct pcmcia_socket must
-be called with "ops_mutex" held:
-
-	socket_reset()
-	socket_setup()
-
-	struct pccard_operations	*ops
-	struct pccard_resource_ops	*resource_ops;
-
-Note that send_event() and struct pcmcia_callback *callback must not be
-called with "ops_mutex" held.
-
-
-C) Protection
-=============
-
-1. Global Data:
----------------
-struct list_head	pcmcia_socket_list;
-
-protected by pcmcia_socket_list_rwsem;
-
-
-2. Per-Socket Data:
--------------------
-The resource_ops and their data are protected by ops_mutex.
-
-The "main" struct pcmcia_socket is protected as follows (read-only fields
-or single-use fields not mentioned):
-
-- by pcmcia_socket_list_rwsem:
-	struct list_head	socket_list;
-
-- by thread_lock:
-	unsigned int		thread_events;
-
-- by skt_mutex:
-	u_int			suspended_state;
-	void			(*tune_bridge);
-	struct pcmcia_callback	*callback;
-	int			resume_status;
-
-- by ops_mutex:
-	socket_state_t		socket;
-	u_int			state;
-	u_short			lock_count;
-	pccard_mem_map		cis_mem;
-	void __iomem 		*cis_virt;
-	struct { }		irq;
-	io_window_t		io[];
-	pccard_mem_map		win[];
-	struct list_head	cis_cache;
-	size_t			fake_cis_len;
-	u8			*fake_cis;
-	u_int			irq_mask;
-	void 			(*zoom_video);
-	int 			(*power_hook);
-	u8			resource...;
-	struct list_head	devices_list;
-	u8			device_count;
-	struct 			pcmcia_state;
-
-
-3. Per PCMCIA-device Data:
---------------------------
-
-The "main" struct pcmcia_device is protected as follows (read-only fields
-or single-use fields not mentioned):
-
-
-- by pcmcia_socket->ops_mutex:
-	struct list_head	socket_device_list;
-	struct config_t		*function_config;
-	u16			_irq:1;
-	u16			_io:1;
-	u16			_win:4;
-	u16			_locked:1;
-	u16			allow_func_id_match:1;
-	u16			suspended:1;
-	u16			_removed:1;
-
-- by the PCMCIA driver:
-	io_req_t		io;
-	irq_req_t		irq;
-	config_req_t		conf;
-	window_handle_t		win;
diff --git a/Documentation/perf/arm-ccn.txt b/Documentation/perf/arm-ccn.txt
deleted file mode 100644
index 15cdb7bc57c3..000000000000
--- a/Documentation/perf/arm-ccn.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-ARM Cache Coherent Network
-==========================
-
-CCN-504 is a ring-bus interconnect consisting of 11 crosspoints
-(XPs), with each crosspoint supporting up to two device ports,
-so nodes (devices) 0 and 1 are connected to crosspoint 0,
-nodes 2 and 3 to crosspoint 1 etc.
-
-PMU (perf) driver
------------------
-
-The CCN driver registers a perf PMU driver, which provides
-description of available events and configuration options
-in sysfs, see /sys/bus/event_source/devices/ccn*.
-
-The "format" directory describes format of the config, config1
-and config2 fields of the perf_event_attr structure. The "events"
-directory provides configuration templates for all documented
-events, that can be used with perf tool. For example "xp_valid_flit"
-is an equivalent of "type=0x8,event=0x4". Other parameters must be
-explicitly specified.
-
-For events originating from device, "node" defines its index.
-
-Crosspoint PMU events require "xp" (index), "bus" (bus number)
-and "vc" (virtual channel ID).
-
-Crosspoint watchpoint-based events (special "event" value 0xfe)
-require "xp" and "vc" as as above plus "port" (device port index),
-"dir" (transmit/receive direction), comparator values ("cmp_l"
-and "cmp_h") and "mask", being index of the comparator mask.
-Masks are defined separately from the event description
-(due to limited number of the config values) in the "cmp_mask"
-directory, with first 8 configurable by user and additional
-4 hardcoded for the most frequent use cases.
-
-Cycle counter is described by a "type" value 0xff and does
-not require any other settings.
-
-The driver also provides a "cpumask" sysfs attribute, which contains
-a single CPU ID, of the processor which will be used to handle all
-the CCN PMU events. It is recommended that the user space tools
-request the events on this processor (if not, the perf_event->cpu value
-will be overwritten anyway). In case of this processor being offlined,
-the events are migrated to another one and the attribute is updated.
-
-Example of perf tool use:
-
-/ # perf list | grep ccn
-  ccn/cycles/                                        [Kernel PMU event]
-<...>
-  ccn/xp_valid_flit,xp=?,port=?,vc=?,dir=?/          [Kernel PMU event]
-<...>
-
-/ # perf stat -a -e ccn/cycles/,ccn/xp_valid_flit,xp=1,port=0,vc=1,dir=1/ \
-                                                                       sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/perf/arm_dsu_pmu.txt b/Documentation/perf/arm_dsu_pmu.txt
deleted file mode 100644
index d611e15f5add..000000000000
--- a/Documentation/perf/arm_dsu_pmu.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-ARM DynamIQ Shared Unit (DSU) PMU
-==================================
-
-ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system,
-control logic and external interfaces to form a multicore cluster. The PMU
-allows counting the various events related to the L3 cache, Snoop Control Unit
-etc, using 32bit independent counters. It also provides a 64bit cycle counter.
-
-The PMU can only be accessed via CPU system registers and are common to the
-cores connected to the same DSU. Like most of the other uncore PMUs, DSU
-PMU doesn't support process specific events and cannot be used in sampling mode.
-
-The DSU provides a bitmap for a subset of implemented events via hardware
-registers. There is no way for the driver to determine if the other events
-are available or not. Hence the driver exposes only those events advertised
-by the DSU, in "events" directory under :
-
-  /sys/bus/event_sources/devices/arm_dsu_<N>/
-
-The user should refer to the TRM of the product to figure out the supported events
-and use the raw event code for the unlisted events.
-
-The driver also exposes the CPUs connected to the DSU instance in "associated_cpus".
-
-
-e.g usage :
-
-	perf stat -a -e arm_dsu_0/cycles/
diff --git a/Documentation/perf/hisi-pmu.txt b/Documentation/perf/hisi-pmu.txt
deleted file mode 100644
index 267a028b2741..000000000000
--- a/Documentation/perf/hisi-pmu.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-HiSilicon SoC uncore Performance Monitoring Unit (PMU)
-======================================================
-The HiSilicon SoC chip includes various independent system device PMUs
-such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
-independent and have hardware logic to gather statistics and performance
-information.
-
-The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
-(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
-called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
-two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
-
-HiSilicon SoC uncore PMU driver
----------------------------------------
-Each device PMU has separate registers for event counting, control and
-interrupt, and the PMU driver shall register perf PMU drivers like L3C,
-HHA and DDRC etc. The available events and configuration options shall
-be described in the sysfs, see :
-/sys/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>/, or
-/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
-The "perf list" command shall list the available events from sysfs.
-
-Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
-name will appear in event listing as hisi_sccl<sccl-id>_module<index-id>.
-where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
-module.
-e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
-SCCL ID #3.
-e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
-SCCL ID #1.
-
-The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
-ID used to count the uncore PMU event.
-
-Example usage of perf:
-$# perf list
-hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-
-$# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
-$# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
-
-The current driver does not support sampling. So "perf record" is unsupported.
-Also attach to a task is unsupported as the events are all uncore.
-
-Note: Please contact the maintainer for a complete list of events supported for
-the PMU devices in the SoC and its information if needed.
diff --git a/Documentation/perf/qcom_l2_pmu.txt b/Documentation/perf/qcom_l2_pmu.txt
deleted file mode 100644
index b25b97659ab9..000000000000
--- a/Documentation/perf/qcom_l2_pmu.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-Qualcomm Technologies Level-2 Cache Performance Monitoring Unit (PMU)
-=====================================================================
-
-This driver supports the L2 cache clusters found in Qualcomm Technologies
-Centriq SoCs. There are multiple physical L2 cache clusters, each with their
-own PMU. Each cluster has one or more CPUs associated with it.
-
-There is one logical L2 PMU exposed, which aggregates the results from
-the physical PMUs.
-
-The driver provides a description of its available events and configuration
-options in sysfs, see /sys/devices/l2cache_0.
-
-The "format" directory describes the format of the events.
-
-Events can be envisioned as a 2-dimensional array. Each column represents
-a group of events. There are 8 groups. Only one entry from each
-group can be in use at a time. If multiple events from the same group
-are specified, the conflicting events cannot be counted at the same time.
-
-Events are specified as 0xCCG, where CC is 2 hex digits specifying
-the code (array row) and G specifies the group (column) 0-7.
-
-In addition there is a cycle counter event specified by the value 0xFE
-which is outside the above scheme.
-
-The driver provides a "cpumask" sysfs attribute which contains a mask
-consisting of one CPU per cluster which will be used to handle all the PMU
-events on that cluster.
-
-Examples for use with perf:
-
-  perf stat -e l2cache_0/config=0x001/,l2cache_0/config=0x042/ -a sleep 1
-
-  perf stat -e l2cache_0/config=0xfe/ -C 2 sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task perf sessions are not supported.
diff --git a/Documentation/perf/qcom_l3_pmu.txt b/Documentation/perf/qcom_l3_pmu.txt
deleted file mode 100644
index 96b3a9444a0d..000000000000
--- a/Documentation/perf/qcom_l3_pmu.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU)
-===========================================================================
-
-This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies
-Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared
-by all cores within a socket. Each slice is exposed as a separate uncore perf
-PMU with device name l3cache_<socket>_<instance>. User space is responsible
-for aggregating across slices.
-
-The driver provides a description of its available events and configuration
-options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs
-the driver also exposes a "cpumask" sysfs attribute which contains a mask
-consisting of one CPU per socket which will be used to handle all the PMU
-events on that socket.
-
-The hardware implements 32bit event counters and has a flat 8bit event space
-exposed via the "event" format attribute. In addition to the 32bit physical
-counters the driver supports virtual 64bit hardware counters by using hardware
-counter chaining. This feature is exposed via the "lc" (long counter) format
-flag. E.g.:
-
-  perf stat -e l3cache_0_0/read-miss,lc/
-
-Given that these are uncore PMUs the driver does not support sampling, therefore
-"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/Documentation/perf/thunderx2-pmu.txt b/Documentation/perf/thunderx2-pmu.txt
deleted file mode 100644
index dffc57143736..000000000000
--- a/Documentation/perf/thunderx2-pmu.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Cavium ThunderX2 SoC Performance Monitoring Unit (PMU UNCORE)
-=============================================================
-
-The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
-PMUs such as the Level 3 Cache (L3C) and DDR4 Memory Controller (DMC).
-
-The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
-Events are counted for the default channel (i.e. channel 0) and prorated
-to the total number of channels/tiles.
-
-The DMC and L3C support up to 4 counters. Counters are independently
-programmable and can be started and stopped individually. Each counter
-can be set to a different event. Counters are 32-bit and do not support
-an overflow interrupt; they are read every 2 seconds.
-
-PMU UNCORE (perf) driver:
-
-The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
-L3C devices.  Each PMU can be used to count up to 4 events
-simultaneously. The PMUs provide a description of their available events
-and configuration options under sysfs, see
-/sys/devices/uncore_<l3c_S/dmc_S/>; S is the socket id.
-
-The driver does not support sampling, therefore "perf record" will not
-work. Per-task perf sessions are also not supported.
-
-Examples:
-
-# perf stat -a -e uncore_dmc_0/cnt_cycles/ sleep 1
-
-# perf stat -a -e \
-uncore_dmc_0/cnt_cycles/,\
-uncore_dmc_0/data_transfers/,\
-uncore_dmc_0/read_txns/,\
-uncore_dmc_0/write_txns/ sleep 1
-
-# perf stat -a -e \
-uncore_l3c_0/read_request/,\
-uncore_l3c_0/read_hit/,\
-uncore_l3c_0/inv_request/,\
-uncore_l3c_0/inv_hit/ sleep 1
diff --git a/Documentation/perf/xgene-pmu.txt b/Documentation/perf/xgene-pmu.txt
deleted file mode 100644
index d7cff4454e5b..000000000000
--- a/Documentation/perf/xgene-pmu.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-APM X-Gene SoC Performance Monitoring Unit (PMU)
-================================================
-
-X-Gene SoC PMU consists of various independent system device PMUs such as
-L3 cache(s), I/O bridge(s), memory controller bridge(s) and memory
-controller(s). These PMU devices are loosely architected to follow the
-same model as the PMU for ARM cores. The PMUs share the same top level
-interrupt and status CSR region.
-
-PMU (perf) driver
------------------
-
-The xgene-pmu driver registers several perf PMU drivers. Each of the perf
-driver provides description of its available events and configuration options
-in sysfs, see /sys/devices/<l3cX/iobX/mcbX/mcX>/.
-
-The "format" directory describes format of the config (event ID),
-config1 (agent ID) fields of the perf_event_attr structure. The "events"
-directory provides configuration templates for all supported event types that
-can be used with perf tool. For example, "l3c0/bank-fifo-full/" is an
-equivalent of "l3c0/config=0x0b/".
-
-Most of the SoC PMU has a specific list of agent ID used for monitoring
-performance of a specific datapath. For example, agents of a L3 cache can be
-a specific CPU or an I/O bridge. Each PMU has a set of 2 registers capable of
-masking the agents from which the request come from. If the bit with
-the bit number corresponding to the agent is set, the event is counted only if
-it is caused by a request from that agent. Each agent ID bit is inversely mapped
-to a corresponding bit in "config1" field. By default, the event will be
-counted for all agent requests (config1 = 0x0). For all the supported agents of
-each PMU, please refer to APM X-Gene User Manual.
-
-Each perf driver also provides a "cpumask" sysfs attribute, which contains a
-single CPU ID of the processor which will be used to handle all the PMU events.
-
-Example for perf tool use:
-
- / # perf list | grep -e l3c -e iob -e mcb -e mc
-   l3c0/ackq-full/                                    [Kernel PMU event]
- <...>
-   mcb1/mcb-csw-stall/                                [Kernel PMU event]
-
- / # perf stat -a -e l3c0/read-miss/,mcb1/csw-write-request/ sleep 1
-
- / # perf stat -a -e l3c0/read-miss,config1=0xfffffffffffffffe/ sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/phy.txt b/Documentation/phy.txt
deleted file mode 100644
index 457c3e0f86d6..000000000000
--- a/Documentation/phy.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-=============
-PHY subsystem
-=============
-
-:Author: Kishon Vijay Abraham I <kishon@ti.com>
-
-This document explains the Generic PHY Framework along with the APIs provided,
-and how-to-use.
-
-Introduction
-============
-
-*PHY* is the abbreviation for physical layer. It is used to connect a device
-to the physical medium e.g., the USB controller has a PHY to provide functions
-such as serialization, de-serialization, encoding, decoding and is responsible
-for obtaining the required data transmission rate. Note that some USB
-controllers have PHY functionality embedded into it and others use an external
-PHY. Other peripherals that use PHY include Wireless LAN, Ethernet,
-SATA etc.
-
-The intention of creating this framework is to bring the PHY drivers spread
-all over the Linux kernel to drivers/phy to increase code re-use and for
-better code maintainability.
-
-This framework will be of use only to devices that use external PHY (PHY
-functionality is not embedded within the controller).
-
-Registering/Unregistering the PHY provider
-==========================================
-
-PHY provider refers to an entity that implements one or more PHY instances.
-For the simple case where the PHY provider implements only a single instance of
-the PHY, the framework provides its own implementation of of_xlate in
-of_phy_simple_xlate. If the PHY provider implements multiple instances, it
-should provide its own implementation of of_xlate. of_xlate is used only for
-dt boot case.
-
-::
-
-	#define of_phy_provider_register(dev, xlate)    \
-		__of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate))
-
-	#define devm_of_phy_provider_register(dev, xlate)       \
-		__devm_of_phy_provider_register((dev), NULL, THIS_MODULE,
-						(xlate))
-
-of_phy_provider_register and devm_of_phy_provider_register macros can be used to
-register the phy_provider and it takes device and of_xlate as
-arguments. For the dt boot case, all PHY providers should use one of the above
-2 macros to register the PHY provider.
-
-Often the device tree nodes associated with a PHY provider will contain a set
-of children that each represent a single PHY. Some bindings may nest the child
-nodes within extra levels for context and extensibility, in which case the low
-level of_phy_provider_register_full() and devm_of_phy_provider_register_full()
-macros can be used to override the node containing the children.
-
-::
-
-	#define of_phy_provider_register_full(dev, children, xlate) \
-		__of_phy_provider_register(dev, children, THIS_MODULE, xlate)
-
-	#define devm_of_phy_provider_register_full(dev, children, xlate) \
-		__devm_of_phy_provider_register_full(dev, children,
-						     THIS_MODULE, xlate)
-
-	void devm_of_phy_provider_unregister(struct device *dev,
-		struct phy_provider *phy_provider);
-	void of_phy_provider_unregister(struct phy_provider *phy_provider);
-
-devm_of_phy_provider_unregister and of_phy_provider_unregister can be used to
-unregister the PHY.
-
-Creating the PHY
-================
-
-The PHY driver should create the PHY in order for other peripheral controllers
-to make use of it. The PHY framework provides 2 APIs to create the PHY.
-
-::
-
-	struct phy *phy_create(struct device *dev, struct device_node *node,
-			       const struct phy_ops *ops);
-	struct phy *devm_phy_create(struct device *dev,
-				    struct device_node *node,
-				    const struct phy_ops *ops);
-
-The PHY drivers can use one of the above 2 APIs to create the PHY by passing
-the device pointer and phy ops.
-phy_ops is a set of function pointers for performing PHY operations such as
-init, exit, power_on and power_off.
-
-Inorder to dereference the private data (in phy_ops), the phy provider driver
-can use phy_set_drvdata() after creating the PHY and use phy_get_drvdata() in
-phy_ops to get back the private data.
-
-4. Getting a reference to the PHY
-
-Before the controller can make use of the PHY, it has to get a reference to
-it. This framework provides the following APIs to get a reference to the PHY.
-
-::
-
-	struct phy *phy_get(struct device *dev, const char *string);
-	struct phy *phy_optional_get(struct device *dev, const char *string);
-	struct phy *devm_phy_get(struct device *dev, const char *string);
-	struct phy *devm_phy_optional_get(struct device *dev,
-					  const char *string);
-	struct phy *devm_of_phy_get_by_index(struct device *dev,
-					     struct device_node *np,
-					     int index);
-
-phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can
-be used to get the PHY. In the case of dt boot, the string arguments
-should contain the phy name as given in the dt data and in the case of
-non-dt boot, it should contain the label of the PHY.  The two
-devm_phy_get associates the device with the PHY using devres on
-successful PHY get. On driver detach, release function is invoked on
-the devres data and devres data is freed. phy_optional_get and
-devm_phy_optional_get should be used when the phy is optional. These
-two functions will never return -ENODEV, but instead returns NULL when
-the phy cannot be found.Some generic drivers, such as ehci, may use multiple
-phys and for such drivers referencing phy(s) by name(s) does not make sense. In
-this case, devm_of_phy_get_by_index can be used to get a phy reference based on
-the index.
-
-It should be noted that NULL is a valid phy reference. All phy
-consumer calls on the NULL phy become NOPs. That is the release calls,
-the phy_init() and phy_exit() calls, and phy_power_on() and
-phy_power_off() calls are all NOP when applied to a NULL phy. The NULL
-phy is useful in devices for handling optional phy devices.
-
-Releasing a reference to the PHY
-================================
-
-When the controller no longer needs the PHY, it has to release the reference
-to the PHY it has obtained using the APIs mentioned in the above section. The
-PHY framework provides 2 APIs to release a reference to the PHY.
-
-::
-
-	void phy_put(struct phy *phy);
-	void devm_phy_put(struct device *dev, struct phy *phy);
-
-Both these APIs are used to release a reference to the PHY and devm_phy_put
-destroys the devres associated with this PHY.
-
-Destroying the PHY
-==================
-
-When the driver that created the PHY is unloaded, it should destroy the PHY it
-created using one of the following 2 APIs::
-
-	void phy_destroy(struct phy *phy);
-	void devm_phy_destroy(struct device *dev, struct phy *phy);
-
-Both these APIs destroy the PHY and devm_phy_destroy destroys the devres
-associated with this PHY.
-
-PM Runtime
-==========
-
-This subsystem is pm runtime enabled. So while creating the PHY,
-pm_runtime_enable of the phy device created by this subsystem is called and
-while destroying the PHY, pm_runtime_disable is called. Note that the phy
-device created by this subsystem will be a child of the device that calls
-phy_create (PHY provider device).
-
-So pm_runtime_get_sync of the phy_device created by this subsystem will invoke
-pm_runtime_get_sync of PHY provider device because of parent-child relationship.
-It should also be noted that phy_power_on and phy_power_off performs
-phy_pm_runtime_get_sync and phy_pm_runtime_put respectively.
-There are exported APIs like phy_pm_runtime_get, phy_pm_runtime_get_sync,
-phy_pm_runtime_put, phy_pm_runtime_put_sync, phy_pm_runtime_allow and
-phy_pm_runtime_forbid for performing PM operations.
-
-PHY Mappings
-============
-
-In order to get reference to a PHY without help from DeviceTree, the framework
-offers lookups which can be compared to clkdev that allow clk structures to be
-bound to devices. A lookup can be made be made during runtime when a handle to
-the struct phy already exists.
-
-The framework offers the following API for registering and unregistering the
-lookups::
-
-	int phy_create_lookup(struct phy *phy, const char *con_id,
-			      const char *dev_id);
-	void phy_remove_lookup(struct phy *phy, const char *con_id,
-			       const char *dev_id);
-
-DeviceTree Binding
-==================
-
-The documentation for PHY dt binding can be found @
-Documentation/devicetree/bindings/phy/phy-bindings.txt
diff --git a/Documentation/phy/samsung-usb2.txt b/Documentation/phy/samsung-usb2.txt
deleted file mode 100644
index ed12d437189d..000000000000
--- a/Documentation/phy/samsung-usb2.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-.------------------------------------------------------------------------------+
-|			Samsung USB 2.0 PHY adaptation layer		       |
-+-----------------------------------------------------------------------------+'
-
-| 1. Description
-+----------------
-
-The architecture of the USB 2.0 PHY module in Samsung SoCs is similar
-among many SoCs. In spite of the similarities it proved difficult to
-create a one driver that would fit all these PHY controllers. Often
-the differences were minor and were found in particular bits of the
-registers of the PHY. In some rare cases the order of register writes or
-the PHY powering up process had to be altered. This adaptation layer is
-a compromise between having separate drivers and having a single driver
-with added support for many special cases.
-
-| 2. Files description
-+----------------------
-
-- phy-samsung-usb2.c
-   This is the main file of the adaptation layer. This file contains
-   the probe function and provides two callbacks to the Generic PHY
-   Framework. This two callbacks are used to power on and power off the
-   phy. They carry out the common work that has to be done on all version
-   of the PHY module. Depending on which SoC was chosen they execute SoC
-   specific callbacks. The specific SoC version is selected by choosing
-   the appropriate compatible string. In addition, this file contains
-   struct of_device_id definitions for particular SoCs.
-
-- phy-samsung-usb2.h
-   This is the include file. It declares the structures used by this
-   driver. In addition it should contain extern declarations for
-   structures that describe particular SoCs.
-
-| 3. Supporting SoCs
-+--------------------
-
-To support a new SoC a new file should be added to the drivers/phy
-directory. Each SoC's configuration is stored in an instance of the
-struct samsung_usb2_phy_config.
-
-struct samsung_usb2_phy_config {
-	const struct samsung_usb2_common_phy *phys;
-	int (*rate_to_clk)(unsigned long, u32 *);
-	unsigned int num_phys;
-	bool has_mode_switch;
-};
-
-The num_phys is the number of phys handled by the driver. *phys is an
-array that contains the configuration for each phy. The has_mode_switch
-property is a boolean flag that determines whether the SoC has USB host
-and device on a single pair of pins. If so, a special register has to
-be modified to change the internal routing of these pins between a USB
-device or host module.
-
-For example the configuration for Exynos 4210 is following:
-
-const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = {
-	.has_mode_switch        = 0,
-	.num_phys		= EXYNOS4210_NUM_PHYS,
-	.phys			= exynos4210_phys,
-	.rate_to_clk		= exynos4210_rate_to_clk,
-}
-
-- int (*rate_to_clk)(unsigned long, u32 *)
-	The rate_to_clk callback is to convert the rate of the clock
-	used as the reference clock for the PHY module to the value
-	that should be written in the hardware register.
-
-The exynos4210_phys configuration array is as follows:
-
-static const struct samsung_usb2_common_phy exynos4210_phys[] = {
-	{
-		.label		= "device",
-		.id		= EXYNOS4210_DEVICE,
-		.power_on	= exynos4210_power_on,
-		.power_off	= exynos4210_power_off,
-	},
-	{
-		.label		= "host",
-		.id		= EXYNOS4210_HOST,
-		.power_on	= exynos4210_power_on,
-		.power_off	= exynos4210_power_off,
-	},
-	{
-		.label		= "hsic0",
-		.id		= EXYNOS4210_HSIC0,
-		.power_on	= exynos4210_power_on,
-		.power_off	= exynos4210_power_off,
-	},
-	{
-		.label		= "hsic1",
-		.id		= EXYNOS4210_HSIC1,
-		.power_on	= exynos4210_power_on,
-		.power_off	= exynos4210_power_off,
-	},
-	{},
-};
-
-- int (*power_on)(struct samsung_usb2_phy_instance *);
-- int (*power_off)(struct samsung_usb2_phy_instance *);
-	These two callbacks are used to power on and power off the phy
-	by modifying appropriate registers.
-
-Final change to the driver is adding appropriate compatible value to the
-phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were
-added to the struct of_device_id samsung_usb2_phy_of_match[] array:
-
-#ifdef CONFIG_PHY_EXYNOS4210_USB2
-	{
-		.compatible = "samsung,exynos4210-usb2-phy",
-		.data = &exynos4210_usb2_phy_config,
-	},
-#endif
-
-To add further flexibility to the driver the Kconfig file enables to
-include support for selected SoCs in the compiled driver. The Kconfig
-entry for Exynos 4210 is following:
-
-config PHY_EXYNOS4210_USB2
-	bool "Support for Exynos 4210"
-	depends on PHY_SAMSUNG_USB2
-	depends on CPU_EXYNOS4210
-	help
-	  Enable USB PHY support for Exynos 4210. This option requires that
-	  Samsung USB 2.0 PHY driver is enabled and means that support for this
-	  particular SoC is compiled in the driver. In case of Exynos 4210 four
-	  phys are available - device, host, HSCI0 and HSCI1.
-
-The newly created file that supports the new SoC has to be also added to the
-Makefile. In case of Exynos 4210 the added line is following:
-
-obj-$(CONFIG_PHY_EXYNOS4210_USB2)       += phy-exynos4210-usb2.o
-
-After completing these steps the support for the new SoC should be ready.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
index b154f6c0c36e..c33ba2befbf8 100644
--- a/Documentation/pi-futex.txt
+++ b/Documentation/pi-futex.txt
@@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex,
 robust-futex, PI-futex, robust+PI-futex.
 
 More details about priority inheritance can be found in
-Documentation/locking/rt-mutex.txt.
+Documentation/locking/rt-mutex.rst.
diff --git a/Documentation/platform/x86-laptop-drivers.txt b/Documentation/platform/x86-laptop-drivers.txt
deleted file mode 100644
index 01facd2590bb..000000000000
--- a/Documentation/platform/x86-laptop-drivers.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-compal-laptop
-=============
-List of supported hardware:
-
-by Compal:
-	Compal FL90/IFL90
-	Compal FL91/IFL91
-	Compal FL92/JFL92
-	Compal FT00/IFT00
-
-by Dell:
-	Dell Vostro 1200
-	Dell Mini 9 (Inspiron 910)
-	Dell Mini 10 (Inspiron 1010)
-	Dell Mini 10v (Inspiron 1011)
-	Dell Mini 1012 (Inspiron 1012)
-	Dell Inspiron 11z (Inspiron 1110)
-	Dell Mini 12 (Inspiron 1210)
diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst
new file mode 100644
index 000000000000..5b90d947126d
--- /dev/null
+++ b/Documentation/power/apm-acpi.rst
@@ -0,0 +1,36 @@
+============
+APM or ACPI?
+============
+
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default).  If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once.  Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two.  Only one power management
+interface can be in control of the machine at once.  Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional.  Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both.  If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+  =====  =======================================
+  apmd   http://ftp.debian.org/pool/main/a/apmd/
+  acpid  http://acpid.sf.net/
+  =====  =======================================
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
deleted file mode 100644
index 6cc423d3662e..000000000000
--- a/Documentation/power/apm-acpi.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-APM or ACPI?
-------------
-If you have a relatively recent x86 mobile, desktop, or server system,
-odds are it supports either Advanced Power Management (APM) or
-Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
-of the two technologies and puts power management in the hands of the
-operating system, allowing for more intelligent power management than
-is possible with BIOS controlled APM.
-
-The best way to determine which, if either, your system supports is to
-build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
-enabled by default).  If a working ACPI implementation is found, the
-ACPI driver will override and disable APM, otherwise the APM driver
-will be used.
-
-No, sorry, you cannot have both ACPI and APM enabled and running at
-once.  Some people with broken ACPI or broken APM implementations
-would like to use both to get a full set of working features, but you
-simply cannot mix and match the two.  Only one power management
-interface can be in control of the machine at once.  Think about it..
-
-User-space Daemons
-------------------
-Both APM and ACPI rely on user-space daemons, apmd and acpid
-respectively, to be completely functional.  Obtain both of these
-daemons from your Linux distribution or from the Internet (see below)
-and be sure that they are started sometime in the system boot process.
-Go ahead and start both.  If ACPI or APM is not available on your
-system the associated daemon will exit gracefully.
-
-  apmd:   http://ftp.debian.org/pool/main/a/apmd/
-  acpid:  http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst
new file mode 100644
index 000000000000..69862e759c30
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.rst
@@ -0,0 +1,269 @@
+=================================
+Debugging hibernation and suspend
+=================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+===================================================
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode::
+
+	# echo reboot > /sys/power/disk
+	# echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition.  If that happens,
+hibernation is most likely to work correctly.  Still, you need to repeat the
+test at least a couple of times in a row for confidence.  [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work.  Thus, if your machine
+fails to hibernate or resume in the "reboot" mode, you should try the
+"platform" mode::
+
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
+work::
+
+	# echo shutdown > /sys/power/disk
+	# echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+----------------------------
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode.  There are 5 test modes available:
+
+freezer
+	- test the freezing of processes
+
+devices
+	- test the freezing of processes and suspending of devices
+
+platform
+	- test the freezing of processes, suspending of devices and platform
+	  global control methods [1]_
+
+processors
+	- test the freezing of processes, suspending of devices, platform
+	  global control methods [1]_ and the disabling of nonboot CPUs
+
+core
+	- test the freezing of processes, suspending of devices, platform global
+	  control methods\ [1]_, the disabling of nonboot CPUs and suspending
+	  of platform/system devices
+
+.. [1]
+
+    the platform global control methods are only available on ACPI systems
+    and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands.  For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following::
+
+	# echo devices > /sys/power/pm_test
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes.  If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation.  Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image.  Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test).  Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration).  To find this driver,
+you can carry out a binary search according to the rules:
+
+- if the test fails, unload a half of the drivers currently loaded and repeat
+  (that would probably involve rebooting the system, so always note what drivers
+  have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+  recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation.  In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules).  You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
+of hibernation is not likely to work.  You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported.  In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware.  Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+--------------------------------
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes.  If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually.  Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Using the "test_resume" hibernation option
+---------------------------------------------
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image.  One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration.  Namely,
+after doing::
+
+	# echo test_resume > /sys/power/disk
+	# echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware.  That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
+---------------------
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem.  First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console.  This may provide you with some
+information about the reasons of the suspend (resume) failure.  Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst .
+
+2. Testing suspend to RAM (STR)
+===============================
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string.  The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them.  In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices.  They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at S2RAM_LINK to test the system, but if
+it does not work "out of the box", you may need to boot it with
+"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
+you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1.  If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
+
+There is a debugfs entry which shows the suspend to RAM statistics. Here is an
+example of its output::
+
+	# mount -t debugfs none /sys/kernel/debug
+	# cat /sys/kernel/debug/suspend_stats
+	success: 20
+	fail: 5
+	failed_freeze: 0
+	failed_prepare: 0
+	failed_suspend: 5
+	failed_suspend_noirq: 0
+	failed_resume: 0
+	failed_resume_noirq: 0
+	failures:
+	  last_failed_dev:	alarm
+				adc
+	  last_failed_errno:	-16
+				-16
+	  last_failed_step:	suspend
+				suspend
+
+Field success means the success number of suspend to RAM, and field fail means
+the failure number. Others are the failure number of different steps of suspend
+to RAM. suspend_stats just lists the last 2 failed devices, error number and
+failed step of suspend.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
deleted file mode 100644
index 708f87f78a75..000000000000
--- a/Documentation/power/basic-pm-debugging.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-Debugging hibernation and suspend
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Testing hibernation (aka suspend to disk or STD)
-
-To check if hibernation works, you can try to hibernate in the "reboot" mode:
-
-# echo reboot > /sys/power/disk
-# echo disk > /sys/power/state
-
-and the system should create a hibernation image, reboot, resume and get back to
-the command prompt where you have started the transition.  If that happens,
-hibernation is most likely to work correctly.  Still, you need to repeat the
-test at least a couple of times in a row for confidence.  [This is necessary,
-because some problems only show up on a second attempt at suspending and
-resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
-modes causes the PM core to skip some platform-related callbacks which on ACPI
-systems might be necessary to make hibernation work.  Thus, if your machine fails
-to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
-
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-which is the default and recommended mode of hibernation.
-
-Unfortunately, the "platform" mode of hibernation does not work on some systems
-with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
-work:
-
-# echo shutdown > /sys/power/disk
-# echo disk > /sys/power/state
-
-(it is similar to the "reboot" mode, but it requires you to press the power
-button to make the system resume).
-
-If neither "platform" nor "shutdown" hibernation mode works, you will need to
-identify what goes wrong.
-
-a) Test modes of hibernation
-
-To find out why hibernation fails on your system, you can use a special testing
-facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
-there is the file /sys/power/pm_test that can be used to make the hibernation
-core run in a test mode.  There are 5 test modes available:
-
-freezer
-- test the freezing of processes
-
-devices
-- test the freezing of processes and suspending of devices
-
-platform
-- test the freezing of processes, suspending of devices and platform
-  global control methods(*)
-
-processors
-- test the freezing of processes, suspending of devices, platform
-  global control methods(*) and the disabling of nonboot CPUs
-
-core
-- test the freezing of processes, suspending of devices, platform global
-  control methods(*), the disabling of nonboot CPUs and suspending of
-  platform/system devices
-
-(*) the platform global control methods are only available on ACPI systems
-    and are only tested if the hibernation mode is set to "platform"
-
-To use one of them it is necessary to write the corresponding string to
-/sys/power/pm_test (eg. "devices" to test the freezing of processes and
-suspending devices) and issue the standard hibernation commands.  For example,
-to use the "devices" test mode along with the "platform" mode of hibernation,
-you should do the following:
-
-# echo devices > /sys/power/pm_test
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-Then, the kernel will try to freeze processes, suspend devices, wait a few
-seconds (5 by default, but configurable by the suspend.pm_test_delay module
-parameter), resume devices and thaw processes.  If "platform" is written to
-/sys/power/pm_test , then after suspending devices the kernel will additionally
-invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation.  Next, it will wait a
-configurable number of seconds and invoke the platform (eg. ACPI) global
-methods used to cancel hibernation etc.
-
-Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
-hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
-contains a space-separated list of all available tests (including "none" that
-represents the normal functionality) in which the current test level is
-indicated by square brackets.
-
-Generally, as you can see, each test level is more "invasive" than the previous
-one and the "core" level tests the hardware and drivers as deeply as possible
-without creating a hibernation image.  Obviously, if the "devices" test fails,
-the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
-should try the test modes starting from "freezer", through "devices", "platform"
-and "processors" up to "core" (repeat the test on each level a couple of times
-to make sure that any random factors are avoided).
-
-If the "freezer" test fails, there is a task that cannot be frozen (in that case
-it usually is possible to identify the offending task by analysing the output of
-dmesg obtained after the failing test).  Failure at this level usually means
-that there is a problem with the tasks freezer subsystem that should be
-reported.
-
-If the "devices" test fails, most likely there is a driver that cannot suspend
-or resume its device (in the latter case the system may hang or become unstable
-after the test, so please take that into consideration).  To find this driver,
-you can carry out a binary search according to the rules:
-- if the test fails, unload a half of the drivers currently loaded and repeat
-(that would probably involve rebooting the system, so always note what drivers
-have been loaded before the test),
-- if the test succeeds, load a half of the drivers you have unloaded most
-recently and repeat.
-
-Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before hibernation.  In that case please
-make sure to report the problem with the driver.
-
-It is also possible that the "devices" test will still fail after you have
-unloaded all modules. In that case, you may want to look in your kernel
-configuration for the drivers that can be compiled as modules (and test again
-with these drivers compiled as modules).  You may also try to use some special
-kernel command line options such as "noapic", "noacpi" or even "acpi=off".
-
-If the "platform" test fails, there is a problem with the handling of the
-platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
-of hibernation is not likely to work.  You can try the "shutdown" mode, but that
-is rather a poor man's workaround.
-
-If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
-work (of course, this only may be an issue on SMP systems) and the problem
-should be reported.  In that case you can also try to switch the nonboot CPUs
-off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
-see if that works.
-
-If the "core" test fails, which means that suspending of the system/platform
-devices has failed (these devices are suspended on one CPU with interrupts off),
-the problem is most probably hardware-related and serious, so it should be
-reported.
-
-A failure of any of the "platform", "processors" or "core" tests may cause your
-system to hang or become unstable, so please beware.  Such a failure usually
-indicates a serious problem that very well may be related to the hardware, but
-please report it anyway.
-
-b) Testing minimal configuration
-
-If all of the hibernation test modes work, you can boot the system with the
-"init=/bin/bash" command line parameter and attempt to hibernate in the
-"reboot", "shutdown" and "platform" modes.  If that does not work, there
-probably is a problem with a driver statically compiled into the kernel and you
-can try to compile more drivers as modules, so that they can be tested
-individually.  Otherwise, there is a problem with a modular driver and you can
-find it by loading a half of the modules you normally use and binary searching
-in accordance with the algorithm:
-- if there are n modules loaded and the attempt to suspend and resume fails,
-unload n/2 of the modules and try again (that would probably involve rebooting
-the system),
-- if there are n modules loaded and the attempt to suspend and resume succeeds,
-load n/2 modules more and try again.
-
-Again, if you find the offending module(s), it(they) must be unloaded every time
-before hibernation, and please report the problem with it(them).
-
-c) Using the "test_resume" hibernation option
-
-/sys/power/disk generally tells the kernel what to do after creating a
-hibernation image.  One of the available options is "test_resume" which
-causes the just created image to be used for immediate restoration.  Namely,
-after doing:
-
-# echo test_resume > /sys/power/disk
-# echo disk > /sys/power/state
-
-a hibernation image will be created and a resume from it will be triggered
-immediately without involving the platform firmware in any way.
-
-That test can be used to check if failures to resume from hibernation are
-related to bad interactions with the platform firmware.  That is, if the above
-works every time, but resume from actual hibernation does not work or is
-unreliable, the platform firmware may be responsible for the failures.
-
-On architectures and platforms that support using different kernels to restore
-hibernation images (that is, the kernel used to read the image from storage and
-load it into memory is different from the one included in the image) or support
-kernel address space randomization, it also can be used to check if failures
-to resume may be related to the differences between the restore and image
-kernels.
-
-d) Advanced debugging
-
-In case that hibernation does not work on your system even in the minimal
-configuration and compiling more drivers as modules is not practical or some
-modules cannot be unloaded, you can use one of the more advanced debugging
-techniques to find the problem.  First, if there is a serial port in your box,
-you can boot the kernel with the 'no_console_suspend' parameter and try to log
-kernel messages using the serial console.  This may provide you with some
-information about the reasons of the suspend (resume) failure.  Alternatively,
-it may be possible to use a FireWire port for debugging with firescope
-(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
-use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt .
-
-2. Testing suspend to RAM (STR)
-
-To verify that the STR works, it is generally more convenient to use the s2ram
-tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
-
-Namely, after writing "freezer", "devices", "platform", "processors", or "core"
-into /sys/power/pm_test (available if the kernel is compiled with
-CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
-to given string.  The STR test modes are defined in the same way as for
-hibernation, so please refer to Section 1 for more information about them.  In
-particular, the "core" test allows you to test everything except for the actual
-invocation of the platform firmware in order to put the system into the sleep
-state.
-
-Among other things, the testing with the help of /sys/power/pm_test may allow
-you to identify drivers that fail to suspend or resume their devices.  They
-should be unloaded every time before an STR transition.
-
-Next, you can follow the instructions at S2RAM_LINK to test the system, but if
-it does not work "out of the box", you may need to boot it with
-"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
-you may be able to search for failing drivers by following the procedure
-analogous to the one described in section 1.  If you find some failing drivers,
-you will have to unload them every time before an STR transition (ie. before
-you run s2ram), and please report the problems with them.
-
-There is a debugfs entry which shows the suspend to RAM statistics. Here is an
-example of its output.
-	# mount -t debugfs none /sys/kernel/debug
-	# cat /sys/kernel/debug/suspend_stats
-	success: 20
-	fail: 5
-	failed_freeze: 0
-	failed_prepare: 0
-	failed_suspend: 5
-	failed_suspend_noirq: 0
-	failed_resume: 0
-	failed_resume_noirq: 0
-	failures:
-	  last_failed_dev:	alarm
-				adc
-	  last_failed_errno:	-16
-				-16
-	  last_failed_step:	suspend
-				suspend
-Field success means the success number of suspend to RAM, and field fail means
-the failure number. Others are the failure number of different steps of suspend
-to RAM. suspend_stats just lists the last 2 failed devices, error number and
-failed step of suspend.
diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst
new file mode 100644
index 000000000000..84fab9376792
--- /dev/null
+++ b/Documentation/power/charger-manager.rst
@@ -0,0 +1,205 @@
+===============
+Charger Manager
+===============
+
+	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
+
+Charger Manager provides in-kernel battery charger management that
+requires temperature monitoring during suspend-to-RAM state
+and where each battery may have multiple chargers attached and the userland
+wants to look at the aggregated information of the multiple chargers.
+
+Charger Manager is a platform_driver with power-supply-class entries.
+An instance of Charger Manager (a platform-device created with Charger-Manager)
+represents an independent battery with chargers. If there are multiple
+batteries with their own chargers acting independently in a system,
+the system may need multiple instances of Charger Manager.
+
+1. Introduction
+===============
+
+Charger Manager supports the following:
+
+* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
+	A system may have multiple chargers (or power sources) and some of
+	they may be activated at the same time. Each charger may have its
+	own power-supply-class and each power-supply-class can provide
+	different information about the battery status. This framework
+	aggregates charger-related information from multiple sources and
+	shows combined information as a single power-supply-class.
+
+* Support for in suspend-to-RAM polling (with suspend_again callback)
+	While the battery is being charged and the system is in suspend-to-RAM,
+	we may need to monitor the battery health by looking at the ambient or
+	battery temperature. We can accomplish this by waking up the system
+	periodically. However, such a method wakes up devices unnecessarily for
+	monitoring the battery health and tasks, and user processes that are
+	supposed to be kept suspended. That, in turn, incurs unnecessary power
+	consumption and slow down charging process. Or even, such peak power
+	consumption can stop chargers in the middle of charging
+	(external power input < device power consumption), which not
+	only affects the charging time, but the lifespan of the battery.
+
+	Charger Manager provides a function "cm_suspend_again" that can be
+	used as suspend_again callback of platform_suspend_ops. If the platform
+	requires tasks other than cm_suspend_again, it may implement its own
+	suspend_again callback that calls cm_suspend_again in the middle.
+	Normally, the platform will need to resume and suspend some devices
+	that are used by Charger Manager.
+
+* Support for premature full-battery event handling
+	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
+	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
+	restarts charging. This check is also performed while suspended by
+	setting wakeup time accordingly and using suspend_again.
+
+* Support for uevent-notify
+	With the charger-related events, the device sends
+	notification to users with UEVENT.
+
+2. Global Charger-Manager Data related with suspend_again
+=========================================================
+In order to setup Charger Manager with suspend-again feature
+(in-suspend monitoring), the user should provide charger_global_desc
+with setup_charger_manager(`struct charger_global_desc *`).
+This charger_global_desc data for in-suspend monitoring is global
+as the name suggests. Thus, the user needs to provide only once even
+if there are multiple batteries. If there are multiple batteries, the
+multiple instances of Charger Manager share the same charger_global_desc
+and it will manage in-suspend monitoring for all instances of Charger Manager.
+
+The user needs to provide all the three entries to `struct charger_global_desc`
+properly in order to activate in-suspend monitoring:
+
+`char *rtc_name;`
+	The name of rtc (e.g., "rtc0") used to wakeup the system from
+	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
+	should be able to wake up the system from suspend. Charger Manager
+	saves and restores the alarm value and use the previously-defined
+	alarm if it is going to go off earlier than Charger Manager so that
+	Charger Manager does not interfere with previously-defined alarms.
+
+`bool (*rtc_only_wakeup)(void);`
+	This callback should let CM know whether
+	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
+	same struct. If there is any other wakeup source triggered the
+	wakeup, it should return false. If the "rtc" is the only wakeup
+	reason, it should return true.
+
+`bool assume_timer_stops_in_suspend;`
+	if true, Charger Manager assumes that
+	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
+	assumes that the suspend-duration is same as the alarm length.
+
+
+3. How to setup suspend_again
+=============================
+Charger Manager provides a function "extern bool cm_suspend_again(void)".
+When cm_suspend_again is called, it monitors every battery. The suspend_ops
+callback of the system's platform_suspend_ops can call cm_suspend_again
+function to know whether Charger Manager wants to suspend again or not.
+If there are no other devices or tasks that want to use suspend_again
+feature, the platform_suspend_ops may directly refer to cm_suspend_again
+for its suspend_again callback.
+
+The cm_suspend_again() returns true (meaning "I want to suspend again")
+if the system was woken up by Charger Manager and the polling
+(in-suspend monitoring) results in "normal".
+
+4. Charger-Manager Data (struct charger_desc)
+=============================================
+For each battery charged independently from other batteries (if a series of
+batteries are charged by a single charger, they are counted as one independent
+battery), an instance of Charger Manager is attached to it. The following
+
+struct charger_desc elements:
+
+`char *psy_name;`
+	The power-supply-class name of the battery. Default is
+	"battery" if psy_name is NULL. Users can access the psy entries
+	at "/sys/class/power_supply/[psy_name]/".
+
+`enum polling_modes polling_mode;`
+	  CM_POLL_DISABLE:
+		do not poll this battery.
+	  CM_POLL_ALWAYS:
+		always poll this battery.
+	  CM_POLL_EXTERNAL_POWER_ONLY:
+		poll this battery if and only if an external power
+		source is attached.
+	  CM_POLL_CHARGING_ONLY:
+		poll this battery if and only if the battery is being charged.
+
+`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;`
+	If both have non-zero values, Charger Manager will check the
+	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
+	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
+	Manager will try to recharge the battery by disabling and enabling
+	chargers. Recharge with voltage drop condition only (without delay
+	condition) is needed to be implemented with hardware interrupts from
+	fuel gauges or charger devices/chips.
+
+`unsigned int fullbatt_uV;`
+	If specified with a non-zero value, Charger Manager assumes
+	that the battery is full (capacity = 100) if the battery is not being
+	charged and the battery voltage is equal to or greater than
+	fullbatt_uV.
+
+`unsigned int polling_interval_ms;`
+	Required polling interval in ms. Charger Manager will poll
+	this battery every polling_interval_ms or more frequently.
+
+`enum data_source battery_present;`
+	CM_BATTERY_PRESENT:
+		assume that the battery exists.
+	CM_NO_BATTERY:
+		assume that the battery does not exists.
+	CM_FUEL_GAUGE:
+		get battery presence information from fuel gauge.
+	CM_CHARGER_STAT:
+		get battery presence from chargers.
+
+`char **psy_charger_stat;`
+	An array ending with NULL that has power-supply-class names of
+	chargers. Each power-supply-class should provide "PRESENT" (if
+	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
+	external power source is attached or not), and "STATUS" (shows whether
+	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
+	"Discharging", "NotCharging"}).
+
+`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;`
+	Regulators representing the chargers in the form for
+	regulator framework's bulk functions.
+
+`char *psy_fuel_gauge;`
+	Power-supply-class name of the fuel gauge.
+
+`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;`
+	This callback returns 0 if the temperature is safe for charging,
+	a positive number if it is too hot to charge, and a negative number
+	if it is too cold to charge. With the variable mC, the callback returns
+	the temperature in 1/1000 of centigrade.
+	The source of temperature can be battery or ambient one according to
+	the value of measure_battery_temp.
+
+
+5. Notify Charger-Manager of charger events: cm_notify_event()
+==============================================================
+If there is an charger event is required to notify
+Charger Manager, a charger device driver that triggers the event can call
+cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
+In the function, psy is the charger driver's power_supply pointer, which is
+associated with Charger-Manager. The parameter "type"
+is the same as irq's type (enum cm_event_types). The event message "msg" is
+optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
+
+6. Other Considerations
+=======================
+
+At the charger/battery-related events such as battery-pulled-out,
+charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
+and others critical to chargers, the system should be configured to wake up.
+At least the following should wake up the system from a suspend:
+a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
+
+It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt
deleted file mode 100644
index 9ff1105e58d6..000000000000
--- a/Documentation/power/charger-manager.txt
+++ /dev/null
@@ -1,200 +0,0 @@
-Charger Manager
-	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
-
-Charger Manager provides in-kernel battery charger management that
-requires temperature monitoring during suspend-to-RAM state
-and where each battery may have multiple chargers attached and the userland
-wants to look at the aggregated information of the multiple chargers.
-
-Charger Manager is a platform_driver with power-supply-class entries.
-An instance of Charger Manager (a platform-device created with Charger-Manager)
-represents an independent battery with chargers. If there are multiple
-batteries with their own chargers acting independently in a system,
-the system may need multiple instances of Charger Manager.
-
-1. Introduction
-===============
-
-Charger Manager supports the following:
-
-* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
-	A system may have multiple chargers (or power sources) and some of
-	they may be activated at the same time. Each charger may have its
-	own power-supply-class and each power-supply-class can provide
-	different information about the battery status. This framework
-	aggregates charger-related information from multiple sources and
-	shows combined information as a single power-supply-class.
-
-* Support for in suspend-to-RAM polling (with suspend_again callback)
-	While the battery is being charged and the system is in suspend-to-RAM,
-	we may need to monitor the battery health by looking at the ambient or
-	battery temperature. We can accomplish this by waking up the system
-	periodically. However, such a method wakes up devices unnecessarily for
-	monitoring the battery health and tasks, and user processes that are
-	supposed to be kept suspended. That, in turn, incurs unnecessary power
-	consumption and slow down charging process. Or even, such peak power
-	consumption can stop chargers in the middle of charging
-	(external power input < device power consumption), which not
-	only affects the charging time, but the lifespan of the battery.
-
-	Charger Manager provides a function "cm_suspend_again" that can be
-	used as suspend_again callback of platform_suspend_ops. If the platform
-	requires tasks other than cm_suspend_again, it may implement its own
-	suspend_again callback that calls cm_suspend_again in the middle.
-	Normally, the platform will need to resume and suspend some devices
-	that are used by Charger Manager.
-
-* Support for premature full-battery event handling
-	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
-	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
-	restarts charging. This check is also performed while suspended by
-	setting wakeup time accordingly and using suspend_again.
-
-* Support for uevent-notify
-	With the charger-related events, the device sends
-	notification to users with UEVENT.
-
-2. Global Charger-Manager Data related with suspend_again
-========================================================
-In order to setup Charger Manager with suspend-again feature
-(in-suspend monitoring), the user should provide charger_global_desc
-with setup_charger_manager(struct charger_global_desc *).
-This charger_global_desc data for in-suspend monitoring is global
-as the name suggests. Thus, the user needs to provide only once even
-if there are multiple batteries. If there are multiple batteries, the
-multiple instances of Charger Manager share the same charger_global_desc
-and it will manage in-suspend monitoring for all instances of Charger Manager.
-
-The user needs to provide all the three entries properly in order to activate
-in-suspend monitoring:
-
-struct charger_global_desc {
-
-char *rtc_name;
-	: The name of rtc (e.g., "rtc0") used to wakeup the system from
-	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
-	should be able to wake up the system from suspend. Charger Manager
-	saves and restores the alarm value and use the previously-defined
-	alarm if it is going to go off earlier than Charger Manager so that
-	Charger Manager does not interfere with previously-defined alarms.
-
-bool (*rtc_only_wakeup)(void);
-	: This callback should let CM know whether
-	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
-	same struct. If there is any other wakeup source triggered the
-	wakeup, it should return false. If the "rtc" is the only wakeup
-	reason, it should return true.
-
-bool assume_timer_stops_in_suspend;
-	: if true, Charger Manager assumes that
-	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
-	assumes that the suspend-duration is same as the alarm length.
-};
-
-3. How to setup suspend_again
-=============================
-Charger Manager provides a function "extern bool cm_suspend_again(void)".
-When cm_suspend_again is called, it monitors every battery. The suspend_ops
-callback of the system's platform_suspend_ops can call cm_suspend_again
-function to know whether Charger Manager wants to suspend again or not.
-If there are no other devices or tasks that want to use suspend_again
-feature, the platform_suspend_ops may directly refer to cm_suspend_again
-for its suspend_again callback.
-
-The cm_suspend_again() returns true (meaning "I want to suspend again")
-if the system was woken up by Charger Manager and the polling
-(in-suspend monitoring) results in "normal".
-
-4. Charger-Manager Data (struct charger_desc)
-=============================================
-For each battery charged independently from other batteries (if a series of
-batteries are charged by a single charger, they are counted as one independent
-battery), an instance of Charger Manager is attached to it.
-
-struct charger_desc {
-
-char *psy_name;
-	: The power-supply-class name of the battery. Default is
-	"battery" if psy_name is NULL. Users can access the psy entries
-	at "/sys/class/power_supply/[psy_name]/".
-
-enum polling_modes polling_mode;
-	: CM_POLL_DISABLE: do not poll this battery.
-	  CM_POLL_ALWAYS: always poll this battery.
-	  CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if
-				       an external power source is attached.
-	  CM_POLL_CHARGING_ONLY: poll this battery if and only if the
-				 battery is being charged.
-
-unsigned int fullbatt_vchkdrop_ms;
-unsigned int fullbatt_vchkdrop_uV;
-	: If both have non-zero values, Charger Manager will check the
-	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
-	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
-	Manager will try to recharge the battery by disabling and enabling
-	chargers. Recharge with voltage drop condition only (without delay
-	condition) is needed to be implemented with hardware interrupts from
-	fuel gauges or charger devices/chips.
-
-unsigned int fullbatt_uV;
-	: If specified with a non-zero value, Charger Manager assumes
-	that the battery is full (capacity = 100) if the battery is not being
-	charged and the battery voltage is equal to or greater than
-	fullbatt_uV.
-
-unsigned int polling_interval_ms;
-	: Required polling interval in ms. Charger Manager will poll
-	this battery every polling_interval_ms or more frequently.
-
-enum data_source battery_present;
-	: CM_BATTERY_PRESENT: assume that the battery exists.
-	CM_NO_BATTERY: assume that the battery does not exists.
-	CM_FUEL_GAUGE: get battery presence information from fuel gauge.
-	CM_CHARGER_STAT: get battery presence from chargers.
-
-char **psy_charger_stat;
-	: An array ending with NULL that has power-supply-class names of
-	chargers. Each power-supply-class should provide "PRESENT" (if
-	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
-	external power source is attached or not), and "STATUS" (shows whether
-	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
-	"Discharging", "NotCharging"}).
-
-int num_charger_regulators;
-struct regulator_bulk_data *charger_regulators;
-	: Regulators representing the chargers in the form for
-	regulator framework's bulk functions.
-
-char *psy_fuel_gauge;
-	: Power-supply-class name of the fuel gauge.
-
-int (*temperature_out_of_range)(int *mC);
-bool measure_battery_temp;
-	: This callback returns 0 if the temperature is safe for charging,
-	a positive number if it is too hot to charge, and a negative number
-	if it is too cold to charge. With the variable mC, the callback returns
-	the temperature in 1/1000 of centigrade.
-	The source of temperature can be battery or ambient one according to
-	the value of measure_battery_temp.
-};
-
-5. Notify Charger-Manager of charger events: cm_notify_event()
-=========================================================
-If there is an charger event is required to notify
-Charger Manager, a charger device driver that triggers the event can call
-cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
-In the function, psy is the charger driver's power_supply pointer, which is
-associated with Charger-Manager. The parameter "type"
-is the same as irq's type (enum cm_event_types). The event message "msg" is
-optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
-
-6. Other Considerations
-=======================
-
-At the charger/battery-related events such as battery-pulled-out,
-charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
-and others critical to chargers, the system should be configured to wake up.
-At least the following should wake up the system from a suspend:
-a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
-
-It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst
new file mode 100644
index 000000000000..e53f1999fc39
--- /dev/null
+++ b/Documentation/power/drivers-testing.rst
@@ -0,0 +1,51 @@
+====================================================
+Testing suspend and resume support in device drivers
+====================================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+============================
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded.  Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested.  Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver.  Please see Documentation/power/basic-pm-debugging.rst
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+=====================
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+   (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+   "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+   hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+   in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst,
+   2).  [As far as the STR tests are concerned, it should not matter whether or
+   not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+   (see: Documentation/power/basic-pm-debugging.rst, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests.  If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
deleted file mode 100644
index 638afdf4d6b8..000000000000
--- a/Documentation/power/drivers-testing.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Testing suspend and resume support in device drivers
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Preparing the test system
-
-Unfortunately, to effectively test the support for the system-wide suspend and
-resume transitions in a driver, it is necessary to suspend and resume a fully
-functional system with this driver loaded.  Moreover, that should be done
-several times, preferably several times in a row, and separately for hibernation
-(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
-cases involves slightly different operations and different interactions with
-the machine's BIOS.
-
-Of course, for this purpose the test system has to be known to suspend and
-resume without the driver being tested.  Thus, if possible, you should first
-resolve all suspend/resume-related problems in the test system before you start
-testing the new driver.  Please see Documentation/power/basic-pm-debugging.txt
-for more information about the debugging of suspend/resume functionality.
-
-2. Testing the driver
-
-Once you have resolved the suspend/resume-related problems with your test system
-without the new driver, you are ready to test it:
-
-a) Build the driver as a module, load it and try the test modes of hibernation
-   (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
-   "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-c) Compile the driver directly into the kernel and try the test modes of
-   hibernation.
-
-d) Attempt to hibernate with the driver compiled directly into the kernel
-   in the "reboot", "shutdown" and "platform" modes.
-
-e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt,
-   2).  [As far as the STR tests are concerned, it should not matter whether or
-   not the driver is built as a module.]
-
-f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
-   (see: Documentation/power/basic-pm-debugging.txt, 2).
-
-Each of the above tests should be repeated several times and the STD tests
-should be mixed with the STR tests.  If any of them fails, the driver cannot be
-regarded as suspend/resume-safe.
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
new file mode 100644
index 000000000000..90a345d57ae9
--- /dev/null
+++ b/Documentation/power/energy-model.rst
@@ -0,0 +1,147 @@
+====================
+Energy Model of CPUs
+====================
+
+1. Overview
+-----------
+
+The Energy Model (EM) framework serves as an interface between drivers knowing
+the power consumed by CPUs at various performance levels, and the kernel
+subsystems willing to use that information to make energy-aware decisions.
+
+The source of the information about the power consumed by CPUs can vary greatly
+from one platform to another. These power costs can be estimated using
+devicetree data in some cases. In others, the firmware will know better.
+Alternatively, userspace might be best positioned. And so on. In order to avoid
+each and every client subsystem to re-implement support for each and every
+possible source of information on its own, the EM framework intervenes as an
+abstraction layer which standardizes the format of power cost tables in the
+kernel, hence enabling to avoid redundant work.
+
+The figure below depicts an example of drivers (Arm-specific here, but the
+approach is applicable to any architecture) providing power costs to the EM
+framework, and interested clients reading the data from it::
+
+       +---------------+  +-----------------+  +---------------+
+       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
+       +---------------+  +-----------------+  +---------------+
+               |                   | em_pd_energy()    |
+               |                   | em_cpu_get()      |
+               +---------+         |         +---------+
+                         |         |         |
+                         v         v         v
+                        +---------------------+
+                        |    Energy Model     |
+                        |     Framework       |
+                        +---------------------+
+                           ^       ^       ^
+                           |       |       | em_register_perf_domain()
+                +----------+       |       +---------+
+                |                  |                 |
+        +---------------+  +---------------+  +--------------+
+        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
+        +---------------+  +---------------+  +--------------+
+                ^                  ^                 ^
+                |                  |                 |
+        +--------------+   +---------------+  +--------------+
+        | Device Tree  |   |   Firmware    |  |      ?       |
+        +--------------+   +---------------+  +--------------+
+
+The EM framework manages power cost tables per 'performance domain' in the
+system. A performance domain is a group of CPUs whose performance is scaled
+together. Performance domains generally have a 1-to-1 mapping with CPUFreq
+policies. All CPUs in a performance domain are required to have the same
+micro-architecture. CPUs in different performance domains can have different
+micro-architectures.
+
+
+2. Core APIs
+------------
+
+2.1 Config options
+^^^^^^^^^^^^^^^^^^
+
+CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
+
+
+2.2 Registration of performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Drivers are expected to register performance domains into the EM framework by
+calling the following API::
+
+  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+			      struct em_data_callback *cb);
+
+Drivers must specify the CPUs of the performance domains using the cpumask
+argument, and provide a callback function returning <frequency, power> tuples
+for each capacity state. The callback function provided by the driver is free
+to fetch data from any relevant location (DT, firmware, ...), and by any mean
+deemed necessary. See Section 3. for an example of driver implementing this
+callback, and kernel/power/energy_model.c for further documentation on this
+API.
+
+
+2.3 Accessing performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subsystems interested in the energy model of a CPU can retrieve it using the
+em_cpu_get() API. The energy model tables are allocated once upon creation of
+the performance domains, and kept in memory untouched.
+
+The energy consumed by a performance domain can be estimated using the
+em_pd_energy() API. The estimation is performed assuming that the schedutil
+CPUfreq governor is in use.
+
+More details about the above APIs can be found in include/linux/energy_model.h.
+
+
+3. Example driver
+-----------------
+
+This section provides a simple example of a CPUFreq driver registering a
+performance domain in the Energy Model framework using the (fake) 'foo'
+protocol. The driver implements an est_power() function to be provided to the
+EM framework::
+
+  -> drivers/cpufreq/foo_cpufreq.c
+
+  01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+  02	{
+  03		long freq, power;
+  04
+  05		/* Use the 'foo' protocol to ceil the frequency */
+  06		freq = foo_get_freq_ceil(cpu, *KHz);
+  07		if (freq < 0);
+  08			return freq;
+  09
+  10		/* Estimate the power cost for the CPU at the relevant freq. */
+  11		power = foo_estimate_power(cpu, freq);
+  12		if (power < 0);
+  13			return power;
+  14
+  15		/* Return the values to the EM framework */
+  16		*mW = power;
+  17		*KHz = freq;
+  18
+  19		return 0;
+  20	}
+  21
+  22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
+  23	{
+  24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
+  25		int nr_opp, ret;
+  26
+  27		/* Do the actual CPUFreq init work ... */
+  28		ret = do_foo_cpufreq_init(policy);
+  29		if (ret)
+  30			return ret;
+  31
+  32		/* Find the number of OPPs for this policy */
+  33		nr_opp = foo_get_nr_opp(policy);
+  34
+  35		/* And register the new performance domain */
+  36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+  37
+  38	        return 0;
+  39	}
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
deleted file mode 100644
index a2b0ae4c76bd..000000000000
--- a/Documentation/power/energy-model.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-                           ====================
-                           Energy Model of CPUs
-                           ====================
-
-1. Overview
------------
-
-The Energy Model (EM) framework serves as an interface between drivers knowing
-the power consumed by CPUs at various performance levels, and the kernel
-subsystems willing to use that information to make energy-aware decisions.
-
-The source of the information about the power consumed by CPUs can vary greatly
-from one platform to another. These power costs can be estimated using
-devicetree data in some cases. In others, the firmware will know better.
-Alternatively, userspace might be best positioned. And so on. In order to avoid
-each and every client subsystem to re-implement support for each and every
-possible source of information on its own, the EM framework intervenes as an
-abstraction layer which standardizes the format of power cost tables in the
-kernel, hence enabling to avoid redundant work.
-
-The figure below depicts an example of drivers (Arm-specific here, but the
-approach is applicable to any architecture) providing power costs to the EM
-framework, and interested clients reading the data from it.
-
-       +---------------+  +-----------------+  +---------------+
-       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
-       +---------------+  +-----------------+  +---------------+
-               |                   | em_pd_energy()    |
-               |                   | em_cpu_get()      |
-               +---------+         |         +---------+
-                         |         |         |
-                         v         v         v
-                        +---------------------+
-                        |    Energy Model     |
-                        |     Framework       |
-                        +---------------------+
-                           ^       ^       ^
-                           |       |       | em_register_perf_domain()
-                +----------+       |       +---------+
-                |                  |                 |
-        +---------------+  +---------------+  +--------------+
-        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
-        +---------------+  +---------------+  +--------------+
-                ^                  ^                 ^
-                |                  |                 |
-        +--------------+   +---------------+  +--------------+
-        | Device Tree  |   |   Firmware    |  |      ?       |
-        +--------------+   +---------------+  +--------------+
-
-The EM framework manages power cost tables per 'performance domain' in the
-system. A performance domain is a group of CPUs whose performance is scaled
-together. Performance domains generally have a 1-to-1 mapping with CPUFreq
-policies. All CPUs in a performance domain are required to have the same
-micro-architecture. CPUs in different performance domains can have different
-micro-architectures.
-
-
-2. Core APIs
-------------
-
-  2.1 Config options
-
-CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
-
-
-  2.2 Registration of performance domains
-
-Drivers are expected to register performance domains into the EM framework by
-calling the following API:
-
-  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-			      struct em_data_callback *cb);
-
-Drivers must specify the CPUs of the performance domains using the cpumask
-argument, and provide a callback function returning <frequency, power> tuples
-for each capacity state. The callback function provided by the driver is free
-to fetch data from any relevant location (DT, firmware, ...), and by any mean
-deemed necessary. See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
-
-
-  2.3 Accessing performance domains
-
-Subsystems interested in the energy model of a CPU can retrieve it using the
-em_cpu_get() API. The energy model tables are allocated once upon creation of
-the performance domains, and kept in memory untouched.
-
-The energy consumed by a performance domain can be estimated using the
-em_pd_energy() API. The estimation is performed assuming that the schedutil
-CPUfreq governor is in use.
-
-More details about the above APIs can be found in include/linux/energy_model.h.
-
-
-3. Example driver
------------------
-
-This section provides a simple example of a CPUFreq driver registering a
-performance domain in the Energy Model framework using the (fake) 'foo'
-protocol. The driver implements an est_power() function to be provided to the
-EM framework.
-
- -> drivers/cpufreq/foo_cpufreq.c
-
-01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
-02	{
-03		long freq, power;
-04
-05		/* Use the 'foo' protocol to ceil the frequency */
-06		freq = foo_get_freq_ceil(cpu, *KHz);
-07		if (freq < 0);
-08			return freq;
-09
-10		/* Estimate the power cost for the CPU at the relevant freq. */
-11		power = foo_estimate_power(cpu, freq);
-12		if (power < 0);
-13			return power;
-14
-15		/* Return the values to the EM framework */
-16		*mW = power;
-17		*KHz = freq;
-18
-19		return 0;
-20	}
-21
-22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
-23	{
-24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
-25		int nr_opp, ret;
-26
-27		/* Do the actual CPUFreq init work ... */
-28		ret = do_foo_cpufreq_init(policy);
-29		if (ret)
-30			return ret;
-31
-32		/* Find the number of OPPs for this policy */
-33		nr_opp = foo_get_nr_opp(policy);
-34
-35		/* And register the new performance domain */
-36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
-37
-38	        return 0;
-39	}
diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst
new file mode 100644
index 000000000000..ef110fe55e82
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.rst
@@ -0,0 +1,244 @@
+=================
+Freezing of tasks
+=================
+
+(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+=================================
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+=====================
+
+There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
+and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
+variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
+whether the system is to undergo a freezing operation. And freeze_processes()
+sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
+fake signal to all user space processes, and wakes up all the kernel threads.
+All freezable tasks must react to that by calling try_to_freeze(), which
+results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
+the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
+it loop until PF_FROZEN is cleared for it. Then, we say that the task is
+'frozen' and therefore the set of functions handling this mechanism is referred
+to as 'the freezer' (these functions are defined in kernel/power/process.c,
+kernel/freezer.c & include/linux/freezer.h). User space processes are generally
+frozen before kernel threads.
+
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+if the task is to be frozen and makes the task enter __refrigerator().
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if the task is to be frozen and
+calling try_to_freeze().  The main loop of a freezable kernel thread may look
+like the following one::
+
+	set_freezable();
+	do {
+		hub_events();
+		wait_event_freezable(khubd_wait,
+				!list_empty(&hub_event_list) ||
+				kthread_should_stop());
+	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+initiated a freezing operation, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled.  For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
+have been frozen leave __refrigerator() and continue running.
+
+
+Rationale behind the functions dealing with freezing and thawing of tasks
+-------------------------------------------------------------------------
+
+freeze_processes():
+  - freezes only userspace tasks
+
+freeze_kernel_threads():
+  - freezes all tasks (including kernel threads) because we can't freeze
+    kernel threads without freezing userspace tasks
+
+thaw_kernel_threads():
+  - thaws only kernel threads; this is particularly useful if we need to do
+    anything special in between thawing of kernel threads and thawing of
+    userspace tasks, or if we want to postpone the thawing of userspace tasks
+
+thaw_processes():
+  - thaws all tasks (including kernel threads) because we can't thaw userspace
+    tasks without thawing kernel threads
+
+
+III. Which kernel threads are freezable?
+========================================
+
+Kernel threads are not freezable by default.  However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is not allowed).  From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+======================
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+   hibernation.  At the moment we have no simple means of checkpointing
+   filesystems, so if there are any modifications made to filesystem data and/or
+   metadata on disks, we cannot bring them back to the state from before the
+   modifications.  At the same time each hibernation image contains some
+   filesystem-related information that must be consistent with the state of the
+   on-disk data and metadata after the system memory state has been restored
+   from the image (otherwise the filesystems will be damaged in a nasty way,
+   usually making them almost impossible to repair).  We therefore freeze
+   tasks that might cause the on-disk filesystems' data and metadata to be
+   modified after the hibernation image has been created and before the
+   system is finally powered off. The majority of these are user space
+   processes, but if any of the kernel threads may cause something like this
+   to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+   memory (approximately 50% of available RAM) and we need to do that before
+   devices are deactivated, because we generally need them for swapping out.
+   Then, after the memory for the image has been freed, we don't want tasks
+   to allocate additional memory and we prevent them from doing that by
+   freezing them earlier. [Of course, this also means that device drivers
+   should not allocate substantial amounts of memory from their .suspend()
+   callbacks before hibernation, but this is a separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+   from interfering with the suspending and resuming of devices.  A user space
+   process running on a second CPU while we are suspending devices may, for
+   example, be troublesome and without the freezing of tasks we would need some
+   safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I **do** realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA.  So we want to be able to
+avoid *that*, there's no question about that.  And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable.  For example, if
+a kernel thread that belongs to a device driver accesses the device directly, it
+in principle needs to know when the device is suspended, so that it doesn't try
+to access it at that time.  However, if the kernel thread is freezable, it will
+be frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+   realizing that hibernation (or suspend) operation takes place.  Ideally, user
+   space processes should not notice that such a system-wide operation has
+   occurred and should continue running without any problems after the restore
+   (or resume from suspend).  Unfortunately, in the most general case this
+   is quite difficult to achieve without the freezing of tasks.  Consider,
+   for example, a process that depends on all CPUs being online while it's
+   running.  Since we need to disable nonboot CPUs during the hibernation,
+   if this process is not frozen, it may notice that the number of CPUs has
+   changed and may start to work incorrectly because of that.
+
+V. Are there any problems related to the freezing of tasks?
+===========================================================
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another.  For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable.  That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+   userspace, it gets even more complicated because some userspace processes are
+   now doing the sorts of things that kernel threads do
+   (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it.  For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point.  So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet.  In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used.  Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in
+Documentation/driver-api/pm/notifiers.rst.
+
+VI. Are there any precautions to be taken to prevent freezing failures?
+=======================================================================
+
+Yes, there are.
+
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
+from system-wide sleep such as suspend/hibernation is not encouraged.
+If possible, that piece of code must instead hook onto the suspend/hibernation
+notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
+(kernel/cpu.c) for an example.
+
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
+that could lead to freezing failures, because if the suspend/hibernate code
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
+to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
+state. As a consequence, the freezer would not be able to freeze that task,
+leading to freezing failure.
+
+However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
+since they ask the freezer to skip freezing this task, since it is anyway
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
+only after the entire suspend/hibernation sequence is complete.
+So, to summarize, use [un]lock_system_sleep() instead of directly using
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
+
+V. Miscellaneous
+================
+
+/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
+all user space processes or all freezable kernel threads, in unit of millisecond.
+The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
deleted file mode 100644
index cd283190855a..000000000000
--- a/Documentation/power/freezing-of-tasks.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Freezing of tasks
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-I. What is the freezing of tasks?
-
-The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).
-
-II. How does it work?
-
-There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
-and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
-PF_NOFREEZE unset (all user space processes and some kernel threads) are
-regarded as 'freezable' and treated in a special way before the system enters a
-suspend state as well as before a hibernation image is created (in what follows
-we only consider hibernation, but the description also applies to suspend).
-
-Namely, as the first step of the hibernation procedure the function
-freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
-variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
-whether the system is to undergo a freezing operation. And freeze_processes()
-sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
-fake signal to all user space processes, and wakes up all the kernel threads.
-All freezable tasks must react to that by calling try_to_freeze(), which
-results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
-the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
-it loop until PF_FROZEN is cleared for it. Then, we say that the task is
-'frozen' and therefore the set of functions handling this mechanism is referred
-to as 'the freezer' (these functions are defined in kernel/power/process.c,
-kernel/freezer.c & include/linux/freezer.h). User space processes are generally
-frozen before kernel threads.
-
-__refrigerator() must not be called directly.  Instead, use the
-try_to_freeze() function (defined in include/linux/freezer.h), that checks
-if the task is to be frozen and makes the task enter __refrigerator().
-
-For user space processes try_to_freeze() is called automatically from the
-signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places or use the wait_event_freezable() or
-wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
-that combine interruptible sleep with checking if the task is to be frozen and
-calling try_to_freeze().  The main loop of a freezable kernel thread may look
-like the following one:
-
-	set_freezable();
-	do {
-		hub_events();
-		wait_event_freezable(khubd_wait,
-				!list_empty(&hub_event_list) ||
-				kthread_should_stop());
-	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
-
-(from drivers/usb/core/hub.c::hub_thread()).
-
-If a freezable kernel thread fails to call try_to_freeze() after the freezer has
-initiated a freezing operation, the freezing of tasks will fail and the entire
-hibernation operation will be cancelled.  For this reason, freezable kernel
-threads must call try_to_freeze() somewhere or use one of the
-wait_event_freezable() and wait_event_freezable_timeout() macros.
-
-After the system memory state has been restored from a hibernation image and
-devices have been reinitialized, the function thaw_processes() is called in
-order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave __refrigerator() and continue running.
-
-
-Rationale behind the functions dealing with freezing and thawing of tasks:
--------------------------------------------------------------------------
-
-freeze_processes():
-  - freezes only userspace tasks
-
-freeze_kernel_threads():
-  - freezes all tasks (including kernel threads) because we can't freeze
-    kernel threads without freezing userspace tasks
-
-thaw_kernel_threads():
-  - thaws only kernel threads; this is particularly useful if we need to do
-    anything special in between thawing of kernel threads and thawing of
-    userspace tasks, or if we want to postpone the thawing of userspace tasks
-
-thaw_processes():
-  - thaws all tasks (including kernel threads) because we can't thaw userspace
-    tasks without thawing kernel threads
-
-
-III. Which kernel threads are freezable?
-
-Kernel threads are not freezable by default.  However, a kernel thread may clear
-PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
-directly is not allowed).  From this point it is regarded as freezable
-and must call try_to_freeze() in a suitable place.
-
-IV. Why do we do that?
-
-Generally speaking, there is a couple of reasons to use the freezing of tasks:
-
-1. The principal reason is to prevent filesystems from being damaged after
-hibernation.  At the moment we have no simple means of checkpointing
-filesystems, so if there are any modifications made to filesystem data and/or
-metadata on disks, we cannot bring them back to the state from before the
-modifications.  At the same time each hibernation image contains some
-filesystem-related information that must be consistent with the state of the
-on-disk data and metadata after the system memory state has been restored from
-the image (otherwise the filesystems will be damaged in a nasty way, usually
-making them almost impossible to repair).  We therefore freeze tasks that might
-cause the on-disk filesystems' data and metadata to be modified after the
-hibernation image has been created and before the system is finally powered off.
-The majority of these are user space processes, but if any of the kernel threads
-may cause something like this to happen, they have to be freezable.
-
-2. Next, to create the hibernation image we need to free a sufficient amount of
-memory (approximately 50% of available RAM) and we need to do that before
-devices are deactivated, because we generally need them for swapping out.  Then,
-after the memory for the image has been freed, we don't want tasks to allocate
-additional memory and we prevent them from doing that by freezing them earlier.
-[Of course, this also means that device drivers should not allocate substantial
-amounts of memory from their .suspend() callbacks before hibernation, but this
-is a separate issue.]
-
-3. The third reason is to prevent user space processes and some kernel threads
-from interfering with the suspending and resuming of devices.  A user space
-process running on a second CPU while we are suspending devices may, for
-example, be troublesome and without the freezing of tasks we would need some
-safeguards against race conditions that might occur in such a case.
-
-Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
-of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
-
-"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
-
-Linus: In many ways, 'at all'.
-
-I _do_ realize the IO request queue issues, and that we cannot actually do
-s2ram with some devices in the middle of a DMA.  So we want to be able to
-avoid *that*, there's no question about that.  And I suspect that stopping
-user threads and then waiting for a sync is practically one of the easier
-ways to do so.
-
-So in practice, the 'at all' may become a 'why freeze kernel threads?' and
-freezing user threads I don't find really objectionable."
-
-Still, there are kernel threads that may want to be freezable.  For example, if
-a kernel thread that belongs to a device driver accesses the device directly, it
-in principle needs to know when the device is suspended, so that it doesn't try
-to access it at that time.  However, if the kernel thread is freezable, it will
-be frozen before the driver's .suspend() callback is executed and it will be
-thawed after the driver's .resume() callback has run, so it won't be accessing
-the device while it's suspended.
-
-4. Another reason for freezing tasks is to prevent user space processes from
-realizing that hibernation (or suspend) operation takes place.  Ideally, user
-space processes should not notice that such a system-wide operation has occurred
-and should continue running without any problems after the restore (or resume
-from suspend).  Unfortunately, in the most general case this is quite difficult
-to achieve without the freezing of tasks.  Consider, for example, a process
-that depends on all CPUs being online while it's running.  Since we need to
-disable nonboot CPUs during the hibernation, if this process is not frozen, it
-may notice that the number of CPUs has changed and may start to work incorrectly
-because of that.
-
-V. Are there any problems related to the freezing of tasks?
-
-Yes, there are.
-
-First of all, the freezing of kernel threads may be tricky if they depend one
-on another.  For example, if kernel thread A waits for a completion (in the
-TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
-and B is frozen in the meantime, then A will be blocked until B is thawed, which
-may be undesirable.  That's why kernel threads are not freezable by default.
-
-Second, there are the following two problems related to the freezing of user
-space processes:
-1. Putting processes into an uninterruptible sleep distorts the load average.
-2. Now that we have FUSE, plus the framework for doing device drivers in
-userspace, it gets even more complicated because some userspace processes are
-now doing the sorts of things that kernel threads do
-(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
-
-The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
-other one is more serious, but it seems that we can work around it by using
-hibernation (and suspend) notifiers (in that case, though, we won't be able to
-avoid the realization by the user space processes that the hibernation is taking
-place).
-
-There are also problems that the freezing of tasks tends to expose, although
-they are not directly related to it.  For example, if request_firmware() is
-called from a device driver's .resume() routine, it will timeout and eventually
-fail, because the user land process that should respond to the request is frozen
-at this point.  So, seemingly, the failure is due to the freezing of tasks.
-Suppose, however, that the firmware file is located on a filesystem accessible
-only through another device that hasn't been resumed yet.  In that case,
-request_firmware() will fail regardless of whether or not the freezing of tasks
-is used.  Consequently, the problem is not really related to the freezing of
-tasks, since it generally exists anyway.
-
-A driver must have all firmwares it may need in RAM before suspend() is called.
-If keeping them is not practical, for example due to their size, they must be
-requested early enough using the suspend notifier API described in
-Documentation/driver-api/pm/notifiers.rst.
-
-VI. Are there any precautions to be taken to prevent freezing failures?
-
-Yes, there are.
-
-First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
-from system-wide sleep such as suspend/hibernation is not encouraged.
-If possible, that piece of code must instead hook onto the suspend/hibernation
-notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
-(kernel/cpu.c) for an example.
-
-However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
-that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
-to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
-state. As a consequence, the freezer would not be able to freeze that task,
-leading to freezing failure.
-
-However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
-since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
-only after the entire suspend/hibernation sequence is complete.
-So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
-
-V. Miscellaneous
-/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
-all user space processes or all freezable kernel threads, in unit of millisecond.
-The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
new file mode 100644
index 000000000000..002e42745263
--- /dev/null
+++ b/Documentation/power/index.rst
@@ -0,0 +1,46 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+Power Management
+================
+
+.. toctree::
+    :maxdepth: 1
+
+    apm-acpi
+    basic-pm-debugging
+    charger-manager
+    drivers-testing
+    energy-model
+    freezing-of-tasks
+    interface
+    opp
+    pci
+    pm_qos_interface
+    power_supply_class
+    runtime_pm
+    s2ram
+    suspend-and-cpuhotplug
+    suspend-and-interrupts
+    swsusp-and-swap-files
+    swsusp-dmcrypt
+    swsusp
+    video
+    tricks
+
+    userland-swsusp
+
+    powercap/powercap
+
+    regulator/consumer
+    regulator/design
+    regulator/machine
+    regulator/overview
+    regulator/regulator
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst
new file mode 100644
index 000000000000..8d270ed27228
--- /dev/null
+++ b/Documentation/power/interface.rst
@@ -0,0 +1,79 @@
+===========================================
+Power Management Interface for System Sleep
+===========================================
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+- 'freeze' (Suspend-to-Idle)
+- 'standby' (Power-On Suspend)
+- 'mem' (Suspend-to-RAM)
+- 'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state.  Refer to
+Documentation/admin-guide/pm/sleep-states.rst for a description of each of
+those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+- 'platform' (put the system into sleep using a platform-provided method)
+- 'shutdown' (shut the system down)
+- 'reboot' (reboot the system)
+- 'suspend' (trigger a Suspend-to-RAM transition)
+- 'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example).  The 'suspend' option is available if Suspend-to-RAM
+is supported.  Refer to Documentation/power/basic-pm-debugging.rst for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes.  The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible.  In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.rst for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
deleted file mode 100644
index 27df7f98668a..000000000000
--- a/Documentation/power/interface.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Power Management Interface for System Sleep
-
-Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-The power management subsystem provides userspace with a unified sysfs interface
-for system sleep regardless of the underlying system architecture or platform.
-The interface is located in the /sys/power/ directory (assuming that sysfs is
-mounted at /sys).
-
-/sys/power/state is the system sleep state control file.
-
-Reading from it returns a list of supported sleep states, encoded as:
-
-'freeze' (Suspend-to-Idle)
-'standby' (Power-On Suspend)
-'mem' (Suspend-to-RAM)
-'disk' (Suspend-to-Disk)
-
-Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
-too as long the kernel has been configured to support hibernation at all
-(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
-for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
-platform.
-
-If one of the strings listed in /sys/power/state is written to it, the system
-will attempt to transition into the corresponding sleep state.  Refer to
-Documentation/admin-guide/pm/sleep-states.rst for a description of each of
-those states.
-
-/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
-Specifically, it tells the kernel what to do after creating a hibernation image.
-
-Reading from it returns a list of supported options encoded as:
-
-'platform' (put the system into sleep using a platform-provided method)
-'shutdown' (shut the system down)
-'reboot' (reboot the system)
-'suspend' (trigger a Suspend-to-RAM transition)
-'test_resume' (resume-after-hibernation test mode)
-
-The currently selected option is printed in square brackets.
-
-The 'platform' option is only available if the platform provides a special
-mechanism to put the system to sleep after creating a hibernation image (ACPI
-does that, for example).  The 'suspend' option is available if Suspend-to-RAM
-is supported.  Refer to Documentation/power/basic-pm-debugging.txt for the
-description of the 'test_resume' option.
-
-To select an option, write the string representing it to /sys/power/disk.
-
-/sys/power/image_size controls the size of hibernation images.
-
-It can be written a string representing a non-negative integer that will be
-used as a best-effort upper limit of the image size, in bytes.  The hibernation
-core will do its best to ensure that the image size will not exceed that number.
-However, if that turns out to be impossible to achieve, a hibernation image will
-still be created and its size will be as small as possible.  In particular,
-writing '0' to this file will enforce hibernation images to be as small as
-possible.
-
-Reading from this file returns the current image size limit, which is set to
-around 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
-or resume event point in the RTC across reboots.
-
-It helps to debug hard lockups or reboots due to device driver failures that
-occur during system suspend or resume (which is more common) more effectively.
-
-If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
-event point in turn will be stored in the RTC memory (overwriting the actual
-RTC information), so it will survive a system crash if one occurs right after
-storing it and it can be used later to identify the driver that caused the crash
-to happen (see Documentation/power/s2ram.txt for more information).
-
-Initially it contains '0' which may be changed to '1' by writing a string
-representing a nonzero integer into it.
diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst
new file mode 100644
index 000000000000..b3cf1def9dee
--- /dev/null
+++ b/Documentation/power/opp.rst
@@ -0,0 +1,379 @@
+==========================================
+Operating Performance Points (OPP) Library
+==========================================
+
+(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
+
+.. Contents
+
+  1. Introduction
+  2. Initial OPP List Registration
+  3. OPP Search Functions
+  4. OPP Availability Control Functions
+  5. OPP Data Retrieval Functions
+  6. Data Structures
+
+1. Introduction
+===============
+
+1.1 What is an Operating Performance Point (OPP)?
+-------------------------------------------------
+
+Complex SoCs of today consists of a multiple sub-modules working in conjunction.
+In an operational system executing varied use cases, not all modules in the SoC
+need to function at their highest performing frequency all the time. To
+facilitate this, sub-modules in a SoC are grouped into domains, allowing some
+domains to run at lower voltage and frequency while other domains run at
+voltage/frequency pairs that are higher.
+
+The set of discrete tuples consisting of frequency and voltage pairs that
+the device will support per domain are called Operating Performance Points or
+OPPs.
+
+As an example:
+
+Let us consider an MPU device which supports the following:
+{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
+{1GHz at minimum voltage of 1.3V}
+
+We can represent these as three OPPs as the following {Hz, uV} tuples:
+
+- {300000000, 1000000}
+- {800000000, 1200000}
+- {1000000000, 1300000}
+
+1.2 Operating Performance Points Library
+----------------------------------------
+
+OPP library provides a set of helper functions to organize and query the OPP
+information. The library is located in drivers/base/power/opp.c and the header
+is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
+CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
+CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
+optionally boot at a certain OPP without needing cpufreq.
+
+Typical usage of the OPP library is as follows::
+
+ (users)	-> registers a set of default OPPs		-> (library)
+ SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
+		-> queries to search/retrieve information	->
+
+OPP layer expects each domain to be represented by a unique device pointer. SoC
+framework registers a set of initial OPPs per device with the OPP layer. This
+list is expected to be an optimally small number typically around 5 per device.
+This initial list contains a set of OPPs that the framework expects to be safely
+enabled by default in the system.
+
+Note on OPP Availability
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As the system proceeds to operate, SoC framework may choose to make certain
+OPPs available or not available on each device based on various external
+factors. Example usage: Thermal management or other exceptional situations where
+SoC framework might choose to disable a higher frequency OPP to safely continue
+operations until that OPP could be re-enabled if possible.
+
+OPP library facilitates this concept in it's implementation. The following
+operational functions operate only on available opps:
+opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
+
+dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
+be used for dev_pm_opp_enable/disable functions to make an opp available as required.
+
+WARNING: Users of OPP library should refresh their availability count using
+get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
+exact mechanism to trigger these or the notification mechanism to other
+dependent subsystems such as cpufreq are left to the discretion of the SoC
+specific framework which uses the OPP library. Similar care needs to be taken
+care to refresh the cpufreq table in cases of these operations.
+
+2. Initial OPP List Registration
+================================
+The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
+device. It is expected that the SoC framework will register the OPP entries
+optimally- typical numbers range to be less than 5. The list generated by
+registering the OPPs is maintained by OPP library throughout the device
+operation. The SoC framework can subsequently control the availability of the
+OPPs dynamically using the dev_pm_opp_enable / disable functions.
+
+dev_pm_opp_add
+	Add a new OPP for a specific domain represented by the device pointer.
+	The OPP is defined using the frequency and voltage. Once added, the OPP
+	is assumed to be available and control of it's availability can be done
+	with the dev_pm_opp_enable/disable functions. OPP library internally stores
+	and manages this information in the opp struct. This function may be
+	used by SoC framework to define a optimal list as per the demands of
+	SoC usage environment.
+
+	WARNING:
+		Do not use this function in interrupt context.
+
+	Example::
+
+	 soc_pm_init()
+	 {
+		/* Do things */
+		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
+		if (!r) {
+			pr_err("%s: unable to register mpu opp(%d)\n", r);
+			goto no_cpufreq;
+		}
+		/* Do cpufreq things */
+	 no_cpufreq:
+		/* Do remaining things */
+	 }
+
+3. OPP Search Functions
+=======================
+High level framework such as cpufreq operates on frequencies. To map the
+frequency back to the corresponding OPP, OPP library provides handy functions
+to search the OPP list that OPP library internally manages. These search
+functions return the matching pointer representing the opp if a match is
+found, else returns error. These errors are expected to be handled by standard
+error checks such as IS_ERR() and appropriate actions taken by the caller.
+
+Callers of these functions shall call dev_pm_opp_put() after they have used the
+OPP. Otherwise the memory for the OPP will never get freed and result in
+memleak.
+
+dev_pm_opp_find_freq_exact
+	Search for an OPP based on an *exact* frequency and
+	availability. This function is especially useful to enable an OPP which
+	is not available by default.
+	Example: In a case when SoC framework detects a situation where a
+	higher frequency could be made available, it can use this function to
+	find the OPP prior to call the dev_pm_opp_enable to actually make
+	it available::
+
+	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+	 dev_pm_opp_put(opp);
+	 /* dont operate on the pointer.. just do a sanity check.. */
+	 if (IS_ERR(opp)) {
+		pr_err("frequency not disabled!\n");
+		/* trigger appropriate actions.. */
+	 } else {
+		dev_pm_opp_enable(dev,1000000000);
+	 }
+
+	NOTE:
+	  This is the only search function that operates on OPPs which are
+	  not available.
+
+dev_pm_opp_find_freq_floor
+	Search for an available OPP which is *at most* the
+	provided frequency. This function is useful while searching for a lesser
+	match OR operating on OPP information in the order of decreasing
+	frequency.
+	Example: To find the highest opp for a device::
+
+	 freq = ULONG_MAX;
+	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+dev_pm_opp_find_freq_ceil
+	Search for an available OPP which is *at least* the
+	provided frequency. This function is useful while searching for a
+	higher match OR operating on OPP information in the order of increasing
+	frequency.
+	Example 1: To find the lowest opp for a device::
+
+	 freq = 0;
+	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+	Example 2: A simplified implementation of a SoC cpufreq_driver->target::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* Do stuff like policy checks etc. */
+		/* Find the best frequency match for the req */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		dev_pm_opp_put(opp);
+		if (!IS_ERR(opp))
+			soc_switch_to_freq_voltage(freq);
+		else
+			/* do something when we can't satisfy the req */
+		/* do other stuff */
+	 }
+
+4. OPP Availability Control Functions
+=====================================
+A default OPP list registered with the OPP library may not cater to all possible
+situation. The OPP library provides a set of functions to modify the
+availability of a OPP within the OPP list. This allows SoC frameworks to have
+fine grained dynamic control of which sets of OPPs are operationally available.
+These functions are intended to *temporarily* remove an OPP in conditions such
+as thermal considerations (e.g. don't use OPPx until the temperature drops).
+
+WARNING:
+	Do not use these functions in interrupt context.
+
+dev_pm_opp_enable
+	Make a OPP available for operation.
+	Example: Lets say that 1GHz OPP is to be made available only if the
+	SoC temperature is lower than a certain threshold. The SoC framework
+	implementation might choose to do something as follows::
+
+	 if (cur_temp < temp_low_thresh) {
+		/* Enable 1GHz if it was disabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_enable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+dev_pm_opp_disable
+	Make an OPP to be not available for operation
+	Example: Lets say that 1GHz OPP is to be disabled if the temperature
+	exceeds a threshold value. The SoC framework implementation might
+	choose to do something as follows::
+
+	 if (cur_temp > temp_high_thresh) {
+		/* Disable 1GHz if it was enabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_disable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+5. OPP Data Retrieval Functions
+===============================
+Since OPP library abstracts away the OPP information, a set of functions to pull
+information from the OPP structure is necessary. Once an OPP pointer is
+retrieved using the search functions, the following functions can be used by SoC
+framework to retrieve the information represented inside the OPP layer.
+
+dev_pm_opp_get_voltage
+	Retrieve the voltage represented by the opp pointer.
+	Example: At a cpufreq transition to a different frequency, SoC
+	framework requires to set the voltage represented by the OPP using
+	the regulator framework to the Power Management chip providing the
+	voltage::
+
+	 soc_switch_to_freq_voltage(freq)
+	 {
+		/* do things */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		v = dev_pm_opp_get_voltage(opp);
+		dev_pm_opp_put(opp);
+		if (v)
+			regulator_set_voltage(.., v);
+		/* do other things */
+	 }
+
+dev_pm_opp_get_freq
+	Retrieve the freq represented by the opp pointer.
+	Example: Lets say the SoC framework uses a couple of helper functions
+	we could pass opp pointers instead of doing additional parameters to
+	handle quiet a bit of data parameters::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* do things.. */
+		 max_freq = ULONG_MAX;
+		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
+		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
+		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
+			r = soc_test_validity(max_opp, requested_opp);
+		 dev_pm_opp_put(max_opp);
+		 dev_pm_opp_put(requested_opp);
+		/* do other things */
+	 }
+	 soc_test_validity(..)
+	 {
+		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
+			 return -EINVAL;
+		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
+			 return -EINVAL;
+		/* do things.. */
+	 }
+
+dev_pm_opp_get_opp_count
+	Retrieve the number of available opps for a device
+	Example: Lets say a co-processor in the SoC needs to know the available
+	frequencies in a table, the main processor can notify as following::
+
+	 soc_notify_coproc_available_frequencies()
+	 {
+		/* Do things */
+		num_available = dev_pm_opp_get_opp_count(dev);
+		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
+		/* populate the table in increasing order */
+		freq = 0;
+		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
+			speeds[i] = freq;
+			freq++;
+			i++;
+			dev_pm_opp_put(opp);
+		}
+
+		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
+		/* Do other things */
+	 }
+
+6. Data Structures
+==================
+Typically an SoC contains multiple voltage domains which are variable. Each
+domain is represented by a device pointer. The relationship to OPP can be
+represented as follows::
+
+  SoC
+   |- device 1
+   |	|- opp 1 (availability, freq, voltage)
+   |	|- opp 2 ..
+   ...	...
+   |	`- opp n ..
+   |- device 2
+   ...
+   `- device m
+
+OPP library maintains a internal list that the SoC framework populates and
+accessed by various functions as described above. However, the structures
+representing the actual OPPs and domains are internal to the OPP library itself
+to allow for suitable abstraction reusable across systems.
+
+struct dev_pm_opp
+	The internal data structure of OPP library which is used to
+	represent an OPP. In addition to the freq, voltage, availability
+	information, it also contains internal book keeping information required
+	for the OPP library to operate on.  Pointer to this structure is
+	provided back to the users such as SoC framework to be used as a
+	identifier for OPP in the interactions with OPP layer.
+
+	WARNING:
+	  The struct dev_pm_opp pointer should not be parsed or modified by the
+	  users. The defaults of for an instance is populated by
+	  dev_pm_opp_add, but the availability of the OPP can be modified
+	  by dev_pm_opp_enable/disable functions.
+
+struct device
+	This is used to identify a domain to the OPP layer. The
+	nature of the device and it's implementation is left to the user of
+	OPP library such as the SoC framework.
+
+Overall, in a simplistic view, the data structure operations is represented as
+following::
+
+  Initialization / modification:
+              +-----+        /- dev_pm_opp_enable
+  dev_pm_opp_add --> | opp | <-------
+    |         +-----+        \- dev_pm_opp_disable
+    \-------> domain_info(device)
+
+  Search functions:
+               /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
+  domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
+               \-- dev_pm_opp_find_freq_floor ---/   +-----+
+
+  Retrieval functions:
+  +-----+     /- dev_pm_opp_get_voltage
+  | opp | <---
+  +-----+     \- dev_pm_opp_get_freq
+
+  domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
deleted file mode 100644
index 0c007e250cd1..000000000000
--- a/Documentation/power/opp.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Operating Performance Points (OPP) Library
-==========================================
-
-(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
-
-Contents
---------
-1. Introduction
-2. Initial OPP List Registration
-3. OPP Search Functions
-4. OPP Availability Control Functions
-5. OPP Data Retrieval Functions
-6. Data Structures
-
-1. Introduction
-===============
-1.1 What is an Operating Performance Point (OPP)?
-
-Complex SoCs of today consists of a multiple sub-modules working in conjunction.
-In an operational system executing varied use cases, not all modules in the SoC
-need to function at their highest performing frequency all the time. To
-facilitate this, sub-modules in a SoC are grouped into domains, allowing some
-domains to run at lower voltage and frequency while other domains run at
-voltage/frequency pairs that are higher.
-
-The set of discrete tuples consisting of frequency and voltage pairs that
-the device will support per domain are called Operating Performance Points or
-OPPs.
-
-As an example:
-Let us consider an MPU device which supports the following:
-{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
-{1GHz at minimum voltage of 1.3V}
-
-We can represent these as three OPPs as the following {Hz, uV} tuples:
-{300000000, 1000000}
-{800000000, 1200000}
-{1000000000, 1300000}
-
-1.2 Operating Performance Points Library
-
-OPP library provides a set of helper functions to organize and query the OPP
-information. The library is located in drivers/base/power/opp.c and the header
-is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
-CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
-CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
-optionally boot at a certain OPP without needing cpufreq.
-
-Typical usage of the OPP library is as follows:
-(users)		-> registers a set of default OPPs		-> (library)
-SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
-		-> queries to search/retrieve information	->
-
-OPP layer expects each domain to be represented by a unique device pointer. SoC
-framework registers a set of initial OPPs per device with the OPP layer. This
-list is expected to be an optimally small number typically around 5 per device.
-This initial list contains a set of OPPs that the framework expects to be safely
-enabled by default in the system.
-
-Note on OPP Availability:
-------------------------
-As the system proceeds to operate, SoC framework may choose to make certain
-OPPs available or not available on each device based on various external
-factors. Example usage: Thermal management or other exceptional situations where
-SoC framework might choose to disable a higher frequency OPP to safely continue
-operations until that OPP could be re-enabled if possible.
-
-OPP library facilitates this concept in it's implementation. The following
-operational functions operate only on available opps:
-opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
-
-dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
-be used for dev_pm_opp_enable/disable functions to make an opp available as required.
-
-WARNING: Users of OPP library should refresh their availability count using
-get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
-exact mechanism to trigger these or the notification mechanism to other
-dependent subsystems such as cpufreq are left to the discretion of the SoC
-specific framework which uses the OPP library. Similar care needs to be taken
-care to refresh the cpufreq table in cases of these operations.
-
-2. Initial OPP List Registration
-================================
-The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
-device. It is expected that the SoC framework will register the OPP entries
-optimally- typical numbers range to be less than 5. The list generated by
-registering the OPPs is maintained by OPP library throughout the device
-operation. The SoC framework can subsequently control the availability of the
-OPPs dynamically using the dev_pm_opp_enable / disable functions.
-
-dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer.
-	The OPP is defined using the frequency and voltage. Once added, the OPP
-	is assumed to be available and control of it's availability can be done
-	with the dev_pm_opp_enable/disable functions. OPP library internally stores
-	and manages this information in the opp struct. This function may be
-	used by SoC framework to define a optimal list as per the demands of
-	SoC usage environment.
-
-	WARNING: Do not use this function in interrupt context.
-
-	Example:
-	 soc_pm_init()
-	 {
-		/* Do things */
-		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
-		if (!r) {
-			pr_err("%s: unable to register mpu opp(%d)\n", r);
-			goto no_cpufreq;
-		}
-		/* Do cpufreq things */
-	 no_cpufreq:
-		/* Do remaining things */
-	 }
-
-3. OPP Search Functions
-=======================
-High level framework such as cpufreq operates on frequencies. To map the
-frequency back to the corresponding OPP, OPP library provides handy functions
-to search the OPP list that OPP library internally manages. These search
-functions return the matching pointer representing the opp if a match is
-found, else returns error. These errors are expected to be handled by standard
-error checks such as IS_ERR() and appropriate actions taken by the caller.
-
-Callers of these functions shall call dev_pm_opp_put() after they have used the
-OPP. Otherwise the memory for the OPP will never get freed and result in
-memleak.
-
-dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and
-	availability. This function is especially useful to enable an OPP which
-	is not available by default.
-	Example: In a case when SoC framework detects a situation where a
-	higher frequency could be made available, it can use this function to
-	find the OPP prior to call the dev_pm_opp_enable to actually make it available.
-	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-	 dev_pm_opp_put(opp);
-	 /* dont operate on the pointer.. just do a sanity check.. */
-	 if (IS_ERR(opp)) {
-		pr_err("frequency not disabled!\n");
-		/* trigger appropriate actions.. */
-	 } else {
-		dev_pm_opp_enable(dev,1000000000);
-	 }
-
-	NOTE: This is the only search function that operates on OPPs which are
-	not available.
-
-dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the
-	provided frequency. This function is useful while searching for a lesser
-	match OR operating on OPP information in the order of decreasing
-	frequency.
-	Example: To find the highest opp for a device:
-	 freq = ULONG_MAX;
-	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
-	 dev_pm_opp_put(opp);
-
-dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the
-	provided frequency. This function is useful while searching for a
-	higher match OR operating on OPP information in the order of increasing
-	frequency.
-	Example 1: To find the lowest opp for a device:
-	 freq = 0;
-	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-	 dev_pm_opp_put(opp);
-	Example 2: A simplified implementation of a SoC cpufreq_driver->target:
-	 soc_cpufreq_target(..)
-	 {
-		/* Do stuff like policy checks etc. */
-		/* Find the best frequency match for the req */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		dev_pm_opp_put(opp);
-		if (!IS_ERR(opp))
-			soc_switch_to_freq_voltage(freq);
-		else
-			/* do something when we can't satisfy the req */
-		/* do other stuff */
-	 }
-
-4. OPP Availability Control Functions
-=====================================
-A default OPP list registered with the OPP library may not cater to all possible
-situation. The OPP library provides a set of functions to modify the
-availability of a OPP within the OPP list. This allows SoC frameworks to have
-fine grained dynamic control of which sets of OPPs are operationally available.
-These functions are intended to *temporarily* remove an OPP in conditions such
-as thermal considerations (e.g. don't use OPPx until the temperature drops).
-
-WARNING: Do not use these functions in interrupt context.
-
-dev_pm_opp_enable - Make a OPP available for operation.
-	Example: Lets say that 1GHz OPP is to be made available only if the
-	SoC temperature is lower than a certain threshold. The SoC framework
-	implementation might choose to do something as follows:
-	 if (cur_temp < temp_low_thresh) {
-		/* Enable 1GHz if it was disabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_enable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-dev_pm_opp_disable - Make an OPP to be not available for operation
-	Example: Lets say that 1GHz OPP is to be disabled if the temperature
-	exceeds a threshold value. The SoC framework implementation might
-	choose to do something as follows:
-	 if (cur_temp > temp_high_thresh) {
-		/* Disable 1GHz if it was enabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_disable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-5. OPP Data Retrieval Functions
-===============================
-Since OPP library abstracts away the OPP information, a set of functions to pull
-information from the OPP structure is necessary. Once an OPP pointer is
-retrieved using the search functions, the following functions can be used by SoC
-framework to retrieve the information represented inside the OPP layer.
-
-dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer.
-	Example: At a cpufreq transition to a different frequency, SoC
-	framework requires to set the voltage represented by the OPP using
-	the regulator framework to the Power Management chip providing the
-	voltage.
-	 soc_switch_to_freq_voltage(freq)
-	 {
-		/* do things */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		v = dev_pm_opp_get_voltage(opp);
-		dev_pm_opp_put(opp);
-		if (v)
-			regulator_set_voltage(.., v);
-		/* do other things */
-	 }
-
-dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer.
-	Example: Lets say the SoC framework uses a couple of helper functions
-	we could pass opp pointers instead of doing additional parameters to
-	handle quiet a bit of data parameters.
-	 soc_cpufreq_target(..)
-	 {
-		/* do things.. */
-		 max_freq = ULONG_MAX;
-		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
-		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
-		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
-			r = soc_test_validity(max_opp, requested_opp);
-		 dev_pm_opp_put(max_opp);
-		 dev_pm_opp_put(requested_opp);
-		/* do other things */
-	 }
-	 soc_test_validity(..)
-	 {
-		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
-			 return -EINVAL;
-		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
-			 return -EINVAL;
-		/* do things.. */
-	 }
-
-dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device
-	Example: Lets say a co-processor in the SoC needs to know the available
-	frequencies in a table, the main processor can notify as following:
-	 soc_notify_coproc_available_frequencies()
-	 {
-		/* Do things */
-		num_available = dev_pm_opp_get_opp_count(dev);
-		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
-		/* populate the table in increasing order */
-		freq = 0;
-		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
-			speeds[i] = freq;
-			freq++;
-			i++;
-			dev_pm_opp_put(opp);
-		}
-
-		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
-		/* Do other things */
-	 }
-
-6. Data Structures
-==================
-Typically an SoC contains multiple voltage domains which are variable. Each
-domain is represented by a device pointer. The relationship to OPP can be
-represented as follows:
-SoC
- |- device 1
- |	|- opp 1 (availability, freq, voltage)
- |	|- opp 2 ..
- ...	...
- |	`- opp n ..
- |- device 2
- ...
- `- device m
-
-OPP library maintains a internal list that the SoC framework populates and
-accessed by various functions as described above. However, the structures
-representing the actual OPPs and domains are internal to the OPP library itself
-to allow for suitable abstraction reusable across systems.
-
-struct dev_pm_opp - The internal data structure of OPP library which is used to
-	represent an OPP. In addition to the freq, voltage, availability
-	information, it also contains internal book keeping information required
-	for the OPP library to operate on.  Pointer to this structure is
-	provided back to the users such as SoC framework to be used as a
-	identifier for OPP in the interactions with OPP layer.
-
-	WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the
-	users. The defaults of for an instance is populated by dev_pm_opp_add, but the
-	availability of the OPP can be modified by dev_pm_opp_enable/disable functions.
-
-struct device - This is used to identify a domain to the OPP layer. The
-	nature of the device and it's implementation is left to the user of
-	OPP library such as the SoC framework.
-
-Overall, in a simplistic view, the data structure operations is represented as
-following:
-
-Initialization / modification:
-            +-----+        /- dev_pm_opp_enable
-dev_pm_opp_add --> | opp | <-------
-  |         +-----+        \- dev_pm_opp_disable
-  \-------> domain_info(device)
-
-Search functions:
-             /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
-domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
-             \-- dev_pm_opp_find_freq_floor ---/   +-----+
-
-Retrieval functions:
-+-----+     /- dev_pm_opp_get_voltage
-| opp | <---
-+-----+     \- dev_pm_opp_get_freq
-
-domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
new file mode 100644
index 000000000000..0e2ef7429304
--- /dev/null
+++ b/Documentation/power/pci.rst
@@ -0,0 +1,1135 @@
+====================
+PCI Power Management
+====================
+
+Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+An overview of concepts and the Linux kernel's interfaces related to PCI power
+management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
+(and others).
+
+This document only covers the aspects of power management specific to PCI
+devices.  For general description of the kernel's interfaces related to device
+power management refer to Documentation/driver-api/pm/devices.rst and
+Documentation/power/runtime_pm.rst.
+
+.. contents:
+
+   1. Hardware and Platform Support for PCI Power Management
+   2. PCI Subsystem and Device Power Management
+   3. PCI Device Drivers and Power Management
+   4. Resources
+
+
+1. Hardware and Platform Support for PCI Power Management
+=========================================================
+
+1.1. Native and Platform-Based Power Management
+-----------------------------------------------
+
+In general, power management is a feature allowing one to save energy by putting
+devices into states in which they draw less power (low-power states) at the
+price of reduced functionality or performance.
+
+Usually, a device is put into a low-power state when it is underutilized or
+completely inactive.  However, when it is necessary to use the device once
+again, it has to be put back into the "fully functional" state (full-power
+state).  This may happen when there are some data for the device to handle or
+as a result of an external event requiring the device to be active, which may
+be signaled by the device itself.
+
+PCI devices may be put into low-power states in two ways, by using the device
+capabilities introduced by the PCI Bus Power Management Interface Specification,
+or with the help of platform firmware, such as an ACPI BIOS.  In the first
+approach, that is referred to as the native PCI power management (native PCI PM)
+in what follows, the device power state is changed as a result of writing a
+specific value into one of its standard configuration registers.  The second
+approach requires the platform firmware to provide special methods that may be
+used by the kernel to change the device's power state.
+
+Devices supporting the native PCI PM usually can generate wakeup signals called
+Power Management Events (PMEs) to let the kernel know about external events
+requiring the device to be active.  After receiving a PME the kernel is supposed
+to put the device that sent it into the full-power state.  However, the PCI Bus
+Power Management Interface Specification doesn't define any standard method of
+delivering the PME from the device to the CPU and the operating system kernel.
+It is assumed that the platform firmware will perform this task and therefore,
+even though a PCI device is set up to generate PMEs, it also may be necessary to
+prepare the platform firmware for notifying the CPU of the PMEs coming from the
+device (e.g. by generating interrupts).
+
+In turn, if the methods provided by the platform firmware are used for changing
+the power state of a device, usually the platform also provides a method for
+preparing the device to generate wakeup signals.  In that case, however, it
+often also is necessary to prepare the device for generating PMEs using the
+native PCI PM mechanism, because the method provided by the platform depends on
+that.
+
+Thus in many situations both the native and the platform-based power management
+mechanisms have to be used simultaneously to obtain the desired result.
+
+1.2. Native PCI Power Management
+--------------------------------
+
+The PCI Bus Power Management Interface Specification (PCI PM Spec) was
+introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
+standard interface for performing various operations related to power
+management.
+
+The implementation of the PCI PM Spec is optional for conventional PCI devices,
+but it is mandatory for PCI Express devices.  If a device supports the PCI PM
+Spec, it has an 8 byte power management capability field in its PCI
+configuration space.  This field is used to describe and control the standard
+features related to the native PCI power management.
+
+The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
+(B0-B3).  The higher the number, the less power is drawn by the device or bus
+in that state.  However, the higher the number, the longer the latency for
+the device or bus to return to the full-power state (D0 or B0, respectively).
+
+There are two variants of the D3 state defined by the specification.  The first
+one is D3hot, referred to as the software accessible D3, because devices can be
+programmed to go into it.  The second one, D3cold, is the state that PCI devices
+are in when the supply voltage (Vcc) is removed from them.  It is not possible
+to program a PCI device to go into D3cold, although there may be a programmable
+interface for putting the bus the device is on into a state in which Vcc is
+removed from all devices on the bus.
+
+PCI bus power management, however, is not supported by the Linux kernel at the
+time of this writing and therefore it is not covered by this document.
+
+Note that every PCI device can be in the full-power state (D0) or in D3cold,
+regardless of whether or not it implements the PCI PM Spec.  In addition to
+that, if the PCI PM Spec is implemented by the device, it must support D3hot
+as well as D0.  The support for the D1 and D2 power states is optional.
+
+PCI devices supporting the PCI PM Spec can be programmed to go to any of the
+supported low-power states (except for D3cold).  While in D1-D3hot the
+standard configuration registers of the device must be accessible to software
+(i.e. the device is required to respond to PCI configuration accesses), although
+its I/O and memory spaces are then disabled.  This allows the device to be
+programmatically put into D0.  Thus the kernel can switch the device back and
+forth between D0 and the supported low-power states (except for D3cold) and the
+possible power state transitions the device can undergo are the following:
+
++----------------------------+
+| Current State | New State  |
++----------------------------+
+| D0            | D1, D2, D3 |
++----------------------------+
+| D1            | D2, D3     |
++----------------------------+
+| D2            | D3         |
++----------------------------+
+| D1, D2, D3    | D0         |
++----------------------------+
+
+The transition from D3cold to D0 occurs when the supply voltage is provided to
+the device (i.e. power is restored).  In that case the device returns to D0 with
+a full power-on reset sequence and the power-on defaults are restored to the
+device by hardware just as at initial power up.
+
+PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
+while in a low-power state (D1-D3), but they are not required to be capable
+of generating PMEs from all supported low-power states.  In particular, the
+capability of generating PMEs from D3cold is optional and depends on the
+presence of additional voltage (3.3Vaux) allowing the device to remain
+sufficiently active to generate a wakeup signal.
+
+1.3. ACPI Device Power Management
+---------------------------------
+
+The platform firmware support for the power management of PCI devices is
+system-specific.  However, if the system in question is compliant with the
+Advanced Configuration and Power Interface (ACPI) Specification, like the
+majority of x86-based systems, it is supposed to implement device power
+management interfaces defined by the ACPI standard.
+
+For this purpose the ACPI BIOS provides special functions called "control
+methods" that may be executed by the kernel to perform specific tasks, such as
+putting a device into a low-power state.  These control methods are encoded
+using special byte-code language called the ACPI Machine Language (AML) and
+stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
+them as needed using an AML interpreter that translates the AML byte code into
+computations and memory or I/O space accesses.  This way, in theory, a BIOS
+writer can provide the kernel with a means to perform actions depending
+on the system design in a system-specific fashion.
+
+ACPI control methods may be divided into global control methods, that are not
+associated with any particular devices, and device control methods, that have
+to be defined separately for each device supposed to be handled with the help of
+the platform.  This means, in particular, that ACPI device control methods can
+only be used to handle devices that the BIOS writer knew about in advance.  The
+ACPI methods used for device power management fall into that category.
+
+The ACPI specification assumes that devices can be in one of four power states
+labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
+D0-D3 states (although the difference between D3hot and D3cold is not taken
+into account by ACPI).  Moreover, for each power state of a device there is a
+set of power resources that have to be enabled for the device to be put into
+that state.  These power resources are controlled (i.e. enabled or disabled)
+with the help of their own control methods, _ON and _OFF, that have to be
+defined individually for each of them.
+
+To put a device into the ACPI power state Dx (where x is a number between 0 and
+3 inclusive) the kernel is supposed to (1) enable the power resources required
+by the device in this state using their _ON control methods and (2) execute the
+_PSx control method defined for the device.  In addition to that, if the device
+is going to be put into a low-power state (D1-D3) and is supposed to generate
+wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
+3.0) control method defined for it has to be executed before _PSx.  Power
+resources that are not required by the device in the target power state and are
+not required any more by any other device should be disabled (by executing their
+_OFF control methods).  If the current power state of the device is D3, it can
+only be put into D0 this way.
+
+However, quite often the power states of devices are changed during a
+system-wide transition into a sleep state or back into the working state.  ACPI
+defines four system sleep states, S1, S2, S3, and S4, and denotes the system
+working state as S0.  In general, the target system sleep (or working) state
+determines the highest power (lowest number) state the device can be put
+into and the kernel is supposed to obtain this information by executing the
+device's _SxD control method (where x is a number between 0 and 4 inclusive).
+If the device is required to wake up the system from the target sleep state, the
+lowest power (highest number) state it can be put into is also determined by the
+target state of the system.  The kernel is then supposed to use the device's
+_SxW control method to obtain the number of that state.  It also is supposed to
+use the device's _PRW control method to learn which power resources need to be
+enabled for the device to be able to generate wakeup signals.
+
+1.4. Wakeup Signaling
+---------------------
+
+Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
+a result of the execution of the _DSW (or _PSW) ACPI control method before
+putting the device into a low-power state, have to be caught and handled as
+appropriate.  If they are sent while the system is in the working state
+(ACPI S0), they should be translated into interrupts so that the kernel can
+put the devices generating them into the full-power state and take care of the
+events that triggered them.  In turn, if they are sent while the system is
+sleeping, they should cause the system's core logic to trigger wakeup.
+
+On ACPI-based systems wakeup signals sent by conventional PCI devices are
+converted into ACPI General-Purpose Events (GPEs) which are hardware signals
+from the system core logic generated in response to various events that need to
+be acted upon.  Every GPE is associated with one or more sources of potentially
+interesting events.  In particular, a GPE may be associated with a PCI device
+capable of signaling wakeup.  The information on the connections between GPEs
+and event sources is recorded in the system's ACPI BIOS from where it can be
+read by the kernel.
+
+If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
+associated with it (if there is one) is triggered.  The GPEs associated with PCI
+bridges may also be triggered in response to a wakeup signal from one of the
+devices below the bridge (this also is the case for root bridges) and, for
+example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
+handled this way.
+
+A GPE may be triggered when the system is sleeping (i.e. when it is in one of
+the ACPI S1-S4 states), in which case system wakeup is started by its core logic
+(the device that was the source of the signal causing the system wakeup to occur
+may be identified later).  The GPEs used in such situations are referred to as
+wakeup GPEs.
+
+Usually, however, GPEs are also triggered when the system is in the working
+state (ACPI S0) and in that case the system's core logic generates a System
+Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
+handler identifies the GPE that caused the interrupt to be generated which,
+in turn, allows the kernel to identify the source of the event (that may be
+a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
+events occurring while the system is in the working state are referred to as
+runtime GPEs.
+
+Unfortunately, there is no standard way of handling wakeup signals sent by
+conventional PCI devices on systems that are not ACPI-based, but there is one
+for PCI Express devices.  Namely, the PCI Express Base Specification introduced
+a native mechanism for converting native PCI PMEs into interrupts generated by
+root ports.  For conventional PCI devices native PMEs are out-of-band, so they
+are routed separately and they need not pass through bridges (in principle they
+may be routed directly to the system's core logic), but for PCI Express devices
+they are in-band messages that have to pass through the PCI Express hierarchy,
+including the root port on the path from the device to the Root Complex.  Thus
+it was possible to introduce a mechanism by which a root port generates an
+interrupt whenever it receives a PME message from one of the devices below it.
+The PCI Express Requester ID of the device that sent the PME message is then
+recorded in one of the root port's configuration registers from where it may be
+read by the interrupt handler allowing the device to be identified.  [PME
+messages sent by PCI Express endpoints integrated with the Root Complex don't
+pass through root ports, but instead they cause a Root Complex Event Collector
+(if there is one) to generate interrupts.]
+
+In principle the native PCI Express PME signaling may also be used on ACPI-based
+systems along with the GPEs, but to use it the kernel has to ask the system's
+ACPI BIOS to release control of root port configuration registers.  The ACPI
+BIOS, however, is not required to allow the kernel to control these registers
+and if it doesn't do that, the kernel must not modify their contents.  Of course
+the native PCI Express PME signaling cannot be used by the kernel in that case.
+
+
+2. PCI Subsystem and Device Power Management
+============================================
+
+2.1. Device Power Management Callbacks
+--------------------------------------
+
+The PCI Subsystem participates in the power management of PCI devices in a
+number of ways.  First of all, it provides an intermediate code layer between
+the device power management core (PM core) and PCI device drivers.
+Specifically, the pm field of the PCI subsystem's struct bus_type object,
+pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
+pointers to several device power management callbacks::
+
+  const struct dev_pm_ops pci_dev_pm_ops = {
+	.prepare = pci_pm_prepare,
+	.complete = pci_pm_complete,
+	.suspend = pci_pm_suspend,
+	.resume = pci_pm_resume,
+	.freeze = pci_pm_freeze,
+	.thaw = pci_pm_thaw,
+	.poweroff = pci_pm_poweroff,
+	.restore = pci_pm_restore,
+	.suspend_noirq = pci_pm_suspend_noirq,
+	.resume_noirq = pci_pm_resume_noirq,
+	.freeze_noirq = pci_pm_freeze_noirq,
+	.thaw_noirq = pci_pm_thaw_noirq,
+	.poweroff_noirq = pci_pm_poweroff_noirq,
+	.restore_noirq = pci_pm_restore_noirq,
+	.runtime_suspend = pci_pm_runtime_suspend,
+	.runtime_resume = pci_pm_runtime_resume,
+	.runtime_idle = pci_pm_runtime_idle,
+  };
+
+These callbacks are executed by the PM core in various situations related to
+device power management and they, in turn, execute power management callbacks
+provided by PCI device drivers.  They also perform power management operations
+involving some standard configuration registers of PCI devices that device
+drivers need not know or care about.
+
+The structure representing a PCI device, struct pci_dev, contains several fields
+that these callbacks operate on::
+
+  struct pci_dev {
+	...
+	pci_power_t     current_state;  /* Current operating state. */
+	int		pm_cap;		/* PM capability offset in the
+					   configuration space */
+	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
+					   can be generated */
+	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
+	unsigned int	d1_support:1;	/* Low power state D1 is supported */
+	unsigned int	d2_support:1;	/* Low power state D2 is supported */
+	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
+	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
+	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
+	...
+  };
+
+They also indirectly use some fields of the struct device that is embedded in
+struct pci_dev.
+
+2.2. Device Initialization
+--------------------------
+
+The PCI subsystem's first task related to device power management is to
+prepare the device for power management and initialize the fields of struct
+pci_dev used for this purpose.  This happens in two functions defined in
+drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
+
+The first of these functions checks if the device supports native PCI PM
+and if that's the case the offset of its power management capability structure
+in the configuration space is stored in the pm_cap field of the device's struct
+pci_dev object.  Next, the function checks which PCI low-power states are
+supported by the device and from which low-power states the device can generate
+native PCI PMEs.  The power management fields of the device's struct pci_dev and
+the struct device embedded in it are updated accordingly and the generation of
+PMEs by the device is disabled.
+
+The second function checks if the device can be prepared to signal wakeup with
+the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
+the function updates the wakeup fields in struct device embedded in the
+device's struct pci_dev and uses the firmware-provided method to prevent the
+device from signaling wakeup.
+
+At this point the device is ready for power management.  For driverless devices,
+however, this functionality is limited to a few basic operations carried out
+during system-wide transitions to a sleep state and back to the working state.
+
+2.3. Runtime Device Power Management
+------------------------------------
+
+The PCI subsystem plays a vital role in the runtime power management of PCI
+devices.  For this purpose it uses the general runtime power management
+(runtime PM) framework described in Documentation/power/runtime_pm.rst.
+Namely, it provides subsystem-level callbacks::
+
+	pci_pm_runtime_suspend()
+	pci_pm_runtime_resume()
+	pci_pm_runtime_idle()
+
+that are executed by the core runtime PM routines.  It also implements the
+entire mechanics necessary for handling runtime wakeup signals from PCI devices
+in low-power states, which at the time of this writing works for both the native
+PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
+Section 1.
+
+First, a PCI device is put into a low-power state, or suspended, with the help
+of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
+pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
+driver has to provide a pm->runtime_suspend() callback (see below), which is
+run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
+returns successfully, the device's standard configuration registers are saved,
+the device is prepared to generate wakeup signals and, finally, it is put into
+the target low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup.  The exact method of signaling wakeup is
+system-dependent and is determined by the PCI subsystem on the basis of the
+reported capabilities of the device and the platform firmware.  To prepare the
+device for signaling wakeup and put it into the selected low-power state, the
+PCI subsystem can use the platform firmware as well as the device's native PCI
+PM capabilities, if supported.
+
+It is expected that the device driver's pm->runtime_suspend() callback will
+not attempt to prepare the device for signaling wakeup or to put it into a
+low-power state.  The driver ought to leave these tasks to the PCI subsystem
+that has all of the information necessary to perform them.
+
+A suspended device is brought back into the "active" state, or resumed,
+with the help of pm_request_resume() or pm_runtime_resume() which both call
+pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
+driver provides a pm->runtime_resume() callback (see below).  However, before
+the driver's callback is executed, pci_pm_runtime_resume() brings the device
+back into the full-power state, prevents it from signaling wakeup while in that
+state and restores its standard configuration registers.  Thus the driver's
+callback need not worry about the PCI-specific aspects of the device resume.
+
+Note that generally pci_pm_runtime_resume() may be called in two different
+situations.  First, it may be called at the request of the device's driver, for
+example if there are some data for it to process.  Second, it may be called
+as a result of a wakeup signal from the device itself (this sometimes is
+referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
+is handled in one of the ways described in Section 1 and finally converted into
+a notification for the PCI subsystem after the source device has been
+identified.
+
+The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
+and pm_request_idle(), executes the device driver's pm->runtime_idle()
+callback, if defined, and if that callback doesn't return error code (or is not
+present at all), suspends the device with the help of pm_runtime_suspend().
+Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
+example, it is called right after the device has just been resumed), in which
+cases it is expected to suspend the device if that makes sense.  Usually,
+however, the PCI subsystem doesn't really know if the device really can be
+suspended, so it lets the device's driver decide by running its
+pm->runtime_idle() callback.
+
+2.4. System-Wide Power Transitions
+----------------------------------
+There are a few different types of system-wide power transitions, described in
+Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
+in a specific way and the PM core executes subsystem-level power management
+callbacks for this purpose.  They are executed in phases such that each phase
+involves executing the same subsystem-level callback for every device belonging
+to the given subsystem before the next phase begins.  These phases always run
+after tasks have been frozen.
+
+2.4.1. System Suspend
+^^^^^^^^^^^^^^^^^^^^^
+
+When the system is going into a sleep state in which the contents of memory will
+be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
+
+	prepare, suspend, suspend_noirq.
+
+The following PCI bus type's callbacks, respectively, are used in these phases::
+
+	pci_pm_prepare()
+	pci_pm_suspend()
+	pci_pm_suspend_noirq()
+
+The pci_pm_prepare() routine first puts the device into the "fully functional"
+state with the help of pm_runtime_resume().  Then, it executes the device
+driver's pm->prepare() callback if defined (i.e. if the driver's struct
+dev_pm_ops object is present and the prepare pointer in that object is valid).
+
+The pci_pm_suspend() routine first checks if the device's driver implements
+legacy PCI suspend routines (see Section 3), in which case the driver's legacy
+suspend callback is executed, if present, and its result is returned.  Next, if
+the device's driver doesn't provide a struct dev_pm_ops object (containing
+pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
+simply turns off the device's bus master capability and runs
+pcibios_disable_device() to disable it, unless the device is a bridge (PCI
+bridges are ignored by this routine).  Next, the device driver's pm->suspend()
+callback is executed, if defined, and its result is returned if it fails.
+Finally, pci_fixup_device() is called to apply hardware suspend quirks related
+to the device if necessary.
+
+Note that the suspend phase is carried out asynchronously for PCI devices, so
+the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
+devices that don't depend on each other in a known way (i.e. none of the paths
+in the device tree from the root bridge to a leaf device contains both of them).
+
+The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
+been called, which means that the device driver's interrupt handler won't be
+invoked while this routine is running.  It first checks if the device's driver
+implements legacy PCI suspends routines (Section 3), in which case the legacy
+late suspend routine is called and its result is returned (the standard
+configuration registers of the device are saved if the driver's callback hasn't
+done that).  Second, if the device driver's struct dev_pm_ops object is not
+present, the device's standard configuration registers are saved and the routine
+returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
+executed, if present, and its result is returned if it fails.  Next, if the
+device's standard configuration registers haven't been saved yet (one of the
+device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
+saves them, prepares the device to signal wakeup (if necessary) and puts it into
+a low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup while the system is in the target sleep
+state.  Just like in the runtime PM case described above, the mechanism of
+signaling wakeup is system-dependent and determined by the PCI subsystem, which
+is also responsible for preparing the device to signal wakeup from the system's
+target sleep state as appropriate.
+
+PCI device drivers (that don't implement legacy power management callbacks) are
+generally not expected to prepare devices for signaling wakeup or to put them
+into low-power states.  However, if one of the driver's suspend callbacks
+(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
+registers, pci_pm_suspend_noirq() will assume that the device has been prepared
+to signal wakeup and put into a low-power state by the driver (the driver is
+then assumed to have used the helper functions provided by the PCI subsystem for
+this purpose).  PCI device drivers are not encouraged to do that, but in some
+rare cases doing that in the driver may be the optimum approach.
+
+2.4.2. System Resume
+^^^^^^^^^^^^^^^^^^^^
+
+When the system is undergoing a transition from a sleep state in which the
+contents of memory have been preserved, such as one of the ACPI sleep states
+S1-S3, into the working state (ACPI S0), the phases are:
+
+	resume_noirq, resume, complete.
+
+The following PCI bus type's callbacks, respectively, are executed in these
+phases::
+
+	pci_pm_resume_noirq()
+	pci_pm_resume()
+	pci_pm_complete()
+
+The pci_pm_resume_noirq() routine first puts the device into the full-power
+state, restores its standard configuration registers and applies early resume
+hardware quirks related to the device, if necessary.  This is done
+unconditionally, regardless of whether or not the device's driver implements
+legacy PCI power management callbacks (this way all PCI devices are in the
+full-power state and their standard configuration registers have been restored
+when their interrupt handlers are invoked for the first time during resume,
+which allows the kernel to avoid problems with the handling of shared interrupts
+by drivers whose devices are still suspended).  If legacy PCI power management
+callbacks (see Section 3) are implemented by the device's driver, the legacy
+early resume callback is executed and its result is returned.  Otherwise, the
+device driver's pm->resume_noirq() callback is executed, if defined, and its
+result is returned.
+
+The pci_pm_resume() routine first checks if the device's standard configuration
+registers have been restored and restores them if that's not the case (this
+only is necessary in the error path during a failing suspend).  Next, resume
+hardware quirks related to the device are applied, if necessary, and if the
+device's driver implements legacy PCI power management callbacks (see
+Section 3), the driver's legacy resume callback is executed and its result is
+returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
+its driver's pm->resume() callback is executed, if defined (the callback's
+result is then returned).
+
+The resume phase is carried out asynchronously for PCI devices, like the
+suspend phase described above, which means that if two PCI devices don't depend
+on each other in a known way, the pci_pm_resume() routine may be executed for
+the both of them in parallel.
+
+The pci_pm_complete() routine only executes the device driver's pm->complete()
+callback, if defined.
+
+2.4.3. System Hibernation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+System hibernation is more complicated than system suspend, because it requires
+a system image to be created and written into a persistent storage medium.  The
+image is created atomically and all devices are quiesced, or frozen, before that
+happens.
+
+The freezing of devices is carried out after enough memory has been freed (at
+the time of this writing the image creation requires at least 50% of system RAM
+to be free) in the following three phases:
+
+	prepare, freeze, freeze_noirq
+
+that correspond to the PCI bus type's callbacks::
+
+	pci_pm_prepare()
+	pci_pm_freeze()
+	pci_pm_freeze_noirq()
+
+This means that the prepare phase is exactly the same as for system suspend.
+The other two phases, however, are different.
+
+The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
+the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
+and it doesn't apply the suspend-related hardware quirks.  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The pci_pm_freeze_noirq() routine, in turn, is similar to
+pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
+routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
+device for signaling wakeup and put it into a low-power state.  Still, it saves
+the device's standard configuration registers if they haven't been saved by one
+of the driver's callbacks.
+
+Once the image has been created, it has to be saved.  However, at this point all
+devices are frozen and they cannot handle I/O, while their ability to handle
+I/O is obviously necessary for the image saving.  Thus they have to be brought
+back to the fully functional state and this is done in the following phases:
+
+	thaw_noirq, thaw, complete
+
+using the following PCI bus type's callbacks::
+
+	pci_pm_thaw_noirq()
+	pci_pm_thaw()
+	pci_pm_complete()
+
+respectively.
+
+The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
+but it doesn't put the device into the full power state and doesn't attempt to
+restore its standard configuration registers.  It also executes the device
+driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
+
+The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
+driver's pm->thaw() callback instead of pm->resume().  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The complete phase it the same as for system resume.
+
+After saving the image, devices need to be powered down before the system can
+enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
+three phases:
+
+	prepare, poweroff, poweroff_noirq
+
+where the prepare phase is exactly the same as for system suspend.  The other
+two phases are analogous to the suspend and suspend_noirq phases, respectively.
+The PCI subsystem-level callbacks they correspond to::
+
+	pci_pm_poweroff()
+	pci_pm_poweroff_noirq()
+
+work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
+although they don't attempt to save the device's standard configuration
+registers.
+
+2.4.4. System Restore
+^^^^^^^^^^^^^^^^^^^^^
+
+System restore requires a hibernation image to be loaded into memory and the
+pre-hibernation memory contents to be restored before the pre-hibernation system
+activity can be resumed.
+
+As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
+into memory by a fresh instance of the kernel, called the boot kernel, which in
+turn is loaded and run by a boot loader in the usual way.  After the boot kernel
+has loaded the image, it needs to replace its own code and data with the code
+and data of the "hibernated" kernel stored within the image, called the image
+kernel.  For this purpose all devices are frozen just like before creating
+the image during hibernation, in the
+
+	prepare, freeze, freeze_noirq
+
+phases described above.  However, the devices affected by these phases are only
+those having drivers in the boot kernel; other devices will still be in whatever
+state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the boot
+kernel would go through the "thawing" procedure described above, using the
+thaw_noirq, thaw, and complete phases (that will only affect the devices having
+drivers in the boot kernel), and then continue running normally.
+
+If the pre-hibernation memory contents are restored successfully, which is the
+usual situation, control is passed to the image kernel, which then becomes
+responsible for bringing the system back to the working state.  To achieve this,
+it must restore the devices' pre-hibernation functionality, which is done much
+like waking up from the memory sleep state, although it involves different
+phases:
+
+	restore_noirq, restore, complete
+
+The first two of these are analogous to the resume_noirq and resume phases
+described above, respectively, and correspond to the following PCI subsystem
+callbacks::
+
+	pci_pm_restore_noirq()
+	pci_pm_restore()
+
+These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
+respectively, but they execute the device driver's pm->restore_noirq() and
+pm->restore() callbacks, if available.
+
+The complete phase is carried out in exactly the same way as during system
+resume.
+
+
+3. PCI Device Drivers and Power Management
+==========================================
+
+3.1. Power Management Callbacks
+-------------------------------
+
+PCI device drivers participate in power management by providing callbacks to be
+executed by the PCI subsystem's power management routines described above and by
+controlling the runtime power management of their devices.
+
+At the time of this writing there are two ways to define power management
+callbacks for a PCI device driver, the recommended one, based on using a
+dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
+"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
+.resume() callbacks from struct pci_driver are used.  The legacy approach,
+however, doesn't allow one to define runtime power management callbacks and is
+not really suitable for any new drivers.  Therefore it is not covered by this
+document (refer to the source code to learn more about it).
+
+It is recommended that all PCI device drivers define a struct dev_pm_ops object
+containing pointers to power management (PM) callbacks that will be executed by
+the PCI subsystem's PM routines in various circumstances.  A pointer to the
+driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
+its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
+in struct pci_driver are ignored (even if they are not NULL).
+
+The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
+defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
+subsystem will handle the device in a simplified default manner.  If they are
+defined, though, they are expected to behave as described in the following
+subsections.
+
+3.1.1. prepare()
+^^^^^^^^^^^^^^^^
+
+The prepare() callback is executed during system suspend, during hibernation
+(when a hibernation image is about to be created), during power-off after
+saving a hibernation image and during system restore, when a hibernation image
+has just been loaded into memory.
+
+This callback is only necessary if the driver's device has children that in
+general may be registered at any time.  In that case the role of the prepare()
+callback is to prevent new children of the device from being registered until
+one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
+
+In addition to that the prepare() callback may carry out some operations
+preparing the device to be suspended, although it should not allocate memory
+(if additional memory is required to suspend the device, it has to be
+preallocated earlier, for example in a suspend/hibernate notifier as described
+in Documentation/driver-api/pm/notifiers.rst).
+
+3.1.2. suspend()
+^^^^^^^^^^^^^^^^
+
+The suspend() callback is only executed during system suspend, after prepare()
+callbacks have been executed for all devices in the system.
+
+This callback is expected to quiesce the device and prepare it to be put into a
+low-power state by the PCI subsystem.  It is not required (in fact it even is
+not recommended) that a PCI driver's suspend() callback save the standard
+configuration registers of the device, prepare it for waking up the system, or
+put it into a low-power state.  All of these operations can very well be taken
+care of by the PCI subsystem, without the driver's participation.
+
+However, in some rare case it is convenient to carry out these operations in
+a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
+pci_set_power_state() should be used to save the device's standard configuration
+registers, to prepare it for system wakeup (if necessary), and to put it into a
+low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
+the PCI subsystem will not execute either pci_prepare_to_sleep(), or
+pci_set_power_state() for its device, so the driver is then responsible for
+handling the device as appropriate.
+
+While the suspend() callback is being executed, the driver's interrupt handler
+can be invoked to handle an interrupt from the device, so all suspend-related
+operations relying on the driver's ability to handle interrupts should be
+carried out in this callback.
+
+3.1.3. suspend_noirq()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The suspend_noirq() callback is only executed during system suspend, after
+suspend() callbacks have been executed for all devices in the system and
+after device interrupts have been disabled by the PM core.
+
+The difference between suspend_noirq() and suspend() is that the driver's
+interrupt handler will not be invoked while suspend_noirq() is running.  Thus
+suspend_noirq() can carry out operations that would cause race conditions to
+arise if they were performed in suspend().
+
+3.1.4. freeze()
+^^^^^^^^^^^^^^^
+
+The freeze() callback is hibernation-specific and is executed in two situations,
+during hibernation, after prepare() callbacks have been executed for all devices
+in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory from persistent storage and the
+prepare() callbacks have been executed for all devices.
+
+The role of this callback is analogous to the role of the suspend() callback
+described above.  In fact, they only need to be different in the rare cases when
+the driver takes the responsibility for putting the device into a low-power
+state.
+
+In that cases the freeze() callback should not prepare the device system wakeup
+or put it into a low-power state.  Still, either it or freeze_noirq() should
+save the device's standard configuration registers using pci_save_state().
+
+3.1.5. freeze_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The freeze_noirq() callback is hibernation-specific.  It is executed during
+hibernation, after prepare() and freeze() callbacks have been executed for all
+devices in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory and after prepare() and
+freeze() callbacks have been executed for all devices.  It is always executed
+after device interrupts have been disabled by the PM core.
+
+The role of this callback is analogous to the role of the suspend_noirq()
+callback described above and it very rarely is necessary to define
+freeze_noirq().
+
+The difference between freeze_noirq() and freeze() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.6. poweroff()
+^^^^^^^^^^^^^^^^^
+
+The poweroff() callback is hibernation-specific.  It is executed when the system
+is about to be powered off after saving a hibernation image to a persistent
+storage.  prepare() callbacks are executed for all devices before poweroff() is
+called.
+
+The role of this callback is analogous to the role of the suspend() and freeze()
+callbacks described above, although it does not need to save the contents of
+the device's registers.  In particular, if the driver wants to put the device
+into a low-power state itself instead of allowing the PCI subsystem to do that,
+the poweroff() callback should use pci_prepare_to_sleep() and
+pci_set_power_state() to prepare the device for system wakeup and to put it
+into a low-power state, respectively, but it need not save the device's standard
+configuration registers.
+
+3.1.7. poweroff_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The poweroff_noirq() callback is hibernation-specific.  It is executed after
+poweroff() callbacks have been executed for all devices in the system.
+
+The role of this callback is analogous to the role of the suspend_noirq() and
+freeze_noirq() callbacks described above, but it does not need to save the
+contents of the device's registers.
+
+The difference between poweroff_noirq() and poweroff() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.8. resume_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The resume_noirq() callback is only executed during system resume, after the
+PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
+be invoked while resume_noirq() is running, so this callback can carry out
+operations that might race with the interrupt handler.
+
+Since the PCI subsystem unconditionally puts all devices into the full power
+state in the resume_noirq phase of system resume and restores their standard
+configuration registers, resume_noirq() is usually not necessary.  In general
+it should only be used for performing operations that would lead to race
+conditions if carried out by resume().
+
+3.1.9. resume()
+^^^^^^^^^^^^^^^
+
+The resume() callback is only executed during system resume, after
+resume_noirq() callbacks have been executed for all devices in the system and
+device interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-suspend configuration of the
+device and bringing it back to the fully functional state.  The device should be
+able to process I/O in a usual way after resume() has returned.
+
+3.1.10. thaw_noirq()
+^^^^^^^^^^^^^^^^^^^^
+
+The thaw_noirq() callback is hibernation-specific.  It is executed after a
+system image has been created and the non-boot CPUs have been enabled by the PM
+core, in the thaw_noirq phase of hibernation.  It also may be executed if the
+loading of a hibernation image fails during system restore (it is then executed
+after enabling the non-boot CPUs).  The driver's interrupt handler will not be
+invoked while thaw_noirq() is running.
+
+The role of this callback is analogous to the role of resume_noirq().  The
+difference between these two callbacks is that thaw_noirq() is executed after
+freeze() and freeze_noirq(), so in general it does not need to modify the
+contents of the device's registers.
+
+3.1.11. thaw()
+^^^^^^^^^^^^^^
+
+The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
+callbacks have been executed for all devices in the system and after device
+interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-freeze configuration of
+the device, so that it will work in a usual way after thaw() has returned.
+
+3.1.12. restore_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The restore_noirq() callback is hibernation-specific.  It is executed in the
+restore_noirq phase of hibernation, when the boot kernel has passed control to
+the image kernel and the non-boot CPUs have been enabled by the image kernel's
+PM core.
+
+This callback is analogous to resume_noirq() with the exception that it cannot
+make any assumption on the previous state of the device, even if the BIOS (or
+generally the platform firmware) is known to preserve that state over a
+suspend-resume cycle.
+
+For the vast majority of PCI device drivers there is no difference between
+resume_noirq() and restore_noirq().
+
+3.1.13. restore()
+^^^^^^^^^^^^^^^^^
+
+The restore() callback is hibernation-specific.  It is executed after
+restore_noirq() callbacks have been executed for all devices in the system and
+after the PM core has enabled device drivers' interrupt handlers to be invoked.
+
+This callback is analogous to resume(), just like restore_noirq() is analogous
+to resume_noirq().  Consequently, the difference between restore_noirq() and
+restore() is analogous to the difference between resume_noirq() and resume().
+
+For the vast majority of PCI device drivers there is no difference between
+resume() and restore().
+
+3.1.14. complete()
+^^^^^^^^^^^^^^^^^^
+
+The complete() callback is executed in the following situations:
+
+  - during system resume, after resume() callbacks have been executed for all
+    devices,
+  - during hibernation, before saving the system image, after thaw() callbacks
+    have been executed for all devices,
+  - during system restore, when the system is going back to its pre-hibernation
+    state, after restore() callbacks have been executed for all devices.
+
+It also may be executed if the loading of a hibernation image into memory fails
+(in that case it is run after thaw() callbacks have been executed for all
+devices that have drivers in the boot kernel).
+
+This callback is entirely optional, although it may be necessary if the
+prepare() callback performs operations that need to be reversed.
+
+3.1.15. runtime_suspend()
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_suspend() callback is specific to device runtime power management
+(runtime PM).  It is executed by the PM core's runtime PM framework when the
+device is about to be suspended (i.e. quiesced and put into a low-power state)
+at run time.
+
+This callback is responsible for freezing the device and preparing it to be
+put into a low-power state, but it must allow the PCI subsystem to perform all
+of the PCI-specific actions necessary for suspending the device.
+
+3.1.16. runtime_resume()
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_resume() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework when the device is about to be resumed
+(i.e. put into the full-power state and programmed to process I/O normally) at
+run time.
+
+This callback is responsible for restoring the normal functionality of the
+device after it has been put into the full-power state by the PCI subsystem.
+The device is expected to be able to process I/O in the usual way after
+runtime_resume() has returned.
+
+3.1.17. runtime_idle()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_idle() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework whenever it may be desirable to suspend
+the device according to the PM core's information.  In particular, it is
+automatically executed right after runtime_resume() has returned in case the
+resume of the device has happened as a result of a spurious event.
+
+This callback is optional, but if it is not implemented or if it returns 0, the
+PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
+cause the driver's runtime_suspend() callback to be executed.
+
+3.1.18. Pointing Multiple Callback Pointers to One Routine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although in principle each of the callbacks described in the previous
+subsections can be defined as a separate function, it often is convenient to
+point two or more members of struct dev_pm_ops to the same routine.  There are
+a few convenience macros that can be used for this purpose.
+
+The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
+suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
+members and one resume routine pointed to by the .resume(), .thaw(), and
+.restore() members.  The other function pointers in this struct dev_pm_ops are
+unset.
+
+The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
+additionally sets the .runtime_resume() pointer to the same value as
+.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
+the same value as .suspend() (and .freeze() and .poweroff()).
+
+The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
+dev_pm_ops to indicate that one suspend routine is to be pointed to by the
+.suspend(), .freeze(), and .poweroff() members and one resume routine is to
+be pointed to by the .resume(), .thaw(), and .restore() members.
+
+3.1.19. Driver Flags for Power Management
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type.  The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts.  That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value.  That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way.  Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
+3.2. Device Runtime Power Management
+------------------------------------
+
+In addition to providing device power management callbacks PCI device drivers
+are responsible for controlling the runtime power management (runtime PM) of
+their devices.
+
+The PCI device runtime PM is optional, but it is recommended that PCI device
+drivers implement it at least in the cases where there is a reliable way of
+verifying that the device is not used (like when the network cable is detached
+from an Ethernet adapter or there are no devices attached to a USB controller).
+
+To support the PCI runtime PM the driver first needs to implement the
+runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
+the runtime_idle() callback to prevent the device from being suspended again
+every time right after the runtime_resume() callback has returned
+(alternatively, the runtime_suspend() callback will have to check if the
+device should really be suspended and return -EAGAIN if that is not the case).
+
+The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function.  In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function.  If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended.  The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have caused the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though.  Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
+runtime PM of the device correctly as soon as pm_runtime_allow() is called
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time.  For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback.  [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
+
+The runtime PM framework works by processing requests to suspend or resume
+devices, or to check if they are idle (in which cases it is reasonable to
+subsequently request that they be suspended).  These requests are represented
+by work items put into the power management workqueue, pm_wq.  Although there
+are a few situations in which power management requests are automatically
+queued by the PM core (for example, after processing a request to resume a
+device the PM core automatically queues a request to check if the device is
+idle), device drivers are generally responsible for queuing power management
+requests for their devices.  For this purpose they should use the runtime PM
+helper functions provided by the PM core, discussed in
+Documentation/power/runtime_pm.rst.
+
+Devices can also be suspended and resumed synchronously, without placing a
+request into pm_wq.  In the majority of cases this also is done by their
+drivers that use helper functions provided by the PM core for this purpose.
+
+For more information on the runtime PM of devices refer to
+Documentation/power/runtime_pm.rst.
+
+
+4. Resources
+============
+
+PCI Local Bus Specification, Rev. 3.0
+
+PCI Bus Power Management Interface Specification, Rev. 1.2
+
+Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
+
+PCI Express Base Specification, Rev. 2.0
+
+Documentation/driver-api/pm/devices.rst
+
+Documentation/power/runtime_pm.rst
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
deleted file mode 100644
index 8eaf9ee24d43..000000000000
--- a/Documentation/power/pci.txt
+++ /dev/null
@@ -1,1094 +0,0 @@
-PCI Power Management
-
-Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-
-An overview of concepts and the Linux kernel's interfaces related to PCI power
-management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
-(and others).
-
-This document only covers the aspects of power management specific to PCI
-devices.  For general description of the kernel's interfaces related to device
-power management refer to Documentation/driver-api/pm/devices.rst and
-Documentation/power/runtime_pm.txt.
-
----------------------------------------------------------------------------
-
-1. Hardware and Platform Support for PCI Power Management
-2. PCI Subsystem and Device Power Management
-3. PCI Device Drivers and Power Management
-4. Resources
-
-
-1. Hardware and Platform Support for PCI Power Management
-=========================================================
-
-1.1. Native and Platform-Based Power Management
------------------------------------------------
-In general, power management is a feature allowing one to save energy by putting
-devices into states in which they draw less power (low-power states) at the
-price of reduced functionality or performance.
-
-Usually, a device is put into a low-power state when it is underutilized or
-completely inactive.  However, when it is necessary to use the device once
-again, it has to be put back into the "fully functional" state (full-power
-state).  This may happen when there are some data for the device to handle or
-as a result of an external event requiring the device to be active, which may
-be signaled by the device itself.
-
-PCI devices may be put into low-power states in two ways, by using the device
-capabilities introduced by the PCI Bus Power Management Interface Specification,
-or with the help of platform firmware, such as an ACPI BIOS.  In the first
-approach, that is referred to as the native PCI power management (native PCI PM)
-in what follows, the device power state is changed as a result of writing a
-specific value into one of its standard configuration registers.  The second
-approach requires the platform firmware to provide special methods that may be
-used by the kernel to change the device's power state.
-
-Devices supporting the native PCI PM usually can generate wakeup signals called
-Power Management Events (PMEs) to let the kernel know about external events
-requiring the device to be active.  After receiving a PME the kernel is supposed
-to put the device that sent it into the full-power state.  However, the PCI Bus
-Power Management Interface Specification doesn't define any standard method of
-delivering the PME from the device to the CPU and the operating system kernel.
-It is assumed that the platform firmware will perform this task and therefore,
-even though a PCI device is set up to generate PMEs, it also may be necessary to
-prepare the platform firmware for notifying the CPU of the PMEs coming from the
-device (e.g. by generating interrupts).
-
-In turn, if the methods provided by the platform firmware are used for changing
-the power state of a device, usually the platform also provides a method for
-preparing the device to generate wakeup signals.  In that case, however, it
-often also is necessary to prepare the device for generating PMEs using the
-native PCI PM mechanism, because the method provided by the platform depends on
-that.
-
-Thus in many situations both the native and the platform-based power management
-mechanisms have to be used simultaneously to obtain the desired result.
-
-1.2. Native PCI Power Management
---------------------------------
-The PCI Bus Power Management Interface Specification (PCI PM Spec) was
-introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
-standard interface for performing various operations related to power
-management.
-
-The implementation of the PCI PM Spec is optional for conventional PCI devices,
-but it is mandatory for PCI Express devices.  If a device supports the PCI PM
-Spec, it has an 8 byte power management capability field in its PCI
-configuration space.  This field is used to describe and control the standard
-features related to the native PCI power management.
-
-The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
-(B0-B3).  The higher the number, the less power is drawn by the device or bus
-in that state.  However, the higher the number, the longer the latency for
-the device or bus to return to the full-power state (D0 or B0, respectively).
-
-There are two variants of the D3 state defined by the specification.  The first
-one is D3hot, referred to as the software accessible D3, because devices can be
-programmed to go into it.  The second one, D3cold, is the state that PCI devices
-are in when the supply voltage (Vcc) is removed from them.  It is not possible
-to program a PCI device to go into D3cold, although there may be a programmable
-interface for putting the bus the device is on into a state in which Vcc is
-removed from all devices on the bus.
-
-PCI bus power management, however, is not supported by the Linux kernel at the
-time of this writing and therefore it is not covered by this document.
-
-Note that every PCI device can be in the full-power state (D0) or in D3cold,
-regardless of whether or not it implements the PCI PM Spec.  In addition to
-that, if the PCI PM Spec is implemented by the device, it must support D3hot
-as well as D0.  The support for the D1 and D2 power states is optional.
-
-PCI devices supporting the PCI PM Spec can be programmed to go to any of the
-supported low-power states (except for D3cold).  While in D1-D3hot the
-standard configuration registers of the device must be accessible to software
-(i.e. the device is required to respond to PCI configuration accesses), although
-its I/O and memory spaces are then disabled.  This allows the device to be
-programmatically put into D0.  Thus the kernel can switch the device back and
-forth between D0 and the supported low-power states (except for D3cold) and the
-possible power state transitions the device can undergo are the following:
-
-+----------------------------+
-| Current State | New State  |
-+----------------------------+
-| D0            | D1, D2, D3 |
-+----------------------------+
-| D1            | D2, D3     |
-+----------------------------+
-| D2            | D3         |
-+----------------------------+
-| D1, D2, D3    | D0         |
-+----------------------------+
-
-The transition from D3cold to D0 occurs when the supply voltage is provided to
-the device (i.e. power is restored).  In that case the device returns to D0 with
-a full power-on reset sequence and the power-on defaults are restored to the
-device by hardware just as at initial power up.
-
-PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
-while in a low-power state (D1-D3), but they are not required to be capable
-of generating PMEs from all supported low-power states.  In particular, the
-capability of generating PMEs from D3cold is optional and depends on the
-presence of additional voltage (3.3Vaux) allowing the device to remain
-sufficiently active to generate a wakeup signal.
-
-1.3. ACPI Device Power Management
----------------------------------
-The platform firmware support for the power management of PCI devices is
-system-specific.  However, if the system in question is compliant with the
-Advanced Configuration and Power Interface (ACPI) Specification, like the
-majority of x86-based systems, it is supposed to implement device power
-management interfaces defined by the ACPI standard.
-
-For this purpose the ACPI BIOS provides special functions called "control
-methods" that may be executed by the kernel to perform specific tasks, such as
-putting a device into a low-power state.  These control methods are encoded
-using special byte-code language called the ACPI Machine Language (AML) and
-stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
-them as needed using an AML interpreter that translates the AML byte code into
-computations and memory or I/O space accesses.  This way, in theory, a BIOS
-writer can provide the kernel with a means to perform actions depending
-on the system design in a system-specific fashion.
-
-ACPI control methods may be divided into global control methods, that are not
-associated with any particular devices, and device control methods, that have
-to be defined separately for each device supposed to be handled with the help of
-the platform.  This means, in particular, that ACPI device control methods can
-only be used to handle devices that the BIOS writer knew about in advance.  The
-ACPI methods used for device power management fall into that category.
-
-The ACPI specification assumes that devices can be in one of four power states
-labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
-D0-D3 states (although the difference between D3hot and D3cold is not taken
-into account by ACPI).  Moreover, for each power state of a device there is a
-set of power resources that have to be enabled for the device to be put into
-that state.  These power resources are controlled (i.e. enabled or disabled)
-with the help of their own control methods, _ON and _OFF, that have to be
-defined individually for each of them.
-
-To put a device into the ACPI power state Dx (where x is a number between 0 and
-3 inclusive) the kernel is supposed to (1) enable the power resources required
-by the device in this state using their _ON control methods and (2) execute the
-_PSx control method defined for the device.  In addition to that, if the device
-is going to be put into a low-power state (D1-D3) and is supposed to generate
-wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
-3.0) control method defined for it has to be executed before _PSx.  Power
-resources that are not required by the device in the target power state and are
-not required any more by any other device should be disabled (by executing their
-_OFF control methods).  If the current power state of the device is D3, it can
-only be put into D0 this way.
-
-However, quite often the power states of devices are changed during a
-system-wide transition into a sleep state or back into the working state.  ACPI
-defines four system sleep states, S1, S2, S3, and S4, and denotes the system
-working state as S0.  In general, the target system sleep (or working) state
-determines the highest power (lowest number) state the device can be put
-into and the kernel is supposed to obtain this information by executing the
-device's _SxD control method (where x is a number between 0 and 4 inclusive).
-If the device is required to wake up the system from the target sleep state, the
-lowest power (highest number) state it can be put into is also determined by the
-target state of the system.  The kernel is then supposed to use the device's
-_SxW control method to obtain the number of that state.  It also is supposed to
-use the device's _PRW control method to learn which power resources need to be
-enabled for the device to be able to generate wakeup signals.
-
-1.4. Wakeup Signaling
----------------------
-Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
-a result of the execution of the _DSW (or _PSW) ACPI control method before
-putting the device into a low-power state, have to be caught and handled as
-appropriate.  If they are sent while the system is in the working state
-(ACPI S0), they should be translated into interrupts so that the kernel can
-put the devices generating them into the full-power state and take care of the
-events that triggered them.  In turn, if they are sent while the system is
-sleeping, they should cause the system's core logic to trigger wakeup.
-
-On ACPI-based systems wakeup signals sent by conventional PCI devices are
-converted into ACPI General-Purpose Events (GPEs) which are hardware signals
-from the system core logic generated in response to various events that need to
-be acted upon.  Every GPE is associated with one or more sources of potentially
-interesting events.  In particular, a GPE may be associated with a PCI device
-capable of signaling wakeup.  The information on the connections between GPEs
-and event sources is recorded in the system's ACPI BIOS from where it can be
-read by the kernel.
-
-If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
-associated with it (if there is one) is triggered.  The GPEs associated with PCI
-bridges may also be triggered in response to a wakeup signal from one of the
-devices below the bridge (this also is the case for root bridges) and, for
-example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
-handled this way.
-
-A GPE may be triggered when the system is sleeping (i.e. when it is in one of
-the ACPI S1-S4 states), in which case system wakeup is started by its core logic
-(the device that was the source of the signal causing the system wakeup to occur
-may be identified later).  The GPEs used in such situations are referred to as
-wakeup GPEs.
-
-Usually, however, GPEs are also triggered when the system is in the working
-state (ACPI S0) and in that case the system's core logic generates a System
-Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
-handler identifies the GPE that caused the interrupt to be generated which,
-in turn, allows the kernel to identify the source of the event (that may be
-a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
-events occurring while the system is in the working state are referred to as
-runtime GPEs.
-
-Unfortunately, there is no standard way of handling wakeup signals sent by
-conventional PCI devices on systems that are not ACPI-based, but there is one
-for PCI Express devices.  Namely, the PCI Express Base Specification introduced
-a native mechanism for converting native PCI PMEs into interrupts generated by
-root ports.  For conventional PCI devices native PMEs are out-of-band, so they
-are routed separately and they need not pass through bridges (in principle they
-may be routed directly to the system's core logic), but for PCI Express devices
-they are in-band messages that have to pass through the PCI Express hierarchy,
-including the root port on the path from the device to the Root Complex.  Thus
-it was possible to introduce a mechanism by which a root port generates an
-interrupt whenever it receives a PME message from one of the devices below it.
-The PCI Express Requester ID of the device that sent the PME message is then
-recorded in one of the root port's configuration registers from where it may be
-read by the interrupt handler allowing the device to be identified.  [PME
-messages sent by PCI Express endpoints integrated with the Root Complex don't
-pass through root ports, but instead they cause a Root Complex Event Collector
-(if there is one) to generate interrupts.]
-
-In principle the native PCI Express PME signaling may also be used on ACPI-based
-systems along with the GPEs, but to use it the kernel has to ask the system's
-ACPI BIOS to release control of root port configuration registers.  The ACPI
-BIOS, however, is not required to allow the kernel to control these registers
-and if it doesn't do that, the kernel must not modify their contents.  Of course
-the native PCI Express PME signaling cannot be used by the kernel in that case.
-
-
-2. PCI Subsystem and Device Power Management
-============================================
-
-2.1. Device Power Management Callbacks
---------------------------------------
-The PCI Subsystem participates in the power management of PCI devices in a
-number of ways.  First of all, it provides an intermediate code layer between
-the device power management core (PM core) and PCI device drivers.
-Specifically, the pm field of the PCI subsystem's struct bus_type object,
-pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
-pointers to several device power management callbacks:
-
-const struct dev_pm_ops pci_dev_pm_ops = {
-	.prepare = pci_pm_prepare,
-	.complete = pci_pm_complete,
-	.suspend = pci_pm_suspend,
-	.resume = pci_pm_resume,
-	.freeze = pci_pm_freeze,
-	.thaw = pci_pm_thaw,
-	.poweroff = pci_pm_poweroff,
-	.restore = pci_pm_restore,
-	.suspend_noirq = pci_pm_suspend_noirq,
-	.resume_noirq = pci_pm_resume_noirq,
-	.freeze_noirq = pci_pm_freeze_noirq,
-	.thaw_noirq = pci_pm_thaw_noirq,
-	.poweroff_noirq = pci_pm_poweroff_noirq,
-	.restore_noirq = pci_pm_restore_noirq,
-	.runtime_suspend = pci_pm_runtime_suspend,
-	.runtime_resume = pci_pm_runtime_resume,
-	.runtime_idle = pci_pm_runtime_idle,
-};
-
-These callbacks are executed by the PM core in various situations related to
-device power management and they, in turn, execute power management callbacks
-provided by PCI device drivers.  They also perform power management operations
-involving some standard configuration registers of PCI devices that device
-drivers need not know or care about.
-
-The structure representing a PCI device, struct pci_dev, contains several fields
-that these callbacks operate on:
-
-struct pci_dev {
-	...
-	pci_power_t     current_state;  /* Current operating state. */
-	int		pm_cap;		/* PM capability offset in the
-					   configuration space */
-	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
-					   can be generated */
-	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
-	unsigned int	d1_support:1;	/* Low power state D1 is supported */
-	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
-	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
-	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
-	...
-};
-
-They also indirectly use some fields of the struct device that is embedded in
-struct pci_dev.
-
-2.2. Device Initialization
---------------------------
-The PCI subsystem's first task related to device power management is to
-prepare the device for power management and initialize the fields of struct
-pci_dev used for this purpose.  This happens in two functions defined in
-drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
-
-The first of these functions checks if the device supports native PCI PM
-and if that's the case the offset of its power management capability structure
-in the configuration space is stored in the pm_cap field of the device's struct
-pci_dev object.  Next, the function checks which PCI low-power states are
-supported by the device and from which low-power states the device can generate
-native PCI PMEs.  The power management fields of the device's struct pci_dev and
-the struct device embedded in it are updated accordingly and the generation of
-PMEs by the device is disabled.
-
-The second function checks if the device can be prepared to signal wakeup with
-the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
-the function updates the wakeup fields in struct device embedded in the
-device's struct pci_dev and uses the firmware-provided method to prevent the
-device from signaling wakeup.
-
-At this point the device is ready for power management.  For driverless devices,
-however, this functionality is limited to a few basic operations carried out
-during system-wide transitions to a sleep state and back to the working state.
-
-2.3. Runtime Device Power Management
-------------------------------------
-The PCI subsystem plays a vital role in the runtime power management of PCI
-devices.  For this purpose it uses the general runtime power management
-(runtime PM) framework described in Documentation/power/runtime_pm.txt.
-Namely, it provides subsystem-level callbacks:
-
-	pci_pm_runtime_suspend()
-	pci_pm_runtime_resume()
-	pci_pm_runtime_idle()
-
-that are executed by the core runtime PM routines.  It also implements the
-entire mechanics necessary for handling runtime wakeup signals from PCI devices
-in low-power states, which at the time of this writing works for both the native
-PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
-Section 1.
-
-First, a PCI device is put into a low-power state, or suspended, with the help
-of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
-pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
-driver has to provide a pm->runtime_suspend() callback (see below), which is
-run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
-returns successfully, the device's standard configuration registers are saved,
-the device is prepared to generate wakeup signals and, finally, it is put into
-the target low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup.  The exact method of signaling wakeup is
-system-dependent and is determined by the PCI subsystem on the basis of the
-reported capabilities of the device and the platform firmware.  To prepare the
-device for signaling wakeup and put it into the selected low-power state, the
-PCI subsystem can use the platform firmware as well as the device's native PCI
-PM capabilities, if supported.
-
-It is expected that the device driver's pm->runtime_suspend() callback will
-not attempt to prepare the device for signaling wakeup or to put it into a
-low-power state.  The driver ought to leave these tasks to the PCI subsystem
-that has all of the information necessary to perform them.
-
-A suspended device is brought back into the "active" state, or resumed,
-with the help of pm_request_resume() or pm_runtime_resume() which both call
-pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
-driver provides a pm->runtime_resume() callback (see below).  However, before
-the driver's callback is executed, pci_pm_runtime_resume() brings the device
-back into the full-power state, prevents it from signaling wakeup while in that
-state and restores its standard configuration registers.  Thus the driver's
-callback need not worry about the PCI-specific aspects of the device resume.
-
-Note that generally pci_pm_runtime_resume() may be called in two different
-situations.  First, it may be called at the request of the device's driver, for
-example if there are some data for it to process.  Second, it may be called
-as a result of a wakeup signal from the device itself (this sometimes is
-referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
-is handled in one of the ways described in Section 1 and finally converted into
-a notification for the PCI subsystem after the source device has been
-identified.
-
-The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
-and pm_request_idle(), executes the device driver's pm->runtime_idle()
-callback, if defined, and if that callback doesn't return error code (or is not
-present at all), suspends the device with the help of pm_runtime_suspend().
-Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
-example, it is called right after the device has just been resumed), in which
-cases it is expected to suspend the device if that makes sense.  Usually,
-however, the PCI subsystem doesn't really know if the device really can be
-suspended, so it lets the device's driver decide by running its
-pm->runtime_idle() callback.
-
-2.4. System-Wide Power Transitions
-----------------------------------
-There are a few different types of system-wide power transitions, described in
-Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
-in a specific way and the PM core executes subsystem-level power management
-callbacks for this purpose.  They are executed in phases such that each phase
-involves executing the same subsystem-level callback for every device belonging
-to the given subsystem before the next phase begins.  These phases always run
-after tasks have been frozen.
-
-2.4.1. System Suspend
-
-When the system is going into a sleep state in which the contents of memory will
-be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
-
-	prepare, suspend, suspend_noirq.
-
-The following PCI bus type's callbacks, respectively, are used in these phases:
-
-	pci_pm_prepare()
-	pci_pm_suspend()
-	pci_pm_suspend_noirq()
-
-The pci_pm_prepare() routine first puts the device into the "fully functional"
-state with the help of pm_runtime_resume().  Then, it executes the device
-driver's pm->prepare() callback if defined (i.e. if the driver's struct
-dev_pm_ops object is present and the prepare pointer in that object is valid).
-
-The pci_pm_suspend() routine first checks if the device's driver implements
-legacy PCI suspend routines (see Section 3), in which case the driver's legacy
-suspend callback is executed, if present, and its result is returned.  Next, if
-the device's driver doesn't provide a struct dev_pm_ops object (containing
-pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
-simply turns off the device's bus master capability and runs
-pcibios_disable_device() to disable it, unless the device is a bridge (PCI
-bridges are ignored by this routine).  Next, the device driver's pm->suspend()
-callback is executed, if defined, and its result is returned if it fails.
-Finally, pci_fixup_device() is called to apply hardware suspend quirks related
-to the device if necessary.
-
-Note that the suspend phase is carried out asynchronously for PCI devices, so
-the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
-devices that don't depend on each other in a known way (i.e. none of the paths
-in the device tree from the root bridge to a leaf device contains both of them).
-
-The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
-been called, which means that the device driver's interrupt handler won't be
-invoked while this routine is running.  It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
-late suspend routine is called and its result is returned (the standard
-configuration registers of the device are saved if the driver's callback hasn't
-done that).  Second, if the device driver's struct dev_pm_ops object is not
-present, the device's standard configuration registers are saved and the routine
-returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
-executed, if present, and its result is returned if it fails.  Next, if the
-device's standard configuration registers haven't been saved yet (one of the
-device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
-saves them, prepares the device to signal wakeup (if necessary) and puts it into
-a low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup while the system is in the target sleep
-state.  Just like in the runtime PM case described above, the mechanism of
-signaling wakeup is system-dependent and determined by the PCI subsystem, which
-is also responsible for preparing the device to signal wakeup from the system's
-target sleep state as appropriate.
-
-PCI device drivers (that don't implement legacy power management callbacks) are
-generally not expected to prepare devices for signaling wakeup or to put them
-into low-power states.  However, if one of the driver's suspend callbacks
-(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
-registers, pci_pm_suspend_noirq() will assume that the device has been prepared
-to signal wakeup and put into a low-power state by the driver (the driver is
-then assumed to have used the helper functions provided by the PCI subsystem for
-this purpose).  PCI device drivers are not encouraged to do that, but in some
-rare cases doing that in the driver may be the optimum approach.
-
-2.4.2. System Resume
-
-When the system is undergoing a transition from a sleep state in which the
-contents of memory have been preserved, such as one of the ACPI sleep states
-S1-S3, into the working state (ACPI S0), the phases are:
-
-	resume_noirq, resume, complete.
-
-The following PCI bus type's callbacks, respectively, are executed in these
-phases:
-
-	pci_pm_resume_noirq()
-	pci_pm_resume()
-	pci_pm_complete()
-
-The pci_pm_resume_noirq() routine first puts the device into the full-power
-state, restores its standard configuration registers and applies early resume
-hardware quirks related to the device, if necessary.  This is done
-unconditionally, regardless of whether or not the device's driver implements
-legacy PCI power management callbacks (this way all PCI devices are in the
-full-power state and their standard configuration registers have been restored
-when their interrupt handlers are invoked for the first time during resume,
-which allows the kernel to avoid problems with the handling of shared interrupts
-by drivers whose devices are still suspended).  If legacy PCI power management
-callbacks (see Section 3) are implemented by the device's driver, the legacy
-early resume callback is executed and its result is returned.  Otherwise, the
-device driver's pm->resume_noirq() callback is executed, if defined, and its
-result is returned.
-
-The pci_pm_resume() routine first checks if the device's standard configuration
-registers have been restored and restores them if that's not the case (this
-only is necessary in the error path during a failing suspend).  Next, resume
-hardware quirks related to the device are applied, if necessary, and if the
-device's driver implements legacy PCI power management callbacks (see
-Section 3), the driver's legacy resume callback is executed and its result is
-returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
-its driver's pm->resume() callback is executed, if defined (the callback's
-result is then returned).
-
-The resume phase is carried out asynchronously for PCI devices, like the
-suspend phase described above, which means that if two PCI devices don't depend
-on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
-
-The pci_pm_complete() routine only executes the device driver's pm->complete()
-callback, if defined.
-
-2.4.3. System Hibernation
-
-System hibernation is more complicated than system suspend, because it requires
-a system image to be created and written into a persistent storage medium.  The
-image is created atomically and all devices are quiesced, or frozen, before that
-happens.
-
-The freezing of devices is carried out after enough memory has been freed (at
-the time of this writing the image creation requires at least 50% of system RAM
-to be free) in the following three phases:
-
-	prepare, freeze, freeze_noirq
-
-that correspond to the PCI bus type's callbacks:
-
-	pci_pm_prepare()
-	pci_pm_freeze()
-	pci_pm_freeze_noirq()
-
-This means that the prepare phase is exactly the same as for system suspend.
-The other two phases, however, are different.
-
-The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
-the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
-and it doesn't apply the suspend-related hardware quirks.  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The pci_pm_freeze_noirq() routine, in turn, is similar to
-pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
-routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
-device for signaling wakeup and put it into a low-power state.  Still, it saves
-the device's standard configuration registers if they haven't been saved by one
-of the driver's callbacks.
-
-Once the image has been created, it has to be saved.  However, at this point all
-devices are frozen and they cannot handle I/O, while their ability to handle
-I/O is obviously necessary for the image saving.  Thus they have to be brought
-back to the fully functional state and this is done in the following phases:
-
-	thaw_noirq, thaw, complete
-
-using the following PCI bus type's callbacks:
-
-	pci_pm_thaw_noirq()
-	pci_pm_thaw()
-	pci_pm_complete()
-
-respectively.
-
-The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
-but it doesn't put the device into the full power state and doesn't attempt to
-restore its standard configuration registers.  It also executes the device
-driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
-
-The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
-driver's pm->thaw() callback instead of pm->resume().  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The complete phase it the same as for system resume.
-
-After saving the image, devices need to be powered down before the system can
-enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
-three phases:
-
-	prepare, poweroff, poweroff_noirq
-
-where the prepare phase is exactly the same as for system suspend.  The other
-two phases are analogous to the suspend and suspend_noirq phases, respectively.
-The PCI subsystem-level callbacks they correspond to
-
-	pci_pm_poweroff()
-	pci_pm_poweroff_noirq()
-
-work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
-although they don't attempt to save the device's standard configuration
-registers.
-
-2.4.4. System Restore
-
-System restore requires a hibernation image to be loaded into memory and the
-pre-hibernation memory contents to be restored before the pre-hibernation system
-activity can be resumed.
-
-As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
-into memory by a fresh instance of the kernel, called the boot kernel, which in
-turn is loaded and run by a boot loader in the usual way.  After the boot kernel
-has loaded the image, it needs to replace its own code and data with the code
-and data of the "hibernated" kernel stored within the image, called the image
-kernel.  For this purpose all devices are frozen just like before creating
-the image during hibernation, in the
-
-	prepare, freeze, freeze_noirq
-
-phases described above.  However, the devices affected by these phases are only
-those having drivers in the boot kernel; other devices will still be in whatever
-state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases (that will only affect the devices having
-drivers in the boot kernel), and then continue running normally.
-
-If the pre-hibernation memory contents are restored successfully, which is the
-usual situation, control is passed to the image kernel, which then becomes
-responsible for bringing the system back to the working state.  To achieve this,
-it must restore the devices' pre-hibernation functionality, which is done much
-like waking up from the memory sleep state, although it involves different
-phases:
-
-	restore_noirq, restore, complete
-
-The first two of these are analogous to the resume_noirq and resume phases
-described above, respectively, and correspond to the following PCI subsystem
-callbacks:
-
-	pci_pm_restore_noirq()
-	pci_pm_restore()
-
-These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
-respectively, but they execute the device driver's pm->restore_noirq() and
-pm->restore() callbacks, if available.
-
-The complete phase is carried out in exactly the same way as during system
-resume.
-
-
-3. PCI Device Drivers and Power Management
-==========================================
-
-3.1. Power Management Callbacks
--------------------------------
-PCI device drivers participate in power management by providing callbacks to be
-executed by the PCI subsystem's power management routines described above and by
-controlling the runtime power management of their devices.
-
-At the time of this writing there are two ways to define power management
-callbacks for a PCI device driver, the recommended one, based on using a
-dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
-"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
-.resume() callbacks from struct pci_driver are used.  The legacy approach,
-however, doesn't allow one to define runtime power management callbacks and is
-not really suitable for any new drivers.  Therefore it is not covered by this
-document (refer to the source code to learn more about it).
-
-It is recommended that all PCI device drivers define a struct dev_pm_ops object
-containing pointers to power management (PM) callbacks that will be executed by
-the PCI subsystem's PM routines in various circumstances.  A pointer to the
-driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
-its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
-in struct pci_driver are ignored (even if they are not NULL).
-
-The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
-defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
-subsystem will handle the device in a simplified default manner.  If they are
-defined, though, they are expected to behave as described in the following
-subsections.
-
-3.1.1. prepare()
-
-The prepare() callback is executed during system suspend, during hibernation
-(when a hibernation image is about to be created), during power-off after
-saving a hibernation image and during system restore, when a hibernation image
-has just been loaded into memory.
-
-This callback is only necessary if the driver's device has children that in
-general may be registered at any time.  In that case the role of the prepare()
-callback is to prevent new children of the device from being registered until
-one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
-
-In addition to that the prepare() callback may carry out some operations
-preparing the device to be suspended, although it should not allocate memory
-(if additional memory is required to suspend the device, it has to be
-preallocated earlier, for example in a suspend/hibernate notifier as described
-in Documentation/driver-api/pm/notifiers.rst).
-
-3.1.2. suspend()
-
-The suspend() callback is only executed during system suspend, after prepare()
-callbacks have been executed for all devices in the system.
-
-This callback is expected to quiesce the device and prepare it to be put into a
-low-power state by the PCI subsystem.  It is not required (in fact it even is
-not recommended) that a PCI driver's suspend() callback save the standard
-configuration registers of the device, prepare it for waking up the system, or
-put it into a low-power state.  All of these operations can very well be taken
-care of by the PCI subsystem, without the driver's participation.
-
-However, in some rare case it is convenient to carry out these operations in
-a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
-pci_set_power_state() should be used to save the device's standard configuration
-registers, to prepare it for system wakeup (if necessary), and to put it into a
-low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
-the PCI subsystem will not execute either pci_prepare_to_sleep(), or
-pci_set_power_state() for its device, so the driver is then responsible for
-handling the device as appropriate.
-
-While the suspend() callback is being executed, the driver's interrupt handler
-can be invoked to handle an interrupt from the device, so all suspend-related
-operations relying on the driver's ability to handle interrupts should be
-carried out in this callback.
-
-3.1.3. suspend_noirq()
-
-The suspend_noirq() callback is only executed during system suspend, after
-suspend() callbacks have been executed for all devices in the system and
-after device interrupts have been disabled by the PM core.
-
-The difference between suspend_noirq() and suspend() is that the driver's
-interrupt handler will not be invoked while suspend_noirq() is running.  Thus
-suspend_noirq() can carry out operations that would cause race conditions to
-arise if they were performed in suspend().
-
-3.1.4. freeze()
-
-The freeze() callback is hibernation-specific and is executed in two situations,
-during hibernation, after prepare() callbacks have been executed for all devices
-in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory from persistent storage and the
-prepare() callbacks have been executed for all devices.
-
-The role of this callback is analogous to the role of the suspend() callback
-described above.  In fact, they only need to be different in the rare cases when
-the driver takes the responsibility for putting the device into a low-power
-state.
-
-In that cases the freeze() callback should not prepare the device system wakeup
-or put it into a low-power state.  Still, either it or freeze_noirq() should
-save the device's standard configuration registers using pci_save_state().
-
-3.1.5. freeze_noirq()
-
-The freeze_noirq() callback is hibernation-specific.  It is executed during
-hibernation, after prepare() and freeze() callbacks have been executed for all
-devices in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory and after prepare() and
-freeze() callbacks have been executed for all devices.  It is always executed
-after device interrupts have been disabled by the PM core.
-
-The role of this callback is analogous to the role of the suspend_noirq()
-callback described above and it very rarely is necessary to define
-freeze_noirq().
-
-The difference between freeze_noirq() and freeze() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.6. poweroff()
-
-The poweroff() callback is hibernation-specific.  It is executed when the system
-is about to be powered off after saving a hibernation image to a persistent
-storage.  prepare() callbacks are executed for all devices before poweroff() is
-called.
-
-The role of this callback is analogous to the role of the suspend() and freeze()
-callbacks described above, although it does not need to save the contents of
-the device's registers.  In particular, if the driver wants to put the device
-into a low-power state itself instead of allowing the PCI subsystem to do that,
-the poweroff() callback should use pci_prepare_to_sleep() and
-pci_set_power_state() to prepare the device for system wakeup and to put it
-into a low-power state, respectively, but it need not save the device's standard
-configuration registers.
-
-3.1.7. poweroff_noirq()
-
-The poweroff_noirq() callback is hibernation-specific.  It is executed after
-poweroff() callbacks have been executed for all devices in the system.
-
-The role of this callback is analogous to the role of the suspend_noirq() and
-freeze_noirq() callbacks described above, but it does not need to save the
-contents of the device's registers.
-
-The difference between poweroff_noirq() and poweroff() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.8. resume_noirq()
-
-The resume_noirq() callback is only executed during system resume, after the
-PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
-be invoked while resume_noirq() is running, so this callback can carry out
-operations that might race with the interrupt handler.
-
-Since the PCI subsystem unconditionally puts all devices into the full power
-state in the resume_noirq phase of system resume and restores their standard
-configuration registers, resume_noirq() is usually not necessary.  In general
-it should only be used for performing operations that would lead to race
-conditions if carried out by resume().
-
-3.1.9. resume()
-
-The resume() callback is only executed during system resume, after
-resume_noirq() callbacks have been executed for all devices in the system and
-device interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-suspend configuration of the
-device and bringing it back to the fully functional state.  The device should be
-able to process I/O in a usual way after resume() has returned.
-
-3.1.10. thaw_noirq()
-
-The thaw_noirq() callback is hibernation-specific.  It is executed after a
-system image has been created and the non-boot CPUs have been enabled by the PM
-core, in the thaw_noirq phase of hibernation.  It also may be executed if the
-loading of a hibernation image fails during system restore (it is then executed
-after enabling the non-boot CPUs).  The driver's interrupt handler will not be
-invoked while thaw_noirq() is running.
-
-The role of this callback is analogous to the role of resume_noirq().  The
-difference between these two callbacks is that thaw_noirq() is executed after
-freeze() and freeze_noirq(), so in general it does not need to modify the
-contents of the device's registers.
-
-3.1.11. thaw()
-
-The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
-callbacks have been executed for all devices in the system and after device
-interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-freeze configuration of
-the device, so that it will work in a usual way after thaw() has returned.
-
-3.1.12. restore_noirq()
-
-The restore_noirq() callback is hibernation-specific.  It is executed in the
-restore_noirq phase of hibernation, when the boot kernel has passed control to
-the image kernel and the non-boot CPUs have been enabled by the image kernel's
-PM core.
-
-This callback is analogous to resume_noirq() with the exception that it cannot
-make any assumption on the previous state of the device, even if the BIOS (or
-generally the platform firmware) is known to preserve that state over a
-suspend-resume cycle.
-
-For the vast majority of PCI device drivers there is no difference between
-resume_noirq() and restore_noirq().
-
-3.1.13. restore()
-
-The restore() callback is hibernation-specific.  It is executed after
-restore_noirq() callbacks have been executed for all devices in the system and
-after the PM core has enabled device drivers' interrupt handlers to be invoked.
-
-This callback is analogous to resume(), just like restore_noirq() is analogous
-to resume_noirq().  Consequently, the difference between restore_noirq() and
-restore() is analogous to the difference between resume_noirq() and resume().
-
-For the vast majority of PCI device drivers there is no difference between
-resume() and restore().
-
-3.1.14. complete()
-
-The complete() callback is executed in the following situations:
-  - during system resume, after resume() callbacks have been executed for all
-    devices,
-  - during hibernation, before saving the system image, after thaw() callbacks
-    have been executed for all devices,
-  - during system restore, when the system is going back to its pre-hibernation
-    state, after restore() callbacks have been executed for all devices.
-It also may be executed if the loading of a hibernation image into memory fails
-(in that case it is run after thaw() callbacks have been executed for all
-devices that have drivers in the boot kernel).
-
-This callback is entirely optional, although it may be necessary if the
-prepare() callback performs operations that need to be reversed.
-
-3.1.15. runtime_suspend()
-
-The runtime_suspend() callback is specific to device runtime power management
-(runtime PM).  It is executed by the PM core's runtime PM framework when the
-device is about to be suspended (i.e. quiesced and put into a low-power state)
-at run time.
-
-This callback is responsible for freezing the device and preparing it to be
-put into a low-power state, but it must allow the PCI subsystem to perform all
-of the PCI-specific actions necessary for suspending the device.
-
-3.1.16. runtime_resume()
-
-The runtime_resume() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework when the device is about to be resumed
-(i.e. put into the full-power state and programmed to process I/O normally) at
-run time.
-
-This callback is responsible for restoring the normal functionality of the
-device after it has been put into the full-power state by the PCI subsystem.
-The device is expected to be able to process I/O in the usual way after
-runtime_resume() has returned.
-
-3.1.17. runtime_idle()
-
-The runtime_idle() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework whenever it may be desirable to suspend
-the device according to the PM core's information.  In particular, it is
-automatically executed right after runtime_resume() has returned in case the
-resume of the device has happened as a result of a spurious event.
-
-This callback is optional, but if it is not implemented or if it returns 0, the
-PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
-cause the driver's runtime_suspend() callback to be executed.
-
-3.1.18. Pointing Multiple Callback Pointers to One Routine
-
-Although in principle each of the callbacks described in the previous
-subsections can be defined as a separate function, it often is convenient to
-point two or more members of struct dev_pm_ops to the same routine.  There are
-a few convenience macros that can be used for this purpose.
-
-The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
-suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
-members and one resume routine pointed to by the .resume(), .thaw(), and
-.restore() members.  The other function pointers in this struct dev_pm_ops are
-unset.
-
-The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
-additionally sets the .runtime_resume() pointer to the same value as
-.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
-the same value as .suspend() (and .freeze() and .poweroff()).
-
-The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
-dev_pm_ops to indicate that one suspend routine is to be pointed to by the
-.suspend(), .freeze(), and .poweroff() members and one resume routine is to
-be pointed to by the .resume(), .thaw(), and .restore() members.
-
-3.1.19. Driver Flags for Power Management
-
-The PM core allows device drivers to set flags that influence the handling of
-power management for the devices by the core itself and by middle layer code
-including the PCI bus type.  The flags should be set once at the driver probe
-time with the help of the dev_pm_set_driver_flags() function and they should not
-be updated directly afterwards.
-
-The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
-mechanism allowing device suspend/resume callbacks to be skipped if the device
-is in runtime suspend when the system suspend starts.  That also affects all of
-the ancestors of the device, so this flag should only be used if absolutely
-necessary.
-
-The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
-positive value from pci_pm_prepare() if the ->prepare callback provided by the
-driver of the device returns a positive value.  That allows the driver to opt
-out from using the direct-complete mechanism dynamically.
-
-The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
-perspective the device can be safely left in runtime suspend during system
-suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
-to skip resuming the device from runtime suspend unless there are PCI-specific
-reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
-pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
-if the device remains in runtime suspend in the beginning of the "late" phase
-of the system-wide transition under way.  Moreover, if the device is in
-runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
-power management status will be changed to "active" (as it is going to be put
-into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
-the function will set the power.direct_complete flag for it (to make the PM core
-skip the subsequent "thaw" callbacks for it) and return.
-
-Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
-device to be left in suspend after system-wide transitions to the working state.
-This flag is checked by the PM core, but the PCI bus type informs the PM core
-which devices may be left in suspend from its perspective (that happens during
-the "noirq" phase of system-wide suspend and analogous transitions) and next it
-uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
-pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
-callbacks for the device during the transition under way and will set its
-runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
-it.
-
-3.2. Device Runtime Power Management
-------------------------------------
-In addition to providing device power management callbacks PCI device drivers
-are responsible for controlling the runtime power management (runtime PM) of
-their devices.
-
-The PCI device runtime PM is optional, but it is recommended that PCI device
-drivers implement it at least in the cases where there is a reliable way of
-verifying that the device is not used (like when the network cable is detached
-from an Ethernet adapter or there are no devices attached to a USB controller).
-
-To support the PCI runtime PM the driver first needs to implement the
-runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
-the runtime_idle() callback to prevent the device from being suspended again
-every time right after the runtime_resume() callback has returned
-(alternatively, the runtime_suspend() callback will have to check if the
-device should really be suspended and return -EAGAIN if that is not the case).
-
-The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
-device drivers do not need to enable it and should not attempt to do so.
-However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
-helper function.  In addition to that, the runtime PM usage counter of
-each PCI device is incremented by local_pci_probe() before executing the
-probe callback provided by the device's driver.
-
-If a PCI driver implements the runtime PM callbacks and intends to use the
-runtime PM framework provided by the PM core and the PCI subsystem, it needs
-to decrement the device's runtime PM usage counter in its probe callback
-function.  If it doesn't do that, the counter will always be different from
-zero for the device and it will never be runtime-suspended.  The simplest
-way to do that is by calling pm_runtime_put_noidle(), but if the driver
-wants to schedule an autosuspend right away, for example, it may call
-pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
-just needs to call a function that decrements the devices usage counter
-from its probe routine to make runtime PM work for the device.
-
-It is important to remember that the driver's runtime_suspend() callback
-may be executed right after the usage counter has been decremented, because
-user space may already have caused the pm_runtime_allow() helper function
-unblocking the runtime PM of the device to run via sysfs, so the driver must
-be prepared to cope with that.
-
-The driver itself should not call pm_runtime_allow(), though.  Instead, it
-should let user space or some platform-specific code do that (user space can
-do it via sysfs as stated above), but it must be prepared to handle the
-runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time, even before the driver is loaded).
-
-When the driver's remove callback runs, it has to balance the decrementation
-of the device's runtime PM usage counter at the probe time.  For this reason,
-if it has decremented the counter in its probe callback, it must run
-pm_runtime_get_noresume() in its remove callback.  [Since the core carries
-out a runtime resume of the device and bumps up the device's usage counter
-before running the driver's remove callback, the runtime PM of the device
-is effectively disabled for the duration of the remove execution and all
-runtime PM helper functions incrementing the device's usage counter are
-then effectively equivalent to pm_runtime_get_noresume().]
-
-The runtime PM framework works by processing requests to suspend or resume
-devices, or to check if they are idle (in which cases it is reasonable to
-subsequently request that they be suspended).  These requests are represented
-by work items put into the power management workqueue, pm_wq.  Although there
-are a few situations in which power management requests are automatically
-queued by the PM core (for example, after processing a request to resume a
-device the PM core automatically queues a request to check if the device is
-idle), device drivers are generally responsible for queuing power management
-requests for their devices.  For this purpose they should use the runtime PM
-helper functions provided by the PM core, discussed in
-Documentation/power/runtime_pm.txt.
-
-Devices can also be suspended and resumed synchronously, without placing a
-request into pm_wq.  In the majority of cases this also is done by their
-drivers that use helper functions provided by the PM core for this purpose.
-
-For more information on the runtime PM of devices refer to
-Documentation/power/runtime_pm.txt.
-
-
-4. Resources
-============
-
-PCI Local Bus Specification, Rev. 3.0
-PCI Bus Power Management Interface Specification, Rev. 1.2
-Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
-PCI Express Base Specification, Rev. 2.0
-Documentation/driver-api/pm/devices.rst
-Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
new file mode 100644
index 000000000000..69921f072ce1
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.rst
@@ -0,0 +1,227 @@
+===============================
+PM Quality Of Service Interface
+===============================
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Two different PM QoS frameworks are available:
+1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
+memory_bandwidth.
+2. the per-device PM QoS framework provides the API to manage the per-device latency
+constraints and PM QoS flags.
+
+Each parameters have defined units:
+
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+ * memory bandwidth: mbs (mega bit / sec)
+
+
+1. PM QoS framework
+===================
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter.  The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h.  This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requests is maintained along with
+an aggregated target value.  The aggregated target value is updated with
+changes to the request list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the request values held
+in the parameter list elements.
+Note: the aggregated target value is implemented as an atomic variable so that
+reading the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is simple:
+
+void pm_qos_add_request(handle, param_class, target_value):
+  Will insert an element into the list for that identified PM QoS class with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of pm_qos need to save the returned handle for future use in other
+  pm_qos API functions.
+
+void pm_qos_update_request(handle, new_target_value):
+  Will update the list element pointed to by the handle with the new target value
+  and recompute the new aggregated target, calling the notification tree if the
+  target is changed.
+
+void pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target and
+  call the notification tree if the target was changed as a result of removing
+  the request.
+
+int pm_qos_request(param_class):
+  Returns the aggregated value for a given PM QoS class.
+
+int pm_qos_request_active(handle):
+  Returns if the request is still active, i.e. it has not been removed from a
+  PM QoS class constraints list.
+
+int pm_qos_add_notifier(param_class, notifier):
+  Adds a notification callback function to the PM QoS class. The callback is
+  called when the aggregated value for the PM QoS class is changed.
+
+int pm_qos_remove_notifier(int param_class, notifier):
+  Removes the notification callback function for the PM QoS class.
+
+
+From user mode:
+
+Only processes can register a pm_qos request.  To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+request on the parameter.
+
+To change the requested target value the process needs to write an s32 value to
+the open device node.  Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678".  This
+translates to a pm_qos_update_request call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+2. PM QoS per-device latency and flags framework
+================================================
+
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
+
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
+
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is the following:
+
+int dev_pm_qos_add_request(device, handle, type, value):
+  Will insert an element into the list for that identified device with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of dev_pm_qos need to save the handle for future use in other
+  dev_pm_qos API functions.
+
+int dev_pm_qos_update_request(handle, new_value):
+  Will update the list element pointed to by the handle with the new target
+  value and recompute the new aggregated target, calling the notification
+  trees if the target is changed.
+
+int dev_pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target
+  and call the notification trees if the target was changed as a result of
+  removing the request.
+
+s32 dev_pm_qos_read_value(device, type):
+  Returns the aggregated value for a given device's constraints list.
+
+enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
+  Check PM QoS flags of the given device against the given mask of flags.
+  The meaning of the return values is as follows:
+
+	PM_QOS_FLAGS_ALL:
+		All flags from the mask are set
+	PM_QOS_FLAGS_SOME:
+		Some flags from the mask are set
+	PM_QOS_FLAGS_NONE:
+		No flags from the mask are set
+	PM_QOS_FLAGS_UNDEFINED:
+		The device's PM QoS structure has not been initialized
+		or the list of requests is empty.
+
+int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
+  Add a PM QoS request for the first direct ancestor of the given device whose
+  power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
+  or whose power.set_latency_tolerance callback pointer is not NULL (for
+  DEV_PM_QOS_LATENCY_TOLERANCE requests).
+
+int dev_pm_qos_expose_latency_limit(device, value)
+  Add a request to the device's PM QoS list of resume latency constraints and
+  create a sysfs attribute pm_qos_resume_latency_us under the device's power
+  directory allowing user space to manipulate that request.
+
+void dev_pm_qos_hide_latency_limit(device)
+  Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
+  PM QoS list of resume latency constraints and remove sysfs attribute
+  pm_qos_resume_latency_us from the device's power directory.
+
+int dev_pm_qos_expose_flags(device, value)
+  Add a request to the device's PM QoS list of flags and create sysfs attribute
+  pm_qos_no_power_off under the device's power directory allowing user space to
+  change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
+
+void dev_pm_qos_hide_flags(device)
+  Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
+  of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+  directory.
+
+Notification mechanisms:
+
+The per-device PM QoS framework has a per-device notification tree.
+
+int dev_pm_qos_add_notifier(device, notifier, type):
+  Adds a notification callback function for the device for a particular request
+  type.
+
+  The callback is called when the aggregated value of the device constraints list
+  is changed.
+
+int dev_pm_qos_remove_notifier(device, notifier, type):
+  Removes the notification callback function for the device.
+
+
+Active state latency tolerance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly.  In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated.  The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it.  If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it.  That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any.  Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
deleted file mode 100644
index 19c5f7b1a7ba..000000000000
--- a/Documentation/power/pm_qos_interface.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-PM Quality Of Service Interface.
-
-This interface provides a kernel and user mode interface for registering
-performance expectations by drivers, subsystems and user space applications on
-one of the parameters.
-
-Two different PM QoS frameworks are available:
-1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
-memory_bandwidth.
-2. the per-device PM QoS framework provides the API to manage the per-device latency
-constraints and PM QoS flags.
-
-Each parameters have defined units:
- * latency: usec
- * timeout: usec
- * throughput: kbs (kilo bit / sec)
- * memory bandwidth: mbs (mega bit / sec)
-
-
-1. PM QoS framework
-
-The infrastructure exposes multiple misc device nodes one per implemented
-parameter.  The set of parameters implement is defined by pm_qos_power_init()
-and pm_qos_params.h.  This is done because having the available parameters
-being runtime configurable or changeable from a driver was seen as too easy to
-abuse.
-
-For each parameter a list of performance requests is maintained along with
-an aggregated target value.  The aggregated target value is updated with
-changes to the request list or elements of the list.  Typically the
-aggregated target value is simply the max or min of the request values held
-in the parameter list elements.
-Note: the aggregated target value is implemented as an atomic variable so that
-reading the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is simple:
-
-void pm_qos_add_request(handle, param_class, target_value):
-Will insert an element into the list for that identified PM QoS class with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of pm_qos need to save the returned handle for future use in other
-pm_qos API functions.
-
-void pm_qos_update_request(handle, new_target_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification tree if the
-target is changed.
-
-void pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification tree if the target was changed as a result of removing
-the request.
-
-int pm_qos_request(param_class):
-Returns the aggregated value for a given PM QoS class.
-
-int pm_qos_request_active(handle):
-Returns if the request is still active, i.e. it has not been removed from a
-PM QoS class constraints list.
-
-int pm_qos_add_notifier(param_class, notifier):
-Adds a notification callback function to the PM QoS class. The callback is
-called when the aggregated value for the PM QoS class is changed.
-
-int pm_qos_remove_notifier(int param_class, notifier):
-Removes the notification callback function for the PM QoS class.
-
-
-From user mode:
-Only processes can register a pm_qos request.  To provide for automatic
-cleanup of a process, the interface requires the process to register its
-parameter requests in the following way:
-
-To register the default pm_qos target for the specific parameter, the process
-must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
-
-As long as the device node is held open that process has a registered
-request on the parameter.
-
-To change the requested target value the process needs to write an s32 value to
-the open device node.  Alternatively the user mode program could write a hex
-string for the value using 10 char long format e.g. "0x12345678".  This
-translates to a pm_qos_update_request call.
-
-To remove the user mode request for a target value simply close the device
-node.
-
-
-2. PM QoS per-device latency and flags framework
-
-For each device, there are three lists of PM QoS requests. Two of them are
-maintained along with the aggregated targets of resume latency and active
-state latency tolerance (in microseconds) and the third one is for PM QoS flags.
-Values are updated in response to changes of the request list.
-
-The target values of resume latency and active state latency tolerance are
-simply the minimum of the request values held in the parameter list elements.
-The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
-
-Note: The aggregated target values are implemented in such a way that reading
-the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is the following:
-
-int dev_pm_qos_add_request(device, handle, type, value):
-Will insert an element into the list for that identified device with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of dev_pm_qos need to save the handle for future use in other
-dev_pm_qos API functions.
-
-int dev_pm_qos_update_request(handle, new_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification trees if the
-target is changed.
-
-int dev_pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification trees if the target was changed as a result of removing
-the request.
-
-s32 dev_pm_qos_read_value(device):
-Returns the aggregated value for a given device's constraints list.
-
-enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
-Check PM QoS flags of the given device against the given mask of flags.
-The meaning of the return values is as follows:
-	PM_QOS_FLAGS_ALL: All flags from the mask are set
-	PM_QOS_FLAGS_SOME: Some flags from the mask are set
-	PM_QOS_FLAGS_NONE: No flags from the mask are set
-	PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been
-			initialized or the list of requests is empty.
-
-int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
-Add a PM QoS request for the first direct ancestor of the given device whose
-power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
-or whose power.set_latency_tolerance callback pointer is not NULL (for
-DEV_PM_QOS_LATENCY_TOLERANCE requests).
-
-int dev_pm_qos_expose_latency_limit(device, value)
-Add a request to the device's PM QoS list of resume latency constraints and
-create a sysfs attribute pm_qos_resume_latency_us under the device's power
-directory allowing user space to manipulate that request.
-
-void dev_pm_qos_hide_latency_limit(device)
-Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
-PM QoS list of resume latency constraints and remove sysfs attribute
-pm_qos_resume_latency_us from the device's power directory.
-
-int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attribute
-pm_qos_no_power_off under the device's power directory allowing user space to
-change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
-
-void dev_pm_qos_hide_flags(device)
-Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
-directory.
-
-Notification mechanisms:
-The per-device PM QoS framework has a per-device notification tree.
-
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
-The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_notifier(device, notifier):
-Removes the notification callback function for the device.
-
-
-Active state latency tolerance
-
-This device PM QoS type is used to support systems in which hardware may switch
-to energy-saving operation modes on the fly.  In those systems, if the operation
-mode chosen by the hardware attempts to save energy in an overly aggressive way,
-it may cause excess latencies to be visible to software, causing it to miss
-certain protocol requirements or target frame or sample rates etc.
-
-If there is a latency tolerance control mechanism for a given device available
-to software, the .set_latency_tolerance callback in that device's dev_pm_info
-structure should be populated.  The routine pointed to by it is should implement
-whatever is necessary to transfer the effective requirement value to the
-hardware.
-
-Whenever the effective latency tolerance changes for the device, its
-.set_latency_tolerance() callback will be executed and the effective value will
-be passed to it.  If that value is negative, which means that the list of
-latency tolerance requirements for the device is empty, the callback is expected
-to switch the underlying hardware latency tolerance control mechanism to an
-autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
-the hardware supports a special "no requirement" setting, the callback is
-expected to use it.  That allows software to prevent the hardware from
-automatically updating the device's latency tolerance in response to its power
-state changes (e.g. during transitions from D3cold to D0), which generally may
-be done in the autonomous latency tolerance control mode.
-
-If .set_latency_tolerance() is present for the device, sysfs attribute
-pm_qos_latency_tolerance_us will be present in the devivce's power directory.
-Then, user space can use that attribute to specify its latency tolerance
-requirement for the device, if any.  Writing "any" to it means "no requirement,
-but do not let the hardware control latency tolerance" and writing "auto" to it
-allows the hardware to be switched to the autonomous mode if there are no other
-requirements from the kernel side in the device's list.
-
-Kernel code can use the functions described above along with the
-DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
-latency tolerance requirements for devices.
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
new file mode 100644
index 000000000000..7b8c42f8b1de
--- /dev/null
+++ b/Documentation/power/power_supply_class.rst
@@ -0,0 +1,288 @@
+========================
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
+for the example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+  All voltages, currents, charges, energies, time and temperatures in µV,
+  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+  stated. It's driver's job to convert its raw values to units in which
+  this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------------------------------------------------------+
+|               **Charge/Energy/Capacity - how to not confuse**            |
++--------------------------------------------------------------------------+
+| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity"   |
+| of battery, this class distinguish these terms. Don't mix them!**        |
+|                                                                          |
+| - `CHARGE_*`                                                             |
+|	attributes represents capacity in µAh only.                        |
+| - `ENERGY_*`                                                             |
+|	attributes represents capacity in µWh only.                        |
+| - `CAPACITY`                                                             |
+|	attribute represents capacity in *percents*, from 0 to 100.        |
++--------------------------------------------------------------------------+
+
+Postfixes:
+
+_AVG
+  *hardware* averaged value, use it if your hardware is really able to
+  report averaged values.
+_NOW
+  momentary/instantaneous values.
+
+STATUS
+  this attribute represents operating status (charging, full,
+  discharging (i.e. powering a load), etc.). This corresponds to
+  `BATTERY_STATUS_*` values, as defined in battery.h.
+
+CHARGE_TYPE
+  batteries can typically charge at different rates.
+  This defines trickle and fast charges.  For batteries that
+  are already charged or discharging, 'n/a' can be displayed (or
+  'unknown', if the status is not known).
+
+AUTHENTIC
+  indicates the power supply (battery or charger) connected
+  to the platform is authentic(1) or non authentic(0).
+
+HEALTH
+  represents health of the battery, values corresponds to
+  POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_OCV
+  open circuit voltage of the battery.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
+  design values for maximal and minimal power supply voltages.
+  Maximal/minimal means values of voltages when battery considered
+  "full"/"empty" at normal conditions. Yes, there is no direct relation
+  between voltage and battery capacity, but some dumb
+  batteries use voltage for very approximated calculation of capacity.
+  Battery driver also can use this attribute just to inform userspace
+  about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN
+  same as _DESIGN voltage values except that these ones should be used
+  if hardware could only guess (measure and retain) the thresholds of a
+  given power supply.
+
+VOLTAGE_BOOT
+  Reports the voltage measured during boot
+
+CURRENT_BOOT
+  Reports the current measured during boot
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
+  design charge values, when battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
+  same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY
+  These attributes means "last remembered value of charge when battery
+  became full/empty". It also could mean "value of charge when battery
+  considered full/empty at given conditions (temperature, age)".
+  I.e. these attributes represents real thresholds, not design values.
+
+ENERGY_FULL, ENERGY_EMPTY
+  same as above but for energy.
+
+CHARGE_COUNTER
+  the current charge counter (in µAh).  This could easily
+  be negative; there is no empty or full value.  It is only useful for
+  relative, time-based measurements.
+
+PRECHARGE_CURRENT
+  the maximum charge current during precharge phase of charge cycle
+  (typically 20% of battery capacity).
+
+CHARGE_TERM_CURRENT
+  Charge termination current. The charge cycle terminates when battery
+  voltage is above recharge threshold, and charge current is below
+  this setting (typically 10% of battery capacity).
+
+CONSTANT_CHARGE_CURRENT
+  constant charge current programmed by charger.
+
+
+CONSTANT_CHARGE_CURRENT_MAX
+  maximum charge current supported by the power supply object.
+
+CONSTANT_CHARGE_VOLTAGE
+  constant charge voltage programmed by charger.
+CONSTANT_CHARGE_VOLTAGE_MAX
+  maximum charge voltage supported by the power supply object.
+
+INPUT_CURRENT_LIMIT
+  input current limit programmed by charger. Indicates
+  the current drawn from a charging source.
+INPUT_VOLTAGE_LIMIT
+  input voltage limit programmed by charger. Indicates
+  the voltage limit from a charging source.
+INPUT_POWER_LIMIT
+  input power limit programmed by charger. Indicates
+  the power limit from a charging source.
+
+CHARGE_CONTROL_LIMIT
+  current charge control limit setting
+CHARGE_CONTROL_LIMIT_MAX
+  maximum charge control limit setting
+
+CALIBRATE
+  battery or coulomb counter calibration status
+
+CAPACITY
+  capacity in percents.
+CAPACITY_ALERT_MIN
+  minimum capacity alert value in percents.
+CAPACITY_ALERT_MAX
+  maximum capacity alert value in percents.
+CAPACITY_LEVEL
+  capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*.
+
+TEMP
+  temperature of the power supply.
+TEMP_ALERT_MIN
+  minimum battery temperature alert.
+TEMP_ALERT_MAX
+  maximum battery temperature alert.
+TEMP_AMBIENT
+  ambient temperature.
+TEMP_AMBIENT_ALERT_MIN
+  minimum ambient temperature alert.
+TEMP_AMBIENT_ALERT_MAX
+  maximum ambient temperature alert.
+TEMP_MIN
+  minimum operatable temperature
+TEMP_MAX
+  maximum operatable temperature
+
+TIME_TO_EMPTY
+  seconds left for battery to be considered empty
+  (i.e. while battery powers a load)
+TIME_TO_FULL
+  seconds left for battery to be considered full
+  (i.e. while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+Devicetree battery characteristics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Drivers should call power_supply_get_battery_info() to obtain battery
+characteristics from a devicetree battery node, defined in
+Documentation/devicetree/bindings/power/supply/battery.txt. This is
+implemented in drivers/power/supply/bq27xxx_battery.c.
+
+Properties in struct power_supply_battery_info and their counterparts in the
+battery node have names corresponding to elements in enum power_supply_property,
+for naming consistency between sysfs attributes and battery node properties.
+
+
+QA
+~~
+
+Q:
+   Where is POWER_SUPPLY_PROP_XYZ attribute?
+A:
+   If you cannot find attribute suitable for your driver needs, feel free
+   to add it and send patch along with your driver.
+
+   The attributes available currently are the ones currently provided by the
+   drivers written.
+
+   Good candidates to add in future: model/part#, cycle_time, manufacturer,
+   etc.
+
+
+Q:
+   I have some very specific attribute (e.g. battery color), should I add
+   this attribute to standard ones?
+A:
+   Most likely, no. Such attribute can be placed in the driver itself, if
+   it is useful. Of course, if the attribute in question applicable to
+   large set of batteries, provided by many drivers, and/or comes from
+   some general battery specification/standard, it may be a candidate to
+   be added to the core attribute set.
+
+
+Q:
+   Suppose, my battery monitoring chip/firmware does not provides capacity
+   in percents, but provides charge_{now,full,empty}. Should I calculate
+   percentage capacity manually, inside the driver, and register CAPACITY
+   attribute? The same question about time_to_empty/time_to_full.
+A:
+   Most likely, no. This class is designed to export properties which are
+   directly measurable by the specific hardware available.
+
+   Inferring not available properties using some heuristics or mathematical
+   model is not subject of work for a battery driver. Such functionality
+   should be factored out, and in fact, apm_power, the driver to serve
+   legacy APM API on top of power supply class, uses a simple heuristic of
+   approximating remaining battery capacity based on its charge, current,
+   voltage and so on. But full-fledged battery model is likely not subject
+   for kernel at all, as it would require floating point calculation to deal
+   with things like differential equations and Kalman filters. This is
+   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
deleted file mode 100644
index 300d37896e51..000000000000
--- a/Documentation/power/power_supply_class.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Linux power supply class
-========================
-
-Synopsis
-~~~~~~~~
-Power supply class used to represent battery, UPS, AC or DC power supply
-properties to user-space.
-
-It defines core set of attributes, which should be applicable to (almost)
-every power supply out there. Attributes are available via sysfs and uevent
-interfaces.
-
-Each attribute has well defined meaning, up to unit of measure used. While
-the attributes provided are believed to be universally applicable to any
-power supply, specific monitoring hardware may not be able to provide them
-all, so any of them may be skipped.
-
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
-
-It also integrates with LED framework, for the purpose of providing
-typically expected feedback of battery charging/fully charged status and
-AC/USB power supply online status. (Note that specific details of the
-indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
-
-
-Attributes/properties
-~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
-predefined attributes *and* their units.
-
-So, userspace gets predictable set of attributes and their units for any
-kind of power supply, and can process/present them to a user in consistent
-manner. Results for different power supplies and machines are also directly
-comparable.
-
-See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
-for the example how to declare and handle attributes.
-
-
-Units
-~~~~~
-Quoting include/linux/power_supply.h:
-
-  All voltages, currents, charges, energies, time and temperatures in µV,
-  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
-  stated. It's driver's job to convert its raw values to units in which
-  this class operates.
-
-
-Attributes/properties detailed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-~ ~ ~ ~ ~ ~ ~  Charge/Energy/Capacity - how to not confuse  ~ ~ ~ ~ ~ ~ ~
-~                                                                       ~
-~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity"  ~
-~ of battery, this class distinguish these terms. Don't mix them!       ~
-~                                                                       ~
-~ CHARGE_* attributes represents capacity in µAh only.                  ~
-~ ENERGY_* attributes represents capacity in µWh only.                  ~
-~ CAPACITY attribute represents capacity in *percents*, from 0 to 100.  ~
-~                                                                       ~
-~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-
-Postfixes:
-_AVG - *hardware* averaged value, use it if your hardware is really able to
-report averaged values.
-_NOW - momentary/instantaneous values.
-
-STATUS - this attribute represents operating status (charging, full,
-discharging (i.e. powering a load), etc.). This corresponds to
-BATTERY_STATUS_* values, as defined in battery.h.
-
-CHARGE_TYPE - batteries can typically charge at different rates.
-This defines trickle and fast charges.  For batteries that
-are already charged or discharging, 'n/a' can be displayed (or
-'unknown', if the status is not known).
-
-AUTHENTIC - indicates the power supply (battery or charger) connected
-to the platform is authentic(1) or non authentic(0).
-
-HEALTH - represents health of the battery, values corresponds to
-POWER_SUPPLY_HEALTH_*, defined in battery.h.
-
-VOLTAGE_OCV - open circuit voltage of the battery.
-
-VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
-minimal power supply voltages. Maximal/minimal means values of voltages
-when battery considered "full"/"empty" at normal conditions. Yes, there is
-no direct relation between voltage and battery capacity, but some dumb
-batteries use voltage for very approximated calculation of capacity.
-Battery driver also can use this attribute just to inform userspace
-about maximal and minimal voltage thresholds of a given battery.
-
-VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
-these ones should be used if hardware could only guess (measure and
-retain) the thresholds of a given power supply.
-
-VOLTAGE_BOOT - Reports the voltage measured during boot
-
-CURRENT_BOOT - Reports the current measured during boot
-
-CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
-battery considered full/empty.
-
-ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
-
-CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
-of charge when battery became full/empty". It also could mean "value of
-charge when battery considered full/empty at given conditions (temperature,
-age)". I.e. these attributes represents real thresholds, not design values.
-
-ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
-
-CHARGE_COUNTER - the current charge counter (in µAh).  This could easily
-be negative; there is no empty or full value.  It is only useful for
-relative, time-based measurements.
-
-PRECHARGE_CURRENT - the maximum charge current during precharge phase
-of charge cycle (typically 20% of battery capacity).
-CHARGE_TERM_CURRENT - Charge termination current. The charge cycle
-terminates when battery voltage is above recharge threshold, and charge
-current is below this setting (typically 10% of battery capacity).
-
-CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
-CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
-power supply object.
-
-CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
-CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
-power supply object.
-
-INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
-the current drawn from a charging source.
-
-CHARGE_CONTROL_LIMIT - current charge control limit setting
-CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
-
-CALIBRATE - battery or coulomb counter calibration status
-
-CAPACITY - capacity in percents.
-CAPACITY_ALERT_MIN - minimum capacity alert value in percents.
-CAPACITY_ALERT_MAX - maximum capacity alert value in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
-
-TEMP - temperature of the power supply.
-TEMP_ALERT_MIN - minimum battery temperature alert.
-TEMP_ALERT_MAX - maximum battery temperature alert.
-TEMP_AMBIENT - ambient temperature.
-TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
-TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
-TEMP_MIN - minimum operatable temperature
-TEMP_MAX - maximum operatable temperature
-
-TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
-while battery powers a load)
-TIME_TO_FULL - seconds left for battery to be considered full (i.e.
-while battery is charging)
-
-
-Battery <-> external power supply interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Often power supplies are acting as supplies and supplicants at the same
-time. Batteries are good example. So, batteries usually care if they're
-externally powered or not.
-
-For that case, power supply class implements notification mechanism for
-batteries.
-
-External power supply (AC) lists supplicants (batteries) names in
-"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
-
-
-Devicetree battery characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Drivers should call power_supply_get_battery_info() to obtain battery
-characteristics from a devicetree battery node, defined in
-Documentation/devicetree/bindings/power/supply/battery.txt. This is
-implemented in drivers/power/supply/bq27xxx_battery.c.
-
-Properties in struct power_supply_battery_info and their counterparts in the
-battery node have names corresponding to elements in enum power_supply_property,
-for naming consistency between sysfs attributes and battery node properties.
-
-
-QA
-~~
-Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
-A: If you cannot find attribute suitable for your driver needs, feel free
-   to add it and send patch along with your driver.
-
-   The attributes available currently are the ones currently provided by the
-   drivers written.
-
-   Good candidates to add in future: model/part#, cycle_time, manufacturer,
-   etc.
-
-
-Q: I have some very specific attribute (e.g. battery color), should I add
-   this attribute to standard ones?
-A: Most likely, no. Such attribute can be placed in the driver itself, if
-   it is useful. Of course, if the attribute in question applicable to
-   large set of batteries, provided by many drivers, and/or comes from
-   some general battery specification/standard, it may be a candidate to
-   be added to the core attribute set.
-
-
-Q: Suppose, my battery monitoring chip/firmware does not provides capacity
-   in percents, but provides charge_{now,full,empty}. Should I calculate
-   percentage capacity manually, inside the driver, and register CAPACITY
-   attribute? The same question about time_to_empty/time_to_full.
-A: Most likely, no. This class is designed to export properties which are
-   directly measurable by the specific hardware available.
-
-   Inferring not available properties using some heuristics or mathematical
-   model is not subject of work for a battery driver. Such functionality
-   should be factored out, and in fact, apm_power, the driver to serve
-   legacy APM API on top of power supply class, uses a simple heuristic of
-   approximating remaining battery capacity based on its charge, current,
-   voltage and so on. But full-fledged battery model is likely not subject
-   for kernel at all, as it would require floating point calculation to deal
-   with things like differential equations and Kalman filters. This is
-   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst
new file mode 100644
index 000000000000..7ae3b44c7624
--- /dev/null
+++ b/Documentation/power/powercap/powercap.rst
@@ -0,0 +1,257 @@
+=======================
+Power Capping Framework
+=======================
+
+The power capping framework provides a consistent interface between the kernel
+and the user space that allows power capping drivers to expose the settings to
+user space in a uniform way.
+
+Terminology
+===========
+
+The framework exposes power capping devices to user space via sysfs in the
+form of a tree of objects. The objects at the root level of the tree represent
+'control types', which correspond to different methods of power capping.  For
+example, the intel-rapl control type represents the Intel "Running Average
+Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
+corresponds to the use of idle injection for controlling power.
+
+Power zones represent different parts of the system, which can be controlled and
+monitored using the power capping method determined by the control type the
+given zone belongs to. They each contain attributes for monitoring power, as
+well as controls represented in the form of power constraints.  If the parts of
+the system represented by different power zones are hierarchical (that is, one
+bigger part consists of multiple smaller parts that each have their own power
+controls), those power zones may also be organized in a hierarchy with one
+parent power zone containing multiple subzones and so on to reflect the power
+control topology of the system.  In that case, it is possible to apply power
+capping to a set of devices together using the parent power zone and if more
+fine grained control is required, it can be applied through the subzones.
+
+
+Example sysfs interface tree::
+
+  /sys/devices/virtual/powercap
+  └──intel-rapl
+      ├──intel-rapl:0
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:0:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:0:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──enabled
+      │   ├──uevent
+      ├──intel-rapl:1
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:1:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:1:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──uevent
+      ├──power
+      │   ├──async
+      │   []
+      ├──subsystem -> ../../../../class/power_cap
+      ├──enabled
+      └──uevent
+
+The above example illustrates a case in which the Intel RAPL technology,
+available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
+control type called intel-rapl which contains two power zones, intel-rapl:0 and
+intel-rapl:1, representing CPU packages.  Each of these power zones contains
+two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
+"core" and the "uncore" parts of the given CPU package, respectively.  All of
+the zones and subzones contain energy monitoring attributes (energy_uj,
+max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
+to be applied (the constraints in the 'package' power zones apply to the whole
+CPU packages and the subzone constraints only apply to the respective parts of
+the given package individually). Since Intel RAPL doesn't provide instantaneous
+power value, there is no power_uw attribute.
+
+In addition to that, each power zone contains a name attribute, allowing the
+part of the system represented by that zone to be identified.
+For example::
+
+	cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
+
+package-0
+---------
+
+The Intel RAPL technology allows two constraints, short term and long term,
+with two different time windows to be applied to each power zone.  Thus for
+each zone there are 2 attributes representing the constraint names, 2 power
+limits and 2 attributes representing the sizes of the time windows. Such that,
+constraint_j_* attributes correspond to the jth constraint (j = 0,1).
+
+For example::
+
+	constraint_0_name
+	constraint_0_power_limit_uw
+	constraint_0_time_window_us
+	constraint_1_name
+	constraint_1_power_limit_uw
+	constraint_1_time_window_us
+
+Power Zone Attributes
+=====================
+
+Monitoring attributes
+---------------------
+
+energy_uj (rw)
+	Current energy counter in micro joules. Write "0" to reset.
+	If the counter can not be reset, then this attribute is read only.
+
+max_energy_range_uj (ro)
+	Range of the above energy counter in micro-joules.
+
+power_uw (ro)
+	Current power in micro watts.
+
+max_power_range_uw (ro)
+	Range of the above power value in micro-watts.
+
+name (ro)
+	Name of this power zone.
+
+It is possible that some domains have both power ranges and energy counter ranges;
+however, only one is mandatory.
+
+Constraints
+-----------
+
+constraint_X_power_limit_uw (rw)
+	Power limit in micro watts, which should be applicable for the
+	time window specified by "constraint_X_time_window_us".
+
+constraint_X_time_window_us (rw)
+	Time window in micro seconds.
+
+constraint_X_name (ro)
+	An optional name of the constraint
+
+constraint_X_max_power_uw(ro)
+	Maximum allowed power in micro watts.
+
+constraint_X_min_power_uw(ro)
+	Minimum allowed power in micro watts.
+
+constraint_X_max_time_window_us(ro)
+	Maximum allowed time window in micro seconds.
+
+constraint_X_min_time_window_us(ro)
+	Minimum allowed time window in micro seconds.
+
+Except power_limit_uw and time_window_us other fields are optional.
+
+Common zone and control type attributes
+---------------------------------------
+
+enabled (rw): Enable/Disable controls at zone level or for all zones using
+a control type.
+
+Power Cap Client Driver Interface
+=================================
+
+The API summary:
+
+Call powercap_register_control_type() to register control type object.
+Call powercap_register_zone() to register a power zone (under a given
+control type), either as a top-level power zone or as a subzone of another
+power zone registered earlier.
+The number of constraints in a power zone and the corresponding callbacks have
+to be defined prior to calling powercap_register_zone() to register that zone.
+
+To Free a power zone call powercap_unregister_zone().
+To free a control type object call powercap_unregister_control_type().
+Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt
deleted file mode 100644
index 1e6ef164e07a..000000000000
--- a/Documentation/power/powercap/powercap.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-Power Capping Framework
-==================================
-
-The power capping framework provides a consistent interface between the kernel
-and the user space that allows power capping drivers to expose the settings to
-user space in a uniform way.
-
-Terminology
-=========================
-The framework exposes power capping devices to user space via sysfs in the
-form of a tree of objects. The objects at the root level of the tree represent
-'control types', which correspond to different methods of power capping.  For
-example, the intel-rapl control type represents the Intel "Running Average
-Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
-corresponds to the use of idle injection for controlling power.
-
-Power zones represent different parts of the system, which can be controlled and
-monitored using the power capping method determined by the control type the
-given zone belongs to. They each contain attributes for monitoring power, as
-well as controls represented in the form of power constraints.  If the parts of
-the system represented by different power zones are hierarchical (that is, one
-bigger part consists of multiple smaller parts that each have their own power
-controls), those power zones may also be organized in a hierarchy with one
-parent power zone containing multiple subzones and so on to reflect the power
-control topology of the system.  In that case, it is possible to apply power
-capping to a set of devices together using the parent power zone and if more
-fine grained control is required, it can be applied through the subzones.
-
-
-Example sysfs interface tree:
-
-/sys/devices/virtual/powercap
-??? intel-rapl
-    ??? intel-rapl:0
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:0:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:0:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? enabled
-    ?   ??? uevent
-    ??? intel-rapl:1
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:1:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:1:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? uevent
-    ??? power
-    ?   ??? async
-    ?   []
-    ??? subsystem -> ../../../../class/power_cap
-    ??? enabled
-    ??? uevent
-
-The above example illustrates a case in which the Intel RAPL technology,
-available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
-control type called intel-rapl which contains two power zones, intel-rapl:0 and
-intel-rapl:1, representing CPU packages.  Each of these power zones contains
-two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
-"core" and the "uncore" parts of the given CPU package, respectively.  All of
-the zones and subzones contain energy monitoring attributes (energy_uj,
-max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
-to be applied (the constraints in the 'package' power zones apply to the whole
-CPU packages and the subzone constraints only apply to the respective parts of
-the given package individually). Since Intel RAPL doesn't provide instantaneous
-power value, there is no power_uw attribute.
-
-In addition to that, each power zone contains a name attribute, allowing the
-part of the system represented by that zone to be identified.
-For example:
-
-cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
-package-0
-
-The Intel RAPL technology allows two constraints, short term and long term,
-with two different time windows to be applied to each power zone.  Thus for
-each zone there are 2 attributes representing the constraint names, 2 power
-limits and 2 attributes representing the sizes of the time windows. Such that,
-constraint_j_* attributes correspond to the jth constraint (j = 0,1).
-
-For example:
-	constraint_0_name
-	constraint_0_power_limit_uw
-	constraint_0_time_window_us
-	constraint_1_name
-	constraint_1_power_limit_uw
-	constraint_1_time_window_us
-
-Power Zone Attributes
-=================================
-Monitoring attributes
-----------------------
-
-energy_uj (rw): Current energy counter in micro joules. Write "0" to reset.
-If the counter can not be reset, then this attribute is read only.
-
-max_energy_range_uj (ro): Range of the above energy counter in micro-joules.
-
-power_uw (ro): Current power in micro watts.
-
-max_power_range_uw (ro): Range of the above power value in micro-watts.
-
-name (ro): Name of this power zone.
-
-It is possible that some domains have both power ranges and energy counter ranges;
-however, only one is mandatory.
-
-Constraints
-----------------
-constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be
-applicable for the time window specified by "constraint_X_time_window_us".
-
-constraint_X_time_window_us (rw): Time window in micro seconds.
-
-constraint_X_name (ro): An optional name of the constraint
-
-constraint_X_max_power_uw(ro): Maximum allowed power in micro watts.
-
-constraint_X_min_power_uw(ro): Minimum allowed power in micro watts.
-
-constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds.
-
-constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds.
-
-Except power_limit_uw and time_window_us other fields are optional.
-
-Common zone and control type attributes
-----------------------------------------
-enabled (rw): Enable/Disable controls at zone level or for all zones using
-a control type.
-
-Power Cap Client Driver Interface
-==================================
-The API summary:
-
-Call powercap_register_control_type() to register control type object.
-Call powercap_register_zone() to register a power zone (under a given
-control type), either as a top-level power zone or as a subzone of another
-power zone registered earlier.
-The number of constraints in a power zone and the corresponding callbacks have
-to be defined prior to calling powercap_register_zone() to register that zone.
-
-To Free a power zone call powercap_unregister_zone().
-To free a control type object call powercap_unregister_control_type().
-Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst
new file mode 100644
index 000000000000..0cd8cc1275a7
--- /dev/null
+++ b/Documentation/power/regulator/consumer.rst
@@ -0,0 +1,229 @@
+===================================
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to its supply regulator by calling ::
+
+	regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in its struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call ::
+
+	regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies ::
+
+	digital = regulator_get(dev, "Vcc");  /* digital core */
+	analog = regulator_get(dev, "Avdd");  /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+===============================================================
+
+
+A consumer can enable its power supply by calling::
+
+	int regulator_enable(regulator);
+
+NOTE:
+  The supply may already be enabled before regulator_enabled() is called.
+  This may happen if the consumer shares the regulator or the regulator has been
+  previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling::
+
+	int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable its supply when no longer needed by calling::
+
+	int regulator_disable(regulator);
+
+NOTE:
+  This may not disable the supply if it's shared with other consumers. The
+  regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency::
+
+	int regulator_force_disable(regulator);
+
+NOTE:
+  this will immediately and forcefully shutdown the regulator output. All
+  consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+=======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling::
+
+	int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling::
+
+	int regulator_get_voltage(regulator);
+
+NOTE:
+  get_voltage() will return the configured output voltage whether the
+  regulator is enabled or disabled and should NOT be used to determine regulator
+  output state. However this can be used in conjunction with is_enabled() to
+  determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+=============================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling::
+
+	int regulator_set_current_limit(regulator, min_uA, max_uA);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE:
+  this can be called when the regulator is enabled or disabled. If called
+  when enabled, then the current limit changes instantly, otherwise the current
+  limit configuration changes and the current limit is physically set when the
+  regulator is next enabled.
+
+A regulators current limit can be found by calling::
+
+	int regulator_get_current_limit(regulator);
+
+NOTE:
+  get_current_limit() will return the current limit whether the regulator
+  is enabled or disabled and should not be used to determine regulator current
+  load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+==============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling::
+
+	int regulator_set_load(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all its consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling::
+
+	int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+	unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling::
+
+	int regulator_register_notifier(struct regulator *regulator,
+					struct notifier_block *nb);
+
+Consumers can unregister interest by calling::
+
+	int regulator_unregister_notifier(struct regulator *regulator,
+					  struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to their interested
+consumers.
+
+7. Regulator Direct Register Access
+===================================
+
+Some kinds of power management hardware or firmware are designed such that
+they need to do low-level hardware access to regulators, with no involvement
+from the kernel. Examples of such devices are:
+
+- clocksource with a voltage-controlled oscillator and control logic to change
+  the supply voltage over I2C to achieve a desired output clock rate
+- thermal management firmware that can issue an arbitrary I2C transaction to
+  perform system poweroff during overtemperature conditions
+
+To set up such a device/firmware, various parameters like I2C address of the
+regulator, addresses of various regulator registers etc. need to be configured
+to it. The regulator framework provides the following helpers for querying
+these details.
+
+Bus-specific details, like I2C addresses or transfer rates are handled by the
+regmap framework. To get the regulator's regmap (if supported), use::
+
+	struct regmap *regulator_get_regmap(struct regulator *regulator);
+
+To obtain the hardware register offset and bitmask for the regulator's voltage
+selector register, use::
+
+	int regulator_get_hardware_vsel_register(struct regulator *regulator,
+						 unsigned *vsel_reg,
+						 unsigned *vsel_mask);
+
+To convert a regulator framework voltage selector code (used by
+regulator_list_voltage) to a hardware-specific voltage selector that can be
+directly written to the voltage selector register, use::
+
+	int regulator_list_hardware_vsel(struct regulator *regulator,
+					 unsigned selector);
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
deleted file mode 100644
index e51564c1a140..000000000000
--- a/Documentation/power/regulator/consumer.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-Regulator Consumer Driver Interface
-===================================
-
-This text describes the regulator interface for consumer device drivers.
-Please see overview.txt for a description of the terms used in this text.
-
-
-1. Consumer Regulator Access (static & dynamic drivers)
-=======================================================
-
-A consumer driver can get access to its supply regulator by calling :-
-
-regulator = regulator_get(dev, "Vcc");
-
-The consumer passes in its struct device pointer and power supply ID. The core
-then finds the correct regulator by consulting a machine specific lookup table.
-If the lookup is successful then this call will return a pointer to the struct
-regulator that supplies this consumer.
-
-To release the regulator the consumer driver should call :-
-
-regulator_put(regulator);
-
-Consumers can be supplied by more than one regulator e.g. codec consumer with
-analog and digital supplies :-
-
-digital = regulator_get(dev, "Vcc");  /* digital core */
-analog = regulator_get(dev, "Avdd");  /* analog */
-
-The regulator access functions regulator_get() and regulator_put() will
-usually be called in your device drivers probe() and remove() respectively.
-
-
-2. Regulator Output Enable & Disable (static & dynamic drivers)
-====================================================================
-
-A consumer can enable its power supply by calling:-
-
-int regulator_enable(regulator);
-
-NOTE: The supply may already be enabled before regulator_enabled() is called.
-This may happen if the consumer shares the regulator or the regulator has been
-previously enabled by bootloader or kernel board initialization code.
-
-A consumer can determine if a regulator is enabled by calling :-
-
-int regulator_is_enabled(regulator);
-
-This will return > zero when the regulator is enabled.
-
-
-A consumer can disable its supply when no longer needed by calling :-
-
-int regulator_disable(regulator);
-
-NOTE: This may not disable the supply if it's shared with other consumers. The
-regulator will only be disabled when the enabled reference count is zero.
-
-Finally, a regulator can be forcefully disabled in the case of an emergency :-
-
-int regulator_force_disable(regulator);
-
-NOTE: this will immediately and forcefully shutdown the regulator output. All
-consumers will be powered off.
-
-
-3. Regulator Voltage Control & Status (dynamic drivers)
-======================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-voltage to match system operating points. e.g. CPUfreq drivers can scale
-voltage along with frequency to save power, SD drivers may need to select the
-correct card voltage, etc.
-
-Consumers can control their supply voltage by calling :-
-
-int regulator_set_voltage(regulator, min_uV, max_uV);
-
-Where min_uV and max_uV are the minimum and maximum acceptable voltages in
-microvolts.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the voltage changes instantly, otherwise the voltage
-configuration changes and the voltage is physically set when the regulator is
-next enabled.
-
-The regulators configured voltage output can be found by calling :-
-
-int regulator_get_voltage(regulator);
-
-NOTE: get_voltage() will return the configured output voltage whether the
-regulator is enabled or disabled and should NOT be used to determine regulator
-output state. However this can be used in conjunction with is_enabled() to
-determine the regulator physical output voltage.
-
-
-4. Regulator Current Limit Control & Status (dynamic drivers)
-===========================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-current limit to match system operating points. e.g. LCD backlight driver can
-change the current limit to vary the backlight brightness, USB drivers may want
-to set the limit to 500mA when supplying power.
-
-Consumers can control their supply current limit by calling :-
-
-int regulator_set_current_limit(regulator, min_uA, max_uA);
-
-Where min_uA and max_uA are the minimum and maximum acceptable current limit in
-microamps.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the current limit changes instantly, otherwise the current
-limit configuration changes and the current limit is physically set when the
-regulator is next enabled.
-
-A regulators current limit can be found by calling :-
-
-int regulator_get_current_limit(regulator);
-
-NOTE: get_current_limit() will return the current limit whether the regulator
-is enabled or disabled and should not be used to determine regulator current
-load.
-
-
-5. Regulator Operating Mode Control & Status (dynamic drivers)
-=============================================================
-
-Some consumers can further save system power by changing the operating mode of
-their supply regulator to be more efficient when the consumers operating state
-changes. e.g. consumer driver is idle and subsequently draws less current
-
-Regulator operating mode can be changed indirectly or directly.
-
-Indirect operating mode control.
---------------------------------
-Consumer drivers can request a change in their supply regulator operating mode
-by calling :-
-
-int regulator_set_load(struct regulator *regulator, int load_uA);
-
-This will cause the core to recalculate the total load on the regulator (based
-on all its consumers) and change operating mode (if necessary and permitted)
-to best match the current operating load.
-
-The load_uA value can be determined from the consumer's datasheet. e.g. most
-datasheets have tables showing the maximum current consumed in certain
-situations.
-
-Most consumers will use indirect operating mode control since they have no
-knowledge of the regulator or whether the regulator is shared with other
-consumers.
-
-Direct operating mode control.
-------------------------------
-Bespoke or tightly coupled drivers may want to directly control regulator
-operating mode depending on their operating point. This can be achieved by
-calling :-
-
-int regulator_set_mode(struct regulator *regulator, unsigned int mode);
-unsigned int regulator_get_mode(struct regulator *regulator);
-
-Direct mode will only be used by consumers that *know* about the regulator and
-are not sharing the regulator with other consumers.
-
-
-6. Regulator Events
-===================
-Regulators can notify consumers of external events. Events could be received by
-consumers under regulator stress or failure conditions.
-
-Consumers can register interest in regulator events by calling :-
-
-int regulator_register_notifier(struct regulator *regulator,
-			      struct notifier_block *nb);
-
-Consumers can unregister interest by calling :-
-
-int regulator_unregister_notifier(struct regulator *regulator,
-				struct notifier_block *nb);
-
-Regulators use the kernel notifier framework to send event to their interested
-consumers.
-
-7. Regulator Direct Register Access
-===================================
-Some kinds of power management hardware or firmware are designed such that
-they need to do low-level hardware access to regulators, with no involvement
-from the kernel. Examples of such devices are:
-
-- clocksource with a voltage-controlled oscillator and control logic to change
-  the supply voltage over I2C to achieve a desired output clock rate
-- thermal management firmware that can issue an arbitrary I2C transaction to
-  perform system poweroff during overtemperature conditions
-
-To set up such a device/firmware, various parameters like I2C address of the
-regulator, addresses of various regulator registers etc. need to be configured
-to it. The regulator framework provides the following helpers for querying
-these details.
-
-Bus-specific details, like I2C addresses or transfer rates are handled by the
-regmap framework. To get the regulator's regmap (if supported), use :-
-
-struct regmap *regulator_get_regmap(struct regulator *regulator);
-
-To obtain the hardware register offset and bitmask for the regulator's voltage
-selector register, use :-
-
-int regulator_get_hardware_vsel_register(struct regulator *regulator,
-					 unsigned *vsel_reg,
-					 unsigned *vsel_mask);
-
-To convert a regulator framework voltage selector code (used by
-regulator_list_voltage) to a hardware-specific voltage selector that can be
-directly written to the voltage selector register, use :-
-
-int regulator_list_hardware_vsel(struct regulator *regulator,
-				 unsigned selector);
diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst
new file mode 100644
index 000000000000..3b09c6841dc4
--- /dev/null
+++ b/Documentation/power/regulator/design.rst
@@ -0,0 +1,38 @@
+==========================
+Regulator API design notes
+==========================
+
+This document provides a brief, partially structured, overview of some
+of the design considerations which impact the regulator API design.
+
+Safety
+------
+
+ - Errors in regulator configuration can have very serious consequences
+   for the system, potentially including lasting hardware damage.
+ - It is not possible to automatically determine the power configuration
+   of the system - software-equivalent variants of the same chip may
+   have different power requirements, and not all components with power
+   requirements are visible to software.
+
+.. note::
+
+     The API should make no changes to the hardware state unless it has
+     specific knowledge that these changes are safe to perform on this
+     particular system.
+
+Consumer use cases
+------------------
+
+ - The overwhelming majority of devices in a system will have no
+   requirement to do any runtime configuration of their power beyond
+   being able to turn it on or off.
+
+ - Many of the power supplies in the system will be shared between many
+   different consumers.
+
+.. note::
+
+     The consumer API should be structured so that these use cases are
+     very easy to handle and so that consumers will work with shared
+     supplies without any additional effort.
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
deleted file mode 100644
index fdd919b96830..000000000000
--- a/Documentation/power/regulator/design.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Regulator API design notes
-==========================
-
-This document provides a brief, partially structured, overview of some
-of the design considerations which impact the regulator API design.
-
-Safety
-------
-
- - Errors in regulator configuration can have very serious consequences
-   for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power configuration
-   of the system - software-equivalent variants of the same chip may
-   have different power requirements, and not all components with power
-   requirements are visible to software.
-
-  => The API should make no changes to the hardware state unless it has
-     specific knowledge that these changes are safe to perform on this
-     particular system.
-
-Consumer use cases
-------------------
-
- - The overwhelming majority of devices in a system will have no
-   requirement to do any runtime configuration of their power beyond
-   being able to turn it on or off.
-
- - Many of the power supplies in the system will be shared between many
-   different consumers.
-
-  => The consumer API should be structured so that these use cases are
-     very easy to handle and so that consumers will work with shared
-     supplies without any additional effort.
diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst
new file mode 100644
index 000000000000..22fffefaa3ad
--- /dev/null
+++ b/Documentation/power/regulator/machine.rst
@@ -0,0 +1,97 @@
+==================================
+Regulator Machine Driver Interface
+==================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine::
+
+  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+               |
+               +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supplies. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator::
+
+  struct regulator_consumer_supply {
+	const char *dev_name;	/* consumer dev_name() */
+	const char *supply;	/* consumer supply - e.g. "vcc" */
+  };
+
+e.g. for the machine above::
+
+  static struct regulator_consumer_supply regulator1_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer B"),
+  };
+
+  static struct regulator_consumer_supply regulator2_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer A"),
+  };
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulators::
+
+  static struct regulator_init_data regulator1_data = {
+	.constraints = {
+		.name = "Regulator-1",
+		.min_uV = 3300000,
+		.max_uV = 3300000,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+	.consumer_supplies = regulator1_consumers,
+  };
+
+The name field should be set to something that is usefully descriptive
+for the board for configuration of supplies for other regulators and
+for use in logging and other diagnostic output.  Normally the name
+used for the supply rail in the schematic is a good choice.  If no
+name is provided then the subsystem will choose one.
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables its
+supply (Regulator-2). The supply regulator is set by the supply_regulator
+field below and co::
+
+  static struct regulator_init_data regulator2_data = {
+	.supply_regulator = "Regulator-1",
+	.constraints = {
+		.min_uV = 1800000,
+		.max_uV = 2000000,
+		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+	.consumer_supplies = regulator2_consumers,
+  };
+
+Finally the regulator devices must be registered in the usual manner::
+
+  static struct platform_device regulator_devices[] = {
+	{
+		.name = "regulator",
+		.id = DCDC_1,
+		.dev = {
+			.platform_data = &regulator1_data,
+		},
+	},
+	{
+		.name = "regulator",
+		.id = DCDC_2,
+		.dev = {
+			.platform_data = &regulator2_data,
+		},
+	},
+  };
+  /* register regulator 1 device */
+  platform_device_register(&regulator_devices[0]);
+
+  /* register regulator 2 device */
+  platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
deleted file mode 100644
index eff4dcaaa252..000000000000
--- a/Documentation/power/regulator/machine.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-Regulator Machine Driver Interface
-===================================
-
-The regulator machine driver interface is intended for board/machine specific
-initialisation code to configure the regulator subsystem.
-
-Consider the following machine :-
-
-  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
-               |
-               +-> [Consumer B @ 3.3V]
-
-The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supplies. This mapping can be achieved in machine
-initialisation code by creating a struct regulator_consumer_supply for
-each regulator.
-
-struct regulator_consumer_supply {
-	const char *dev_name;	/* consumer dev_name() */
-	const char *supply;	/* consumer supply - e.g. "vcc" */
-};
-
-e.g. for the machine above
-
-static struct regulator_consumer_supply regulator1_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer B"),
-};
-
-static struct regulator_consumer_supply regulator2_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer A"),
-};
-
-This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
-to the 'Vcc' supply for Consumer A.
-
-Constraints can now be registered by defining a struct regulator_init_data
-for each regulator power domain. This structure also maps the consumers
-to their supply regulators :-
-
-static struct regulator_init_data regulator1_data = {
-	.constraints = {
-		.name = "Regulator-1",
-		.min_uV = 3300000,
-		.max_uV = 3300000,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
-	.consumer_supplies = regulator1_consumers,
-};
-
-The name field should be set to something that is usefully descriptive
-for the board for configuration of supplies for other regulators and
-for use in logging and other diagnostic output.  Normally the name
-used for the supply rail in the schematic is a good choice.  If no
-name is provided then the subsystem will choose one.
-
-Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables its
-supply (Regulator-2). The supply regulator is set by the supply_regulator
-field below and co:-
-
-static struct regulator_init_data regulator2_data = {
-	.supply_regulator = "Regulator-1",
-	.constraints = {
-		.min_uV = 1800000,
-		.max_uV = 2000000,
-		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
-	.consumer_supplies = regulator2_consumers,
-};
-
-Finally the regulator devices must be registered in the usual manner.
-
-static struct platform_device regulator_devices[] = {
-	{
-		.name = "regulator",
-		.id = DCDC_1,
-		.dev = {
-			.platform_data = &regulator1_data,
-		},
-	},
-	{
-		.name = "regulator",
-		.id = DCDC_2,
-		.dev = {
-			.platform_data = &regulator2_data,
-		},
-	},
-};
-/* register regulator 1 device */
-platform_device_register(&regulator_devices[0]);
-
-/* register regulator 2 device */
-platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst
new file mode 100644
index 000000000000..ee494c70a7c4
--- /dev/null
+++ b/Documentation/power/regulator/overview.rst
@@ -0,0 +1,178 @@
+=============================================
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008  Wolfson Microelectronics PLC.
+
+Author: Liam Girdwood <lrg@slimlogic.co.uk>
+
+
+Nomenclature
+============
+
+Some terms used in this document:
+
+  - Regulator
+                 - Electronic device that supplies power to other devices.
+                   Most regulators can enable and disable their output while
+                   some can control their output voltage and or current.
+
+                   Input Voltage -> Regulator -> Output Voltage
+
+
+  - PMIC
+                 - Power Management IC. An IC that contains numerous
+                   regulators and often contains other subsystems.
+
+
+  - Consumer
+                 - Electronic device that is supplied power by a regulator.
+                   Consumers can be classified into two types:-
+
+                   Static: consumer does not change its supply voltage or
+                   current limit. It only needs to enable or disable its
+                   power supply. Its supply voltage is set by the hardware,
+                   bootloader, firmware or kernel board initialisation code.
+
+                   Dynamic: consumer needs to change its supply voltage or
+                   current limit to meet operation demands.
+
+
+  - Power Domain
+                 - Electronic circuit that is supplied its input power by the
+                   output power of a regulator, switch or by another power
+                   domain.
+
+                   The supply regulator may be behind a switch(s). i.e.::
+
+                     Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+                                |             |
+                                |             +-> [Consumer B], [Consumer C]
+                                |
+                                +-> [Consumer D], [Consumer E]
+
+                   That is one regulator and three power domains:
+
+                   - Domain 1: Switch-1, Consumers D & E.
+                   - Domain 2: Switch-2, Consumers B & C.
+                   - Domain 3: Consumer A.
+
+                   and this represents a "supplies" relationship:
+
+                   Domain-1 --> Domain-2 --> Domain-3.
+
+                   A power domain may have regulators that are supplied power
+                   by other regulators. i.e.::
+
+                     Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+                                  |
+                                  +-> [Consumer B]
+
+                   This gives us two regulators and two power domains:
+
+                   - Domain 1: Regulator-2, Consumer B.
+                   - Domain 2: Consumer A.
+
+                   and a "supplies" relationship:
+
+                   Domain-1 --> Domain-2
+
+
+  - Constraints
+                 - Constraints are used to define power levels for performance
+                   and hardware protection. Constraints exist at three levels:
+
+                   Regulator Level: This is defined by the regulator hardware
+                   operating parameters and is specified in the regulator
+                   datasheet. i.e.
+
+                     - voltage output is in the range 800mV -> 3500mV.
+                     - regulator current output limit is 20mA @ 5V but is
+                       10mA @ 10V.
+
+                   Power Domain Level: This is defined in software by kernel
+                   level board initialisation code. It is used to constrain a
+                   power domain to a particular power range. i.e.
+
+                     - Domain-1 voltage is 3300mV
+                     - Domain-2 voltage is 1400mV -> 1600mV
+                     - Domain-3 current limit is 0mA -> 20mA.
+
+                   Consumer Level: This is defined by consumer drivers
+                   dynamically setting voltage or current limit levels.
+
+                   e.g. a consumer backlight driver asks for a current increase
+                   from 5mA to 10mA to increase LCD illumination. This passes
+                   to through the levels as follows :-
+
+                   Consumer: need to increase LCD brightness. Lookup and
+                   request next current mA value in brightness table (the
+                   consumer driver could be used on several different
+                   personalities based upon the same reference device).
+
+                   Power Domain: is the new current limit within the domain
+                   operating limits for this domain and system state (e.g.
+                   battery power, USB power)
+
+                   Regulator Domains: is the new current limit within the
+                   regulator operating parameters for input/output voltage.
+
+                   If the regulator request passes all the constraint tests
+                   then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+   1. Consumer driver interface.
+
+      This uses a similar API to the kernel clock interface in that consumer
+      drivers can get and put a regulator (like they can with clocks atm) and
+      get/set voltage, current limit, mode, enable and disable. This should
+      allow consumers complete control over their supply voltage and current
+      limit. This also compiles out if not in use so drivers can be reused in
+      systems with no regulator based power control.
+
+        See Documentation/power/regulator/consumer.rst
+
+   2. Regulator driver interface.
+
+      This allows regulator drivers to register their regulators and provide
+      operations to the core. It also has a notifier call chain for propagating
+      regulator events to clients.
+
+        See Documentation/power/regulator/regulator.rst
+
+   3. Machine interface.
+
+      This interface is for machine specific code and allows the creation of
+      voltage/current domains (with constraints) for each regulator. It can
+      provide regulator constraints that will prevent device damage through
+      overvoltage or overcurrent caused by buggy client drivers. It also
+      allows the creation of a regulator tree whereby some regulators are
+      supplied by others (similar to a clock tree).
+
+        See Documentation/power/regulator/machine.rst
+
+   4. Userspace ABI.
+
+      The framework also exports a lot of useful voltage/current/opmode data to
+      userspace via sysfs. This could be used to help monitor device power
+      consumption and status.
+
+        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
deleted file mode 100644
index 721b4739ec32..000000000000
--- a/Documentation/power/regulator/overview.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-Linux voltage and current regulator framework
-=============================================
-
-About
-=====
-
-This framework is designed to provide a standard kernel interface to control
-voltage and current regulators.
-
-The intention is to allow systems to dynamically control regulator power output
-in order to save power and prolong battery life. This applies to both voltage
-regulators (where voltage output is controllable) and current sinks (where
-current limit is controllable).
-
-(C) 2008  Wolfson Microelectronics PLC.
-Author: Liam Girdwood <lrg@slimlogic.co.uk>
-
-
-Nomenclature
-============
-
-Some terms used in this document:-
-
-  o Regulator    - Electronic device that supplies power to other devices.
-                   Most regulators can enable and disable their output while
-                   some can control their output voltage and or current.
-
-                   Input Voltage -> Regulator -> Output Voltage
-
-
-  o PMIC         - Power Management IC. An IC that contains numerous regulators
-                   and often contains other subsystems.
-
-
-  o Consumer     - Electronic device that is supplied power by a regulator.
-                   Consumers can be classified into two types:-
-
-                   Static: consumer does not change its supply voltage or
-                   current limit. It only needs to enable or disable its
-                   power supply. Its supply voltage is set by the hardware,
-                   bootloader, firmware or kernel board initialisation code.
-
-                   Dynamic: consumer needs to change its supply voltage or
-                   current limit to meet operation demands.
-
-
-  o Power Domain - Electronic circuit that is supplied its input power by the
-                   output power of a regulator, switch or by another power
-                   domain.
-
-                   The supply regulator may be behind a switch(s). i.e.
-
-                   Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
-                              |             |
-                              |             +-> [Consumer B], [Consumer C]
-                              |
-                              +-> [Consumer D], [Consumer E]
-
-                   That is one regulator and three power domains:
-
-                   Domain 1: Switch-1, Consumers D & E.
-                   Domain 2: Switch-2, Consumers B & C.
-                   Domain 3: Consumer A.
-
-                   and this represents a "supplies" relationship:
-
-                   Domain-1 --> Domain-2 --> Domain-3.
-
-                   A power domain may have regulators that are supplied power
-                   by other regulators. i.e.
-
-                   Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
-                                |
-                                +-> [Consumer B]
-
-                   This gives us two regulators and two power domains:
-
-                   Domain 1: Regulator-2, Consumer B.
-                   Domain 2: Consumer A.
-
-                   and a "supplies" relationship:
-
-                   Domain-1 --> Domain-2
-
-
-  o Constraints  - Constraints are used to define power levels for performance
-                   and hardware protection. Constraints exist at three levels:
-
-                   Regulator Level: This is defined by the regulator hardware
-                   operating parameters and is specified in the regulator
-                   datasheet. i.e.
-
-                     - voltage output is in the range 800mV -> 3500mV.
-                     - regulator current output limit is 20mA @ 5V but is
-                       10mA @ 10V.
-
-                   Power Domain Level: This is defined in software by kernel
-                   level board initialisation code. It is used to constrain a
-                   power domain to a particular power range. i.e.
-
-                     - Domain-1 voltage is 3300mV
-                     - Domain-2 voltage is 1400mV -> 1600mV
-                     - Domain-3 current limit is 0mA -> 20mA.
-
-                   Consumer Level: This is defined by consumer drivers
-                   dynamically setting voltage or current limit levels.
-
-                   e.g. a consumer backlight driver asks for a current increase
-                   from 5mA to 10mA to increase LCD illumination. This passes
-                   to through the levels as follows :-
-
-                   Consumer: need to increase LCD brightness. Lookup and
-                   request next current mA value in brightness table (the
-                   consumer driver could be used on several different
-                   personalities based upon the same reference device).
-
-                   Power Domain: is the new current limit within the domain
-                   operating limits for this domain and system state (e.g.
-                   battery power, USB power)
-
-                   Regulator Domains: is the new current limit within the
-                   regulator operating parameters for input/output voltage.
-
-                   If the regulator request passes all the constraint tests
-                   then the new regulator value is applied.
-
-
-Design
-======
-
-The framework is designed and targeted at SoC based devices but may also be
-relevant to non SoC devices and is split into the following four interfaces:-
-
-
-   1. Consumer driver interface.
-
-      This uses a similar API to the kernel clock interface in that consumer
-      drivers can get and put a regulator (like they can with clocks atm) and
-      get/set voltage, current limit, mode, enable and disable. This should
-      allow consumers complete control over their supply voltage and current
-      limit. This also compiles out if not in use so drivers can be reused in
-      systems with no regulator based power control.
-
-        See Documentation/power/regulator/consumer.txt
-
-   2. Regulator driver interface.
-
-      This allows regulator drivers to register their regulators and provide
-      operations to the core. It also has a notifier call chain for propagating
-      regulator events to clients.
-
-        See Documentation/power/regulator/regulator.txt
-
-   3. Machine interface.
-
-      This interface is for machine specific code and allows the creation of
-      voltage/current domains (with constraints) for each regulator. It can
-      provide regulator constraints that will prevent device damage through
-      overvoltage or overcurrent caused by buggy client drivers. It also
-      allows the creation of a regulator tree whereby some regulators are
-      supplied by others (similar to a clock tree).
-
-        See Documentation/power/regulator/machine.txt
-
-   4. Userspace ABI.
-
-      The framework also exports a lot of useful voltage/current/opmode data to
-      userspace via sysfs. This could be used to help monitor device power
-      consumption and status.
-
-        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst
new file mode 100644
index 000000000000..794b3256fbb9
--- /dev/null
+++ b/Documentation/power/regulator/regulator.rst
@@ -0,0 +1,32 @@
+==========================
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling::
+
+  struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
+					   const struct regulator_config *config);
+
+This will register the regulator's capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling::
+
+  void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling::
+
+  int regulator_notifier_call_chain(struct regulator_dev *rdev,
+				    unsigned long event, void *data);
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
deleted file mode 100644
index b17e5833ce21..000000000000
--- a/Documentation/power/regulator/regulator.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Regulator Driver Interface
-==========================
-
-The regulator driver interface is relatively simple and designed to allow
-regulator drivers to register their services with the core framework.
-
-
-Registration
-============
-
-Drivers can register a regulator by calling :-
-
-struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-					 const struct regulator_config *config);
-
-This will register the regulator's capabilities and operations to the regulator
-core.
-
-Regulators can be unregistered by calling :-
-
-void regulator_unregister(struct regulator_dev *rdev);
-
-
-Regulator Events
-================
-Regulators can send events (e.g. overtemperature, undervoltage, etc) to
-consumer drivers by calling :-
-
-int regulator_notifier_call_chain(struct regulator_dev *rdev,
-				  unsigned long event, void *data);
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
new file mode 100644
index 000000000000..2c2ec99b5088
--- /dev/null
+++ b/Documentation/power/runtime_pm.rst
@@ -0,0 +1,940 @@
+==================================================
+Runtime Power Management Framework for I/O Devices
+==================================================
+
+(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+1. Introduction
+===============
+
+Support for runtime power management (runtime PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+  put their PM-related work items.  It is strongly recommended that pm_wq be
+  used for queuing all work items related to runtime PM, because this allows
+  them to be synchronized with system-wide power transitions (suspend to RAM,
+  hibernation and resume from system sleep states).  pm_wq is declared in
+  include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of runtime PM fields in the 'power' member of 'struct device' (which
+  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+  be used for synchronizing runtime PM operations with one another.
+
+* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
+  include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+  used for carrying out runtime PM operations in such a way that the
+  synchronization between them is taken care of by the PM core.  Bus types and
+  device drivers are encouraged to use these functions.
+
+The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+runtime PM are described below.
+
+2. Device Runtime PM Callbacks
+==============================
+
+There are three device runtime PM callbacks defined in 'struct dev_pm_ops'::
+
+  struct dev_pm_ops {
+	...
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	int (*runtime_idle)(struct device *dev);
+	...
+  };
+
+The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
+are executed by the PM core for the device's subsystem that may be either of
+the following:
+
+  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
+     is present.
+
+  2. Device type of the device, if both dev->type and dev->type->pm are present.
+
+  3. Device class of the device, if both dev->class and dev->class->pm are
+     present.
+
+  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
+
+If the subsystem chosen by applying the above rules doesn't provide the relevant
+callback, the PM core will invoke the corresponding driver callback stored in
+dev->driver->pm directly (if present).
+
+The PM core always checks which callback to use in the order given above, so the
+priority order of callbacks from high to low is: PM domain, device type, class
+and bus type.  Moreover, the high-priority one will always take precedence over
+a low-priority one.  The PM domain, bus type, device type and class callbacks
+are referred to as subsystem-level callbacks in what follows.
+
+By default, the callbacks are always invoked in process context with interrupts
+enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
+the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
+and ->runtime_idle() callbacks for the given device in atomic context with
+interrupts disabled.  This implies that the callback routines in question must
+not block or sleep, but it also means that the synchronous helper functions
+listed at the end of Section 4 may be used for that device within an interrupt
+handler or generally in an atomic context.
+
+The subsystem-level suspend callback, if present, is _entirely_ _responsible_
+for handling the suspend of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the subsystem-level suspend callback
+knows what to do to handle the device).
+
+  * Once the subsystem-level suspend callback (or the driver suspend callback,
+    if invoked directly) has completed successfully for the given device, the PM
+    core regards the device as suspended, which need not mean that it has been
+    put into a low power state.  It is supposed to mean, however, that the
+    device will not process data and will not communicate with the CPU(s) and
+    RAM until the appropriate resume callback is executed for it.  The runtime
+    PM status of a device after successful execution of the suspend callback is
+    'suspended'.
+
+  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
+    status remains 'active', which means that the device _must_ be fully
+    operational afterwards.
+
+  * If the suspend callback returns an error code different from -EBUSY and
+    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
+    the helper functions described in Section 4 for the device until its status
+    is directly set to  either 'active', or 'suspended' (the PM core provides
+    special helper functions for this purpose).
+
+In particular, if the driver requires remote wakeup capability (i.e. hardware
+mechanism allowing the device to request a change of its power state, such as
+PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
+device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
+device_can_wakeup() returns 'true' for the device and the device is put into a
+low-power state during the execution of the suspend callback, it is expected
+that remote wakeup will be enabled for the device.  Generally, remote wakeup
+should be enabled for all input devices put into low-power states at run time.
+
+The subsystem-level resume callback, if present, is **entirely responsible** for
+handling the resume of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the subsystem-level resume callback knows
+what to do to handle the device).
+
+  * Once the subsystem-level resume callback (or the driver resume callback, if
+    invoked directly) has completed successfully, the PM core regards the device
+    as fully operational, which means that the device _must_ be able to complete
+    I/O operations as needed.  The runtime PM status of the device is then
+    'active'.
+
+  * If the resume callback returns an error code, the PM core regards this as a
+    fatal error and will refuse to run the helper functions described in Section
+    4 for the device, until its status is directly set to either 'active', or
+    'suspended' (by means of special helper functions provided by the PM core
+    for this purpose).
+
+The idle callback (a subsystem-level one, if present, or the driver one) is
+executed by the PM core whenever the device appears to be idle, which is
+indicated to the PM core by two counters, the device's usage counter and the
+counter of 'active' children of the device.
+
+  * If any of these counters is decreased using a helper function provided by
+    the PM core and it turns out to be equal to zero, the other counter is
+    checked.  If that counter also is equal to zero, the PM core executes the
+    idle callback with the device as its argument.
+
+The action performed by the idle callback is totally dependent on the subsystem
+(or driver) in question, but the expected and recommended action is to check
+if the device can be suspended (i.e. if all of the conditions necessary for
+suspending the device are satisfied) and to queue up a suspend request for the
+device in that case.  If there is no idle callback, or if the callback returns
+0, then the PM core will attempt to carry out a runtime suspend of the device,
+also respecting devices configured for autosuspend.  In essence this means a
+call to pm_runtime_autosuspend() (do note that drivers needs to update the
+device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
+this circumstance).  To prevent this (for example, if the callback routine has
+started a delayed suspend), the routine must return a non-zero value.  Negative
+error return codes are ignored by the PM core.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to runtime PM callbacks for
+one device:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+    ->runtime_suspend() in parallel with ->runtime_resume() or with another
+    instance of ->runtime_suspend() for the same device) with the exception that
+    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+    ->runtime_idle() (although ->runtime_idle() will not be started while any
+    of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+    devices (i.e. the PM core will only execute ->runtime_idle() or
+    ->runtime_suspend() for the devices the runtime PM status of which is
+    'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+    the usage counter of which is equal to zero _and_ either the counter of
+    'active' children of which is equal to zero, or the 'power.ignore_children'
+    flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
+    PM core will only execute ->runtime_resume() for the devices the runtime
+    PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+  * If ->runtime_suspend() is about to be executed or there's a pending request
+    to execute it, ->runtime_idle() will not be executed for the same device.
+
+  * A request to execute or to schedule the execution of ->runtime_suspend()
+    will cancel any pending requests to execute ->runtime_idle() for the same
+    device.
+
+  * If ->runtime_resume() is about to be executed or there's a pending request
+    to execute it, the other callbacks will not be executed for the same device.
+
+  * A request to execute ->runtime_resume() will cancel any pending or
+    scheduled requests to execute the other callbacks for the same device,
+    except for scheduled autosuspends.
+
+3. Runtime PM Device Fields
+===========================
+
+The following device runtime PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+  `struct timer_list suspend_timer;`
+    - timer used for scheduling (delayed) suspend and autosuspend requests
+
+  `unsigned long timer_expires;`
+    - timer expiration time, in jiffies (if this is different from zero, the
+      timer is running and will expire at that time, otherwise the timer is not
+      running)
+
+  `struct work_struct work;`
+    - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+  `wait_queue_head_t wait_queue;`
+    - wait queue used if any of the helper functions needs to wait for another
+      one to complete
+
+  `spinlock_t lock;`
+    - lock used for synchronization
+
+  `atomic_t usage_count;`
+    - the usage counter of the device
+
+  `atomic_t child_count;`
+    - the count of 'active' children of the device
+
+  `unsigned int ignore_children;`
+    - if set, the value of child_count is ignored (but still updated)
+
+  `unsigned int disable_depth;`
+    - used for disabling the helper functions (they work normally if this is
+      equal to zero); the initial value of it is 1 (i.e. runtime PM is
+      initially disabled for all devices)
+
+  `int runtime_error;`
+    - if set, there was a fatal error (one of the callbacks returned error code
+      as described in Section 2), so the helper functions will not work until
+      this flag is cleared; this is the error code returned by the failing
+      callback
+
+  `unsigned int idle_notification;`
+    - if set, ->runtime_idle() is being executed
+
+  `unsigned int request_pending;`
+    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+  `enum rpm_request request;`
+    - type of request that's pending (valid if request_pending is set)
+
+  `unsigned int deferred_resume;`
+    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+      being executed for that device and it is not practical to wait for the
+      suspend to complete; means "start a resume as soon as you've suspended"
+
+  `enum rpm_status runtime_status;`
+    - the runtime PM status of the device; this field's initial value is
+      RPM_SUSPENDED, which means that each device is initially regarded by the
+      PM core as 'suspended', regardless of its real hardware status
+
+  `unsigned int runtime_auto;`
+    - if set, indicates that the user space has allowed the device driver to
+      power manage the device at run time via the /sys/devices/.../power/control
+      `interface;` it may only be modified with the help of the pm_runtime_allow()
+      and pm_runtime_forbid() helper functions
+
+  `unsigned int no_callbacks;`
+    - indicates that the device does not use the runtime PM callbacks (see
+      Section 8); it may be modified only by the pm_runtime_no_callbacks()
+      helper function
+
+  `unsigned int irq_safe;`
+    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+      will be invoked with the spinlock held and interrupts disabled
+
+  `unsigned int use_autosuspend;`
+    - indicates that the device's driver supports delayed autosuspend (see
+      Section 9); it may be modified only by the
+      pm_runtime{_dont}_use_autosuspend() helper functions
+
+  `unsigned int timer_autosuspends;`
+    - indicates that the PM core should attempt to carry out an autosuspend
+      when the timer expires rather than a normal suspend
+
+  `int autosuspend_delay;`
+    - the delay time (in milliseconds) to be used for autosuspend
+
+  `unsigned long last_busy;`
+    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
+      function was last called for this device; used in calculating inactivity
+      periods for autosuspend
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Runtime PM Device Helper Functions
+=====================================
+
+The following runtime PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+  `void pm_runtime_init(struct device *dev);`
+    - initialize the device runtime PM fields in 'struct dev_pm_info'
+
+  `void pm_runtime_remove(struct device *dev);`
+    - make sure that the runtime PM of the device will be disabled after
+      removing the device from device hierarchy
+
+  `int pm_runtime_idle(struct device *dev);`
+    - execute the subsystem-level idle callback for the device; returns an
+      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
+      already being executed; if there is no callback or the callback returns 0
+      then run pm_runtime_autosuspend(dev) and return its result
+
+  `int pm_runtime_suspend(struct device *dev);`
+    - execute the subsystem-level suspend callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'suspended', or
+      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+      to suspend the device again in future and -EACCES means that
+      'power.disable_depth' is different from 0
+
+  `int pm_runtime_autosuspend(struct device *dev);`
+    - same as pm_runtime_suspend() except that the autosuspend delay is taken
+      `into account;` if pm_runtime_autosuspend_expiration() says the delay has
+      not yet expired then an autosuspend is scheduled for the appropriate time
+      and 0 is returned
+
+  `int pm_runtime_resume(struct device *dev);`
+    - execute the subsystem-level resume callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'active' or
+      error code on failure, where -EAGAIN means it may be safe to attempt to
+      resume the device again in future, but 'power.runtime_error' should be
+      checked additionally, and -EACCES means that 'power.disable_depth' is
+      different from 0
+
+  `int pm_request_idle(struct device *dev);`
+    - submit a request to execute the subsystem-level idle callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success or error code if the request has not been queued up
+
+  `int pm_request_autosuspend(struct device *dev);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device when the autosuspend delay has expired; if the delay has already
+      expired then the work item is queued up immediately
+
+  `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device in future, where 'delay' is the time to wait before queuing up a
+      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
+      item is queued up immediately); returns 0 on success, 1 if the device's PM
+      runtime status was already 'suspended', or error code if the request
+      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+      ->runtime_suspend() is already scheduled and not yet expired, the new
+      value of 'delay' will be used as the time to wait
+
+  `int pm_request_resume(struct device *dev);`
+    - submit a request to execute the subsystem-level resume callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success, 1 if the device's runtime PM status was already 'active', or
+      error code if the request hasn't been queued up
+
+  `void pm_runtime_get_noresume(struct device *dev);`
+    - increment the device's usage counter
+
+  `int pm_runtime_get(struct device *dev);`
+    - increment the device's usage counter, run pm_request_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_sync(struct device *dev);`
+    - increment the device's usage counter, run pm_runtime_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_if_in_use(struct device *dev);`
+    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
+      nonzero, increment the counter and return 1; otherwise return 0 without
+      changing the counter
+
+  `void pm_runtime_put_noidle(struct device *dev);`
+    - decrement the device's usage counter
+
+  `int pm_runtime_put(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_idle(dev) and return its result
+
+  `int pm_runtime_put_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_autosuspend(dev) and return its result
+
+  `int pm_runtime_put_sync(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_idle(dev) and return its result
+
+  `int pm_runtime_put_sync_suspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_suspend(dev) and return its result
+
+  `int pm_runtime_put_sync_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_autosuspend(dev) and return its result
+
+  `void pm_runtime_enable(struct device *dev);`
+    - decrement the device's 'power.disable_depth' field; if that field is equal
+      to zero, the runtime PM helper functions can execute subsystem-level
+      callbacks described in Section 2 for the device
+
+  `int pm_runtime_disable(struct device *dev);`
+    - increment the device's 'power.disable_depth' field (if the value of that
+      field was previously zero, this prevents subsystem-level runtime PM
+      callbacks from being run for the device), make sure that all of the
+      pending runtime PM operations on the device are either completed or
+      canceled; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device
+      to satisfy that request, otherwise 0 is returned
+
+  `int pm_runtime_barrier(struct device *dev);`
+    - check if there's a resume request pending for the device and resume it
+      (synchronously) in that case, cancel any other pending runtime PM requests
+      regarding it and wait for all runtime PM operations on it in progress to
+      complete; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device to
+      satisfy that request, otherwise 0 is returned
+
+  `void pm_suspend_ignore_children(struct device *dev, bool enable);`
+    - set/unset the power.ignore_children flag of the device
+
+  `int pm_runtime_set_active(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'active' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero); it will fail and return error code if the device has a parent
+      which is not active and the 'power.ignore_children' flag of which is unset
+
+  `void pm_runtime_set_suspended(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'suspended' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero)
+
+  `bool pm_runtime_active(struct device *dev);`
+    - return true if the device's runtime PM status is 'active' or its
+      'power.disable_depth' field is not equal to zero, or false otherwise
+
+  `bool pm_runtime_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended' and its
+      'power.disable_depth' field is equal to zero, or false otherwise
+
+  `bool pm_runtime_status_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended'
+
+  `void pm_runtime_allow(struct device *dev);`
+    - set the power.runtime_auto flag for the device and decrease its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively allow the device to be power managed at run time)
+
+  `void pm_runtime_forbid(struct device *dev);`
+    - unset the power.runtime_auto flag for the device and increase its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively prevent the device from being power managed at run time)
+
+  `void pm_runtime_no_callbacks(struct device *dev);`
+    - set the power.no_callbacks flag for the device and remove the runtime
+      PM attributes from /sys/devices/.../power (or prevent them from being
+      added when the device is registered)
+
+  `void pm_runtime_irq_safe(struct device *dev);`
+    - set the power.irq_safe flag for the device, causing the runtime-PM
+      callbacks to be invoked with interrupts off
+
+  `bool pm_runtime_is_irq_safe(struct device *dev);`
+    - return true if power.irq_safe flag was set for the device, causing
+      the runtime-PM callbacks to be invoked with interrupts off
+
+  `void pm_runtime_mark_last_busy(struct device *dev);`
+    - set the power.last_busy field to the current time
+
+  `void pm_runtime_use_autosuspend(struct device *dev);`
+    - set the power.use_autosuspend flag, enabling autosuspend delays; call
+      pm_runtime_get_sync if the flag was previously cleared and
+      power.autosuspend_delay is negative
+
+  `void pm_runtime_dont_use_autosuspend(struct device *dev);`
+    - clear the power.use_autosuspend flag, disabling autosuspend delays;
+      decrement the device's usage counter if the flag was previously set and
+      power.autosuspend_delay is negative; call pm_runtime_idle
+
+  `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);`
+    - set the power.autosuspend_delay value to 'delay' (expressed in
+      milliseconds); if 'delay' is negative then runtime suspends are
+      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+      called or the device's usage counter may be decremented and
+      pm_runtime_idle called depending on if power.autosuspend_delay is
+      changed to or from a negative value; if power.use_autosuspend is clear,
+      pm_runtime_idle is called
+
+  `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);`
+    - calculate the time when the current autosuspend delay period will expire,
+      based on power.last_busy and power.autosuspend_delay; if the delay time
+      is 1000 ms or larger then the expiration time is rounded up to the
+      nearest second; returns 0 if the delay period has already expired or
+      power.use_autosuspend isn't set, otherwise returns the expiration time
+      in jiffies
+
+It is safe to execute the following helper functions from interrupt context:
+
+- pm_request_idle()
+- pm_request_autosuspend()
+- pm_schedule_suspend()
+- pm_request_resume()
+- pm_runtime_get_noresume()
+- pm_runtime_get()
+- pm_runtime_put_noidle()
+- pm_runtime_put()
+- pm_runtime_put_autosuspend()
+- pm_runtime_enable()
+- pm_suspend_ignore_children()
+- pm_runtime_set_active()
+- pm_runtime_set_suspended()
+- pm_runtime_suspended()
+- pm_runtime_mark_last_busy()
+- pm_runtime_autosuspend_expiration()
+
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+- pm_runtime_idle()
+- pm_runtime_suspend()
+- pm_runtime_autosuspend()
+- pm_runtime_resume()
+- pm_runtime_get_sync()
+- pm_runtime_put_sync()
+- pm_runtime_put_sync_suspend()
+- pm_runtime_put_sync_autosuspend()
+
+5. Runtime PM Initialization, Device Probing and Removal
+========================================================
+
+Initially, the runtime PM is disabled for all devices, which means that the
+majority of the runtime PM helper functions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial runtime PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+runtime PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's runtime PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set.  Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it).  For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its runtime PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial runtime PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4.  In that case, pm_runtime_resume()
+should be used.  Of course, for this purpose the device's runtime PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+Note, if the device may execute pm_runtime calls during the probe (such as
+if it is registers with a subsystem that may call back in) then the
+pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
+appropriate to ensure that the device is not put back to sleep during the
+probe. This can happen with systems such as the network device layer.
+
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asynchronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time.  A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
+
+Moreover, the driver core prevents runtime PM callbacks from racing with the bus
+notifier callback in __device_release_driver(), which is necessary, because the
+notifier is used by some subsystems to carry out operations affecting the
+runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
+driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
+resumes the device if it's in the suspended state and prevents it from
+being suspended again while those routines are being executed.
+
+To allow bus types and drivers to put devices into the suspended state by
+calling pm_runtime_suspend() from their ->remove() routines, the driver core
+executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
+notifications in __device_release_driver().  This requires bus types and
+drivers to make their ->remove() callbacks avoid races with runtime PM directly,
+but also it allows of more flexibility in the handling of devices during the
+removal of their drivers.
+
+Drivers in ->remove() callback should undo the runtime PM changes done
+in ->probe(). Usually this means calling pm_runtime_disable(),
+pm_runtime_dont_use_autosuspend() etc.
+
+The user space can effectively disallow the driver of the device to power manage
+it at run time by changing the value of its /sys/devices/.../power/control
+attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
+this mechanism may also be used by the driver to effectively turn off the
+runtime power management of the device until the user space turns it on.
+Namely, during the initialization the driver can make sure that the runtime PM
+status of the device is 'active' and call pm_runtime_forbid().  It should be
+noted, however, that if the user space has already intentionally changed the
+value of /sys/devices/.../power/control to "auto" to allow the driver to power
+manage the device at run time, the driver may confuse it by using
+pm_runtime_forbid() this way.
+
+6. Runtime PM and System Sleep
+==============================
+
+Runtime PM and system sleep (i.e., system suspend and hibernation, also known
+as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
+ways.  If a device is active when a system sleep starts, everything is
+straightforward.  But what should happen if the device is already suspended?
+
+The device may have different wake-up settings for runtime PM and system sleep.
+For example, remote wake-up may be enabled for runtime suspend but disallowed
+for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
+the subsystem-level system suspend callback is responsible for changing the
+device's wake-up setting (it may leave that to the device driver's system
+suspend routine).  It may be necessary to resume the device and suspend it again
+in order to do so.  The same is true if the driver uses different power levels
+or other settings for runtime suspend and system sleep.
+
+During system resume, the simplest approach is to bring all devices back to full
+power, even if they had been suspended before the system suspend began.  There
+are several reasons for this, including:
+
+  * The device might need to switch power levels, wake-up settings, etc.
+
+  * Remote wake-up events might have been lost by the firmware.
+
+  * The device's children may need the device to be at full power in order
+    to resume themselves.
+
+  * The driver's idea of the device state may not agree with the device's
+    physical state.  This can happen during resume from hibernation.
+
+  * The device might need to be reset.
+
+  * Even though the device was suspended, if its usage counter was > 0 then most
+    likely it would need a runtime resume in the near future anyway.
+
+If the device had been suspended before the system suspend began and it's
+brought back to full power during resume, then its runtime PM status will have
+to be updated to reflect the actual post-system sleep status.  The way to do
+this is:
+
+	 - pm_runtime_disable(dev);
+	 - pm_runtime_set_active(dev);
+	 - pm_runtime_enable(dev);
+
+The PM core always increments the runtime usage counter before calling the
+->suspend() callback and decrements it after calling the ->resume() callback.
+Hence disabling runtime PM temporarily like this will not cause any runtime
+suspend attempts to be permanently lost.  If the usage count goes to zero
+following the return of the ->resume() callback, the ->runtime_idle() callback
+will be invoked as usual.
+
+On some systems, however, system sleep is not entered through a global firmware
+or hardware operation.  Instead, all hardware components are put into low-power
+states directly by the kernel in a coordinated way.  Then, the system sleep
+state effectively follows from the states the hardware components end up in
+and the system is woken up from that state by a hardware interrupt or a similar
+mechanism entirely under the kernel's control.  As a result, the kernel never
+gives control away and the states of all devices during resume are precisely
+known to it.  If that is the case and none of the situations listed above takes
+place (in particular, if the system is not waking up from hibernation), it may
+be more efficient to leave the devices that had been suspended before the system
+suspend began in the suspended state.
+
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy.  Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend.  If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate.  This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/driver-api/pm/devices.rst for more
+information).
+
+The PM core does its best to reduce the probability of race conditions between
+the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
+out the following operations:
+
+  * During system suspend pm_runtime_get_noresume() is called for every device
+    right before executing the subsystem-level .prepare() callback for it and
+    pm_runtime_barrier() is called for every device right before executing the
+    subsystem-level .suspend() callback for it.  In addition to that the PM core
+    calls  __pm_runtime_disable() with 'false' as the second argument for every
+    device right before executing the subsystem-level .suspend_late() callback
+    for it.
+
+  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
+    every device right after executing the subsystem-level .resume_early()
+    callback and right after executing the subsystem-level .complete() callback
+    for it, respectively.
+
+7. Generic subsystem callbacks
+
+Subsystems may wish to conserve code space by using the set of generic power
+management callbacks provided by the PM core, defined in
+driver/base/power/generic_ops.c:
+
+  `int pm_generic_runtime_suspend(struct device *dev);`
+    - invoke the ->runtime_suspend() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_runtime_resume(struct device *dev);`
+    - invoke the ->runtime_resume() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_suspend(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->suspend()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_suspend_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_resume(struct device *dev);`
+    - invoke the ->resume() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_resume_noirq(struct device *dev);`
+    - invoke the ->resume_noirq() callback provided by the driver of this device
+
+  `int pm_generic_freeze(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->freeze()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_freeze_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_thaw(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->thaw()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_thaw_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_poweroff(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->poweroff()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_poweroff_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_restore(struct device *dev);`
+    - invoke the ->restore() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_restore_noirq(struct device *dev);`
+    - invoke the ->restore_noirq() callback provided by the device's driver
+
+These functions are the defaults used by the PM core, if a subsystem doesn't
+provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
+subsystem-level dev_pm_ops structure.
+
+Device drivers that wish to use the same function as a system suspend, freeze,
+poweroff and runtime suspend callback, and similarly for system resume, thaw,
+restore, and runtime resume, can achieve this with the help of the
+UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
+last argument to NULL).
+
+8. "No-Callback" Devices
+========================
+
+Some "devices" are only logical sub-devices of their parent and cannot be
+power-managed on their own.  (The prototype example is a USB interface.  Entire
+USB devices can go into low-power mode or send wake-up requests, but neither is
+possible for individual interfaces.)  The drivers for these devices have no
+need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
+and ->runtime_resume() would always return 0 without doing anything else and
+->runtime_idle() would always call pm_runtime_suspend().
+
+Subsystems can tell the PM core about these devices by calling
+pm_runtime_no_callbacks().  This should be done after the device structure is
+initialized and before it is registered (although after device registration is
+also okay).  The routine will set the device's power.no_callbacks flag and
+prevent the non-debugging runtime PM sysfs attributes from being created.
+
+When power.no_callbacks is set, the PM core will not invoke the
+->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
+Instead it will assume that suspends and resumes always succeed and that idle
+devices should be suspended.
+
+As a consequence, the PM core will never directly inform the device's subsystem
+or driver about runtime power changes.  Instead, the driver for the device's
+parent must take responsibility for telling the device's driver when the
+parent's power state changes.
+
+9. Autosuspend, or automatically-delayed suspends
+=================================================
+
+Changing a device's power state isn't free; it requires both time and energy.
+A device should be put in a low-power state only when there's some reason to
+think it will remain in that state for a substantial time.  A common heuristic
+says that a device which hasn't been used for a while is liable to remain
+unused; following this advice, drivers should not allow devices to be suspended
+at runtime until they have been inactive for some minimum period.  Even when
+the heuristic ends up being non-optimal, it will still prevent devices from
+"bouncing" too rapidly between low-power and full-power states.
+
+The term "autosuspend" is an historical remnant.  It doesn't mean that the
+device is automatically suspended (the subsystem or driver still has to call
+the appropriate PM routines); rather it means that runtime suspends will
+automatically be delayed until the desired period of inactivity has elapsed.
+
+Inactivity is determined based on the power.last_busy field.  Drivers should
+call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
+typically just before calling pm_runtime_put_autosuspend().  The desired length
+of the inactivity period is a matter of policy.  Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
+registration the length should be controlled by user space, using the
+/sys/devices/.../power/autosuspend_delay_ms attribute.
+
+In order to use autosuspend, subsystems or drivers must call
+pm_runtime_use_autosuspend() (preferably before registering the device), and
+thereafter they should use the various `*_autosuspend()` helper functions
+instead of the non-autosuspend counterparts::
+
+	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
+	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
+	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
+	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
+
+Drivers may also continue to use the non-autosuspend helper functions; they
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
+
+Under some circumstances a driver or subsystem may want to prevent a device
+from autosuspending immediately, even though the usage counter is zero and the
+autosuspend delay time has expired.  If the ->runtime_suspend() callback
+returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
+in the future (as it normally would be if the callback invoked
+pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
+autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
+itself because no suspend requests of any kind are accepted while the device is
+suspending (i.e., while the callback is running).
+
+The implementation is well suited for asynchronous use in interrupt contexts.
+However such use inevitably involves races, because the PM core can't
+synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
+This synchronization must be handled by the driver, using its private lock.
+Here is a schematic pseudo-code example::
+
+	foo_read_or_write(struct foo_priv *foo, void *data)
+	{
+		lock(&foo->private_lock);
+		add_request_to_io_queue(foo, data);
+		if (foo->num_pending_requests++ == 0)
+			pm_runtime_get(&foo->dev);
+		if (!foo->is_suspended)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+	}
+
+	foo_io_completion(struct foo_priv *foo, void *req)
+	{
+		lock(&foo->private_lock);
+		if (--foo->num_pending_requests == 0) {
+			pm_runtime_mark_last_busy(&foo->dev);
+			pm_runtime_put_autosuspend(&foo->dev);
+		} else {
+			foo_process_next_request(foo);
+		}
+		unlock(&foo->private_lock);
+		/* Send req result back to the user ... */
+	}
+
+	int foo_runtime_suspend(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+		int ret = 0;
+
+		lock(&foo->private_lock);
+		if (foo->num_pending_requests > 0) {
+			ret = -EBUSY;
+		} else {
+			/* ... suspend the device ... */
+			foo->is_suspended = 1;
+		}
+		unlock(&foo->private_lock);
+		return ret;
+	}
+
+	int foo_runtime_resume(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+
+		lock(&foo->private_lock);
+		/* ... resume the device ... */
+		foo->is_suspended = 0;
+		pm_runtime_mark_last_busy(&foo->dev);
+		if (foo->num_pending_requests > 0)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+		return 0;
+	}
+
+The important point is that after foo_io_completion() asks for an autosuspend,
+the foo_runtime_suspend() callback may race with foo_read_or_write().
+Therefore foo_runtime_suspend() has to check whether there are any pending I/O
+requests (while holding the private lock) before allowing the suspend to
+proceed.
+
+In addition, the power.autosuspend_delay field can be changed by user space at
+any time.  If a driver cares about this, it can call
+pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
+callback while holding its private lock.  If the function returns a nonzero
+value then the delay has not yet expired and the callback should return
+-EAGAIN.
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
deleted file mode 100644
index 937e33c46211..000000000000
--- a/Documentation/power/runtime_pm.txt
+++ /dev/null
@@ -1,928 +0,0 @@
-Runtime Power Management Framework for I/O Devices
-
-(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-(C) 2010 Alan Stern <stern@rowland.harvard.edu>
-(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-1. Introduction
-
-Support for runtime power management (runtime PM) of I/O devices is provided
-at the power management core (PM core) level by means of:
-
-* The power management workqueue pm_wq in which bus types and device drivers can
-  put their PM-related work items.  It is strongly recommended that pm_wq be
-  used for queuing all work items related to runtime PM, because this allows
-  them to be synchronized with system-wide power transitions (suspend to RAM,
-  hibernation and resume from system sleep states).  pm_wq is declared in
-  include/linux/pm_runtime.h and defined in kernel/power/main.c.
-
-* A number of runtime PM fields in the 'power' member of 'struct device' (which
-  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
-  be used for synchronizing runtime PM operations with one another.
-
-* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
-  include/linux/pm.h).
-
-* A set of helper functions defined in drivers/base/power/runtime.c that can be
-  used for carrying out runtime PM operations in such a way that the
-  synchronization between them is taken care of by the PM core.  Bus types and
-  device drivers are encouraged to use these functions.
-
-The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
-fields of 'struct dev_pm_info' and the core helper functions provided for
-runtime PM are described below.
-
-2. Device Runtime PM Callbacks
-
-There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
-
-struct dev_pm_ops {
-	...
-	int (*runtime_suspend)(struct device *dev);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_idle)(struct device *dev);
-	...
-};
-
-The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
-are executed by the PM core for the device's subsystem that may be either of
-the following:
-
-  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
-     is present.
-
-  2. Device type of the device, if both dev->type and dev->type->pm are present.
-
-  3. Device class of the device, if both dev->class and dev->class->pm are
-     present.
-
-  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
-
-If the subsystem chosen by applying the above rules doesn't provide the relevant
-callback, the PM core will invoke the corresponding driver callback stored in
-dev->driver->pm directly (if present).
-
-The PM core always checks which callback to use in the order given above, so the
-priority order of callbacks from high to low is: PM domain, device type, class
-and bus type.  Moreover, the high-priority one will always take precedence over
-a low-priority one.  The PM domain, bus type, device type and class callbacks
-are referred to as subsystem-level callbacks in what follows.
-
-By default, the callbacks are always invoked in process context with interrupts
-enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
-the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
-and ->runtime_idle() callbacks for the given device in atomic context with
-interrupts disabled.  This implies that the callback routines in question must
-not block or sleep, but it also means that the synchronous helper functions
-listed at the end of Section 4 may be used for that device within an interrupt
-handler or generally in an atomic context.
-
-The subsystem-level suspend callback, if present, is _entirely_ _responsible_
-for handling the suspend of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_suspend() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_suspend()
-callback in a device driver as long as the subsystem-level suspend callback
-knows what to do to handle the device).
-
-  * Once the subsystem-level suspend callback (or the driver suspend callback,
-    if invoked directly) has completed successfully for the given device, the PM
-    core regards the device as suspended, which need not mean that it has been
-    put into a low power state.  It is supposed to mean, however, that the
-    device will not process data and will not communicate with the CPU(s) and
-    RAM until the appropriate resume callback is executed for it.  The runtime
-    PM status of a device after successful execution of the suspend callback is
-    'suspended'.
-
-  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
-    status remains 'active', which means that the device _must_ be fully
-    operational afterwards.
-
-  * If the suspend callback returns an error code different from -EBUSY and
-    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
-    the helper functions described in Section 4 for the device until its status
-    is directly set to  either 'active', or 'suspended' (the PM core provides
-    special helper functions for this purpose).
-
-In particular, if the driver requires remote wakeup capability (i.e. hardware
-mechanism allowing the device to request a change of its power state, such as
-PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
-device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
-device_can_wakeup() returns 'true' for the device and the device is put into a
-low-power state during the execution of the suspend callback, it is expected
-that remote wakeup will be enabled for the device.  Generally, remote wakeup
-should be enabled for all input devices put into low-power states at run time.
-
-The subsystem-level resume callback, if present, is _entirely_ _responsible_ for
-handling the resume of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_resume() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_resume()
-callback in a device driver as long as the subsystem-level resume callback knows
-what to do to handle the device).
-
-  * Once the subsystem-level resume callback (or the driver resume callback, if
-    invoked directly) has completed successfully, the PM core regards the device
-    as fully operational, which means that the device _must_ be able to complete
-    I/O operations as needed.  The runtime PM status of the device is then
-    'active'.
-
-  * If the resume callback returns an error code, the PM core regards this as a
-    fatal error and will refuse to run the helper functions described in Section
-    4 for the device, until its status is directly set to either 'active', or
-    'suspended' (by means of special helper functions provided by the PM core
-    for this purpose).
-
-The idle callback (a subsystem-level one, if present, or the driver one) is
-executed by the PM core whenever the device appears to be idle, which is
-indicated to the PM core by two counters, the device's usage counter and the
-counter of 'active' children of the device.
-
-  * If any of these counters is decreased using a helper function provided by
-    the PM core and it turns out to be equal to zero, the other counter is
-    checked.  If that counter also is equal to zero, the PM core executes the
-    idle callback with the device as its argument.
-
-The action performed by the idle callback is totally dependent on the subsystem
-(or driver) in question, but the expected and recommended action is to check
-if the device can be suspended (i.e. if all of the conditions necessary for
-suspending the device are satisfied) and to queue up a suspend request for the
-device in that case.  If there is no idle callback, or if the callback returns
-0, then the PM core will attempt to carry out a runtime suspend of the device,
-also respecting devices configured for autosuspend.  In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance).  To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value.  Negative
-error return codes are ignored by the PM core.
-
-The helper functions provided by the PM core, described in Section 4, guarantee
-that the following constraints are met with respect to runtime PM callbacks for
-one device:
-
-(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
-    ->runtime_suspend() in parallel with ->runtime_resume() or with another
-    instance of ->runtime_suspend() for the same device) with the exception that
-    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
-    ->runtime_idle() (although ->runtime_idle() will not be started while any
-    of the other callbacks is being executed for the same device).
-
-(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
-    devices (i.e. the PM core will only execute ->runtime_idle() or
-    ->runtime_suspend() for the devices the runtime PM status of which is
-    'active').
-
-(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
-    the usage counter of which is equal to zero _and_ either the counter of
-    'active' children of which is equal to zero, or the 'power.ignore_children'
-    flag of which is set.
-
-(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
-    PM core will only execute ->runtime_resume() for the devices the runtime
-    PM status of which is 'suspended').
-
-Additionally, the helper functions provided by the PM core obey the following
-rules:
-
-  * If ->runtime_suspend() is about to be executed or there's a pending request
-    to execute it, ->runtime_idle() will not be executed for the same device.
-
-  * A request to execute or to schedule the execution of ->runtime_suspend()
-    will cancel any pending requests to execute ->runtime_idle() for the same
-    device.
-
-  * If ->runtime_resume() is about to be executed or there's a pending request
-    to execute it, the other callbacks will not be executed for the same device.
-
-  * A request to execute ->runtime_resume() will cancel any pending or
-    scheduled requests to execute the other callbacks for the same device,
-    except for scheduled autosuspends.
-
-3. Runtime PM Device Fields
-
-The following device runtime PM fields are present in 'struct dev_pm_info', as
-defined in include/linux/pm.h:
-
-  struct timer_list suspend_timer;
-    - timer used for scheduling (delayed) suspend and autosuspend requests
-
-  unsigned long timer_expires;
-    - timer expiration time, in jiffies (if this is different from zero, the
-      timer is running and will expire at that time, otherwise the timer is not
-      running)
-
-  struct work_struct work;
-    - work structure used for queuing up requests (i.e. work items in pm_wq)
-
-  wait_queue_head_t wait_queue;
-    - wait queue used if any of the helper functions needs to wait for another
-      one to complete
-
-  spinlock_t lock;
-    - lock used for synchronization
-
-  atomic_t usage_count;
-    - the usage counter of the device
-
-  atomic_t child_count;
-    - the count of 'active' children of the device
-
-  unsigned int ignore_children;
-    - if set, the value of child_count is ignored (but still updated)
-
-  unsigned int disable_depth;
-    - used for disabling the helper functions (they work normally if this is
-      equal to zero); the initial value of it is 1 (i.e. runtime PM is
-      initially disabled for all devices)
-
-  int runtime_error;
-    - if set, there was a fatal error (one of the callbacks returned error code
-      as described in Section 2), so the helper functions will not work until
-      this flag is cleared; this is the error code returned by the failing
-      callback
-
-  unsigned int idle_notification;
-    - if set, ->runtime_idle() is being executed
-
-  unsigned int request_pending;
-    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
-
-  enum rpm_request request;
-    - type of request that's pending (valid if request_pending is set)
-
-  unsigned int deferred_resume;
-    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
-      being executed for that device and it is not practical to wait for the
-      suspend to complete; means "start a resume as soon as you've suspended"
-
-  enum rpm_status runtime_status;
-    - the runtime PM status of the device; this field's initial value is
-      RPM_SUSPENDED, which means that each device is initially regarded by the
-      PM core as 'suspended', regardless of its real hardware status
-
-  unsigned int runtime_auto;
-    - if set, indicates that the user space has allowed the device driver to
-      power manage the device at run time via the /sys/devices/.../power/control
-      interface; it may only be modified with the help of the pm_runtime_allow()
-      and pm_runtime_forbid() helper functions
-
-  unsigned int no_callbacks;
-    - indicates that the device does not use the runtime PM callbacks (see
-      Section 8); it may be modified only by the pm_runtime_no_callbacks()
-      helper function
-
-  unsigned int irq_safe;
-    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
-      will be invoked with the spinlock held and interrupts disabled
-
-  unsigned int use_autosuspend;
-    - indicates that the device's driver supports delayed autosuspend (see
-      Section 9); it may be modified only by the
-      pm_runtime{_dont}_use_autosuspend() helper functions
-
-  unsigned int timer_autosuspends;
-    - indicates that the PM core should attempt to carry out an autosuspend
-      when the timer expires rather than a normal suspend
-
-  int autosuspend_delay;
-    - the delay time (in milliseconds) to be used for autosuspend
-
-  unsigned long last_busy;
-    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
-      function was last called for this device; used in calculating inactivity
-      periods for autosuspend
-
-All of the above fields are members of the 'power' member of 'struct device'.
-
-4. Runtime PM Device Helper Functions
-
-The following runtime PM helper functions are defined in
-drivers/base/power/runtime.c and include/linux/pm_runtime.h:
-
-  void pm_runtime_init(struct device *dev);
-    - initialize the device runtime PM fields in 'struct dev_pm_info'
-
-  void pm_runtime_remove(struct device *dev);
-    - make sure that the runtime PM of the device will be disabled after
-      removing the device from device hierarchy
-
-  int pm_runtime_idle(struct device *dev);
-    - execute the subsystem-level idle callback for the device; returns an
-      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
-      already being executed; if there is no callback or the callback returns 0
-      then run pm_runtime_autosuspend(dev) and return its result
-
-  int pm_runtime_suspend(struct device *dev);
-    - execute the subsystem-level suspend callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'suspended', or
-      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
-      to suspend the device again in future and -EACCES means that
-      'power.disable_depth' is different from 0
-
-  int pm_runtime_autosuspend(struct device *dev);
-    - same as pm_runtime_suspend() except that the autosuspend delay is taken
-      into account; if pm_runtime_autosuspend_expiration() says the delay has
-      not yet expired then an autosuspend is scheduled for the appropriate time
-      and 0 is returned
-
-  int pm_runtime_resume(struct device *dev);
-    - execute the subsystem-level resume callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'active' or
-      error code on failure, where -EAGAIN means it may be safe to attempt to
-      resume the device again in future, but 'power.runtime_error' should be
-      checked additionally, and -EACCES means that 'power.disable_depth' is
-      different from 0
-
-  int pm_request_idle(struct device *dev);
-    - submit a request to execute the subsystem-level idle callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success or error code if the request has not been queued up
-
-  int pm_request_autosuspend(struct device *dev);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device when the autosuspend delay has expired; if the delay has already
-      expired then the work item is queued up immediately
-
-  int pm_schedule_suspend(struct device *dev, unsigned int delay);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device in future, where 'delay' is the time to wait before queuing up a
-      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
-      item is queued up immediately); returns 0 on success, 1 if the device's PM
-      runtime status was already 'suspended', or error code if the request
-      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
-      ->runtime_suspend() is already scheduled and not yet expired, the new
-      value of 'delay' will be used as the time to wait
-
-  int pm_request_resume(struct device *dev);
-    - submit a request to execute the subsystem-level resume callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success, 1 if the device's runtime PM status was already 'active', or
-      error code if the request hasn't been queued up
-
-  void pm_runtime_get_noresume(struct device *dev);
-    - increment the device's usage counter
-
-  int pm_runtime_get(struct device *dev);
-    - increment the device's usage counter, run pm_request_resume(dev) and
-      return its result
-
-  int pm_runtime_get_sync(struct device *dev);
-    - increment the device's usage counter, run pm_runtime_resume(dev) and
-      return its result
-
-  int pm_runtime_get_if_in_use(struct device *dev);
-    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
-      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
-      nonzero, increment the counter and return 1; otherwise return 0 without
-      changing the counter
-
-  void pm_runtime_put_noidle(struct device *dev);
-    - decrement the device's usage counter
-
-  int pm_runtime_put(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_idle(dev) and return its result
-
-  int pm_runtime_put_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_autosuspend(dev) and return its result
-
-  int pm_runtime_put_sync(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_idle(dev) and return its result
-
-  int pm_runtime_put_sync_suspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_suspend(dev) and return its result
-
-  int pm_runtime_put_sync_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_autosuspend(dev) and return its result
-
-  void pm_runtime_enable(struct device *dev);
-    - decrement the device's 'power.disable_depth' field; if that field is equal
-      to zero, the runtime PM helper functions can execute subsystem-level
-      callbacks described in Section 2 for the device
-
-  int pm_runtime_disable(struct device *dev);
-    - increment the device's 'power.disable_depth' field (if the value of that
-      field was previously zero, this prevents subsystem-level runtime PM
-      callbacks from being run for the device), make sure that all of the
-      pending runtime PM operations on the device are either completed or
-      canceled; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device
-      to satisfy that request, otherwise 0 is returned
-
-  int pm_runtime_barrier(struct device *dev);
-    - check if there's a resume request pending for the device and resume it
-      (synchronously) in that case, cancel any other pending runtime PM requests
-      regarding it and wait for all runtime PM operations on it in progress to
-      complete; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device to
-      satisfy that request, otherwise 0 is returned
-
-  void pm_suspend_ignore_children(struct device *dev, bool enable);
-    - set/unset the power.ignore_children flag of the device
-
-  int pm_runtime_set_active(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'active' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero); it will fail and return error code if the device has a parent
-      which is not active and the 'power.ignore_children' flag of which is unset
-
-  void pm_runtime_set_suspended(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'suspended' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero)
-
-  bool pm_runtime_active(struct device *dev);
-    - return true if the device's runtime PM status is 'active' or its
-      'power.disable_depth' field is not equal to zero, or false otherwise
-
-  bool pm_runtime_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended' and its
-      'power.disable_depth' field is equal to zero, or false otherwise
-
-  bool pm_runtime_status_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended'
-
-  void pm_runtime_allow(struct device *dev);
-    - set the power.runtime_auto flag for the device and decrease its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively allow the device to be power managed at run time)
-
-  void pm_runtime_forbid(struct device *dev);
-    - unset the power.runtime_auto flag for the device and increase its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively prevent the device from being power managed at run time)
-
-  void pm_runtime_no_callbacks(struct device *dev);
-    - set the power.no_callbacks flag for the device and remove the runtime
-      PM attributes from /sys/devices/.../power (or prevent them from being
-      added when the device is registered)
-
-  void pm_runtime_irq_safe(struct device *dev);
-    - set the power.irq_safe flag for the device, causing the runtime-PM
-      callbacks to be invoked with interrupts off
-
-  bool pm_runtime_is_irq_safe(struct device *dev);
-    - return true if power.irq_safe flag was set for the device, causing
-      the runtime-PM callbacks to be invoked with interrupts off
-
-  void pm_runtime_mark_last_busy(struct device *dev);
-    - set the power.last_busy field to the current time
-
-  void pm_runtime_use_autosuspend(struct device *dev);
-    - set the power.use_autosuspend flag, enabling autosuspend delays; call
-      pm_runtime_get_sync if the flag was previously cleared and
-      power.autosuspend_delay is negative
-
-  void pm_runtime_dont_use_autosuspend(struct device *dev);
-    - clear the power.use_autosuspend flag, disabling autosuspend delays;
-      decrement the device's usage counter if the flag was previously set and
-      power.autosuspend_delay is negative; call pm_runtime_idle
-
-  void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
-    - set the power.autosuspend_delay value to 'delay' (expressed in
-      milliseconds); if 'delay' is negative then runtime suspends are
-      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
-      called or the device's usage counter may be decremented and
-      pm_runtime_idle called depending on if power.autosuspend_delay is
-      changed to or from a negative value; if power.use_autosuspend is clear,
-      pm_runtime_idle is called
-
-  unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
-    - calculate the time when the current autosuspend delay period will expire,
-      based on power.last_busy and power.autosuspend_delay; if the delay time
-      is 1000 ms or larger then the expiration time is rounded up to the
-      nearest second; returns 0 if the delay period has already expired or
-      power.use_autosuspend isn't set, otherwise returns the expiration time
-      in jiffies
-
-It is safe to execute the following helper functions from interrupt context:
-
-pm_request_idle()
-pm_request_autosuspend()
-pm_schedule_suspend()
-pm_request_resume()
-pm_runtime_get_noresume()
-pm_runtime_get()
-pm_runtime_put_noidle()
-pm_runtime_put()
-pm_runtime_put_autosuspend()
-pm_runtime_enable()
-pm_suspend_ignore_children()
-pm_runtime_set_active()
-pm_runtime_set_suspended()
-pm_runtime_suspended()
-pm_runtime_mark_last_busy()
-pm_runtime_autosuspend_expiration()
-
-If pm_runtime_irq_safe() has been called for a device then the following helper
-functions may also be used in interrupt context:
-
-pm_runtime_idle()
-pm_runtime_suspend()
-pm_runtime_autosuspend()
-pm_runtime_resume()
-pm_runtime_get_sync()
-pm_runtime_put_sync()
-pm_runtime_put_sync_suspend()
-pm_runtime_put_sync_autosuspend()
-
-5. Runtime PM Initialization, Device Probing and Removal
-
-Initially, the runtime PM is disabled for all devices, which means that the
-majority of the runtime PM helper functions described in Section 4 will return
--EAGAIN until pm_runtime_enable() is called for the device.
-
-In addition to that, the initial runtime PM status of all devices is
-'suspended', but it need not reflect the actual physical state of the device.
-Thus, if the device is initially active (i.e. it is able to process I/O), its
-runtime PM status must be changed to 'active', with the help of
-pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
-
-However, if the device has a parent and the parent's runtime PM is enabled,
-calling pm_runtime_set_active() for the device will affect the parent, unless
-the parent's 'power.ignore_children' flag is set.  Namely, in that case the
-parent won't be able to suspend at run time, using the PM core's helper
-functions, as long as the child's status is 'active', even if the child's
-runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
-the child yet or pm_runtime_disable() has been called for it).  For this reason,
-once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
-should be called for it too as soon as reasonably possible or its runtime PM
-status should be changed back to 'suspended' with the help of
-pm_runtime_set_suspended().
-
-If the default initial runtime PM status of the device (i.e. 'suspended')
-reflects the actual state of the device, its bus type's or its driver's
-->probe() callback will likely need to wake it up using one of the PM core's
-helper functions described in Section 4.  In that case, pm_runtime_resume()
-should be used.  Of course, for this purpose the device's runtime PM has to be
-enabled earlier by calling pm_runtime_enable().
-
-Note, if the device may execute pm_runtime calls during the probe (such as
-if it is registers with a subsystem that may call back in) then the
-pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
-appropriate to ensure that the device is not put back to sleep during the
-probe. This can happen with systems such as the network device layer.
-
-It may be desirable to suspend the device once ->probe() has finished.
-Therefore the driver core uses the asynchronous pm_request_idle() to submit a
-request to execute the subsystem-level idle callback for the device at that
-time.  A driver that makes use of the runtime autosuspend feature, may want to
-update the last busy mark before returning from ->probe().
-
-Moreover, the driver core prevents runtime PM callbacks from racing with the bus
-notifier callback in __device_release_driver(), which is necessary, because the
-notifier is used by some subsystems to carry out operations affecting the
-runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
-driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
-resumes the device if it's in the suspended state and prevents it from
-being suspended again while those routines are being executed.
-
-To allow bus types and drivers to put devices into the suspended state by
-calling pm_runtime_suspend() from their ->remove() routines, the driver core
-executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
-notifications in __device_release_driver().  This requires bus types and
-drivers to make their ->remove() callbacks avoid races with runtime PM directly,
-but also it allows of more flexibility in the handling of devices during the
-removal of their drivers.
-
-Drivers in ->remove() callback should undo the runtime PM changes done
-in ->probe(). Usually this means calling pm_runtime_disable(),
-pm_runtime_dont_use_autosuspend() etc.
-
-The user space can effectively disallow the driver of the device to power manage
-it at run time by changing the value of its /sys/devices/.../power/control
-attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
-this mechanism may also be used by the driver to effectively turn off the
-runtime power management of the device until the user space turns it on.
-Namely, during the initialization the driver can make sure that the runtime PM
-status of the device is 'active' and call pm_runtime_forbid().  It should be
-noted, however, that if the user space has already intentionally changed the
-value of /sys/devices/.../power/control to "auto" to allow the driver to power
-manage the device at run time, the driver may confuse it by using
-pm_runtime_forbid() this way.
-
-6. Runtime PM and System Sleep
-
-Runtime PM and system sleep (i.e., system suspend and hibernation, also known
-as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
-ways.  If a device is active when a system sleep starts, everything is
-straightforward.  But what should happen if the device is already suspended?
-
-The device may have different wake-up settings for runtime PM and system sleep.
-For example, remote wake-up may be enabled for runtime suspend but disallowed
-for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
-the subsystem-level system suspend callback is responsible for changing the
-device's wake-up setting (it may leave that to the device driver's system
-suspend routine).  It may be necessary to resume the device and suspend it again
-in order to do so.  The same is true if the driver uses different power levels
-or other settings for runtime suspend and system sleep.
-
-During system resume, the simplest approach is to bring all devices back to full
-power, even if they had been suspended before the system suspend began.  There
-are several reasons for this, including:
-
-  * The device might need to switch power levels, wake-up settings, etc.
-
-  * Remote wake-up events might have been lost by the firmware.
-
-  * The device's children may need the device to be at full power in order
-    to resume themselves.
-
-  * The driver's idea of the device state may not agree with the device's
-    physical state.  This can happen during resume from hibernation.
-
-  * The device might need to be reset.
-
-  * Even though the device was suspended, if its usage counter was > 0 then most
-    likely it would need a runtime resume in the near future anyway.
-
-If the device had been suspended before the system suspend began and it's
-brought back to full power during resume, then its runtime PM status will have
-to be updated to reflect the actual post-system sleep status.  The way to do
-this is:
-
-	pm_runtime_disable(dev);
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-The PM core always increments the runtime usage counter before calling the
-->suspend() callback and decrements it after calling the ->resume() callback.
-Hence disabling runtime PM temporarily like this will not cause any runtime
-suspend attempts to be permanently lost.  If the usage count goes to zero
-following the return of the ->resume() callback, the ->runtime_idle() callback
-will be invoked as usual.
-
-On some systems, however, system sleep is not entered through a global firmware
-or hardware operation.  Instead, all hardware components are put into low-power
-states directly by the kernel in a coordinated way.  Then, the system sleep
-state effectively follows from the states the hardware components end up in
-and the system is woken up from that state by a hardware interrupt or a similar
-mechanism entirely under the kernel's control.  As a result, the kernel never
-gives control away and the states of all devices during resume are precisely
-known to it.  If that is the case and none of the situations listed above takes
-place (in particular, if the system is not waking up from hibernation), it may
-be more efficient to leave the devices that had been suspended before the system
-suspend began in the suspended state.
-
-To this end, the PM core provides a mechanism allowing some coordination between
-different levels of device hierarchy.  Namely, if a system suspend .prepare()
-callback returns a positive number for a device, that indicates to the PM core
-that the device appears to be runtime-suspended and its state is fine, so it
-may be left in runtime suspend provided that all of its descendants are also
-left in runtime suspend.  If that happens, the PM core will not execute any
-system suspend and resume callbacks for all of those devices, except for the
-complete callback, which is then entirely responsible for handling the device
-as appropriate.  This only applies to system suspend transitions that are not
-related to hibernation (see Documentation/driver-api/pm/devices.rst for more
-information).
-
-The PM core does its best to reduce the probability of race conditions between
-the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
-out the following operations:
-
-  * During system suspend pm_runtime_get_noresume() is called for every device
-    right before executing the subsystem-level .prepare() callback for it and
-    pm_runtime_barrier() is called for every device right before executing the
-    subsystem-level .suspend() callback for it.  In addition to that the PM core
-    calls  __pm_runtime_disable() with 'false' as the second argument for every
-    device right before executing the subsystem-level .suspend_late() callback
-    for it.
-
-  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
-    every device right after executing the subsystem-level .resume_early()
-    callback and right after executing the subsystem-level .complete() callback
-    for it, respectively.
-
-7. Generic subsystem callbacks
-
-Subsystems may wish to conserve code space by using the set of generic power
-management callbacks provided by the PM core, defined in
-driver/base/power/generic_ops.c:
-
-  int pm_generic_runtime_suspend(struct device *dev);
-    - invoke the ->runtime_suspend() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_runtime_resume(struct device *dev);
-    - invoke the ->runtime_resume() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_suspend(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->suspend()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_suspend_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_resume(struct device *dev);
-    - invoke the ->resume() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_resume_noirq(struct device *dev);
-    - invoke the ->resume_noirq() callback provided by the driver of this device
-
-  int pm_generic_freeze(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->freeze()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_freeze_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_thaw(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->thaw()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_thaw_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_poweroff(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->poweroff()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_poweroff_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_restore(struct device *dev);
-    - invoke the ->restore() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_restore_noirq(struct device *dev);
-    - invoke the ->restore_noirq() callback provided by the device's driver
-
-These functions are the defaults used by the PM core, if a subsystem doesn't
-provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
-->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
-->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
-subsystem-level dev_pm_ops structure.
-
-Device drivers that wish to use the same function as a system suspend, freeze,
-poweroff and runtime suspend callback, and similarly for system resume, thaw,
-restore, and runtime resume, can achieve this with the help of the
-UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
-last argument to NULL).
-
-8. "No-Callback" Devices
-
-Some "devices" are only logical sub-devices of their parent and cannot be
-power-managed on their own.  (The prototype example is a USB interface.  Entire
-USB devices can go into low-power mode or send wake-up requests, but neither is
-possible for individual interfaces.)  The drivers for these devices have no
-need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
-and ->runtime_resume() would always return 0 without doing anything else and
-->runtime_idle() would always call pm_runtime_suspend().
-
-Subsystems can tell the PM core about these devices by calling
-pm_runtime_no_callbacks().  This should be done after the device structure is
-initialized and before it is registered (although after device registration is
-also okay).  The routine will set the device's power.no_callbacks flag and
-prevent the non-debugging runtime PM sysfs attributes from being created.
-
-When power.no_callbacks is set, the PM core will not invoke the
-->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
-Instead it will assume that suspends and resumes always succeed and that idle
-devices should be suspended.
-
-As a consequence, the PM core will never directly inform the device's subsystem
-or driver about runtime power changes.  Instead, the driver for the device's
-parent must take responsibility for telling the device's driver when the
-parent's power state changes.
-
-9. Autosuspend, or automatically-delayed suspends
-
-Changing a device's power state isn't free; it requires both time and energy.
-A device should be put in a low-power state only when there's some reason to
-think it will remain in that state for a substantial time.  A common heuristic
-says that a device which hasn't been used for a while is liable to remain
-unused; following this advice, drivers should not allow devices to be suspended
-at runtime until they have been inactive for some minimum period.  Even when
-the heuristic ends up being non-optimal, it will still prevent devices from
-"bouncing" too rapidly between low-power and full-power states.
-
-The term "autosuspend" is an historical remnant.  It doesn't mean that the
-device is automatically suspended (the subsystem or driver still has to call
-the appropriate PM routines); rather it means that runtime suspends will
-automatically be delayed until the desired period of inactivity has elapsed.
-
-Inactivity is determined based on the power.last_busy field.  Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling pm_runtime_put_autosuspend().  The desired length
-of the inactivity period is a matter of policy.  Subsystems can set this length
-initially by calling pm_runtime_set_autosuspend_delay(), but after device
-registration the length should be controlled by user space, using the
-/sys/devices/.../power/autosuspend_delay_ms attribute.
-
-In order to use autosuspend, subsystems or drivers must call
-pm_runtime_use_autosuspend() (preferably before registering the device), and
-thereafter they should use the various *_autosuspend() helper functions instead
-of the non-autosuspend counterparts:
-
-	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
-	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
-	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
-	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
-
-Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, which means sometimes taking the autosuspend delay into
-account (see pm_runtime_idle).
-
-Under some circumstances a driver or subsystem may want to prevent a device
-from autosuspending immediately, even though the usage counter is zero and the
-autosuspend delay time has expired.  If the ->runtime_suspend() callback
-returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
-in the future (as it normally would be if the callback invoked
-pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
-autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
-itself because no suspend requests of any kind are accepted while the device is
-suspending (i.e., while the callback is running).
-
-The implementation is well suited for asynchronous use in interrupt contexts.
-However such use inevitably involves races, because the PM core can't
-synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
-This synchronization must be handled by the driver, using its private lock.
-Here is a schematic pseudo-code example:
-
-	foo_read_or_write(struct foo_priv *foo, void *data)
-	{
-		lock(&foo->private_lock);
-		add_request_to_io_queue(foo, data);
-		if (foo->num_pending_requests++ == 0)
-			pm_runtime_get(&foo->dev);
-		if (!foo->is_suspended)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-	}
-
-	foo_io_completion(struct foo_priv *foo, void *req)
-	{
-		lock(&foo->private_lock);
-		if (--foo->num_pending_requests == 0) {
-			pm_runtime_mark_last_busy(&foo->dev);
-			pm_runtime_put_autosuspend(&foo->dev);
-		} else {
-			foo_process_next_request(foo);
-		}
-		unlock(&foo->private_lock);
-		/* Send req result back to the user ... */
-	}
-
-	int foo_runtime_suspend(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-		int ret = 0;
-
-		lock(&foo->private_lock);
-		if (foo->num_pending_requests > 0) {
-			ret = -EBUSY;
-		} else {
-			/* ... suspend the device ... */
-			foo->is_suspended = 1;
-		}
-		unlock(&foo->private_lock);
-		return ret;
-	}
-
-	int foo_runtime_resume(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-
-		lock(&foo->private_lock);
-		/* ... resume the device ... */
-		foo->is_suspended = 0;
-		pm_runtime_mark_last_busy(&foo->dev);
-		if (foo->num_pending_requests > 0)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-		return 0;
-	}
-
-The important point is that after foo_io_completion() asks for an autosuspend,
-the foo_runtime_suspend() callback may race with foo_read_or_write().
-Therefore foo_runtime_suspend() has to check whether there are any pending I/O
-requests (while holding the private lock) before allowing the suspend to
-proceed.
-
-In addition, the power.autosuspend_delay field can be changed by user space at
-any time.  If a driver cares about this, it can call
-pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
-callback while holding its private lock.  If the function returns a nonzero
-value then the delay has not yet expired and the callback should return
--EAGAIN.
diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst
new file mode 100644
index 000000000000..d739aa7c742c
--- /dev/null
+++ b/Documentation/power/s2ram.rst
@@ -0,0 +1,87 @@
+========================
+How to get s2ram working
+========================
+
+2006 Linus Torvalds
+2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+   "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+   video.txt. Perhaps problem is as simple as broken module, and
+   simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+Using TRACE_RESUME
+~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this::
+
+	#!/bin/sh
+	sync
+	echo 1 > /sys/power/pm_trace
+	echo mem > /sys/power/state
+
+   to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+   holding the power button down, and look at the dmesg output for things
+   like::
+
+	Magic number: 4:156:725
+	hash matches drivers/base/power/resume.c:28
+	hash matches device 0000:01:00.0
+
+   which means that the last trace event was just before trying to resume
+   device 0000:01:00.0. Then figure out what driver is controlling that
+   device (lspci and /sys/devices/pci* is your friend), and see if you can
+   fix it, disable it, or trace into its resume function.
+
+   If no device matches the hash (or any matches appear to be false positives),
+   the culprit may be a device from a loadable kernel module that is not loaded
+   until after the hash is checked. You can check the hash against the current
+   devices again after more modules are loaded using sysfs::
+
+	cat /sys/power/pm_trace_dev_match
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+pm_trace is not compatible with asynchronous suspend, so it turns
+asynchronous suspend off (which may work around timing or
+ordering-sensitive bugs).
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic number instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
deleted file mode 100644
index 4685aee197fd..000000000000
--- a/Documentation/power/s2ram.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-			How to get s2ram working
-			~~~~~~~~~~~~~~~~~~~~~~~~
-			2006 Linus Torvalds
-			2006 Pavel Machek
-
-1) Check suspend.sf.net, program s2ram there has long whitelist of
-   "known ok" machines, along with tricks to use on each one.
-
-2) If that does not help, try reading tricks.txt and
-   video.txt. Perhaps problem is as simple as broken module, and
-   simple module unload can fix it.
-
-3) You can use Linus' TRACE_RESUME infrastructure, described below.
-
-		      Using TRACE_RESUME
-		      ~~~~~~~~~~~~~~~~~~
-
-I've been working at making the machines I have able to STR, and almost
-always it's a driver that is buggy. Thank God for the suspend/resume
-debugging - the thing that Chuck tried to disable. That's often the _only_
-way to debug these things, and it's actually pretty powerful (but
-time-consuming - having to insert TRACE_RESUME() markers into the device
-driver that doesn't resume and recompile and reboot).
-
-Anyway, the way to debug this for people who are interested (have a
-machine that doesn't boot) is:
-
- - enable PM_DEBUG, and PM_TRACE
-
- - use a script like this:
-
-	#!/bin/sh
-	sync
-	echo 1 > /sys/power/pm_trace
-	echo mem > /sys/power/state
-
-   to suspend
-
- - if it doesn't come back up (which is usually the problem), reboot by
-   holding the power button down, and look at the dmesg output for things
-   like
-
-	Magic number: 4:156:725
-	hash matches drivers/base/power/resume.c:28
-	hash matches device 0000:01:00.0
-
-   which means that the last trace event was just before trying to resume
-   device 0000:01:00.0. Then figure out what driver is controlling that
-   device (lspci and /sys/devices/pci* is your friend), and see if you can
-   fix it, disable it, or trace into its resume function.
-
-   If no device matches the hash (or any matches appear to be false positives),
-   the culprit may be a device from a loadable kernel module that is not loaded
-   until after the hash is checked. You can check the hash against the current
-   devices again after more modules are loaded using sysfs:
-
-	cat /sys/power/pm_trace_dev_match
-
-For example, the above happens to be the VGA device on my EVO, which I
-used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
-that "radeonfb" simply cannot resume that device - it tries to set the
-PLL's, and it just _hangs_. Using the regular VGA console and letting X
-resume it instead works fine.
-
-NOTE
-====
-pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
-Reason for this is that the RTC is the only reliably available piece of
-hardware during resume operations where a value can be set that will
-survive a reboot.
-
-pm_trace is not compatible with asynchronous suspend, so it turns
-asynchronous suspend off (which may work around timing or
-ordering-sensitive bugs).
-
-Consequence is that after a resume (even if it is successful) your system
-clock will have a value corresponding to the magic number instead of the
-correct date/time! It is therefore advisable to use a program like ntp-date
-or rdate to reset the correct date/time from an external time source when
-using this trace option.
-
-As the clock keeps ticking it is also essential that the reboot is done
-quickly after the resume failure. The trace option does not use the seconds
-or the low order bits of the minutes of the RTC, but a too long delay will
-corrupt the magic value.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
new file mode 100644
index 000000000000..7ac8e1f549f4
--- /dev/null
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -0,0 +1,286 @@
+====================================================================
+Interaction of Suspend code (S3) with the CPU hotplug infrastructure
+====================================================================
+
+(C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+
+
+I. Differences between CPU hotplug and Suspend-to-RAM
+======================================================
+
+How does the regular CPU hotplug code differ from how the Suspend-to-RAM
+infrastructure uses it internally? And where do they share common code?
+
+Well, a picture is worth a thousand words... So ASCII art follows :-)
+
+[This depicts the current design in the kernel, and focusses only on the
+interactions involving the freezer and CPU hotplug and also tries to explain
+the locking involved. It outlines the notifications involved as well.
+But please note that here, only the call paths are illustrated, with the aim
+of describing where they take different paths and where they share code.
+What happens when regular CPU hotplug and Suspend-to-RAM race with each other
+is not depicted here.]
+
+On a high level, the suspend-resume cycle goes like this::
+
+  |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
+  |tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
+
+
+More details follow::
+
+                                Suspend call path
+                                -----------------
+
+                                  Write 'mem' to
+                                /sys/power/state
+                                    sysfs file
+                                        |
+                                        v
+                               Acquire system_transition_mutex lock
+                                        |
+                                        v
+                             Send PM_SUSPEND_PREPARE
+                                   notifications
+                                        |
+                                        v
+                                   Freeze tasks
+                                        |
+                                        |
+                                        v
+                              disable_nonboot_cpus()
+                                   /* start */
+                                        |
+                                        v
+                            Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                             Iterate over CURRENTLY
+                                   online CPUs
+                                        |
+                                        |
+                                        |                ----------
+                                        v                          | L
+             ======>               _cpu_down()                     |
+            |              [This takes cpuhotplug.lock             |
+  Common    |               before taking down the CPU             |
+   code     |               and releases it when done]             | O
+            |            While it is at it, notifications          |
+            |            are sent when notable events occur,       |
+             ======>     by running all registered callbacks.      |
+                                        |                          | O
+                                        |                          |
+                                        |                          |
+                                        v                          |
+                            Note down these cpus in                | P
+                                frozen_cpus mask         ----------
+                                        |
+                                        v
+                           Disable regular cpu hotplug
+                        by increasing cpu_hotplug_disabled
+                                        |
+                                        v
+                            Release cpu_add_remove_lock
+                                        |
+                                        v
+                       /* disable_nonboot_cpus() complete */
+                                        |
+                                        v
+                                   Do suspend
+
+
+
+Resuming back is likewise, with the counterparts being (in the order of
+execution during resume):
+
+* enable_nonboot_cpus() which involves::
+
+   |  Acquire cpu_add_remove_lock
+   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
+   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
+   |  Release cpu_add_remove_lock
+   v
+
+* thaw tasks
+* send PM_POST_SUSPEND notifications
+* Release system_transition_mutex lock.
+
+
+It is to be noted here that the system_transition_mutex lock is acquired at the very
+beginning, when we are just starting out to suspend, and then released only
+after the entire cycle is complete (i.e., suspend + resume).
+
+::
+
+
+
+                          Regular CPU hotplug call path
+                          -----------------------------
+
+                                Write 0 (or 1) to
+                       /sys/devices/system/cpu/cpu*/online
+                                    sysfs file
+                                        |
+                                        |
+                                        v
+                                    cpu_down()
+                                        |
+                                        v
+                           Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                          If cpu_hotplug_disabled > 0
+                                return gracefully
+                                        |
+                                        |
+                                        v
+             ======>                _cpu_down()
+            |              [This takes cpuhotplug.lock
+  Common    |               before taking down the CPU
+   code     |               and releases it when done]
+            |            While it is at it, notifications
+            |           are sent when notable events occur,
+             ======>    by running all registered callbacks.
+                                        |
+                                        |
+                                        v
+                          Release cpu_add_remove_lock
+                               [That's it!, for
+                              regular CPU hotplug]
+
+
+
+So, as can be seen from the two diagrams (the parts marked as "Common code"),
+regular CPU hotplug and the suspend code path converge at the _cpu_down() and
+_cpu_up() functions. They differ in the arguments passed to these functions,
+in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
+argument. But during suspend, since the tasks are already frozen by the time
+the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
+with the 'tasks_frozen' argument set to 1.
+[See below for some known issues regarding this.]
+
+
+Important files and functions/entry points:
+-------------------------------------------
+
+- kernel/power/process.c : freeze_processes(), thaw_processes()
+- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
+- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
+
+
+
+II. What are the issues involved in CPU hotplug?
+------------------------------------------------
+
+There are some interesting situations involving CPU hotplug and microcode
+update on the CPUs, as discussed below:
+
+[Please bear in mind that the kernel requests the microcode images from
+userspace, using the request_firmware() function defined in
+drivers/base/firmware_loader/main.c]
+
+
+a. When all the CPUs are identical:
+
+   This is the most common situation and it is quite straightforward: we want
+   to apply the same microcode revision to each of the CPUs.
+   To give an example of x86, the collect_cpu_info() function defined in
+   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
+   and thereby in applying the correct microcode revision to it.
+   But note that the kernel does not maintain a common microcode image for the
+   all CPUs, in order to handle case 'b' described below.
+
+
+b. When some of the CPUs are different than the rest:
+
+   In this case since we probably need to apply different microcode revisions
+   to different CPUs, the kernel maintains a copy of the correct microcode
+   image for each CPU (after appropriate CPU type/model discovery using
+   functions such as collect_cpu_info()).
+
+
+c. When a CPU is physically hot-unplugged and a new (and possibly different
+   type of) CPU is hot-plugged into the system:
+
+   In the current design of the kernel, whenever a CPU is taken offline during
+   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
+   (which is sent by the CPU hotplug code), the microcode update driver's
+   callback for that event reacts by freeing the kernel's copy of the
+   microcode image for that CPU.
+
+   Hence, when a new CPU is brought online, since the kernel finds that it
+   doesn't have the microcode image, it does the CPU type/model discovery
+   afresh and then requests the userspace for the appropriate microcode image
+   for that CPU, which is subsequently applied.
+
+   For example, in x86, the mc_cpu_callback() function (which is the microcode
+   update driver's callback registered for CPU hotplug events) calls
+   microcode_update_cpu() which would call microcode_init_cpu() in this case,
+   instead of microcode_resume_cpu() when it finds that the kernel doesn't
+   have a valid microcode image. This ensures that the CPU type/model
+   discovery is performed and the right microcode is applied to the CPU after
+   getting it from userspace.
+
+
+d. Handling microcode update during suspend/hibernate:
+
+   Strictly speaking, during a CPU hotplug operation which does not involve
+   physically removing or inserting CPUs, the CPUs are not actually powered
+   off during a CPU offline. They are just put to the lowest C-states possible.
+   Hence, in such a case, it is not really necessary to re-apply microcode
+   when the CPUs are brought back online, since they wouldn't have lost the
+   image during the CPU offline operation.
+
+   This is the usual scenario encountered during a resume after a suspend.
+   However, in the case of hibernation, since all the CPUs are completely
+   powered off, during restore it becomes necessary to apply the microcode
+   images to all the CPUs.
+
+   [Note that we don't expect someone to physically pull out nodes and insert
+   nodes with a different type of CPUs in-between a suspend-resume or a
+   hibernate/restore cycle.]
+
+   In the current design of the kernel however, during a CPU offline operation
+   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
+   the existing copy of microcode image in the kernel is not freed up.
+   And during the CPU online operations (during resume/restore), since the
+   kernel finds that it already has copies of the microcode images for all the
+   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
+   type/model and the need for validating whether the microcode revisions are
+   right for the CPUs or not (due to the above assumption that physical CPU
+   hotplug will not be done in-between suspend/resume or hibernate/restore
+   cycles).
+
+
+III. Known problems
+===================
+
+Are there any known problems when regular CPU hotplug and suspend race
+with each other?
+
+Yes, they are listed below:
+
+1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
+   the _cpu_down() and _cpu_up() functions is *always* 0.
+   This might not reflect the true current state of the system, since the
+   tasks could have been frozen by an out-of-band event such as a suspend
+   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+   reflect the frozen state and the CPU hotplug callbacks which evaluate
+   that variable might execute the wrong code path.
+
+2. If a regular CPU hotplug stress test happens to race with the freezer due
+   to a suspend operation in progress at the same time, then we could hit the
+   situation described below:
+
+    * A regular cpu online operation continues its journey from userspace
+      into the kernel, since the freezing has not yet begun.
+    * Then freezer gets to work and freezes userspace.
+    * If cpu online has not yet completed the microcode update stuff by now,
+      it will now start waiting on the frozen userspace in the
+      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
+    * Now the freezer continues and tries to freeze the remaining tasks. But
+      due to this wait mentioned above, the freezer won't be able to freeze
+      the cpu online hotplug task and hence freezing of tasks fails.
+
+   As a result of this task freezing failure, the suspend operation gets
+   aborted.
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
deleted file mode 100644
index a8751b8df10e..000000000000
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ /dev/null
@@ -1,274 +0,0 @@
-Interaction of Suspend code (S3) with the CPU hotplug infrastructure
-
-     (C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
-
-
-I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM
-   infrastructure uses it internally? And where do they share common code?
-
-Well, a picture is worth a thousand words... So ASCII art follows :-)
-
-[This depicts the current design in the kernel, and focusses only on the
-interactions involving the freezer and CPU hotplug and also tries to explain
-the locking involved. It outlines the notifications involved as well.
-But please note that here, only the call paths are illustrated, with the aim
-of describing where they take different paths and where they share code.
-What happens when regular CPU hotplug and Suspend-to-RAM race with each other
-is not depicted here.]
-
-On a high level, the suspend-resume cycle goes like this:
-
-|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
-|tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
-
-
-More details follow:
-
-                                Suspend call path
-                                -----------------
-
-                                  Write 'mem' to
-                                /sys/power/state
-                                    sysfs file
-                                        |
-                                        v
-                               Acquire system_transition_mutex lock
-                                        |
-                                        v
-                             Send PM_SUSPEND_PREPARE
-                                   notifications
-                                        |
-                                        v
-                                   Freeze tasks
-                                        |
-                                        |
-                                        v
-                              disable_nonboot_cpus()
-                                   /* start */
-                                        |
-                                        v
-                            Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                             Iterate over CURRENTLY
-                                   online CPUs
-                                        |
-                                        |
-                                        |                ----------
-                                        v                          | L
-             ======>               _cpu_down()                     |
-            |              [This takes cpuhotplug.lock             |
-  Common    |               before taking down the CPU             |
-   code     |               and releases it when done]             | O
-            |            While it is at it, notifications          |
-            |            are sent when notable events occur,       |
-             ======>     by running all registered callbacks.      |
-                                        |                          | O
-                                        |                          |
-                                        |                          |
-                                        v                          |
-                            Note down these cpus in                | P
-                                frozen_cpus mask         ----------
-                                        |
-                                        v
-                           Disable regular cpu hotplug
-                        by increasing cpu_hotplug_disabled
-                                        |
-                                        v
-                            Release cpu_add_remove_lock
-                                        |
-                                        v
-                       /* disable_nonboot_cpus() complete */
-                                        |
-                                        v
-                                   Do suspend
-
-
-
-Resuming back is likewise, with the counterparts being (in the order of
-execution during resume):
-* enable_nonboot_cpus() which involves:
-   |  Acquire cpu_add_remove_lock
-   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
-   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
-   |  Release cpu_add_remove_lock
-   v
-
-* thaw tasks
-* send PM_POST_SUSPEND notifications
-* Release system_transition_mutex lock.
-
-
-It is to be noted here that the system_transition_mutex lock is acquired at the very
-beginning, when we are just starting out to suspend, and then released only
-after the entire cycle is complete (i.e., suspend + resume).
-
-
-
-                          Regular CPU hotplug call path
-                          -----------------------------
-
-                                Write 0 (or 1) to
-                       /sys/devices/system/cpu/cpu*/online
-                                    sysfs file
-                                        |
-                                        |
-                                        v
-                                    cpu_down()
-                                        |
-                                        v
-                           Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                          If cpu_hotplug_disabled > 0
-                                return gracefully
-                                        |
-                                        |
-                                        v
-             ======>                _cpu_down()
-            |              [This takes cpuhotplug.lock
-  Common    |               before taking down the CPU
-   code     |               and releases it when done]
-            |            While it is at it, notifications
-            |           are sent when notable events occur,
-             ======>    by running all registered callbacks.
-                                        |
-                                        |
-                                        v
-                          Release cpu_add_remove_lock
-                               [That's it!, for
-                              regular CPU hotplug]
-
-
-
-So, as can be seen from the two diagrams (the parts marked as "Common code"),
-regular CPU hotplug and the suspend code path converge at the _cpu_down() and
-_cpu_up() functions. They differ in the arguments passed to these functions,
-in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
-argument. But during suspend, since the tasks are already frozen by the time
-the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
-with the 'tasks_frozen' argument set to 1.
-[See below for some known issues regarding this.]
-
-
-Important files and functions/entry points:
-------------------------------------------
-
-kernel/power/process.c : freeze_processes(), thaw_processes()
-kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
-kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
-
-
-
-II. What are the issues involved in CPU hotplug?
-    -------------------------------------------
-
-There are some interesting situations involving CPU hotplug and microcode
-update on the CPUs, as discussed below:
-
-[Please bear in mind that the kernel requests the microcode images from
-userspace, using the request_firmware() function defined in
-drivers/base/firmware_loader/main.c]
-
-
-a. When all the CPUs are identical:
-
-   This is the most common situation and it is quite straightforward: we want
-   to apply the same microcode revision to each of the CPUs.
-   To give an example of x86, the collect_cpu_info() function defined in
-   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
-   and thereby in applying the correct microcode revision to it.
-   But note that the kernel does not maintain a common microcode image for the
-   all CPUs, in order to handle case 'b' described below.
-
-
-b. When some of the CPUs are different than the rest:
-
-   In this case since we probably need to apply different microcode revisions
-   to different CPUs, the kernel maintains a copy of the correct microcode
-   image for each CPU (after appropriate CPU type/model discovery using
-   functions such as collect_cpu_info()).
-
-
-c. When a CPU is physically hot-unplugged and a new (and possibly different
-   type of) CPU is hot-plugged into the system:
-
-   In the current design of the kernel, whenever a CPU is taken offline during
-   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
-   (which is sent by the CPU hotplug code), the microcode update driver's
-   callback for that event reacts by freeing the kernel's copy of the
-   microcode image for that CPU.
-
-   Hence, when a new CPU is brought online, since the kernel finds that it
-   doesn't have the microcode image, it does the CPU type/model discovery
-   afresh and then requests the userspace for the appropriate microcode image
-   for that CPU, which is subsequently applied.
-
-   For example, in x86, the mc_cpu_callback() function (which is the microcode
-   update driver's callback registered for CPU hotplug events) calls
-   microcode_update_cpu() which would call microcode_init_cpu() in this case,
-   instead of microcode_resume_cpu() when it finds that the kernel doesn't
-   have a valid microcode image. This ensures that the CPU type/model
-   discovery is performed and the right microcode is applied to the CPU after
-   getting it from userspace.
-
-
-d. Handling microcode update during suspend/hibernate:
-
-   Strictly speaking, during a CPU hotplug operation which does not involve
-   physically removing or inserting CPUs, the CPUs are not actually powered
-   off during a CPU offline. They are just put to the lowest C-states possible.
-   Hence, in such a case, it is not really necessary to re-apply microcode
-   when the CPUs are brought back online, since they wouldn't have lost the
-   image during the CPU offline operation.
-
-   This is the usual scenario encountered during a resume after a suspend.
-   However, in the case of hibernation, since all the CPUs are completely
-   powered off, during restore it becomes necessary to apply the microcode
-   images to all the CPUs.
-
-   [Note that we don't expect someone to physically pull out nodes and insert
-   nodes with a different type of CPUs in-between a suspend-resume or a
-   hibernate/restore cycle.]
-
-   In the current design of the kernel however, during a CPU offline operation
-   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
-   the existing copy of microcode image in the kernel is not freed up.
-   And during the CPU online operations (during resume/restore), since the
-   kernel finds that it already has copies of the microcode images for all the
-   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
-   type/model and the need for validating whether the microcode revisions are
-   right for the CPUs or not (due to the above assumption that physical CPU
-   hotplug will not be done in-between suspend/resume or hibernate/restore
-   cycles).
-
-
-III. Are there any known problems when regular CPU hotplug and suspend race
-     with each other?
-
-Yes, they are listed below:
-
-1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
-   the _cpu_down() and _cpu_up() functions is *always* 0.
-   This might not reflect the true current state of the system, since the
-   tasks could have been frozen by an out-of-band event such as a suspend
-   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
-   reflect the frozen state and the CPU hotplug callbacks which evaluate
-   that variable might execute the wrong code path.
-
-2. If a regular CPU hotplug stress test happens to race with the freezer due
-   to a suspend operation in progress at the same time, then we could hit the
-   situation described below:
-
-    * A regular cpu online operation continues its journey from userspace
-      into the kernel, since the freezing has not yet begun.
-    * Then freezer gets to work and freezes userspace.
-    * If cpu online has not yet completed the microcode update stuff by now,
-      it will now start waiting on the frozen userspace in the
-      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
-    * Now the freezer continues and tries to freeze the remaining tasks. But
-      due to this wait mentioned above, the freezer won't be able to freeze
-      the cpu online hotplug task and hence freezing of tasks fails.
-
-   As a result of this task freezing failure, the suspend operation gets
-   aborted.
diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst
new file mode 100644
index 000000000000..4cda6617709a
--- /dev/null
+++ b/Documentation/power/suspend-and-interrupts.rst
@@ -0,0 +1,137 @@
+====================================
+System Suspend and Device Interrupts
+====================================
+
+Copyright (C) 2014 Intel Corp.
+Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+Suspending and Resuming Device IRQs
+-----------------------------------
+
+Device interrupt request lines (IRQs) are generally disabled during system
+suspend after the "late" phase of suspending devices (that is, after all of the
+->prepare, ->suspend and ->suspend_late callbacks have been executed for all
+devices).  That is done by suspend_device_irqs().
+
+The rationale for doing so is that after the "late" phase of device suspend
+there is no legitimate reason why any interrupts from suspended devices should
+trigger and if any devices have not been suspended properly yet, it is better to
+block interrupts from them anyway.  Also, in the past we had problems with
+interrupt handlers for shared IRQs that device drivers implementing them were
+not prepared for interrupts triggering after their devices had been suspended.
+In some cases they would attempt to access, for example, memory address spaces
+of suspended devices and cause unpredictable behavior to ensue as a result.
+Unfortunately, such problems are very difficult to debug and the introduction
+of suspend_device_irqs(), along with the "noirq" phase of device suspend and
+resume, was the only practical way to mitigate them.
+
+Device IRQs are re-enabled during system resume, right before the "early" phase
+of resuming devices (that is, before starting to execute ->resume_early
+callbacks for devices).  The function doing that is resume_device_irqs().
+
+
+The IRQF_NO_SUSPEND Flag
+------------------------
+
+There are interrupts that can legitimately trigger during the entire system
+suspend-resume cycle, including the "noirq" phases of suspending and resuming
+devices as well as during the time when nonboot CPUs are taken offline and
+brought back online.  That applies to timer interrupts in the first place,
+but also to IPIs and to some other special-purpose interrupts.
+
+The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
+requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
+leave the corresponding IRQ enabled so as to allow the interrupt to work as
+expected during the suspend-resume cycle, but does not guarantee that the
+interrupt will wake the system from a suspended state -- for such cases it is
+necessary to use enable_irq_wake().
+
+Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
+user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
+for it will be executed as usual after suspend_device_irqs(), even if the
+IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
+the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
+same time should be avoided.
+
+
+System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
+------------------------------------------------------------------
+
+System wakeup interrupts generally need to be configured to wake up the system
+from sleep states, especially if they are used for different purposes (e.g. as
+I/O interrupts) in the working state.
+
+That may involve turning on a special signal handling logic within the platform
+(such as an SoC) so that signals from a given line are routed in a different way
+during system sleep so as to trigger a system wakeup when needed.  For example,
+the platform may include a dedicated interrupt controller used specifically for
+handling system wakeup events.  Then, if a given interrupt line is supposed to
+wake up the system from sleep sates, the corresponding input of that interrupt
+controller needs to be enabled to receive signals from the line in question.
+After wakeup, it generally is better to disable that input to prevent the
+dedicated controller from triggering interrupts unnecessarily.
+
+The IRQ subsystem provides two helper functions to be used by device drivers for
+those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
+handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
+turns that logic off.
+
+Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
+in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
+it will be disabled, marked as pending and "suspended" so that it will be
+re-enabled by resume_device_irqs() during the subsequent system resume.  Also
+the PM core is notified about the event which causes the system suspend in
+progress to be aborted (that doesn't have to happen immediately, but at one
+of the points where the suspend thread looks for pending wakeup events).
+
+This way every interrupt from a wakeup interrupt source will either cause the
+system suspend currently in progress to be aborted or wake up the system if
+already suspended.  However, after suspend_device_irqs() interrupt handlers are
+not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
+IRQs at that time, but those IRQs should not be configured for system wakeup
+using enable_irq_wake().
+
+
+Interrupts and Suspend-to-Idle
+------------------------------
+
+Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
+system sleep state that works by idling all of the processors and waiting for
+interrupts right after the "noirq" phase of suspending devices.
+
+Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
+set will bring CPUs out of idle while in that state, but they will not cause the
+IRQ subsystem to trigger a system wakeup.
+
+System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
+analogy with what they do in the full system suspend case.  The only difference
+is that the wakeup from suspend-to-idle is signaled using the usual working
+state interrupt delivery mechanisms and doesn't require the platform to use
+any special interrupt handling logic for it to work.
+
+
+IRQF_NO_SUSPEND and enable_irq_wake()
+-------------------------------------
+
+There are very few valid reasons to use both enable_irq_wake() and the
+IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
+same device.
+
+First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
+interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
+directly at odds with the rules for handling system wakeup interrupts (interrupt
+handlers are not invoked after suspend_device_irqs()).
+
+Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
+to individual interrupt handlers, so sharing an IRQ between a system wakeup
+interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
+make sense.
+
+In rare cases an IRQ can be shared between a wakeup device driver and an
+IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
+must be able to discern spurious IRQs from genuine wakeup events (signalling
+the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
+ensure that the IRQ will function as a wakeup source, and must request the IRQ
+with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
+these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt
deleted file mode 100644
index 8afb29a8604a..000000000000
--- a/Documentation/power/suspend-and-interrupts.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-System Suspend and Device Interrupts
-
-Copyright (C) 2014 Intel Corp.
-Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Suspending and Resuming Device IRQs
------------------------------------
-
-Device interrupt request lines (IRQs) are generally disabled during system
-suspend after the "late" phase of suspending devices (that is, after all of the
-->prepare, ->suspend and ->suspend_late callbacks have been executed for all
-devices).  That is done by suspend_device_irqs().
-
-The rationale for doing so is that after the "late" phase of device suspend
-there is no legitimate reason why any interrupts from suspended devices should
-trigger and if any devices have not been suspended properly yet, it is better to
-block interrupts from them anyway.  Also, in the past we had problems with
-interrupt handlers for shared IRQs that device drivers implementing them were
-not prepared for interrupts triggering after their devices had been suspended.
-In some cases they would attempt to access, for example, memory address spaces
-of suspended devices and cause unpredictable behavior to ensue as a result.
-Unfortunately, such problems are very difficult to debug and the introduction
-of suspend_device_irqs(), along with the "noirq" phase of device suspend and
-resume, was the only practical way to mitigate them.
-
-Device IRQs are re-enabled during system resume, right before the "early" phase
-of resuming devices (that is, before starting to execute ->resume_early
-callbacks for devices).  The function doing that is resume_device_irqs().
-
-
-The IRQF_NO_SUSPEND Flag
-------------------------
-
-There are interrupts that can legitimately trigger during the entire system
-suspend-resume cycle, including the "noirq" phases of suspending and resuming
-devices as well as during the time when nonboot CPUs are taken offline and
-brought back online.  That applies to timer interrupts in the first place,
-but also to IPIs and to some other special-purpose interrupts.
-
-The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
-requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
-leave the corresponding IRQ enabled so as to allow the interrupt to work as
-expected during the suspend-resume cycle, but does not guarantee that the
-interrupt will wake the system from a suspended state -- for such cases it is
-necessary to use enable_irq_wake().
-
-Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
-user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
-for it will be executed as usual after suspend_device_irqs(), even if the
-IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
-the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
-same time should be avoided.
-
-
-System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
-------------------------------------------------------------------
-
-System wakeup interrupts generally need to be configured to wake up the system
-from sleep states, especially if they are used for different purposes (e.g. as
-I/O interrupts) in the working state.
-
-That may involve turning on a special signal handling logic within the platform
-(such as an SoC) so that signals from a given line are routed in a different way
-during system sleep so as to trigger a system wakeup when needed.  For example,
-the platform may include a dedicated interrupt controller used specifically for
-handling system wakeup events.  Then, if a given interrupt line is supposed to
-wake up the system from sleep sates, the corresponding input of that interrupt
-controller needs to be enabled to receive signals from the line in question.
-After wakeup, it generally is better to disable that input to prevent the
-dedicated controller from triggering interrupts unnecessarily.
-
-The IRQ subsystem provides two helper functions to be used by device drivers for
-those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
-handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
-turns that logic off.
-
-Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
-in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
-it will be disabled, marked as pending and "suspended" so that it will be
-re-enabled by resume_device_irqs() during the subsequent system resume.  Also
-the PM core is notified about the event which causes the system suspend in
-progress to be aborted (that doesn't have to happen immediately, but at one
-of the points where the suspend thread looks for pending wakeup events).
-
-This way every interrupt from a wakeup interrupt source will either cause the
-system suspend currently in progress to be aborted or wake up the system if
-already suspended.  However, after suspend_device_irqs() interrupt handlers are
-not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
-IRQs at that time, but those IRQs should not be configured for system wakeup
-using enable_irq_wake().
-
-
-Interrupts and Suspend-to-Idle
-------------------------------
-
-Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
-system sleep state that works by idling all of the processors and waiting for
-interrupts right after the "noirq" phase of suspending devices.
-
-Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
-set will bring CPUs out of idle while in that state, but they will not cause the
-IRQ subsystem to trigger a system wakeup.
-
-System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
-analogy with what they do in the full system suspend case.  The only difference
-is that the wakeup from suspend-to-idle is signaled using the usual working
-state interrupt delivery mechanisms and doesn't require the platform to use
-any special interrupt handling logic for it to work.
-
-
-IRQF_NO_SUSPEND and enable_irq_wake()
--------------------------------------
-
-There are very few valid reasons to use both enable_irq_wake() and the
-IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
-same device.
-
-First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
-interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
-directly at odds with the rules for handling system wakeup interrupts (interrupt
-handlers are not invoked after suspend_device_irqs()).
-
-Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
-to individual interrupt handlers, so sharing an IRQ between a system wakeup
-interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
-make sense.
-
-In rare cases an IRQ can be shared between a wakeup device driver and an
-IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
-must be able to discern spurious IRQs from genuine wakeup events (signalling
-the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
-ensure that the IRQ will function as a wakeup source, and must request the IRQ
-with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
-these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst
new file mode 100644
index 000000000000..a33a2919dbe4
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.rst
@@ -0,0 +1,63 @@
+===============================================
+Using swap files with software suspend (swsusp)
+===============================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it.  From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver.  Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk.  For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.::
+
+    # dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+    # mkswap <swap_file_path>
+    # swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line::
+
+    resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.rst (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition.  In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before.  Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
deleted file mode 100644
index f281886de490..000000000000
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Using swap files with software suspend (swsusp)
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-The Linux kernel handles swap files almost in the same way as it handles swap
-partitions and there are only two differences between these two types of swap
-areas:
-(1) swap files need not be contiguous,
-(2) the header of a swap file is not in the first block of the partition that
-holds it.  From the swsusp's point of view (1) is not a problem, because it is
-already taken care of by the swap-handling code, but (2) has to be taken into
-consideration.
-
-In principle the location of a swap file's header may be determined with the
-help of appropriate filesystem driver.  Unfortunately, however, it requires the
-filesystem holding the swap file to be mounted, and if this filesystem is
-journaled, it cannot be mounted during resume from disk.  For this reason to
-identify a swap file swsusp uses the name of the partition that holds the file
-and the offset from the beginning of the partition at which the swap file's
-header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
-units.
-
-In order to use a swap file with swsusp, you need to:
-
-1) Create the swap file and make it active, eg.
-
-# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
-# mkswap <swap_file_path>
-# swapon <swap_file_path>
-
-2) Use an application that will bmap the swap file with the help of the
-FIBMAP ioctl and determine the location of the file's swap header, as the
-offset, in <PAGE_SIZE> units, from the beginning of the partition which
-holds the swap file.
-
-3) Add the following parameters to the kernel command line:
-
-resume=<swap_file_partition> resume_offset=<swap_file_offset>
-
-where <swap_file_partition> is the partition on which the swap file is located
-and <swap_file_offset> is the offset of the swap header determined by the
-application in 2) (of course, this step may be carried out automatically
-by the same application that determines the swap file's header offset using the
-FIBMAP ioctl)
-
-OR
-
-Use a userland suspend application that will set the partition and offset
-with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
-Documentation/power/userland-swsusp.txt (this is the only method to suspend
-to a swap file allowing the resume to be initiated from an initrd or initramfs
-image).
-
-Now, swsusp will use the swap file in the same way in which it would use a swap
-partition.  In particular, the swap file has to be active (ie. be present in
-/proc/swaps) so that it can be used for suspending.
-
-Note that if the swap file used for suspending is deleted and recreated,
-the location of its header need not be the same as before.  Thus every time
-this happens the value of the "resume_offset=" kernel command line parameter
-has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst
new file mode 100644
index 000000000000..426df59172cd
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.rst
@@ -0,0 +1,140 @@
+=======================================
+How to use dm-crypt and swsusp together
+=======================================
+
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.rst and understand it.
+You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following::
+
+  brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains::
+
+  CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines::
+
+  image=/boot/vmlinuz
+  initrd=/boot/initrd.gz
+  label=linux
+  append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following::
+
+  /dev/hda1   /mnt    ext3      ro                            0 0
+  none        /proc   proc      defaults,noatime,nodiratime   0 0
+  none        /sys    sysfs     defaults,noatime,nodiratime   0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen::
+
+  #!/bin/sh
+  PATH=/sbin:/bin:/usr/sbin:/usr/bin
+  mount /proc
+  mount /sys
+  mapped=0
+  noresume=`grep -c noresume /proc/cmdline`
+  if [ "$*" != "" ]
+  then
+    noresume=1
+  fi
+  dmesg -n 1
+  /sbin/cardmgr -q
+  for i in 1 2 3 4 5 6 7 8 9 0
+  do
+    if [ -f /proc/ide/hde/media ]
+    then
+      usleep 500000
+      mount -t ext2 -o ro /dev/hde1 /mnt
+      if [ -f /mnt/swapkey ]
+      then
+        dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+      fi
+      umount /mnt
+      break
+    fi
+    usleep 500000
+  done
+  killproc /sbin/cardmgr
+  dmesg -n 6
+  if [ $mapped = 1 ]
+  then
+    if [ $noresume != 0 ]
+    then
+      mkswap /dev/mapper/swap0 > /dev/null 2>&1
+    fi
+    echo 254:0 > /sys/power/resume
+    dmsetup remove swap0
+  fi
+  umount /sys
+  mount /mnt
+  umount /proc
+  cd /mnt
+  pivot_root . mnt
+  mount /proc
+  umount -l /mnt
+  umount /proc
+  exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
deleted file mode 100644
index b802fbfd95ef..000000000000
--- a/Documentation/power/swsusp-dmcrypt.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-Author: Andreas Steinmetz <ast@domdv.de>
-
-
-How to use dm-crypt and swsusp together:
-========================================
-
-Some prerequisites:
-You know how dm-crypt works. If not, visit the following web page:
-http://www.saout.de/misc/dm-crypt/
-You have read Documentation/power/swsusp.txt and understand it.
-You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
-You know how to create or how to modify an initrd.
-
-Now your system is properly set up, your disk is encrypted except for
-the swap device(s) and the boot partition which may contain a mini
-system for crypto setup and/or rescue purposes. You may even have
-an initrd that does your current crypto setup already.
-
-At this point you want to encrypt your swap, too. Still you want to
-be able to suspend using swsusp. This, however, means that you
-have to be able to either enter a passphrase or that you read
-the key(s) from an external device like a pcmcia flash disk
-or an usb stick prior to resume. So you need an initrd, that sets
-up dm-crypt and then asks swsusp to resume from the encrypted
-swap device.
-
-The most important thing is that you set up dm-crypt in such
-a way that the swap device you suspend to/resume from has
-always the same major/minor within the initrd as well as
-within your running system. The easiest way to achieve this is
-to always set up this swap device first with dmsetup, so that
-it will always look like the following:
-
-brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
-
-Now set up your kernel to use /dev/mapper/swap0 as the default
-resume partition, so your kernel .config contains:
-
-CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
-
-Prepare your boot loader to use the initrd you will create or
-modify. For lilo the simplest setup looks like the following
-lines:
-
-image=/boot/vmlinuz
-initrd=/boot/initrd.gz
-label=linux
-append="root=/dev/ram0 init=/linuxrc rw"
-
-Finally you need to create or modify your initrd. Lets assume
-you create an initrd that reads the required dm-crypt setup
-from a pcmcia flash disk card. The card is formatted with an ext2
-fs which resides on /dev/hde1 when the card is inserted. The
-card contains at least the encrypted swap setup in a file
-named "swapkey". /etc/fstab of your initrd contains something
-like the following:
-
-/dev/hda1   /mnt    ext3      ro                            0 0
-none        /proc   proc      defaults,noatime,nodiratime   0 0
-none        /sys    sysfs     defaults,noatime,nodiratime   0 0
-
-/dev/hda1 contains an unencrypted mini system that sets up all
-of your crypto devices, again by reading the setup from the
-pcmcia flash disk. What follows now is a /linuxrc for your
-initrd that allows you to resume from encrypted swap and that
-continues boot with your mini system on /dev/hda1 if resume
-does not happen:
-
-#!/bin/sh
-PATH=/sbin:/bin:/usr/sbin:/usr/bin
-mount /proc
-mount /sys
-mapped=0
-noresume=`grep -c noresume /proc/cmdline`
-if [ "$*" != "" ]
-then
-  noresume=1
-fi
-dmesg -n 1
-/sbin/cardmgr -q
-for i in 1 2 3 4 5 6 7 8 9 0
-do
-  if [ -f /proc/ide/hde/media ]
-  then
-    usleep 500000
-    mount -t ext2 -o ro /dev/hde1 /mnt
-    if [ -f /mnt/swapkey ]
-    then
-      dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
-    fi
-    umount /mnt
-    break
-  fi
-  usleep 500000
-done
-killproc /sbin/cardmgr
-dmesg -n 6
-if [ $mapped = 1 ]
-then
-  if [ $noresume != 0 ]
-  then
-    mkswap /dev/mapper/swap0 > /dev/null 2>&1
-  fi
-  echo 254:0 > /sys/power/resume
-  dmsetup remove swap0
-fi
-umount /sys
-mount /mnt
-umount /proc
-cd /mnt
-pivot_root . mnt
-mount /proc
-umount -l /mnt
-umount /proc
-exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
-
-Please don't mind the weird loop above, busybox's msh doesn't know
-the let statement. Now, what is happening in the script?
-First we have to decide if we want to try to resume, or not.
-We will not resume if booting with "noresume" or any parameters
-for init like "single" or "emergency" as boot parameters.
-
-Then we need to set up dmcrypt with the setup data from the
-pcmcia flash disk. If this succeeds we need to reset the swap
-device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
-then attempts to resume from the first device mapper device.
-Note that it is important to set the device in /sys/power/resume,
-regardless if resuming or not, otherwise later suspend will fail.
-If resume starts, script execution terminates here.
-
-Otherwise we just remove the encrypted swap device and leave it to the
-mini system on /dev/hda1 to set the whole crypto up (it is up to
-you to modify this to your taste).
-
-What then follows is the well known process to change the root
-file system and continue booting from there. I prefer to unmount
-the initrd prior to continue booting but it is up to you to modify
-this.
diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst
new file mode 100644
index 000000000000..d000312f6965
--- /dev/null
+++ b/Documentation/power/swsusp.rst
@@ -0,0 +1,501 @@
+============
+Swap suspend
+============
+
+Some warnings, first.
+
+.. warning::
+
+   **BIG FAT WARNING**
+
+   If you touch anything on disk between suspend and resume...
+				...kiss your data goodbye.
+
+   If you do resume from initrd after your filesystems are mounted...
+				...bye bye root partition.
+
+			[this is actually same case as above]
+
+   If you have unsupported ( ) devices using DMA, you may have some
+   problems. If your disk driver does not support suspend... (IDE does),
+   it may cause some problems, too. If you change kernel command line
+   between suspend and resume, it may do something wrong. If you change
+   your hardware while system is suspended... well, it was not good idea;
+   but it will probably only crash.
+
+   ( ) suspend/resume support is needed to make it safe.
+
+   If you have any filesystems on USB devices mounted before software suspend,
+   they won't be accessible after resume and you may lose data, as though
+   you have unplugged the USB devices with mounted filesystems on them;
+   see the FAQ below for details.  (This is not true for more traditional
+   power states like "standby", which normally don't turn USB off.)
+
+Swap partition:
+  You need to append resume=/dev/your_swap_partition to kernel command
+  line or specify it using /sys/power/resume.
+
+Swap file:
+  If using a swapfile you can also specify a resume offset using
+  resume_offset=<number> on the kernel command line or specify it
+  in /sys/power/resume_offset.
+
+After preparing then you suspend by::
+
+	echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+- If you feel ACPI works pretty well on your system, you might try::
+
+	echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+- If you would like to write hibernation image to swap and then suspend
+  to RAM (provided your platform supports it), you can try::
+
+	echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
+- If you have SATA disks, you'll need recent kernels with SATA suspend
+  support. For suspend and resume to work, make sure your disk drivers
+  are built into kernel -- not modules. [There's way to make
+  suspend/resume with modular disk drivers, see FAQ, but you probably
+  should not do that.]
+
+If you want to limit the suspend image size to N bytes, do::
+
+	echo N > /sys/power/image_size
+
+before suspend (it is limited to around 2/5 of available RAM by default).
+
+- The resume process checks for the presence of the resume device,
+  if found, it then checks the contents for the hibernation image signature.
+  If both are found, it resumes the hibernation image.
+
+- The resume process may be triggered in two ways:
+
+  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
+     the kernel command line, lateinit runs the resume process.  If the
+     resume device has not been probed yet, the resume process fails and
+     bootup continues.
+  2) Manually from an initrd or initramfs:  May be run from
+     the init script by using the /sys/power/resume file.  It is vital
+     that this be done prior to remounting any filesystems (even as
+     read-only) otherwise data may be corrupted.
+
+Article about goals and implementation of Software Suspend for Linux
+====================================================================
+
+Author: Gábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+-------------------------
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have
+to interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns.  You must explicitly specify the swap partition to resume from with
+`resume=` kernel option. If signature is found it loads and restores saved
+state. If the option `noresume` is specified as a boot parameter, it skips
+the resuming.  If the option `hibernate=nocompress` is specified as a boot
+parameter, it saves hibernation image without compression.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world::
+
+  echo 1 > /proc/acpi/sleep       # for standby
+  echo 2 > /proc/acpi/sleep       # for suspend to ram
+  echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
+  echo 4 > /proc/acpi/sleep       # for suspend to disk
+  echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
+
+and perhaps::
+
+  echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q:
+  well, suspending a server is IMHO a really stupid thing,
+  but... (Diego Zuccato):
+
+A:
+  You bought new UPS for your server. How do you install it without
+  bringing machine down? Suspend to disk, rearrange power cables,
+  resume.
+
+  You have your server on UPS. Power died, and UPS is indicating 30
+  seconds to failure. What do you do? Suspend to disk.
+
+
+Q:
+  Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A:
+  We do use the regular I/O paths. However we cannot restore the data
+  to its original location as we load it. That would create an
+  inconsistent kernel state which would certainly result in an oops.
+  Instead, we load the image into unused memory and then atomically copy
+  it back to it original location. This implies, of course, a maximum
+  image size of half the amount of memory.
+
+  There are two solutions to this:
+
+  * require half of memory to be free during suspend. That way you can
+    read "new" data onto free spots, then cli and copy
+
+  * assume we had special "polling" ide driver that only uses memory
+    between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+    during suspending, but otherwise it would work...
+
+  suspend2 shares this fundamental limitation, but does not include user
+  data and disk caches into "used memory" by saving them in
+  advance. That means that the limitation goes away in practice.
+
+Q:
+  Does linux support ACPI S4?
+
+A:
+  Yes. That's what echo platform > /sys/power/disk does.
+
+Q:
+  What is 'suspend2'?
+
+A:
+  suspend2 is 'Software Suspend 2', a forked implementation of
+  suspend-to-disk which is available as separate patches for 2.4 and 2.6
+  kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+  highmem and preemption. It also has a extensible architecture that
+  allows for arbitrary transformations on the image (compression,
+  encryption) and arbitrary backends for writing the image (eg to swap
+  or an NFS share[Work In Progress]). Questions regarding suspend2
+  should be sent to the mailing list available through the suspend2
+  website, and not to the Linux Kernel Mailing List. We are working
+  toward merging suspend2 into the mainline kernel.
+
+Q:
+  What is the freezing of tasks and why are we using it?
+
+A:
+  The freezing of tasks is a mechanism by which user space processes and some
+  kernel threads are controlled during hibernation or system-wide suspend (on some
+  architectures).  See freezing-of-tasks.txt for details.
+
+Q:
+  What is the difference between "platform" and "shutdown"?
+
+A:
+  shutdown:
+	save state in linux, then tell bios to powerdown
+
+  platform:
+	save state in linux, then tell bios to powerdown and blink
+        "suspended led"
+
+  "platform" is actually right thing to do where supported, but
+  "shutdown" is most reliable (except on ACPI systems).
+
+Q:
+  I do not understand why you have such strong objections to idea of
+  selective suspend.
+
+A:
+  Do selective suspend during runtime power management, that's okay. But
+  it's useless for suspend-to-disk. (And I do not see how you could use
+  it for suspend-to-ram, I hope you do not want that).
+
+  Lets see, so you suggest to
+
+  * SUSPEND all but swap device and parents
+  * Snapshot
+  * Write image to disk
+  * SUSPEND swap device and parents
+  * Powerdown
+
+  Oh no, that does not work, if swap device or its parents uses DMA,
+  you've corrupted data. You'd have to do
+
+  * SUSPEND all but swap device and parents
+  * FREEZE swap device and parents
+  * Snapshot
+  * UNFREEZE swap device and parents
+  * Write
+  * SUSPEND swap device and parents
+
+  Which means that you still need that FREEZE state, and you get more
+  complicated code. (And I have not yet introduce details like system
+  devices).
+
+Q:
+  There don't seem to be any generally useful behavioral
+  distinctions between SUSPEND and FREEZE.
+
+A:
+  Doing SUSPEND when you are asked to do FREEZE is always correct,
+  but it may be unnecessarily slow. If you want your driver to stay simple,
+  slowness may not matter to you. It can always be fixed later.
+
+  For devices like disk it does matter, you do not want to spindown for
+  FREEZE.
+
+Q:
+  After resuming, system is paging heavily, leading to very bad interactivity.
+
+A:
+  Try running::
+
+    cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+    do
+      test -f "$file" && cat "$file" > /dev/null
+    done
+
+  after resume. swapoff -a; swapon -a may also be useful.
+
+Q:
+  What happens to devices during swsusp? They seem to be resumed
+  during system suspend?
+
+A:
+  That's correct. We need to resume them if we want to write image to
+  disk. Whole sequence goes like
+
+      **Suspend part**
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with state snapshot
+
+      state snapshot: copy of whole used memory is taken with interrupts disabled
+
+      resume(): devices are woken up so that we can write image to swap
+
+      write image to swap
+
+      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+      turn the power off
+
+      **Resume part**
+
+      (is actually pretty similar)
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped (in common case there are none,
+      but with resume-from-initrd, no one knows)
+
+      read image from disk
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with image restoration
+
+      image restoration: rewrite memory with image
+
+      resume(): devices are woken up so that system can continue
+
+      thaw all user processes
+
+Q:
+  What is this 'Encrypt suspend image' for?
+
+A:
+  First of all: it is not a replacement for dm-crypt encrypted swap.
+  It cannot protect your computer while it is suspended. Instead it does
+  protect from leaking sensitive data after resume from suspend.
+
+  Think of the following: you suspend while an application is running
+  that keeps sensitive data in memory. The application itself prevents
+  the data from being swapped out. Suspend, however, must write these
+  data to swap to be able to resume later on. Without suspend encryption
+  your sensitive data are then stored in plaintext on disk.  This means
+  that after resume your sensitive data are accessible to all
+  applications having direct access to the swap device which was used
+  for suspend. If you don't need swap after resume these data can remain
+  on disk virtually forever. Thus it can happen that your system gets
+  broken in weeks later and sensitive data which you thought were
+  encrypted and protected are retrieved and stolen from the swap device.
+  To prevent this situation you should use 'Encrypt suspend image'.
+
+  During suspend a temporary key is created and this key is used to
+  encrypt the data written to disk. When, during resume, the data was
+  read back into memory the temporary key is destroyed which simply
+  means that all data written to disk during suspend are then
+  inaccessible so they can't be stolen later on.  The only thing that
+  you must then take care of is that you call 'mkswap' for the swap
+  partition used for suspend as early as possible during regular
+  boot. This asserts that any temporary key from an oopsed suspend or
+  from a failed or aborted resume is erased from the swap device.
+
+  As a rule of thumb use encrypted swap to protect your data while your
+  system is shut down or suspended. Additionally use the encrypted
+  suspend image to prevent sensitive data from being stolen after
+  resume.
+
+Q:
+  Can I suspend to a swap file?
+
+A:
+  Generally, yes, you can.  However, it requires you to use the "resume=" and
+  "resume_offset=" kernel command line parameters, so the resume from a swap file
+  cannot be initiated from an initrd or initramfs image.  See
+  swsusp-and-swap-files.txt for details.
+
+Q:
+  Is there a maximum system RAM size that is supported by swsusp?
+
+A:
+  It should work okay with highmem.
+
+Q:
+  Does swsusp (to disk) use only one swap partition or can it use
+  multiple swap partitions (aggregate them into one logical space)?
+
+A:
+  Only one swap partition, sorry.
+
+Q:
+  If my application(s) causes lots of memory & swap space to be used
+  (over half of the total system RAM), is it correct that it is likely
+  to be useless to try to suspend to disk while that app is running?
+
+A:
+  No, it should work okay, as long as your app does not mlock()
+  it. Just prepare big enough swap partition.
+
+Q:
+  What information is useful for debugging suspend-to-disk problems?
+
+A:
+  Well, last messages on the screen are always useful. If something
+  is broken, it is usually some kernel driver, therefore trying with as
+  little as possible modules loaded helps a lot. I also prefer people to
+  suspend from console, preferably without X running. Booting with
+  init=/bin/bash, then swapon and starting suspend sequence manually
+  usually does the trick. Then it is good idea to try with latest
+  vanilla kernel.
+
+Q:
+  How can distributions ship a swsusp-supporting kernel with modular
+  disk drivers (especially SATA)?
+
+A:
+  Well, it can be done, load the drivers, then do echo into
+  /sys/power/resume file from initrd. Be sure not to mount
+  anything, not even read-only mount, or you are going to lose your
+  data.
+
+Q:
+  How do I make suspend more verbose?
+
+A:
+  If you want to see any non-error kernel messages on the virtual
+  terminal the kernel switches to during suspend, you have to set the
+  kernel console loglevel to at least 4 (KERN_WARNING), for example by
+  doing::
+
+	# save the old loglevel
+	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+	# set the loglevel so we see the progress bar.
+	# if the level is higher than needed, we leave it alone.
+	if [ $LOGLEVEL -lt 5 ]; then
+	        echo 5 > /proc/sys/kernel/printk
+		fi
+
+        IMG_SZ=0
+        read IMG_SZ < /sys/power/image_size
+        echo -n disk > /sys/power/state
+        RET=$?
+        #
+        # the logic here is:
+        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+        # then try again with image_size set to zero.
+	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+                echo 0 > /sys/power/image_size
+                echo -n disk > /sys/power/state
+                RET=$?
+        fi
+
+	# restore previous loglevel
+	echo $LOGLEVEL > /proc/sys/kernel/printk
+	exit $RET
+
+Q:
+  Is this true that if I have a mounted filesystem on a USB device and
+  I suspend to disk, I can lose data unless the filesystem has been mounted
+  with "sync"?
+
+A:
+  That's right ... if you disconnect that device, you may lose data.
+  In fact, even with "-o sync" you can lose data if your programs have
+  information in buffers they haven't written out to a disk you disconnect,
+  or if you disconnect before the device finished saving data you wrote.
+
+  Software suspend normally powers down USB controllers, which is equivalent
+  to disconnecting all USB devices attached to your system.
+
+  Your system might well support low-power modes for its USB controllers
+  while the system is asleep, maintaining the connection, using true sleep
+  modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
+  /sys/power/state file; write "standby" or "mem".)  We've not seen any
+  hardware that can use these modes through software suspend, although in
+  theory some systems might support "platform" modes that won't break the
+  USB connections.
+
+  Remember that it's always a bad idea to unplug a disk drive containing a
+  mounted filesystem.  That's true even when your system is asleep!  The
+  safest thing is to unmount all filesystems on removable media (such USB,
+  Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+  before suspending; then remount them after resuming.
+
+  There is a work-around for this problem.  For more information, see
+  Documentation/driver-api/usb/persist.rst.
+
+Q:
+  Can I suspend-to-disk using a swap partition under LVM?
+
+A:
+  Yes and No.  You can suspend successfully, but the kernel will not be able
+  to resume on its own.  You need an initramfs that can recognize the resume
+  situation, activate the logical volume containing the swap volume (but not
+  touch any filesystems!), and eventually call::
+
+    echo -n "$major:$minor" > /sys/power/resume
+
+  where $major and $minor are the respective major and minor device numbers of
+  the swap volume.
+
+  uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
+
+Q:
+  I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+  compiled with the similar configuration files. Anyway I found that
+  suspend to disk (and resume) is much slower on 2.6.16 compared to
+  2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A:
+  This is because the size of the suspend image is now greater than
+  for 2.6.15 (by saving more data we can get more responsive system
+  after resume).
+
+  There's the /sys/power/image_size knob that controls the size of the
+  image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+  root), the 2.6.15 behavior should be restored.  If it is still too
+  slow, take a look at suspend.sf.net -- userland suspend is faster and
+  supports LZF compression to speed it up further.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
deleted file mode 100644
index 236d1fb13640..000000000000
--- a/Documentation/power/swsusp.txt
+++ /dev/null
@@ -1,446 +0,0 @@
-Some warnings, first.
-
- * BIG FAT WARNING *********************************************************
- *
- * If you touch anything on disk between suspend and resume...
- *				...kiss your data goodbye.
- *
- * If you do resume from initrd after your filesystems are mounted...
- *				...bye bye root partition.
- *			[this is actually same case as above]
- *
- * If you have unsupported (*) devices using DMA, you may have some
- * problems. If your disk driver does not support suspend... (IDE does),
- * it may cause some problems, too. If you change kernel command line
- * between suspend and resume, it may do something wrong. If you change
- * your hardware while system is suspended... well, it was not good idea;
- * but it will probably only crash.
- *
- * (*) suspend/resume support is needed to make it safe.
- *
- * If you have any filesystems on USB devices mounted before software suspend,
- * they won't be accessible after resume and you may lose data, as though
- * you have unplugged the USB devices with mounted filesystems on them;
- * see the FAQ below for details.  (This is not true for more traditional
- * power states like "standby", which normally don't turn USB off.)
-
-Swap partition:
-You need to append resume=/dev/your_swap_partition to kernel command
-line or specify it using /sys/power/resume.
-
-Swap file:
-If using a swapfile you can also specify a resume offset using
-resume_offset=<number> on the kernel command line or specify it
-in /sys/power/resume_offset.
-
-After preparing then you suspend by
-
-echo shutdown > /sys/power/disk; echo disk > /sys/power/state
-
-. If you feel ACPI works pretty well on your system, you might try
-
-echo platform > /sys/power/disk; echo disk > /sys/power/state
-
-. If you would like to write hibernation image to swap and then suspend
-to RAM (provided your platform supports it), you can try
-
-echo suspend > /sys/power/disk; echo disk > /sys/power/state
-
-. If you have SATA disks, you'll need recent kernels with SATA suspend
-support. For suspend and resume to work, make sure your disk drivers
-are built into kernel -- not modules. [There's way to make
-suspend/resume with modular disk drivers, see FAQ, but you probably
-should not do that.]
-
-If you want to limit the suspend image size to N bytes, do
-
-echo N > /sys/power/image_size
-
-before suspend (it is limited to around 2/5 of available RAM by default).
-
-. The resume process checks for the presence of the resume device,
-if found, it then checks the contents for the hibernation image signature.
-If both are found, it resumes the hibernation image.
-
-. The resume process may be triggered in two ways:
-  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
-     the kernel command line, lateinit runs the resume process.  If the
-     resume device has not been probed yet, the resume process fails and
-     bootup continues.
-  2) Manually from an initrd or initramfs:  May be run from
-     the init script by using the /sys/power/resume file.  It is vital
-     that this be done prior to remounting any filesystems (even as
-     read-only) otherwise data may be corrupted.
-
-Article about goals and implementation of Software Suspend for Linux
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Author: Gábor Kuti
-Last revised: 2003-10-20 by Pavel Machek
-
-Idea and goals to achieve
-
-Nowadays it is common in several laptops that they have a suspend button. It
-saves the state of the machine to a filesystem or to a partition and switches
-to standby mode. Later resuming the machine the saved state is loaded back to
-ram and the machine can continue its work. It has two real benefits. First we
-save ourselves the time machine goes down and later boots up, energy costs
-are real high when running from batteries. The other gain is that we don't have to
-interrupt our programs so processes that are calculating something for a long
-time shouldn't need to be written interruptible.
-
-swsusp saves the state of the machine into active swaps and then reboots or
-powerdowns.  You must explicitly specify the swap partition to resume from with
-``resume='' kernel option. If signature is found it loads and restores saved
-state. If the option ``noresume'' is specified as a boot parameter, it skips
-the resuming.  If the option ``hibernate=nocompress'' is specified as a boot
-parameter, it saves hibernation image without compression.
-
-In the meantime while the system is suspended you should not add/remove any
-of the hardware, write to the filesystems, etc.
-
-Sleep states summary
-====================
-
-There are three different interfaces you can use, /proc/acpi should
-work like this:
-
-In a really perfect world:
-echo 1 > /proc/acpi/sleep       # for standby
-echo 2 > /proc/acpi/sleep       # for suspend to ram
-echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
-echo 4 > /proc/acpi/sleep       # for suspend to disk
-echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
-
-and perhaps
-echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
-
-Frequently Asked Questions
-==========================
-
-Q: well, suspending a server is IMHO a really stupid thing,
-but... (Diego Zuccato):
-
-A: You bought new UPS for your server. How do you install it without
-bringing machine down? Suspend to disk, rearrange power cables,
-resume.
-
-You have your server on UPS. Power died, and UPS is indicating 30
-seconds to failure. What do you do? Suspend to disk.
-
-
-Q: Maybe I'm missing something, but why don't the regular I/O paths work?
-
-A: We do use the regular I/O paths. However we cannot restore the data
-to its original location as we load it. That would create an
-inconsistent kernel state which would certainly result in an oops.
-Instead, we load the image into unused memory and then atomically copy
-it back to it original location. This implies, of course, a maximum
-image size of half the amount of memory.
-
-There are two solutions to this:
-
-* require half of memory to be free during suspend. That way you can
-read "new" data onto free spots, then cli and copy
-
-* assume we had special "polling" ide driver that only uses memory
-between 0-640KB. That way, I'd have to make sure that 0-640KB is free
-during suspending, but otherwise it would work...
-
-suspend2 shares this fundamental limitation, but does not include user
-data and disk caches into "used memory" by saving them in
-advance. That means that the limitation goes away in practice.
-
-Q: Does linux support ACPI S4?
-
-A: Yes. That's what echo platform > /sys/power/disk does.
-
-Q: What is 'suspend2'?
-
-A: suspend2 is 'Software Suspend 2', a forked implementation of
-suspend-to-disk which is available as separate patches for 2.4 and 2.6
-kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
-highmem and preemption. It also has a extensible architecture that
-allows for arbitrary transformations on the image (compression,
-encryption) and arbitrary backends for writing the image (eg to swap
-or an NFS share[Work In Progress]). Questions regarding suspend2
-should be sent to the mailing list available through the suspend2
-website, and not to the Linux Kernel Mailing List. We are working
-toward merging suspend2 into the mainline kernel.
-
-Q: What is the freezing of tasks and why are we using it?
-
-A: The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).  See freezing-of-tasks.txt for details.
-
-Q: What is the difference between "platform" and "shutdown"?
-
-A:
-
-shutdown: save state in linux, then tell bios to powerdown
-
-platform: save state in linux, then tell bios to powerdown and blink
-          "suspended led"
-
-"platform" is actually right thing to do where supported, but
-"shutdown" is most reliable (except on ACPI systems).
-
-Q: I do not understand why you have such strong objections to idea of
-selective suspend.
-
-A: Do selective suspend during runtime power management, that's okay. But
-it's useless for suspend-to-disk. (And I do not see how you could use
-it for suspend-to-ram, I hope you do not want that).
-
-Lets see, so you suggest to
-
-* SUSPEND all but swap device and parents
-* Snapshot
-* Write image to disk
-* SUSPEND swap device and parents
-* Powerdown
-
-Oh no, that does not work, if swap device or its parents uses DMA,
-you've corrupted data. You'd have to do
-
-* SUSPEND all but swap device and parents
-* FREEZE swap device and parents
-* Snapshot
-* UNFREEZE swap device and parents
-* Write
-* SUSPEND swap device and parents
-
-Which means that you still need that FREEZE state, and you get more
-complicated code. (And I have not yet introduce details like system
-devices).
-
-Q: There don't seem to be any generally useful behavioral
-distinctions between SUSPEND and FREEZE.
-
-A: Doing SUSPEND when you are asked to do FREEZE is always correct,
-but it may be unnecessarily slow. If you want your driver to stay simple,
-slowness may not matter to you. It can always be fixed later.
-
-For devices like disk it does matter, you do not want to spindown for
-FREEZE.
-
-Q: After resuming, system is paging heavily, leading to very bad interactivity.
-
-A: Try running
-
-cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
-do
-  test -f "$file" && cat "$file" > /dev/null
-done
-
-after resume. swapoff -a; swapon -a may also be useful.
-
-Q: What happens to devices during swsusp? They seem to be resumed
-during system suspend?
-
-A: That's correct. We need to resume them if we want to write image to
-disk. Whole sequence goes like
-
-      Suspend part
-      ~~~~~~~~~~~~
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with state snapshot
-
-      state snapshot: copy of whole used memory is taken with interrupts disabled
-
-      resume(): devices are woken up so that we can write image to swap
-
-      write image to swap
-
-      suspend(PMSG_SUSPEND): suspend devices so that we can power off
-
-      turn the power off
-
-      Resume part
-      ~~~~~~~~~~~
-      (is actually pretty similar)
-
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows)
-
-      read image from disk
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with image restoration
-
-      image restoration: rewrite memory with image
-
-      resume(): devices are woken up so that system can continue
-
-      thaw all user processes
-
-Q: What is this 'Encrypt suspend image' for?
-
-A: First of all: it is not a replacement for dm-crypt encrypted swap.
-It cannot protect your computer while it is suspended. Instead it does
-protect from leaking sensitive data after resume from suspend.
-
-Think of the following: you suspend while an application is running
-that keeps sensitive data in memory. The application itself prevents
-the data from being swapped out. Suspend, however, must write these
-data to swap to be able to resume later on. Without suspend encryption
-your sensitive data are then stored in plaintext on disk.  This means
-that after resume your sensitive data are accessible to all
-applications having direct access to the swap device which was used
-for suspend. If you don't need swap after resume these data can remain
-on disk virtually forever. Thus it can happen that your system gets
-broken in weeks later and sensitive data which you thought were
-encrypted and protected are retrieved and stolen from the swap device.
-To prevent this situation you should use 'Encrypt suspend image'.
-
-During suspend a temporary key is created and this key is used to
-encrypt the data written to disk. When, during resume, the data was
-read back into memory the temporary key is destroyed which simply
-means that all data written to disk during suspend are then
-inaccessible so they can't be stolen later on.  The only thing that
-you must then take care of is that you call 'mkswap' for the swap
-partition used for suspend as early as possible during regular
-boot. This asserts that any temporary key from an oopsed suspend or
-from a failed or aborted resume is erased from the swap device.
-
-As a rule of thumb use encrypted swap to protect your data while your
-system is shut down or suspended. Additionally use the encrypted
-suspend image to prevent sensitive data from being stolen after
-resume.
-
-Q: Can I suspend to a swap file?
-
-A: Generally, yes, you can.  However, it requires you to use the "resume=" and
-"resume_offset=" kernel command line parameters, so the resume from a swap file
-cannot be initiated from an initrd or initramfs image.  See
-swsusp-and-swap-files.txt for details.
-
-Q: Is there a maximum system RAM size that is supported by swsusp?
-
-A: It should work okay with highmem.
-
-Q: Does swsusp (to disk) use only one swap partition or can it use
-multiple swap partitions (aggregate them into one logical space)?
-
-A: Only one swap partition, sorry.
-
-Q: If my application(s) causes lots of memory & swap space to be used
-(over half of the total system RAM), is it correct that it is likely
-to be useless to try to suspend to disk while that app is running?
-
-A: No, it should work okay, as long as your app does not mlock()
-it. Just prepare big enough swap partition.
-
-Q: What information is useful for debugging suspend-to-disk problems?
-
-A: Well, last messages on the screen are always useful. If something
-is broken, it is usually some kernel driver, therefore trying with as
-little as possible modules loaded helps a lot. I also prefer people to
-suspend from console, preferably without X running. Booting with
-init=/bin/bash, then swapon and starting suspend sequence manually
-usually does the trick. Then it is good idea to try with latest
-vanilla kernel.
-
-Q: How can distributions ship a swsusp-supporting kernel with modular
-disk drivers (especially SATA)?
-
-A: Well, it can be done, load the drivers, then do echo into
-/sys/power/resume file from initrd. Be sure not to mount
-anything, not even read-only mount, or you are going to lose your
-data.
-
-Q: How do I make suspend more verbose?
-
-A: If you want to see any non-error kernel messages on the virtual
-terminal the kernel switches to during suspend, you have to set the
-kernel console loglevel to at least 4 (KERN_WARNING), for example by
-doing
-
-	# save the old loglevel
-	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
-	# set the loglevel so we see the progress bar.
-	# if the level is higher than needed, we leave it alone.
-	if [ $LOGLEVEL -lt 5 ]; then
-	        echo 5 > /proc/sys/kernel/printk
-		fi
-
-        IMG_SZ=0
-        read IMG_SZ < /sys/power/image_size
-        echo -n disk > /sys/power/state
-        RET=$?
-        #
-        # the logic here is:
-        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
-        # then try again with image_size set to zero.
-	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
-                echo 0 > /sys/power/image_size
-                echo -n disk > /sys/power/state
-                RET=$?
-        fi
-
-	# restore previous loglevel
-	echo $LOGLEVEL > /proc/sys/kernel/printk
-	exit $RET
-
-Q: Is this true that if I have a mounted filesystem on a USB device and
-I suspend to disk, I can lose data unless the filesystem has been mounted
-with "sync"?
-
-A: That's right ... if you disconnect that device, you may lose data.
-In fact, even with "-o sync" you can lose data if your programs have
-information in buffers they haven't written out to a disk you disconnect,
-or if you disconnect before the device finished saving data you wrote.
-
-Software suspend normally powers down USB controllers, which is equivalent
-to disconnecting all USB devices attached to your system.
-
-Your system might well support low-power modes for its USB controllers
-while the system is asleep, maintaining the connection, using true sleep
-modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
-/sys/power/state file; write "standby" or "mem".)  We've not seen any
-hardware that can use these modes through software suspend, although in
-theory some systems might support "platform" modes that won't break the
-USB connections.
-
-Remember that it's always a bad idea to unplug a disk drive containing a
-mounted filesystem.  That's true even when your system is asleep!  The
-safest thing is to unmount all filesystems on removable media (such USB,
-Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
-before suspending; then remount them after resuming.
-
-There is a work-around for this problem.  For more information, see
-Documentation/driver-api/usb/persist.rst.
-
-Q: Can I suspend-to-disk using a swap partition under LVM?
-
-A: Yes and No.  You can suspend successfully, but the kernel will not be able
-to resume on its own.  You need an initramfs that can recognize the resume
-situation, activate the logical volume containing the swap volume (but not
-touch any filesystems!), and eventually call
-
-echo -n "$major:$minor" > /sys/power/resume
-
-where $major and $minor are the respective major and minor device numbers of
-the swap volume.
-
-uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
-
-Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
-compiled with the similar configuration files. Anyway I found that
-suspend to disk (and resume) is much slower on 2.6.16 compared to
-2.6.15. Any idea for why that might happen or how can I speed it up?
-
-A: This is because the size of the suspend image is now greater than
-for 2.6.15 (by saving more data we can get more responsive system
-after resume).
-
-There's the /sys/power/image_size knob that controls the size of the
-image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
-root), the 2.6.15 behavior should be restored.  If it is still too
-slow, take a look at suspend.sf.net -- userland suspend is faster and
-supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst
new file mode 100644
index 000000000000..ca787f142c3f
--- /dev/null
+++ b/Documentation/power/tricks.rst
@@ -0,0 +1,29 @@
+================
+swsusp/S3 tricks
+================
+
+Pavel Machek <pavel@ucw.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+  really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+  wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+  want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+  user mode.
+
+* due to video issues, swsusp should be easier to get working than
+  S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
deleted file mode 100644
index a1b8f7249f4c..000000000000
--- a/Documentation/power/tricks.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-	swsusp/S3 tricks
-	~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@ucw.cz>
-
-If you want to trick swsusp/S3 into working, you might want to try:
-
-* go with minimal config, turn off drivers like USB, AGP you don't
-  really need
-
-* turn off APIC and preempt
-
-* use ext2. At least it has working fsck. [If something seems to go
-  wrong, force fsck when you have a chance]
-
-* turn off modules
-
-* use vga text console, shut down X. [If you really want X, you might
-  want to try vesafb later]
-
-* try running as few processes as possible, preferably go to single
-  user mode.
-
-* due to video issues, swsusp should be easier to get working than
-  S3. Try that first.
-
-When you make it work, try to find out what exactly was it that broke
-suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst
new file mode 100644
index 000000000000..a0fa51bb1a4d
--- /dev/null
+++ b/Documentation/power/userland-swsusp.rst
@@ -0,0 +1,191 @@
+=====================================================
+Documentation for userland software suspend interface
+=====================================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://suspend.sourceforge.net>.  You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h .  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for simultaneous
+reading and writing.  It is also impossible to have the device open more than
+once at a time.
+
+Even opening the device has side effects. Data structures are
+allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
+called.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE
+	freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE
+	thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE
+	create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE
+	restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE
+	free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE
+	set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE
+	return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE
+	return the amount of available swap in bytes (the
+	last argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE
+	allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES
+	free all swap pages allocated by
+	SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA
+	set the resume partition and the offset (in <PAGE_SIZE>
+	units) from the beginning of the partition at which the swap header is
+	located (the last ioctl() argument should point to a struct
+	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+	containing the resume device specification and the offset); for swap
+	partitions the offset is always 0, but it is different from zero for
+	swap files (see Documentation/power/swsusp-and-swap-files.rst for
+	details).
+
+SNAPSHOT_PLATFORM_SUPPORT
+	enable/disable the hibernation platform support,
+	depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF
+	make the kernel transition the system to the hibernation
+	state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM
+	suspend to RAM; using this call causes the kernel to
+	immediately enter the suspend-to-RAM state, so this call must always
+	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
+	is needed to implement the suspend-to-both mechanism in which the
+	suspend image is first created, as though the system had been suspended
+	to disk, and then the system is suspended to RAM (this makes it possible
+	to resume the system from RAM if there's enough battery power or restore
+	its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+
+- you cannot read() more than one virtual memory page at a time
+- read()s across page boundaries are impossible (ie. if you read() 1/2 of
+  a page in the previous call, you will only be able to read()
+  **at most** 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap partition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file).  However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image.  The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_CREATE_IMAGE has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
deleted file mode 100644
index bbfcd1bbedc5..000000000000
--- a/Documentation/power/userland-swsusp.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-Documentation for userland software suspend interface
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-First, the warnings at the beginning of swsusp.txt still apply.
-
-Second, you should read the FAQ in swsusp.txt _now_ if you have not
-done it already.
-
-Now, to use the userland interface for software suspend you need special
-utilities that will read/write the system memory snapshot from/to the
-kernel.  Such utilities are available, for example, from
-<http://suspend.sourceforge.net>.  You may want to have a look at them if you
-are going to develop your own suspend/resume utilities.
-
-The interface consists of a character device providing the open(),
-release(), read(), and write() operations as well as several ioctl()
-commands defined in include/linux/suspend_ioctls.h .  The major and minor
-numbers of the device are, respectively, 10 and 231, and they can
-be read from /sys/class/misc/snapshot/dev.
-
-The device can be open either for reading or for writing.  If open for
-reading, it is considered to be in the suspend mode.  Otherwise it is
-assumed to be in the resume mode.  The device cannot be open for simultaneous
-reading and writing.  It is also impossible to have the device open more than
-once at a time.
-
-Even opening the device has side effects. Data structures are
-allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
-called.
-
-The ioctl() commands recognized by the device are:
-
-SNAPSHOT_FREEZE - freeze user space processes (the current process is
-	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
-	and SNAPSHOT_ATOMIC_RESTORE to succeed
-
-SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-
-SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
-	last argument of ioctl() should be a pointer to an int variable,
-	the value of which will indicate whether the call returned after
-	creating the snapshot (1) or after restoring the system memory state
-	from it (0) (after resume the system finds itself finishing the
-	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
-	has been created the read() operation can be used to transfer
-	it out of the kernel
-
-SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
-	uploaded snapshot image; before calling it you should transfer
-	the system memory snapshot back to the kernel using the write()
-	operation; this call will not succeed if the snapshot
-	image is not available to the kernel
-
-SNAPSHOT_FREE - free memory allocated for the snapshot image
-
-SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
-	(the kernel will do its best to ensure the image size will not exceed
-	this number, but if it turns out to be impossible, the kernel will
-	create the smallest image possible)
-
-SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
-
-SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
-	last argument should be a pointer to an unsigned int variable that will
-	contain the result if the call is successful).
-
-SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
-	(the last argument should be a pointer to a loff_t variable that
-	will contain the swap page offset if the call is successful)
-
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
-	SNAPSHOT_ALLOC_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
-	units) from the beginning of the partition at which the swap header is
-	located (the last ioctl() argument should point to a struct
-	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
-	containing the resume device specification and the offset); for swap
-	partitions the offset is always 0, but it is different from zero for
-	swap files (see Documentation/power/swsusp-and-swap-files.txt for
-	details).
-
-SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
-	depending on the argument value (enable, if the argument is nonzero)
-
-SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
-	state (eg. ACPI S4) using the platform (eg. ACPI) driver
-
-SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
-	immediately enter the suspend-to-RAM state, so this call must always
-	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
-	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
-	is needed to implement the suspend-to-both mechanism in which the
-	suspend image is first created, as though the system had been suspended
-	to disk, and then the system is suspended to RAM (this makes it possible
-	to resume the system from RAM if there's enough battery power or restore
-	its state on the basis of the saved suspend image otherwise)
-
-The device's read() operation can be used to transfer the snapshot image from
-the kernel.  It has the following limitations:
-- you cannot read() more than one virtual memory page at a time
-- read()s across page boundaries are impossible (ie. if you read() 1/2 of
-	a page in the previous call, you will only be able to read()
-	_at_ _most_ 1/2 of the page in the next call)
-
-The device's write() operation is used for uploading the system memory snapshot
-into the kernel.  It has the same limitations as the read() operation.
-
-The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
-Thus it is not necessary to use either SNAPSHOT_FREE or
-SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
-unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
-still frozen when the device is being closed).
-
-Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, or a swap file as storage space (if a swap file is used, the resume
-partition is the partition that holds this file).  However, this is not really
-required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
-mounted afterwards.
-
-These utilities MUST NOT make any assumptions regarding the ordering of
-data within the snapshot image.  The contents of the image are entirely owned
-by the kernel and its structure may be changed in future kernel releases.
-
-The snapshot image MUST be written to the kernel unaltered (ie. all of the image
-data, metadata and header MUST be written in _exactly_ the same amount, form
-and order in which they have been read).  Otherwise, the behavior of the
-resumed system may be totally unpredictable.
-
-While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
-structure of the snapshot image is consistent with the information stored
-in the image header.  If any inconsistencies are detected,
-SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
-mechanism and the userland utilities using the interface SHOULD use additional
-means, such as checksums, to ensure the integrity of the snapshot image.
-
-The suspending and resuming utilities MUST lock themselves in memory,
-preferably using mlockall(), before calling SNAPSHOT_FREEZE.
-
-The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
-in the memory location pointed to by the last argument of ioctl() and proceed
-in accordance with it:
-1. 	If the value is 1 (ie. the system memory snapshot has just been
-	created and the system is ready for saving it):
-	(a)	The suspending utility MUST NOT close the snapshot device
-		_unless_ the whole suspend procedure is to be cancelled, in
-		which case, if the snapshot image has already been saved, the
-		suspending utility SHOULD destroy it, preferably by zapping
-		its header.  If the suspend is not to be cancelled, the
-		system MUST be powered off or rebooted after the snapshot
-		image has been saved.
-	(b)	The suspending utility SHOULD NOT attempt to perform any
-		file system operations (including reads) on the file systems
-		that were mounted before SNAPSHOT_CREATE_IMAGE has been
-		called.  However, it MAY mount a file system that was not
-		mounted at that time and perform some operations on it (eg.
-		use it for saving the image).
-2.	If the value is 0 (ie. the system state has just been restored from
-	the snapshot image), the suspending utility MUST close the snapshot
-	device.  Afterwards it will be treated as a regular userland process,
-	so it need not exit.
-
-The resuming utility SHOULD NOT attempt to mount any file systems that could
-be mounted before suspend and SHOULD NOT attempt to perform any operations
-involving such file systems.
-
-For details, please refer to the source code.
diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst
new file mode 100644
index 000000000000..337a2ba9f32f
--- /dev/null
+++ b/Documentation/power/video.rst
@@ -0,0 +1,213 @@
+===========================
+Video issues with S3 resume
+===========================
+
+2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+    resume. Unfortunately, it is not correct to call the video BIOS at
+    that point, but it happens to work on some machines. Use
+    acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+    the BIOS works well enough to be able to set video mode. Use
+    acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+    acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+    a new enough X, and a plain text console (no vesafb or radeonfb). See
+    http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+    Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+    to life. It needs text console to be working. Do vbetool vbestate
+    save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+    vbestate restore < /tmp/delme; setfont <whatever>, and your video
+    should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+    POSTing bios works. Ole Rohne has patch to do just that at
+    http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility and or
+    do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will
+    initialize the display in console mode. If you are in X, you can switch
+    to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
+    the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+
+=============================== ===============================================
+Model                           hack (or "how to do it")
+=============================== ===============================================
+Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
+Acer TM 230			s3_bios (2)
+Acer TM 242FX			vbetool (6)
+Acer TM C110			video_post (8)
+Acer TM C300                    vga=normal (only suspend on console, not in X),
+				vbetool (6) or video_post (8)
+Acer TM 4052LCi		        s3_bios (2)
+Acer TM 636Lci			s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text
+				console back
+Acer TM 660			??? [#f1]_
+Acer TM 800			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803LCi			vga=normal, vbetool (6)
+Arima W730a			vbetool needed (6)
+Asus L2400D                     s3_mode (3) [#f2]_ (S1 also works OK)
+Asus L3350M (SiS 740)           (6)
+Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
+Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver
+				instead of fglrx in x.org
+Athlon64 desktop prototype	s3_bios (2)
+Compal CL-50			??? [#f1]_
+Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
+Compaq Evo N620c		vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
+Dell D610			vga=normal and X (possibly vbestate (6) too,
+				but not tested)
+Dell Inspiron 4000		??? [#f1]_
+Dell Inspiron 500m		??? [#f1]_
+Dell Inspiron 510m		???
+Dell Inspiron 5150		vbetool needed (6)
+Dell Inspiron 600m		??? [#f1]_
+Dell Inspiron 8200		??? [#f1]_
+Dell Inspiron 8500		??? [#f1]_
+Dell Inspiron 8600		??? [#f1]_
+eMachines athlon64 machines	vbetool needed (6) (someone please get
+				me model #s)
+HP NC6000			s3_bios, may not use radeonfb (2);
+				or vbetool (6)
+HP NX7000			??? [#f1]_
+HP Pavilion ZD7000		vbetool post needed, need open-source nv
+				driver for X
+HP Omnibook XE3	athlon version	none (1)
+HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF		vbetool (6)
+HP Omnibook 5150		none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294
+				Savage/IX-MV, vesafb gets "interesting"
+				but X work.
+IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with
+				BIOS 1.04 2002-08-23, but not at all with
+				BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG      none (1)
+IBM TP R40 2722B3G		??? [#f1]_
+IBM TP R50p / Type 1832-22U     s3_bios (2)
+IBM TP R51			none (1)
+IBM TP T30	236681A		??? [#f1]_
+IBM TP T40 / Type 2373-MU4      none (1)
+IBM TP T40p			none (1)
+IBM TP R40p			s3_bios (2)
+IBM TP T41p			s3_bios (2), switch to X after resume
+IBM TP T42			s3_bios (2)
+IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
+IBM TP X20			??? [#f1]_
+IBM TP X30			s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH      none (1), use radeontool
+				(http://fdd.com/software/radeon/) to
+				turn off backlight.
+IBM TP X32			none (1), but backlight is on and video is
+				trashed after long suspend. s3_bios,
+				s3_mode (4) works too. Perhaps that gets
+				better results?
+IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
+IBM TP 600e			none(1), but a switch to console and
+				back to X is needed
+Medion MD4220			??? [#f1]_
+Samsung P35			vbetool needed (6)
+Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K		s3_bios (2)
+Sony Vaio PCG-F403		??? [#f1]_
+Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use
+				radeontool (http://fdd.com/software/radeon/)
+				to turn off backlight.
+Sony Vaio PCG-N505SN		??? [#f1]_
+Sony Vaio vgn-s260		X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will
+				be blank unless you return to X.
+Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
+Toshiba Libretto L5		none (1)
+Toshiba Libretto 100CT/110CT    vbetool (6)
+Toshiba Portege 3020CT		s3_mode (3)
+Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT      ??? [#f1]_
+Toshiba Satellite P10-554       s3_bios,s3_mode (4)[#f3]_
+Toshiba M30                     (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO			??? [#f1]_
+=============================== ===============================================
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+=================== ============================= ========================
+Mainboard	    Graphics card                 hack (or "how to do it")
+=================== ============================= ========================
+Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
+=================== ============================= ========================
+
+
+.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure
+         which options to use. If you know, please tell me.
+
+.. [#f2] To be tested with a newer kernel.
+
+.. [#f3] Not with SMP kernel, UP only.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
deleted file mode 100644
index 3e6272bc4472..000000000000
--- a/Documentation/power/video.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-		Video issues with S3 resume
-		~~~~~~~~~~~~~~~~~~~~~~~~~~~
-		  2003-2006, Pavel Machek
-
-During S3 resume, hardware needs to be reinitialized. For most
-devices, this is easy, and kernel driver knows how to do
-it. Unfortunately there's one exception: video card. Those are usually
-initialized by BIOS, and kernel does not have enough information to
-boot video card. (Kernel usually does not even contain video card
-driver -- vesafb and vgacon are widely used).
-
-This is not problem for swsusp, because during swsusp resume, BIOS is
-run normally so video card is normally initialized. It should not be
-problem for S1 standby, because hardware should retain its state over
-that.
-
-We either have to run video BIOS during early resume, or interpret it
-using vbetool later, or maybe nothing is necessary on particular
-system because video state is preserved. Unfortunately different
-methods work on different systems, and no known method suits all of
-them.
-
-Userland application called s2ram has been developed; it contains long
-whitelist of systems, and automatically selects working method for a
-given system. It can be downloaded from CVS at
-www.sf.net/projects/suspend . If you get a system that is not in the
-whitelist, please try to find a working solution, and submit whitelist
-entry so that work does not need to be repeated.
-
-Currently, VBE_SAVE method (6 below) works on most
-systems. Unfortunately, vbetool only runs after userland is resumed,
-so it makes debugging of early resume problems
-hard/impossible. Methods that do not rely on userland are preferable.
-
-Details
-~~~~~~~
-
-There are a few types of systems where video works after S3 resume:
-
-(1) systems where video state is preserved over S3.
-
-(2) systems where it is possible to call the video BIOS during S3
-  resume. Unfortunately, it is not correct to call the video BIOS at
-  that point, but it happens to work on some machines. Use
-  acpi_sleep=s3_bios.
-
-(3) systems that initialize video card into vga text mode and where
-  the BIOS works well enough to be able to set video mode. Use
-  acpi_sleep=s3_mode on these.
-
-(4) on some systems s3_bios kicks video into text mode, and
-  acpi_sleep=s3_bios,s3_mode is needed.
-
-(5) radeon systems, where X can soft-boot your video card. You'll need
-  a new enough X, and a plain text console (no vesafb or radeonfb). See
-  http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
-  Alternatively, you should use vbetool (6) instead.
-
-(6) other radeon systems, where vbetool is enough to bring system back
-  to life. It needs text console to be working. Do vbetool vbestate
-  save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
-  vbestate restore < /tmp/delme; setfont <whatever>, and your video
-  should work.
-
-(7) on some systems, it is possible to boot most of kernel, and then
-  POSTing bios works. Ole Rohne has patch to do just that at
-  http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
-
-(8) on some systems, you can use the video_post utility and or 
-  do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will 
-  initialize the display in console mode. If you are in X, you can switch
-  to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
-  the display working in graphical mode again.
-
-Now, if you pass acpi_sleep=something, and it does not work with your
-bios, you'll get a hard crash during resume. Be careful. Also it is
-safest to do your experiments with plain old VGA console. The vesafb
-and radeonfb (etc) drivers have a tendency to crash the machine during
-resume.
-
-You may have a system where none of above works. At that point you
-either invent another ugly hack that works, or write proper driver for
-your video card (good luck getting docs :-(). Maybe suspending from X
-(proper X, knowing your hardware, not XF68_FBcon) might have better
-chance of working.
-
-Table of known working notebooks:
-
-Model                           hack (or "how to do it")
-------------------------------------------------------------------------------
-Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
-Acer TM 230			s3_bios (2)
-Acer TM 242FX			vbetool (6)
-Acer TM C110			video_post (8)
-Acer TM C300                    vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
-Acer TM 4052LCi		        s3_bios (2)
-Acer TM 636Lci			s3_bios,s3_mode (4)
-Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text console back
-Acer TM 660			??? (*)
-Acer TM 800			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803LCi			vga=normal, vbetool (6)
-Arima W730a			vbetool needed (6)
-Asus L2400D                     s3_mode (3)(***) (S1 also works OK)
-Asus L3350M (SiS 740)           (6)
-Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
-Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
-Athlon64 desktop prototype	s3_bios (2)
-Compal CL-50			??? (*)
-Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
-Compaq Evo N620c		vga=normal, s3_bios (2)
-Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
-Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
-Dell D610			vga=normal and X (possibly vbestate (6) too, but not tested)
-Dell Inspiron 4000		??? (*)
-Dell Inspiron 500m		??? (*)
-Dell Inspiron 510m		???
-Dell Inspiron 5150		vbetool needed (6)
-Dell Inspiron 600m		??? (*)
-Dell Inspiron 8200		??? (*)
-Dell Inspiron 8500		??? (*)
-Dell Inspiron 8600		??? (*)
-eMachines athlon64 machines	vbetool needed (6) (someone please get me model #s)
-HP NC6000			s3_bios, may not use radeonfb (2); or vbetool (6)
-HP NX7000			??? (*)
-HP Pavilion ZD7000		vbetool post needed, need open-source nv driver for X
-HP Omnibook XE3	athlon version	none (1)
-HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
-HP Omnibook XE3L-GF		vbetool (6)
-HP Omnibook 5150		none (1), (S1 also works OK)
-IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
-IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
-IBM TP R32 / Type 2658-MMG      none (1)
-IBM TP R40 2722B3G		??? (*)
-IBM TP R50p / Type 1832-22U     s3_bios (2)
-IBM TP R51			none (1)
-IBM TP T30	236681A		??? (*)
-IBM TP T40 / Type 2373-MU4      none (1)
-IBM TP T40p			none (1)
-IBM TP R40p			s3_bios (2)
-IBM TP T41p			s3_bios (2), switch to X after resume
-IBM TP T42			s3_bios (2)
-IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
-IBM TP X20			??? (*)
-IBM TP X30			s3_bios, s3_mode (4)
-IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32			none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
-IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
-IBM TP 600e			none(1), but a switch to console and back to X is needed
-Medion MD4220			??? (*)
-Samsung P35			vbetool needed (6)
-Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
-Sony Vaio PCG-C1VRX/K		s3_bios (2)
-Sony Vaio PCG-F403		??? (*)
-Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
-Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-Sony Vaio PCG-N505SN		??? (*)
-Sony Vaio vgn-s260		X or boot-radeon can init it (5)
-Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will be blank unless you return to X.
-Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
-Toshiba Libretto L5		none (1)
-Toshiba Libretto 100CT/110CT    vbetool (6)
-Toshiba Portege 3020CT		s3_mode (3)
-Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4090XCDT      ??? (*)
-Toshiba Satellite P10-554       s3_bios,s3_mode (4)(****)
-Toshiba M30                     (2) xor X with nvidia driver using internal AGP
-Uniwill 244IIO			??? (*)
-
-Known working desktop systems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Mainboard	    Graphics card                 hack (or "how to do it")
-------------------------------------------------------------------------------
-Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
-
-
-(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
-    which options to use. If you know, please tell me.
-
-(***) To be tested with a newer kernel.
-
-(****) Not with SMP kernel, UP only.
diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt
deleted file mode 100644
index ecdbb076438c..000000000000
--- a/Documentation/powerpc/DAWR-POWER9.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-DAWR issues on POWER9
-============================
-
-On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop
-if it points to cache inhibited (CI) memory. Currently Linux has no way to
-disinguish CI memory when configuring the DAWR, so (for now) the DAWR is
-disabled by this commit:
-
-    commit 9654153158d3e0684a1bdb76dbababdb7111d5a0
-    Author: Michael Neuling <mikey@neuling.org>
-    Date:   Tue Mar 27 15:37:24 2018 +1100
-    powerpc: Disable DAWR in the base POWER9 CPU features
-
-Technical Details:
-============================
-
-DAWR has 6 different ways of being set.
-1) ptrace
-2) h_set_mode(DAWR)
-3) h_set_dabr()
-4) kvmppc_set_one_reg()
-5) xmon
-
-For ptrace, we now advertise zero breakpoints on POWER9 via the
-PPC_PTRACE_GETHWDBGINFO call. This results in GDB falling back to
-software emulation of the watchpoint (which is slow).
-
-h_set_mode(DAWR) and h_set_dabr() will now return an error to the
-guest on a POWER9 host. Current Linux guests ignore this error, so
-they will silently not get the DAWR.
-
-kvmppc_set_one_reg() will store the value in the vcpu but won't
-actually set it on POWER9 hardware. This is done so we don't break
-migration from POWER8 to POWER9, at the cost of silently losing the
-DAWR on the migration.
-
-For xmon, the 'bd' command will return an error on P9.
-
-Consequences for users
-============================
-
-For GDB watchpoints (ie 'watch' command) on POWER9 bare metal , GDB
-will accept the command. Unfortunately since there is no hardware
-support for the watchpoint, GDB will software emulate the watchpoint
-making it run very slowly.
-
-The same will also be true for any guests started on a POWER9
-host. The watchpoint will fail and GDB will fall back to software
-emulation.
-
-If a guest is started on a POWER8 host, GDB will accept the watchpoint
-and configure the hardware to use the DAWR. This will run at full
-speed since it can use the hardware emulation. Unfortunately if this
-guest is migrated to a POWER9 host, the watchpoint will be lost on the
-POWER9. Loads and stores to the watchpoint locations will not be
-trapped in GDB. The watchpoint is remembered, so if the guest is
-migrated back to the POWER8 host, it will start working again.
-
-Force enabling the DAWR
-=============================
-Kernels (since ~v5.2) have an option to force enable the DAWR via:
-
-  echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous
-
-This enables the DAWR even on POWER9.
-
-This is a dangerous setting, USE AT YOUR OWN RISK.
-
-Some users may not care about a bad user crashing their box
-(ie. single user/desktop systems) and really want the DAWR.  This
-allows them to force enable DAWR.
-
-This flag can also be used to disable DAWR access. Once this is
-cleared, all DAWR access should be cleared immediately and your
-machine once again safe from crashing.
-
-Userspace may get confused by toggling this. If DAWR is force
-enabled/disabled between getting the number of breakpoints (via
-PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an
-inconsistent view of what's available. Similarly for guests.
-
-For the DAWR to be enabled in a KVM guest, the DAWR needs to be force
-enabled in the host AND the guest. For this reason, this won't work on
-POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the
-dawr_enable_dangerous file will fail if the hypervisor doesn't support
-writing the DAWR.
-
-To double check the DAWR is working, run this kernel selftest:
-  tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
-Any errors/failures/skips mean something is wrong.
diff --git a/Documentation/powerpc/bootwrapper.rst b/Documentation/powerpc/bootwrapper.rst
new file mode 100644
index 000000000000..a6292afba573
--- /dev/null
+++ b/Documentation/powerpc/bootwrapper.rst
@@ -0,0 +1,155 @@
+========================
+The PowerPC boot wrapper
+========================
+
+Copyright (C) Secret Lab Technologies Ltd.
+
+PowerPC image targets compresses and wraps the kernel image (vmlinux) with
+a boot wrapper to make it usable by the system firmware.  There is no
+standard PowerPC firmware interface, so the boot wrapper is designed to
+be adaptable for each kind of image that needs to be built.
+
+The boot wrapper can be found in the arch/powerpc/boot/ directory.  The
+Makefile in that directory has targets for all the available image types.
+The different image types are used to support all of the various firmware
+interfaces found on PowerPC platforms.  OpenFirmware is the most commonly
+used firmware type on general purpose PowerPC systems from Apple, IBM and
+others.  U-Boot is typically found on embedded PowerPC hardware, but there
+are a handful of other firmware implementations which are also popular.  Each
+firmware interface requires a different image format.
+
+The boot wrapper is built from the makefile in arch/powerpc/boot/Makefile and
+it uses the wrapper script (arch/powerpc/boot/wrapper) to generate target
+image.  The details of the build system is discussed in the next section.
+Currently, the following image format targets exist:
+
+   ==================== ========================================================
+   cuImage.%:		Backwards compatible uImage for older version of
+			U-Boot (for versions that don't understand the device
+			tree).  This image embeds a device tree blob inside
+			the image.  The boot wrapper, kernel and device tree
+			are all embedded inside the U-Boot uImage file format
+			with boot wrapper code that extracts data from the old
+			bd_info structure and loads the data into the device
+			tree before jumping into the kernel.
+
+			Because of the series of #ifdefs found in the
+			bd_info structure used in the old U-Boot interfaces,
+			cuImages are platform specific.  Each specific
+			U-Boot platform has a different platform init file
+			which populates the embedded device tree with data
+			from the platform specific bd_info file.  The platform
+			specific cuImage platform init code can be found in
+			`arch/powerpc/boot/cuboot.*.c`. Selection of the correct
+			cuImage init code for a specific board can be found in
+			the wrapper structure.
+
+   dtbImage.%:		Similar to zImage, except device tree blob is embedded
+			inside the image instead of provided by firmware.  The
+			output image file can be either an elf file or a flat
+			binary depending on the platform.
+
+			dtbImages are used on systems which do not have an
+			interface for passing a device tree directly.
+			dtbImages are similar to simpleImages except that
+			dtbImages have platform specific code for extracting
+			data from the board firmware, but simpleImages do not
+			talk to the firmware at all.
+
+			PlayStation 3 support uses dtbImage.  So do Embedded
+			Planet boards using the PlanetCore firmware.  Board
+			specific initialization code is typically found in a
+			file named arch/powerpc/boot/<platform>.c; but this
+			can be overridden by the wrapper script.
+
+   simpleImage.%:	Firmware independent compressed image that does not
+			depend on any particular firmware interface and embeds
+			a device tree blob.  This image is a flat binary that
+			can be loaded to any location in RAM and jumped to.
+			Firmware cannot pass any configuration data to the
+			kernel with this image type and it depends entirely on
+			the embedded device tree for all information.
+
+			The simpleImage is useful for booting systems with
+			an unknown firmware interface or for booting from
+			a debugger when no firmware is present (such as on
+			the Xilinx Virtex platform).  The only assumption that
+			simpleImage makes is that RAM is correctly initialized
+			and that the MMU is either off or has RAM mapped to
+			base address 0.
+
+			simpleImage also supports inserting special platform
+			specific initialization code to the start of the bootup
+			sequence.  The virtex405 platform uses this feature to
+			ensure that the cache is invalidated before caching
+			is enabled.  Platform specific initialization code is
+			added as part of the wrapper script and is keyed on
+			the image target name.  For example, all
+			simpleImage.virtex405-* targets will add the
+			virtex405-head.S initialization code (This also means
+			that the dts file for virtex405 targets should be
+			named (virtex405-<board>.dts).  Search the wrapper
+			script for 'virtex405' and see the file
+			arch/powerpc/boot/virtex405-head.S for details.
+
+   treeImage.%;		Image format for used with OpenBIOS firmware found
+			on some ppc4xx hardware.  This image embeds a device
+			tree blob inside the image.
+
+   uImage:		Native image format used by U-Boot.  The uImage target
+			does not add any boot code.  It just wraps a compressed
+			vmlinux in the uImage data structure.  This image
+			requires a version of U-Boot that is able to pass
+			a device tree to the kernel at boot.  If using an older
+			version of U-Boot, then you need to use a cuImage
+			instead.
+
+   zImage.%:		Image format which does not embed a device tree.
+			Used by OpenFirmware and other firmware interfaces
+			which are able to supply a device tree.  This image
+			expects firmware to provide the device tree at boot.
+			Typically, if you have general purpose PowerPC
+			hardware then you want this image format.
+   ==================== ========================================================
+
+Image types which embed a device tree blob (simpleImage, dtbImage, treeImage,
+and cuImage) all generate the device tree blob from a file in the
+arch/powerpc/boot/dts/ directory.  The Makefile selects the correct device
+tree source based on the name of the target.  Therefore, if the kernel is
+built with 'make treeImage.walnut simpleImage.virtex405-ml403', then the
+build system will use arch/powerpc/boot/dts/walnut.dts to build
+treeImage.walnut and arch/powerpc/boot/dts/virtex405-ml403.dts to build
+the simpleImage.virtex405-ml403.
+
+Two special targets called 'zImage' and 'zImage.initrd' also exist.  These
+targets build all the default images as selected by the kernel configuration.
+Default images are selected by the boot wrapper Makefile
+(arch/powerpc/boot/Makefile) by adding targets to the $image-y variable.  Look
+at the Makefile to see which default image targets are available.
+
+How it is built
+---------------
+arch/powerpc is designed to support multiplatform kernels, which means
+that a single vmlinux image can be booted on many different target boards.
+It also means that the boot wrapper must be able to wrap for many kinds of
+images on a single build.  The design decision was made to not use any
+conditional compilation code (#ifdef, etc) in the boot wrapper source code.
+All of the boot wrapper pieces are buildable at any time regardless of the
+kernel configuration.  Building all the wrapper bits on every kernel build
+also ensures that obscure parts of the wrapper are at the very least compile
+tested in a large variety of environments.
+
+The wrapper is adapted for different image types at link time by linking in
+just the wrapper bits that are appropriate for the image type.  The 'wrapper
+script' (found in arch/powerpc/boot/wrapper) is called by the Makefile and
+is responsible for selecting the correct wrapper bits for the image type.
+The arguments are well documented in the script's comment block, so they
+are not repeated here.  However, it is worth mentioning that the script
+uses the -p (platform) argument as the main method of deciding which wrapper
+bits to compile in.  Look for the large 'case "$platform" in' block in the
+middle of the script.  This is also the place where platform specific fixups
+can be selected by changing the link order.
+
+In particular, care should be taken when working with cuImages.  cuImage
+wrapper bits are very board specific and care should be taken to make sure
+the target you are trying to build is supported by the wrapper bits.
diff --git a/Documentation/powerpc/bootwrapper.txt b/Documentation/powerpc/bootwrapper.txt
deleted file mode 100644
index d60fced5e1cc..000000000000
--- a/Documentation/powerpc/bootwrapper.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-The PowerPC boot wrapper
-------------------------
-Copyright (C) Secret Lab Technologies Ltd.
-
-PowerPC image targets compresses and wraps the kernel image (vmlinux) with
-a boot wrapper to make it usable by the system firmware.  There is no
-standard PowerPC firmware interface, so the boot wrapper is designed to
-be adaptable for each kind of image that needs to be built.
-
-The boot wrapper can be found in the arch/powerpc/boot/ directory.  The
-Makefile in that directory has targets for all the available image types.
-The different image types are used to support all of the various firmware
-interfaces found on PowerPC platforms.  OpenFirmware is the most commonly
-used firmware type on general purpose PowerPC systems from Apple, IBM and
-others.  U-Boot is typically found on embedded PowerPC hardware, but there
-are a handful of other firmware implementations which are also popular.  Each
-firmware interface requires a different image format.
-
-The boot wrapper is built from the makefile in arch/powerpc/boot/Makefile and
-it uses the wrapper script (arch/powerpc/boot/wrapper) to generate target
-image.  The details of the build system is discussed in the next section.
-Currently, the following image format targets exist:
-
-   cuImage.%:		Backwards compatible uImage for older version of
-			U-Boot (for versions that don't understand the device
-			tree).  This image embeds a device tree blob inside
-			the image.  The boot wrapper, kernel and device tree
-			are all embedded inside the U-Boot uImage file format
-			with boot wrapper code that extracts data from the old
-			bd_info structure and loads the data into the device
-			tree before jumping into the kernel.
-			  Because of the series of #ifdefs found in the
-			bd_info structure used in the old U-Boot interfaces,
-			cuImages are platform specific.  Each specific
-			U-Boot platform has a different platform init file
-			which populates the embedded device tree with data
-			from the platform specific bd_info file.  The platform
-			specific cuImage platform init code can be found in
-			arch/powerpc/boot/cuboot.*.c.  Selection of the correct
-			cuImage init code for a specific board can be found in
-			the wrapper structure.
-   dtbImage.%:		Similar to zImage, except device tree blob is embedded
-			inside the image instead of provided by firmware.  The
-			output image file can be either an elf file or a flat
-			binary depending on the platform.
-			  dtbImages are used on systems which do not have an
-			interface for passing a device tree directly.
-			dtbImages are similar to simpleImages except that
-			dtbImages have platform specific code for extracting
-			data from the board firmware, but simpleImages do not
-			talk to the firmware at all.
-			  PlayStation 3 support uses dtbImage.  So do Embedded
-			Planet boards using the PlanetCore firmware.  Board
-			specific initialization code is typically found in a
-			file named arch/powerpc/boot/<platform>.c; but this
-			can be overridden by the wrapper script.
-   simpleImage.%:	Firmware independent compressed image that does not
-			depend on any particular firmware interface and embeds
-			a device tree blob.  This image is a flat binary that
-			can be loaded to any location in RAM and jumped to.
-			Firmware cannot pass any configuration data to the
-			kernel with this image type and it depends entirely on
-			the embedded device tree for all information.
-			  The simpleImage is useful for booting systems with
-			an unknown firmware interface or for booting from
-			a debugger when no firmware is present (such as on
-			the Xilinx Virtex platform).  The only assumption that
-			simpleImage makes is that RAM is correctly initialized
-			and that the MMU is either off or has RAM mapped to
-			base address 0.
-			  simpleImage also supports inserting special platform
-			specific initialization code to the start of the bootup
-			sequence.  The virtex405 platform uses this feature to
-			ensure that the cache is invalidated before caching
-			is enabled.  Platform specific initialization code is
-			added as part of the wrapper script and is keyed on
-			the image target name.  For example, all
-			simpleImage.virtex405-* targets will add the
-			virtex405-head.S initialization code (This also means
-			that the dts file for virtex405 targets should be
-			named (virtex405-<board>.dts).  Search the wrapper
-			script for 'virtex405' and see the file
-			arch/powerpc/boot/virtex405-head.S for details.
-   treeImage.%;		Image format for used with OpenBIOS firmware found
-			on some ppc4xx hardware.  This image embeds a device
-			tree blob inside the image.
-   uImage:		Native image format used by U-Boot.  The uImage target
-			does not add any boot code.  It just wraps a compressed
-			vmlinux in the uImage data structure.  This image
-			requires a version of U-Boot that is able to pass
-			a device tree to the kernel at boot.  If using an older
-			version of U-Boot, then you need to use a cuImage
-			instead.
-   zImage.%:		Image format which does not embed a device tree.
-			Used by OpenFirmware and other firmware interfaces
-			which are able to supply a device tree.  This image
-			expects firmware to provide the device tree at boot.
-			Typically, if you have general purpose PowerPC
-			hardware then you want this image format.
-
-Image types which embed a device tree blob (simpleImage, dtbImage, treeImage,
-and cuImage) all generate the device tree blob from a file in the
-arch/powerpc/boot/dts/ directory.  The Makefile selects the correct device
-tree source based on the name of the target.  Therefore, if the kernel is
-built with 'make treeImage.walnut simpleImage.virtex405-ml403', then the
-build system will use arch/powerpc/boot/dts/walnut.dts to build
-treeImage.walnut and arch/powerpc/boot/dts/virtex405-ml403.dts to build
-the simpleImage.virtex405-ml403.
-
-Two special targets called 'zImage' and 'zImage.initrd' also exist.  These
-targets build all the default images as selected by the kernel configuration.
-Default images are selected by the boot wrapper Makefile
-(arch/powerpc/boot/Makefile) by adding targets to the $image-y variable.  Look
-at the Makefile to see which default image targets are available.
-
-How it is built
----------------
-arch/powerpc is designed to support multiplatform kernels, which means
-that a single vmlinux image can be booted on many different target boards.
-It also means that the boot wrapper must be able to wrap for many kinds of
-images on a single build.  The design decision was made to not use any
-conditional compilation code (#ifdef, etc) in the boot wrapper source code.
-All of the boot wrapper pieces are buildable at any time regardless of the
-kernel configuration.  Building all the wrapper bits on every kernel build
-also ensures that obscure parts of the wrapper are at the very least compile
-tested in a large variety of environments.
-
-The wrapper is adapted for different image types at link time by linking in
-just the wrapper bits that are appropriate for the image type.  The 'wrapper
-script' (found in arch/powerpc/boot/wrapper) is called by the Makefile and
-is responsible for selecting the correct wrapper bits for the image type.
-The arguments are well documented in the script's comment block, so they
-are not repeated here.  However, it is worth mentioning that the script
-uses the -p (platform) argument as the main method of deciding which wrapper
-bits to compile in.  Look for the large 'case "$platform" in' block in the
-middle of the script.  This is also the place where platform specific fixups
-can be selected by changing the link order.
-
-In particular, care should be taken when working with cuImages.  cuImage
-wrapper bits are very board specific and care should be taken to make sure
-the target you are trying to build is supported by the wrapper bits.
diff --git a/Documentation/powerpc/cpu_families.rst b/Documentation/powerpc/cpu_families.rst
new file mode 100644
index 000000000000..1e063c5440c3
--- /dev/null
+++ b/Documentation/powerpc/cpu_families.rst
@@ -0,0 +1,222 @@
+============
+CPU Families
+============
+
+This document tries to summarise some of the different cpu families that exist
+and are supported by arch/powerpc.
+
+
+Book3S (aka sPAPR)
+------------------
+
+- Hash MMU
+- Mix of 32 & 64 bit::
+
+   +--------------+                 +----------------+
+   |  Old POWER   | --------------> | RS64 (threads) |
+   +--------------+                 +----------------+
+          |
+          |
+          v
+   +--------------+                 +----------------+      +------+
+   |     601      | --------------> |      603       | ---> | e300 |
+   +--------------+                 +----------------+      +------+
+          |                                 |
+          |                                 |
+          v                                 v
+   +--------------+                 +----------------+      +-------+
+   |     604      |                 |    750 (G3)    | ---> | 750CX |
+   +--------------+                 +----------------+      +-------+
+          |                                 |                   |
+          |                                 |                   |
+          v                                 v                   v
+   +--------------+                 +----------------+      +-------+
+   | 620 (64 bit) |                 |      7400      |      | 750CL |
+   +--------------+                 +----------------+      +-------+
+          |                                 |                   |
+          |                                 |                   |
+          v                                 v                   v
+   +--------------+                 +----------------+      +-------+
+   |  POWER3/630  |                 |      7410      |      | 750FX |
+   +--------------+                 +----------------+      +-------+
+          |                                 |
+          |                                 |
+          v                                 v
+   +--------------+                 +----------------+
+   |   POWER3+    |                 |      7450      |
+   +--------------+                 +----------------+
+          |                                 |
+          |                                 |
+          v                                 v
+   +--------------+                 +----------------+
+   |    POWER4    |                 |      7455      |
+   +--------------+                 +----------------+
+          |                                 |
+          |                                 |
+          v                                 v
+   +--------------+     +-------+   +----------------+
+   |   POWER4+    | --> |  970  |   |      7447      |
+   +--------------+     +-------+   +----------------+
+          |                 |               |
+          |                 |               |
+          v                 v               v
+   +--------------+     +-------+   +----------------+
+   |    POWER5    |     | 970FX |   |      7448      |
+   +--------------+     +-------+   +----------------+
+          |                 |               |
+          |                 |               |
+          v                 v               v
+   +--------------+     +-------+   +----------------+
+   |   POWER5+    |     | 970MP |   |      e600      |
+   +--------------+     +-------+   +----------------+
+          |
+          |
+          v
+   +--------------+
+   |   POWER5++   |
+   +--------------+
+          |
+          |
+          v
+   +--------------+       +-------+
+   |    POWER6    | <-?-> | Cell  |
+   +--------------+       +-------+
+          |
+          |
+          v
+   +--------------+
+   |    POWER7    |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |   POWER7+    |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |    POWER8    |
+   +--------------+
+
+
+   +---------------+
+   | PA6T (64 bit) |
+   +---------------+
+
+
+IBM BookE
+---------
+
+- Software loaded TLB.
+- All 32 bit::
+
+   +--------------+
+   |     401      |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |     403      |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |     405      |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |     440      |
+   +--------------+
+          |
+          |
+          v
+   +--------------+     +----------------+
+   |     450      | --> |      BG/P      |
+   +--------------+     +----------------+
+          |
+          |
+          v
+   +--------------+
+   |     460      |
+   +--------------+
+          |
+          |
+          v
+   +--------------+
+   |     476      |
+   +--------------+
+
+
+Motorola/Freescale 8xx
+----------------------
+
+- Software loaded with hardware assist.
+- All 32 bit::
+
+   +-------------+
+   | MPC8xx Core |
+   +-------------+
+
+
+Freescale BookE
+---------------
+
+- Software loaded TLB.
+- e6500 adds HW loaded indirect TLB entries.
+- Mix of 32 & 64 bit::
+
+   +--------------+
+   |     e200     |
+   +--------------+
+
+
+   +--------------------------------+
+   |              e500              |
+   +--------------------------------+
+                   |
+                   |
+                   v
+   +--------------------------------+
+   |             e500v2             |
+   +--------------------------------+
+                   |
+                   |
+                   v
+   +--------------------------------+
+   |        e500mc (Book3e)         |
+   +--------------------------------+
+                   |
+                   |
+                   v
+   +--------------------------------+
+   |          e5500 (64 bit)        |
+   +--------------------------------+
+                   |
+                   |
+                   v
+   +--------------------------------+
+   | e6500 (HW TLB) (Multithreaded) |
+   +--------------------------------+
+
+
+IBM A2 core
+-----------
+
+- Book3E, software loaded TLB + HW loaded indirect TLB entries.
+- 64 bit::
+
+   +--------------+     +----------------+
+   |   A2 core    | --> |      WSP       |
+   +--------------+     +----------------+
+           |
+           |
+           v
+   +--------------+
+   |     BG/Q     |
+   +--------------+
diff --git a/Documentation/powerpc/cpu_families.txt b/Documentation/powerpc/cpu_families.txt
deleted file mode 100644
index fc08e22feb1a..000000000000
--- a/Documentation/powerpc/cpu_families.txt
+++ /dev/null
@@ -1,221 +0,0 @@
-CPU Families
-============
-
-This document tries to summarise some of the different cpu families that exist
-and are supported by arch/powerpc.
-
-
-Book3S (aka sPAPR)
-------------------
-
- - Hash MMU
- - Mix of 32 & 64 bit
-
-   +--------------+                 +----------------+
-   |  Old POWER   | --------------> | RS64 (threads) |
-   +--------------+                 +----------------+
-          |
-          |
-          v
-   +--------------+                 +----------------+      +------+
-   |     601      | --------------> |      603       | ---> | e300 |
-   +--------------+                 +----------------+      +------+
-          |                                 |
-          |                                 |
-          v                                 v
-   +--------------+                 +----------------+      +-------+
-   |     604      |                 |    750 (G3)    | ---> | 750CX |
-   +--------------+                 +----------------+      +-------+
-          |                                 |                   |
-          |                                 |                   |
-          v                                 v                   v
-   +--------------+                 +----------------+      +-------+
-   | 620 (64 bit) |                 |      7400      |      | 750CL |
-   +--------------+                 +----------------+      +-------+
-          |                                 |                   |
-          |                                 |                   |
-          v                                 v                   v
-   +--------------+                 +----------------+      +-------+
-   |  POWER3/630  |                 |      7410      |      | 750FX |
-   +--------------+                 +----------------+      +-------+
-          |                                 |
-          |                                 |
-          v                                 v
-   +--------------+                 +----------------+
-   |   POWER3+    |                 |      7450      |
-   +--------------+                 +----------------+
-          |                                 |
-          |                                 |
-          v                                 v
-   +--------------+                 +----------------+
-   |    POWER4    |                 |      7455      |
-   +--------------+                 +----------------+
-          |                                 |
-          |                                 |
-          v                                 v
-   +--------------+     +-------+   +----------------+
-   |   POWER4+    | --> |  970  |   |      7447      |
-   +--------------+     +-------+   +----------------+
-          |                 |               |
-          |                 |               |
-          v                 v               v
-   +--------------+     +-------+   +----------------+
-   |    POWER5    |     | 970FX |   |      7448      |
-   +--------------+     +-------+   +----------------+
-          |                 |               |
-          |                 |               |
-          v                 v               v
-   +--------------+     +-------+   +----------------+
-   |   POWER5+    |     | 970MP |   |      e600      |
-   +--------------+     +-------+   +----------------+
-          |
-          |
-          v
-   +--------------+
-   |   POWER5++   |
-   +--------------+
-          |
-          |
-          v
-   +--------------+       +-------+
-   |    POWER6    | <-?-> | Cell  |
-   +--------------+       +-------+
-          |
-          |
-          v
-   +--------------+
-   |    POWER7    |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |   POWER7+    |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |    POWER8    |
-   +--------------+
-
-
-   +---------------+
-   | PA6T (64 bit) |
-   +---------------+
-
-
-IBM BookE
----------
-
- - Software loaded TLB.
- - All 32 bit
-
-   +--------------+
-   |     401      |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |     403      |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |     405      |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |     440      |
-   +--------------+
-          |
-          |
-          v
-   +--------------+     +----------------+
-   |     450      | --> |      BG/P      |
-   +--------------+     +----------------+
-          |
-          |
-          v
-   +--------------+
-   |     460      |
-   +--------------+
-          |
-          |
-          v
-   +--------------+
-   |     476      |
-   +--------------+
-
-
-Motorola/Freescale 8xx
-----------------------
-
- - Software loaded with hardware assist.
- - All 32 bit
-
-   +-------------+
-   | MPC8xx Core |
-   +-------------+
-
-
-Freescale BookE
----------------
-
- - Software loaded TLB.
- - e6500 adds HW loaded indirect TLB entries.
- - Mix of 32 & 64 bit
-
-   +--------------+
-   |     e200     |
-   +--------------+
-
-
-   +--------------------------------+
-   |              e500              |
-   +--------------------------------+
-                   |
-                   |
-                   v
-   +--------------------------------+
-   |             e500v2             |
-   +--------------------------------+
-                   |
-                   |
-                   v
-   +--------------------------------+
-   |        e500mc (Book3e)         |
-   +--------------------------------+
-                   |
-                   |
-                   v
-   +--------------------------------+
-   |          e5500 (64 bit)        |
-   +--------------------------------+
-                   |
-                   |
-                   v
-   +--------------------------------+
-   | e6500 (HW TLB) (Multithreaded) |
-   +--------------------------------+
-
-
-IBM A2 core
------------
-
- - Book3E, software loaded TLB + HW loaded indirect TLB entries.
- - 64 bit
-
-   +--------------+     +----------------+
-   |   A2 core    | --> |      WSP       |
-   +--------------+     +----------------+
-           |
-           |
-           v
-   +--------------+
-   |     BG/Q     |
-   +--------------+
diff --git a/Documentation/powerpc/cpu_features.rst b/Documentation/powerpc/cpu_features.rst
new file mode 100644
index 000000000000..b7bcdd2f41bb
--- /dev/null
+++ b/Documentation/powerpc/cpu_features.rst
@@ -0,0 +1,60 @@
+============
+CPU Features
+============
+
+Hollis Blanchard <hollis@austin.ibm.com>
+5 Jun 2002
+
+This document describes the system (including self-modifying code) used in the
+PPC Linux kernel to support a variety of PowerPC CPUs without requiring
+compile-time selection.
+
+Early in the boot process the ppc32 kernel detects the current CPU type and
+chooses a set of features accordingly. Some examples include Altivec support,
+split instruction and data caches, and if the CPU supports the DOZE and NAP
+sleep modes.
+
+Detection of the feature set is simple. A list of processors can be found in
+arch/powerpc/kernel/cputable.c. The PVR register is masked and compared with
+each value in the list. If a match is found, the cpu_features of cur_cpu_spec
+is assigned to the feature bitmask for this processor and a __setup_cpu
+function is called.
+
+C code may test 'cur_cpu_spec[smp_processor_id()]->cpu_features' for a
+particular feature bit. This is done in quite a few places, for example
+in ppc_setup_l2cr().
+
+Implementing cpufeatures in assembly is a little more involved. There are
+several paths that are performance-critical and would suffer if an array
+index, structure dereference, and conditional branch were added. To avoid the
+performance penalty but still allow for runtime (rather than compile-time) CPU
+selection, unused code is replaced by 'nop' instructions. This nop'ing is
+based on CPU 0's capabilities, so a multi-processor system with non-identical
+processors will not work (but such a system would likely have other problems
+anyways).
+
+After detecting the processor type, the kernel patches out sections of code
+that shouldn't be used by writing nop's over it. Using cpufeatures requires
+just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S
+transfer_to_handler::
+
+	#ifdef CONFIG_ALTIVEC
+	BEGIN_FTR_SECTION
+		mfspr	r22,SPRN_VRSAVE		/* if G4, save vrsave register value */
+		stw	r22,THREAD_VRSAVE(r23)
+	END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	#endif /* CONFIG_ALTIVEC */
+
+If CPU 0 supports Altivec, the code is left untouched. If it doesn't, both
+instructions are replaced with nop's.
+
+The END_FTR_SECTION macro has two simpler variations: END_FTR_SECTION_IFSET
+and END_FTR_SECTION_IFCLR. These simply test if a flag is set (in
+cur_cpu_spec[0]->cpu_features) or is cleared, respectively. These two macros
+should be used in the majority of cases.
+
+The END_FTR_SECTION macros are implemented by storing information about this
+code in the '__ftr_fixup' ELF section. When do_cpu_ftr_fixups
+(arch/powerpc/kernel/misc.S) is invoked, it will iterate over the records in
+__ftr_fixup, and if the required feature is not present it will loop writing
+nop's from each BEGIN_FTR_SECTION to END_FTR_SECTION.
diff --git a/Documentation/powerpc/cpu_features.txt b/Documentation/powerpc/cpu_features.txt
deleted file mode 100644
index ae09df8722c8..000000000000
--- a/Documentation/powerpc/cpu_features.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Hollis Blanchard <hollis@austin.ibm.com>
-5 Jun 2002
-
-This document describes the system (including self-modifying code) used in the
-PPC Linux kernel to support a variety of PowerPC CPUs without requiring
-compile-time selection.
-
-Early in the boot process the ppc32 kernel detects the current CPU type and
-chooses a set of features accordingly. Some examples include Altivec support,
-split instruction and data caches, and if the CPU supports the DOZE and NAP
-sleep modes.
-
-Detection of the feature set is simple. A list of processors can be found in
-arch/powerpc/kernel/cputable.c. The PVR register is masked and compared with
-each value in the list. If a match is found, the cpu_features of cur_cpu_spec
-is assigned to the feature bitmask for this processor and a __setup_cpu
-function is called.
-
-C code may test 'cur_cpu_spec[smp_processor_id()]->cpu_features' for a
-particular feature bit. This is done in quite a few places, for example
-in ppc_setup_l2cr().
-
-Implementing cpufeatures in assembly is a little more involved. There are
-several paths that are performance-critical and would suffer if an array
-index, structure dereference, and conditional branch were added. To avoid the
-performance penalty but still allow for runtime (rather than compile-time) CPU
-selection, unused code is replaced by 'nop' instructions. This nop'ing is
-based on CPU 0's capabilities, so a multi-processor system with non-identical
-processors will not work (but such a system would likely have other problems
-anyways).
-
-After detecting the processor type, the kernel patches out sections of code
-that shouldn't be used by writing nop's over it. Using cpufeatures requires
-just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S
-transfer_to_handler:
-
-	#ifdef CONFIG_ALTIVEC
-	BEGIN_FTR_SECTION
-		mfspr	r22,SPRN_VRSAVE		/* if G4, save vrsave register value */
-		stw	r22,THREAD_VRSAVE(r23)
-	END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-	#endif /* CONFIG_ALTIVEC */
-
-If CPU 0 supports Altivec, the code is left untouched. If it doesn't, both
-instructions are replaced with nop's.
-
-The END_FTR_SECTION macro has two simpler variations: END_FTR_SECTION_IFSET
-and END_FTR_SECTION_IFCLR. These simply test if a flag is set (in
-cur_cpu_spec[0]->cpu_features) or is cleared, respectively. These two macros
-should be used in the majority of cases.
-
-The END_FTR_SECTION macros are implemented by storing information about this
-code in the '__ftr_fixup' ELF section. When do_cpu_ftr_fixups
-(arch/powerpc/kernel/misc.S) is invoked, it will iterate over the records in
-__ftr_fixup, and if the required feature is not present it will loop writing
-nop's from each BEGIN_FTR_SECTION to END_FTR_SECTION.
diff --git a/Documentation/powerpc/cxl.rst b/Documentation/powerpc/cxl.rst
new file mode 100644
index 000000000000..920546d81326
--- /dev/null
+++ b/Documentation/powerpc/cxl.rst
@@ -0,0 +1,467 @@
+====================================
+Coherent Accelerator Interface (CXL)
+====================================
+
+Introduction
+============
+
+    The coherent accelerator interface is designed to allow the
+    coherent connection of accelerators (FPGAs and other devices) to a
+    POWER system. These devices need to adhere to the Coherent
+    Accelerator Interface Architecture (CAIA).
+
+    IBM refers to this as the Coherent Accelerator Processor Interface
+    or CAPI. In the kernel it's referred to by the name CXL to avoid
+    confusion with the ISDN CAPI subsystem.
+
+    Coherent in this context means that the accelerator and CPUs can
+    both access system memory directly and with the same effective
+    addresses.
+
+
+Hardware overview
+=================
+
+    ::
+
+         POWER8/9             FPGA
+       +----------+        +---------+
+       |          |        |         |
+       |   CPU    |        |   AFU   |
+       |          |        |         |
+       |          |        |         |
+       |          |        |         |
+       +----------+        +---------+
+       |   PHB    |        |         |
+       |   +------+        |   PSL   |
+       |   | CAPP |<------>|         |
+       +---+------+  PCIE  +---------+
+
+    The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP)
+    unit which is part of the PCIe Host Bridge (PHB). This is managed
+    by Linux by calls into OPAL. Linux doesn't directly program the
+    CAPP.
+
+    The FPGA (or coherently attached device) consists of two parts.
+    The POWER Service Layer (PSL) and the Accelerator Function Unit
+    (AFU). The AFU is used to implement specific functionality behind
+    the PSL. The PSL, among other things, provides memory address
+    translation services to allow each AFU direct access to userspace
+    memory.
+
+    The AFU is the core part of the accelerator (eg. the compression,
+    crypto etc function). The kernel has no knowledge of the function
+    of the AFU. Only userspace interacts directly with the AFU.
+
+    The PSL provides the translation and interrupt services that the
+    AFU needs. This is what the kernel interacts with. For example, if
+    the AFU needs to read a particular effective address, it sends
+    that address to the PSL, the PSL then translates it, fetches the
+    data from memory and returns it to the AFU. If the PSL has a
+    translation miss, it interrupts the kernel and the kernel services
+    the fault. The context to which this fault is serviced is based on
+    who owns that acceleration function.
+
+    - POWER8 and PSL Version 8 are compliant to the CAIA Version 1.0.
+    - POWER9 and PSL Version 9 are compliant to the CAIA Version 2.0.
+
+    This PSL Version 9 provides new features such as:
+
+    * Interaction with the nest MMU on the P9 chip.
+    * Native DMA support.
+    * Supports sending ASB_Notify messages for host thread wakeup.
+    * Supports Atomic operations.
+    * etc.
+
+    Cards with a PSL9 won't work on a POWER8 system and cards with a
+    PSL8 won't work on a POWER9 system.
+
+AFU Modes
+=========
+
+    There are two programming modes supported by the AFU. Dedicated
+    and AFU directed. AFU may support one or both modes.
+
+    When using dedicated mode only one MMU context is supported. In
+    this mode, only one userspace process can use the accelerator at
+    time.
+
+    When using AFU directed mode, up to 16K simultaneous contexts can
+    be supported. This means up to 16K simultaneous userspace
+    applications may use the accelerator (although specific AFUs may
+    support fewer). In this mode, the AFU sends a 16 bit context ID
+    with each of its requests. This tells the PSL which context is
+    associated with each operation. If the PSL can't translate an
+    operation, the ID can also be accessed by the kernel so it can
+    determine the userspace context associated with an operation.
+
+
+MMIO space
+==========
+
+    A portion of the accelerator MMIO space can be directly mapped
+    from the AFU to userspace. Either the whole space can be mapped or
+    just a per context portion. The hardware is self describing, hence
+    the kernel can determine the offset and size of the per context
+    portion.
+
+
+Interrupts
+==========
+
+    AFUs may generate interrupts that are destined for userspace. These
+    are received by the kernel as hardware interrupts and passed onto
+    userspace by a read syscall documented below.
+
+    Data storage faults and error interrupts are handled by the kernel
+    driver.
+
+
+Work Element Descriptor (WED)
+=============================
+
+    The WED is a 64-bit parameter passed to the AFU when a context is
+    started. Its format is up to the AFU hence the kernel has no
+    knowledge of what it represents. Typically it will be the
+    effective address of a work queue or status block where the AFU
+    and userspace can share control and status information.
+
+
+
+
+User API
+========
+
+1. AFU character devices
+
+    For AFUs operating in AFU directed mode, two character device
+    files will be created. /dev/cxl/afu0.0m will correspond to a
+    master context and /dev/cxl/afu0.0s will correspond to a slave
+    context. Master contexts have access to the full MMIO space an
+    AFU provides. Slave contexts have access to only the per process
+    MMIO space an AFU provides.
+
+    For AFUs operating in dedicated process mode, the driver will
+    only create a single character device per AFU called
+    /dev/cxl/afu0.0d. This will have access to the entire MMIO space
+    that the AFU provides (like master contexts in AFU directed).
+
+    The types described below are defined in include/uapi/misc/cxl.h
+
+    The following file operations are supported on both slave and
+    master devices.
+
+    A userspace library libcxl is available here:
+
+	https://github.com/ibm-capi/libcxl
+
+    This provides a C interface to this kernel API.
+
+open
+----
+
+    Opens the device and allocates a file descriptor to be used with
+    the rest of the API.
+
+    A dedicated mode AFU only has one context and only allows the
+    device to be opened once.
+
+    An AFU directed mode AFU can have many contexts, the device can be
+    opened once for each context that is available.
+
+    When all available contexts are allocated the open call will fail
+    and return -ENOSPC.
+
+    Note:
+	  IRQs need to be allocated for each context, which may limit
+          the number of contexts that can be created, and therefore
+          how many times the device can be opened. The POWER8 CAPP
+          supports 2040 IRQs and 3 are used by the kernel, so 2037 are
+          left. If 1 IRQ is needed per context, then only 2037
+          contexts can be allocated. If 4 IRQs are needed per context,
+          then only 2037/4 = 509 contexts can be allocated.
+
+
+ioctl
+-----
+
+    CXL_IOCTL_START_WORK:
+        Starts the AFU context and associates it with the current
+        process. Once this ioctl is successfully executed, all memory
+        mapped into this process is accessible to this AFU context
+        using the same effective addresses. No additional calls are
+        required to map/unmap memory. The AFU memory context will be
+        updated as userspace allocates and frees memory. This ioctl
+        returns once the AFU context is started.
+
+        Takes a pointer to a struct cxl_ioctl_start_work
+
+            ::
+
+                struct cxl_ioctl_start_work {
+                        __u64 flags;
+                        __u64 work_element_descriptor;
+                        __u64 amr;
+                        __s16 num_interrupts;
+                        __s16 reserved1;
+                        __s32 reserved2;
+                        __u64 reserved3;
+                        __u64 reserved4;
+                        __u64 reserved5;
+                        __u64 reserved6;
+                };
+
+            flags:
+                Indicates which optional fields in the structure are
+                valid.
+
+            work_element_descriptor:
+                The Work Element Descriptor (WED) is a 64-bit argument
+                defined by the AFU. Typically this is an effective
+                address pointing to an AFU specific structure
+                describing what work to perform.
+
+            amr:
+                Authority Mask Register (AMR), same as the powerpc
+                AMR. This field is only used by the kernel when the
+                corresponding CXL_START_WORK_AMR value is specified in
+                flags. If not specified the kernel will use a default
+                value of 0.
+
+            num_interrupts:
+                Number of userspace interrupts to request. This field
+                is only used by the kernel when the corresponding
+                CXL_START_WORK_NUM_IRQS value is specified in flags.
+                If not specified the minimum number required by the
+                AFU will be allocated. The min and max number can be
+                obtained from sysfs.
+
+            reserved fields:
+                For ABI padding and future extensions
+
+    CXL_IOCTL_GET_PROCESS_ELEMENT:
+        Get the current context id, also known as the process element.
+        The value is returned from the kernel as a __u32.
+
+
+mmap
+----
+
+    An AFU may have an MMIO space to facilitate communication with the
+    AFU. If it does, the MMIO space can be accessed via mmap. The size
+    and contents of this area are specific to the particular AFU. The
+    size can be discovered via sysfs.
+
+    In AFU directed mode, master contexts are allowed to map all of
+    the MMIO space and slave contexts are allowed to only map the per
+    process MMIO space associated with the context. In dedicated
+    process mode the entire MMIO space can always be mapped.
+
+    This mmap call must be done after the START_WORK ioctl.
+
+    Care should be taken when accessing MMIO space. Only 32 and 64-bit
+    accesses are supported by POWER8. Also, the AFU will be designed
+    with a specific endianness, so all MMIO accesses should consider
+    endianness (recommend endian(3) variants like: le64toh(),
+    be64toh() etc). These endian issues equally apply to shared memory
+    queues the WED may describe.
+
+
+read
+----
+
+    Reads events from the AFU. Blocks if no events are pending
+    (unless O_NONBLOCK is supplied). Returns -EIO in the case of an
+    unrecoverable error or if the card is removed.
+
+    read() will always return an integral number of events.
+
+    The buffer passed to read() must be at least 4K bytes.
+
+    The result of the read will be a buffer of one or more events,
+    each event is of type struct cxl_event, of varying size::
+
+            struct cxl_event {
+                    struct cxl_event_header header;
+                    union {
+                            struct cxl_event_afu_interrupt irq;
+                            struct cxl_event_data_storage fault;
+                            struct cxl_event_afu_error afu_error;
+                    };
+            };
+
+    The struct cxl_event_header is defined as
+
+        ::
+
+            struct cxl_event_header {
+                    __u16 type;
+                    __u16 size;
+                    __u16 process_element;
+                    __u16 reserved1;
+            };
+
+        type:
+            This defines the type of event. The type determines how
+            the rest of the event is structured. These types are
+            described below and defined by enum cxl_event_type.
+
+        size:
+            This is the size of the event in bytes including the
+            struct cxl_event_header. The start of the next event can
+            be found at this offset from the start of the current
+            event.
+
+        process_element:
+            Context ID of the event.
+
+        reserved field:
+            For future extensions and padding.
+
+    If the event type is CXL_EVENT_AFU_INTERRUPT then the event
+    structure is defined as
+
+        ::
+
+            struct cxl_event_afu_interrupt {
+                    __u16 flags;
+                    __u16 irq; /* Raised AFU interrupt number */
+                    __u32 reserved1;
+            };
+
+        flags:
+            These flags indicate which optional fields are present
+            in this struct. Currently all fields are mandatory.
+
+        irq:
+            The IRQ number sent by the AFU.
+
+        reserved field:
+            For future extensions and padding.
+
+    If the event type is CXL_EVENT_DATA_STORAGE then the event
+    structure is defined as
+
+        ::
+
+            struct cxl_event_data_storage {
+                    __u16 flags;
+                    __u16 reserved1;
+                    __u32 reserved2;
+                    __u64 addr;
+                    __u64 dsisr;
+                    __u64 reserved3;
+            };
+
+        flags:
+            These flags indicate which optional fields are present in
+            this struct. Currently all fields are mandatory.
+
+        address:
+            The address that the AFU unsuccessfully attempted to
+            access. Valid accesses will be handled transparently by the
+            kernel but invalid accesses will generate this event.
+
+        dsisr:
+            This field gives information on the type of fault. It is a
+            copy of the DSISR from the PSL hardware when the address
+            fault occurred. The form of the DSISR is as defined in the
+            CAIA.
+
+        reserved fields:
+            For future extensions
+
+    If the event type is CXL_EVENT_AFU_ERROR then the event structure
+    is defined as
+
+        ::
+
+            struct cxl_event_afu_error {
+                    __u16 flags;
+                    __u16 reserved1;
+                    __u32 reserved2;
+                    __u64 error;
+            };
+
+        flags:
+            These flags indicate which optional fields are present in
+            this struct. Currently all fields are Mandatory.
+
+        error:
+            Error status from the AFU. Defined by the AFU.
+
+        reserved fields:
+            For future extensions and padding
+
+
+2. Card character device (powerVM guest only)
+
+    In a powerVM guest, an extra character device is created for the
+    card. The device is only used to write (flash) a new image on the
+    FPGA accelerator. Once the image is written and verified, the
+    device tree is updated and the card is reset to reload the updated
+    image.
+
+open
+----
+
+    Opens the device and allocates a file descriptor to be used with
+    the rest of the API. The device can only be opened once.
+
+ioctl
+-----
+
+CXL_IOCTL_DOWNLOAD_IMAGE / CXL_IOCTL_VALIDATE_IMAGE:
+    Starts and controls flashing a new FPGA image. Partial
+    reconfiguration is not supported (yet), so the image must contain
+    a copy of the PSL and AFU(s). Since an image can be quite large,
+    the caller may have to iterate, splitting the image in smaller
+    chunks.
+
+    Takes a pointer to a struct cxl_adapter_image::
+
+        struct cxl_adapter_image {
+            __u64 flags;
+            __u64 data;
+            __u64 len_data;
+            __u64 len_image;
+            __u64 reserved1;
+            __u64 reserved2;
+            __u64 reserved3;
+            __u64 reserved4;
+        };
+
+    flags:
+        These flags indicate which optional fields are present in
+        this struct. Currently all fields are mandatory.
+
+    data:
+        Pointer to a buffer with part of the image to write to the
+        card.
+
+    len_data:
+        Size of the buffer pointed to by data.
+
+    len_image:
+        Full size of the image.
+
+
+Sysfs Class
+===========
+
+    A cxl sysfs class is added under /sys/class/cxl to facilitate
+    enumeration and tuning of the accelerators. Its layout is
+    described in Documentation/ABI/testing/sysfs-class-cxl
+
+
+Udev rules
+==========
+
+    The following udev rules could be used to create a symlink to the
+    most logical chardev to use in any programming mode (afuX.Yd for
+    dedicated, afuX.Ys for afu directed), since the API is virtually
+    identical for each::
+
+	SUBSYSTEM=="cxl", ATTRS{mode}=="dedicated_process", SYMLINK="cxl/%b"
+	SUBSYSTEM=="cxl", ATTRS{mode}=="afu_directed", \
+	                  KERNEL=="afu[0-9]*.[0-9]*s", SYMLINK="cxl/%b"
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
deleted file mode 100644
index c5e8d5098ed3..000000000000
--- a/Documentation/powerpc/cxl.txt
+++ /dev/null
@@ -1,449 +0,0 @@
-Coherent Accelerator Interface (CXL)
-====================================
-
-Introduction
-============
-
-    The coherent accelerator interface is designed to allow the
-    coherent connection of accelerators (FPGAs and other devices) to a
-    POWER system. These devices need to adhere to the Coherent
-    Accelerator Interface Architecture (CAIA).
-
-    IBM refers to this as the Coherent Accelerator Processor Interface
-    or CAPI. In the kernel it's referred to by the name CXL to avoid
-    confusion with the ISDN CAPI subsystem.
-
-    Coherent in this context means that the accelerator and CPUs can
-    both access system memory directly and with the same effective
-    addresses.
-
-
-Hardware overview
-=================
-
-         POWER8/9             FPGA
-       +----------+        +---------+
-       |          |        |         |
-       |   CPU    |        |   AFU   |
-       |          |        |         |
-       |          |        |         |
-       |          |        |         |
-       +----------+        +---------+
-       |   PHB    |        |         |
-       |   +------+        |   PSL   |
-       |   | CAPP |<------>|         |
-       +---+------+  PCIE  +---------+
-
-    The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP)
-    unit which is part of the PCIe Host Bridge (PHB). This is managed
-    by Linux by calls into OPAL. Linux doesn't directly program the
-    CAPP.
-
-    The FPGA (or coherently attached device) consists of two parts.
-    The POWER Service Layer (PSL) and the Accelerator Function Unit
-    (AFU). The AFU is used to implement specific functionality behind
-    the PSL. The PSL, among other things, provides memory address
-    translation services to allow each AFU direct access to userspace
-    memory.
-
-    The AFU is the core part of the accelerator (eg. the compression,
-    crypto etc function). The kernel has no knowledge of the function
-    of the AFU. Only userspace interacts directly with the AFU.
-
-    The PSL provides the translation and interrupt services that the
-    AFU needs. This is what the kernel interacts with. For example, if
-    the AFU needs to read a particular effective address, it sends
-    that address to the PSL, the PSL then translates it, fetches the
-    data from memory and returns it to the AFU. If the PSL has a
-    translation miss, it interrupts the kernel and the kernel services
-    the fault. The context to which this fault is serviced is based on
-    who owns that acceleration function.
-
-    POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0.
-    POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0.
-    This PSL Version 9 provides new features such as:
-    * Interaction with the nest MMU on the P9 chip.
-    * Native DMA support.
-    * Supports sending ASB_Notify messages for host thread wakeup.
-    * Supports Atomic operations.
-    * ....
-
-    Cards with a PSL9 won't work on a POWER8 system and cards with a
-    PSL8 won't work on a POWER9 system.
-
-AFU Modes
-=========
-
-    There are two programming modes supported by the AFU. Dedicated
-    and AFU directed. AFU may support one or both modes.
-
-    When using dedicated mode only one MMU context is supported. In
-    this mode, only one userspace process can use the accelerator at
-    time.
-
-    When using AFU directed mode, up to 16K simultaneous contexts can
-    be supported. This means up to 16K simultaneous userspace
-    applications may use the accelerator (although specific AFUs may
-    support fewer). In this mode, the AFU sends a 16 bit context ID
-    with each of its requests. This tells the PSL which context is
-    associated with each operation. If the PSL can't translate an
-    operation, the ID can also be accessed by the kernel so it can
-    determine the userspace context associated with an operation.
-
-
-MMIO space
-==========
-
-    A portion of the accelerator MMIO space can be directly mapped
-    from the AFU to userspace. Either the whole space can be mapped or
-    just a per context portion. The hardware is self describing, hence
-    the kernel can determine the offset and size of the per context
-    portion.
-
-
-Interrupts
-==========
-
-    AFUs may generate interrupts that are destined for userspace. These
-    are received by the kernel as hardware interrupts and passed onto
-    userspace by a read syscall documented below.
-
-    Data storage faults and error interrupts are handled by the kernel
-    driver.
-
-
-Work Element Descriptor (WED)
-=============================
-
-    The WED is a 64-bit parameter passed to the AFU when a context is
-    started. Its format is up to the AFU hence the kernel has no
-    knowledge of what it represents. Typically it will be the
-    effective address of a work queue or status block where the AFU
-    and userspace can share control and status information.
-
-
-
-
-User API
-========
-
-1. AFU character devices
-
-    For AFUs operating in AFU directed mode, two character device
-    files will be created. /dev/cxl/afu0.0m will correspond to a
-    master context and /dev/cxl/afu0.0s will correspond to a slave
-    context. Master contexts have access to the full MMIO space an
-    AFU provides. Slave contexts have access to only the per process
-    MMIO space an AFU provides.
-
-    For AFUs operating in dedicated process mode, the driver will
-    only create a single character device per AFU called
-    /dev/cxl/afu0.0d. This will have access to the entire MMIO space
-    that the AFU provides (like master contexts in AFU directed).
-
-    The types described below are defined in include/uapi/misc/cxl.h
-
-    The following file operations are supported on both slave and
-    master devices.
-
-    A userspace library libcxl is available here:
-	https://github.com/ibm-capi/libcxl
-    This provides a C interface to this kernel API.
-
-open
-----
-
-    Opens the device and allocates a file descriptor to be used with
-    the rest of the API.
-
-    A dedicated mode AFU only has one context and only allows the
-    device to be opened once.
-
-    An AFU directed mode AFU can have many contexts, the device can be
-    opened once for each context that is available.
-
-    When all available contexts are allocated the open call will fail
-    and return -ENOSPC.
-
-    Note: IRQs need to be allocated for each context, which may limit
-          the number of contexts that can be created, and therefore
-          how many times the device can be opened. The POWER8 CAPP
-          supports 2040 IRQs and 3 are used by the kernel, so 2037 are
-          left. If 1 IRQ is needed per context, then only 2037
-          contexts can be allocated. If 4 IRQs are needed per context,
-          then only 2037/4 = 509 contexts can be allocated.
-
-
-ioctl
------
-
-    CXL_IOCTL_START_WORK:
-        Starts the AFU context and associates it with the current
-        process. Once this ioctl is successfully executed, all memory
-        mapped into this process is accessible to this AFU context
-        using the same effective addresses. No additional calls are
-        required to map/unmap memory. The AFU memory context will be
-        updated as userspace allocates and frees memory. This ioctl
-        returns once the AFU context is started.
-
-        Takes a pointer to a struct cxl_ioctl_start_work:
-
-                struct cxl_ioctl_start_work {
-                        __u64 flags;
-                        __u64 work_element_descriptor;
-                        __u64 amr;
-                        __s16 num_interrupts;
-                        __s16 reserved1;
-                        __s32 reserved2;
-                        __u64 reserved3;
-                        __u64 reserved4;
-                        __u64 reserved5;
-                        __u64 reserved6;
-                };
-
-            flags:
-                Indicates which optional fields in the structure are
-                valid.
-
-            work_element_descriptor:
-                The Work Element Descriptor (WED) is a 64-bit argument
-                defined by the AFU. Typically this is an effective
-                address pointing to an AFU specific structure
-                describing what work to perform.
-
-            amr:
-                Authority Mask Register (AMR), same as the powerpc
-                AMR. This field is only used by the kernel when the
-                corresponding CXL_START_WORK_AMR value is specified in
-                flags. If not specified the kernel will use a default
-                value of 0.
-
-            num_interrupts:
-                Number of userspace interrupts to request. This field
-                is only used by the kernel when the corresponding
-                CXL_START_WORK_NUM_IRQS value is specified in flags.
-                If not specified the minimum number required by the
-                AFU will be allocated. The min and max number can be
-                obtained from sysfs.
-
-            reserved fields:
-                For ABI padding and future extensions
-
-    CXL_IOCTL_GET_PROCESS_ELEMENT:
-        Get the current context id, also known as the process element.
-        The value is returned from the kernel as a __u32.
-
-
-mmap
-----
-
-    An AFU may have an MMIO space to facilitate communication with the
-    AFU. If it does, the MMIO space can be accessed via mmap. The size
-    and contents of this area are specific to the particular AFU. The
-    size can be discovered via sysfs.
-
-    In AFU directed mode, master contexts are allowed to map all of
-    the MMIO space and slave contexts are allowed to only map the per
-    process MMIO space associated with the context. In dedicated
-    process mode the entire MMIO space can always be mapped.
-
-    This mmap call must be done after the START_WORK ioctl.
-
-    Care should be taken when accessing MMIO space. Only 32 and 64-bit
-    accesses are supported by POWER8. Also, the AFU will be designed
-    with a specific endianness, so all MMIO accesses should consider
-    endianness (recommend endian(3) variants like: le64toh(),
-    be64toh() etc). These endian issues equally apply to shared memory
-    queues the WED may describe.
-
-
-read
-----
-
-    Reads events from the AFU. Blocks if no events are pending
-    (unless O_NONBLOCK is supplied). Returns -EIO in the case of an
-    unrecoverable error or if the card is removed.
-
-    read() will always return an integral number of events.
-
-    The buffer passed to read() must be at least 4K bytes.
-
-    The result of the read will be a buffer of one or more events,
-    each event is of type struct cxl_event, of varying size.
-
-            struct cxl_event {
-                    struct cxl_event_header header;
-                    union {
-                            struct cxl_event_afu_interrupt irq;
-                            struct cxl_event_data_storage fault;
-                            struct cxl_event_afu_error afu_error;
-                    };
-            };
-
-    The struct cxl_event_header is defined as:
-
-            struct cxl_event_header {
-                    __u16 type;
-                    __u16 size;
-                    __u16 process_element;
-                    __u16 reserved1;
-            };
-
-        type:
-            This defines the type of event. The type determines how
-            the rest of the event is structured. These types are
-            described below and defined by enum cxl_event_type.
-
-        size:
-            This is the size of the event in bytes including the
-            struct cxl_event_header. The start of the next event can
-            be found at this offset from the start of the current
-            event.
-
-        process_element:
-            Context ID of the event.
-
-        reserved field:
-            For future extensions and padding.
-
-    If the event type is CXL_EVENT_AFU_INTERRUPT then the event
-    structure is defined as:
-
-            struct cxl_event_afu_interrupt {
-                    __u16 flags;
-                    __u16 irq; /* Raised AFU interrupt number */
-                    __u32 reserved1;
-            };
-
-        flags:
-            These flags indicate which optional fields are present
-            in this struct. Currently all fields are mandatory.
-
-        irq:
-            The IRQ number sent by the AFU.
-
-        reserved field:
-            For future extensions and padding.
-
-    If the event type is CXL_EVENT_DATA_STORAGE then the event
-    structure is defined as:
-
-            struct cxl_event_data_storage {
-                    __u16 flags;
-                    __u16 reserved1;
-                    __u32 reserved2;
-                    __u64 addr;
-                    __u64 dsisr;
-                    __u64 reserved3;
-            };
-
-        flags:
-            These flags indicate which optional fields are present in
-            this struct. Currently all fields are mandatory.
-
-        address:
-            The address that the AFU unsuccessfully attempted to
-            access. Valid accesses will be handled transparently by the
-            kernel but invalid accesses will generate this event.
-
-        dsisr:
-            This field gives information on the type of fault. It is a
-            copy of the DSISR from the PSL hardware when the address
-            fault occurred. The form of the DSISR is as defined in the
-            CAIA.
-
-        reserved fields:
-            For future extensions
-
-    If the event type is CXL_EVENT_AFU_ERROR then the event structure
-    is defined as:
-
-            struct cxl_event_afu_error {
-                    __u16 flags;
-                    __u16 reserved1;
-                    __u32 reserved2;
-                    __u64 error;
-            };
-
-        flags:
-            These flags indicate which optional fields are present in
-            this struct. Currently all fields are Mandatory.
-
-        error:
-            Error status from the AFU. Defined by the AFU.
-
-        reserved fields:
-            For future extensions and padding
-
-
-2. Card character device (powerVM guest only)
-
-    In a powerVM guest, an extra character device is created for the
-    card. The device is only used to write (flash) a new image on the
-    FPGA accelerator. Once the image is written and verified, the
-    device tree is updated and the card is reset to reload the updated
-    image.
-
-open
-----
-
-    Opens the device and allocates a file descriptor to be used with
-    the rest of the API. The device can only be opened once.
-
-ioctl
------
-
-CXL_IOCTL_DOWNLOAD_IMAGE:
-CXL_IOCTL_VALIDATE_IMAGE:
-    Starts and controls flashing a new FPGA image. Partial
-    reconfiguration is not supported (yet), so the image must contain
-    a copy of the PSL and AFU(s). Since an image can be quite large,
-    the caller may have to iterate, splitting the image in smaller
-    chunks.
-
-    Takes a pointer to a struct cxl_adapter_image:
-        struct cxl_adapter_image {
-            __u64 flags;
-            __u64 data;
-            __u64 len_data;
-            __u64 len_image;
-            __u64 reserved1;
-            __u64 reserved2;
-            __u64 reserved3;
-            __u64 reserved4;
-        };
-
-    flags:
-        These flags indicate which optional fields are present in
-        this struct. Currently all fields are mandatory.
-
-    data:
-        Pointer to a buffer with part of the image to write to the
-        card.
-
-    len_data:
-        Size of the buffer pointed to by data.
-
-    len_image:
-        Full size of the image.
-
-
-Sysfs Class
-===========
-
-    A cxl sysfs class is added under /sys/class/cxl to facilitate
-    enumeration and tuning of the accelerators. Its layout is
-    described in Documentation/ABI/testing/sysfs-class-cxl
-
-
-Udev rules
-==========
-
-    The following udev rules could be used to create a symlink to the
-    most logical chardev to use in any programming mode (afuX.Yd for
-    dedicated, afuX.Ys for afu directed), since the API is virtually
-    identical for each:
-
-	SUBSYSTEM=="cxl", ATTRS{mode}=="dedicated_process", SYMLINK="cxl/%b"
-	SUBSYSTEM=="cxl", ATTRS{mode}=="afu_directed", \
-	                  KERNEL=="afu[0-9]*.[0-9]*s", SYMLINK="cxl/%b"
diff --git a/Documentation/powerpc/cxlflash.rst b/Documentation/powerpc/cxlflash.rst
new file mode 100644
index 000000000000..cea67931b3b9
--- /dev/null
+++ b/Documentation/powerpc/cxlflash.rst
@@ -0,0 +1,433 @@
+================================
+Coherent Accelerator (CXL) Flash
+================================
+
+Introduction
+============
+
+    The IBM Power architecture provides support for CAPI (Coherent
+    Accelerator Power Interface), which is available to certain PCIe slots
+    on Power 8 systems. CAPI can be thought of as a special tunneling
+    protocol through PCIe that allow PCIe adapters to look like special
+    purpose co-processors which can read or write an application's
+    memory and generate page faults. As a result, the host interface to
+    an adapter running in CAPI mode does not require the data buffers to
+    be mapped to the device's memory (IOMMU bypass) nor does it require
+    memory to be pinned.
+
+    On Linux, Coherent Accelerator (CXL) kernel services present CAPI
+    devices as a PCI device by implementing a virtual PCI host bridge.
+    This abstraction simplifies the infrastructure and programming
+    model, allowing for drivers to look similar to other native PCI
+    device drivers.
+
+    CXL provides a mechanism by which user space applications can
+    directly talk to a device (network or storage) bypassing the typical
+    kernel/device driver stack. The CXL Flash Adapter Driver enables a
+    user space application direct access to Flash storage.
+
+    The CXL Flash Adapter Driver is a kernel module that sits in the
+    SCSI stack as a low level device driver (below the SCSI disk and
+    protocol drivers) for the IBM CXL Flash Adapter. This driver is
+    responsible for the initialization of the adapter, setting up the
+    special path for user space access, and performing error recovery. It
+    communicates directly the Flash Accelerator Functional Unit (AFU)
+    as described in Documentation/powerpc/cxl.rst.
+
+    The cxlflash driver supports two, mutually exclusive, modes of
+    operation at the device (LUN) level:
+
+        - Any flash device (LUN) can be configured to be accessed as a
+          regular disk device (i.e.: /dev/sdc). This is the default mode.
+
+        - Any flash device (LUN) can be configured to be accessed from
+          user space with a special block library. This mode further
+          specifies the means of accessing the device and provides for
+          either raw access to the entire LUN (referred to as direct
+          or physical LUN access) or access to a kernel/AFU-mediated
+          partition of the LUN (referred to as virtual LUN access). The
+          segmentation of a disk device into virtual LUNs is assisted
+          by special translation services provided by the Flash AFU.
+
+Overview
+========
+
+    The Coherent Accelerator Interface Architecture (CAIA) introduces a
+    concept of a master context. A master typically has special privileges
+    granted to it by the kernel or hypervisor allowing it to perform AFU
+    wide management and control. The master may or may not be involved
+    directly in each user I/O, but at the minimum is involved in the
+    initial setup before the user application is allowed to send requests
+    directly to the AFU.
+
+    The CXL Flash Adapter Driver establishes a master context with the
+    AFU. It uses memory mapped I/O (MMIO) for this control and setup. The
+    Adapter Problem Space Memory Map looks like this::
+
+                     +-------------------------------+
+                     |    512 * 64 KB User MMIO      |
+                     |        (per context)          |
+                     |       User Accessible         |
+                     +-------------------------------+
+                     |    512 * 128 B per context    |
+                     |    Provisioning and Control   |
+                     |   Trusted Process accessible  |
+                     +-------------------------------+
+                     |         64 KB Global          |
+                     |   Trusted Process accessible  |
+                     +-------------------------------+
+
+    This driver configures itself into the SCSI software stack as an
+    adapter driver. The driver is the only entity that is considered a
+    Trusted Process to program the Provisioning and Control and Global
+    areas in the MMIO Space shown above.  The master context driver
+    discovers all LUNs attached to the CXL Flash adapter and instantiates
+    scsi block devices (/dev/sdb, /dev/sdc etc.) for each unique LUN
+    seen from each path.
+
+    Once these scsi block devices are instantiated, an application
+    written to a specification provided by the block library may get
+    access to the Flash from user space (without requiring a system call).
+
+    This master context driver also provides a series of ioctls for this
+    block library to enable this user space access.  The driver supports
+    two modes for accessing the block device.
+
+    The first mode is called a virtual mode. In this mode a single scsi
+    block device (/dev/sdb) may be carved up into any number of distinct
+    virtual LUNs. The virtual LUNs may be resized as long as the sum of
+    the sizes of all the virtual LUNs, along with the meta-data associated
+    with it does not exceed the physical capacity.
+
+    The second mode is called the physical mode. In this mode a single
+    block device (/dev/sdb) may be opened directly by the block library
+    and the entire space for the LUN is available to the application.
+
+    Only the physical mode provides persistence of the data.  i.e. The
+    data written to the block device will survive application exit and
+    restart and also reboot. The virtual LUNs do not persist (i.e. do
+    not survive after the application terminates or the system reboots).
+
+
+Block library API
+=================
+
+    Applications intending to get access to the CXL Flash from user
+    space should use the block library, as it abstracts the details of
+    interfacing directly with the cxlflash driver that are necessary for
+    performing administrative actions (i.e.: setup, tear down, resize).
+    The block library can be thought of as a 'user' of services,
+    implemented as IOCTLs, that are provided by the cxlflash driver
+    specifically for devices (LUNs) operating in user space access
+    mode. While it is not a requirement that applications understand
+    the interface between the block library and the cxlflash driver,
+    a high-level overview of each supported service (IOCTL) is provided
+    below.
+
+    The block library can be found on GitHub:
+    http://github.com/open-power/capiflash
+
+
+CXL Flash Driver LUN IOCTLs
+===========================
+
+    Users, such as the block library, that wish to interface with a flash
+    device (LUN) via user space access need to use the services provided
+    by the cxlflash driver. As these services are implemented as ioctls,
+    a file descriptor handle must first be obtained in order to establish
+    the communication channel between a user and the kernel.  This file
+    descriptor is obtained by opening the device special file associated
+    with the scsi disk device (/dev/sdb) that was created during LUN
+    discovery. As per the location of the cxlflash driver within the
+    SCSI protocol stack, this open is actually not seen by the cxlflash
+    driver. Upon successful open, the user receives a file descriptor
+    (herein referred to as fd1) that should be used for issuing the
+    subsequent ioctls listed below.
+
+    The structure definitions for these IOCTLs are available in:
+    uapi/scsi/cxlflash_ioctl.h
+
+DK_CXLFLASH_ATTACH
+------------------
+
+    This ioctl obtains, initializes, and starts a context using the CXL
+    kernel services. These services specify a context id (u16) by which
+    to uniquely identify the context and its allocated resources. The
+    services additionally provide a second file descriptor (herein
+    referred to as fd2) that is used by the block library to initiate
+    memory mapped I/O (via mmap()) to the CXL flash device and poll for
+    completion events. This file descriptor is intentionally installed by
+    this driver and not the CXL kernel services to allow for intermediary
+    notification and access in the event of a non-user-initiated close(),
+    such as a killed process. This design point is described in further
+    detail in the description for the DK_CXLFLASH_DETACH ioctl.
+
+    There are a few important aspects regarding the "tokens" (context id
+    and fd2) that are provided back to the user:
+
+        - These tokens are only valid for the process under which they
+          were created. The child of a forked process cannot continue
+          to use the context id or file descriptor created by its parent
+          (see DK_CXLFLASH_VLUN_CLONE for further details).
+
+        - These tokens are only valid for the lifetime of the context and
+          the process under which they were created. Once either is
+          destroyed, the tokens are to be considered stale and subsequent
+          usage will result in errors.
+
+	- A valid adapter file descriptor (fd2 >= 0) is only returned on
+	  the initial attach for a context. Subsequent attaches to an
+	  existing context (DK_CXLFLASH_ATTACH_REUSE_CONTEXT flag present)
+	  do not provide the adapter file descriptor as it was previously
+	  made known to the application.
+
+        - When a context is no longer needed, the user shall detach from
+          the context via the DK_CXLFLASH_DETACH ioctl. When this ioctl
+	  returns with a valid adapter file descriptor and the return flag
+	  DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_
+	  close the adapter file descriptor following a successful detach.
+
+	- When this ioctl returns with a valid fd2 and the return flag
+	  DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_
+	  close fd2 in the following circumstances:
+
+		+ Following a successful detach of the last user of the context
+		+ Following a successful recovery on the context's original fd2
+		+ In the child process of a fork(), following a clone ioctl,
+		  on the fd2 associated with the source context
+
+        - At any time, a close on fd2 will invalidate the tokens. Applications
+	  should exercise caution to only close fd2 when appropriate (outlined
+	  in the previous bullet) to avoid premature loss of I/O.
+
+DK_CXLFLASH_USER_DIRECT
+-----------------------
+    This ioctl is responsible for transitioning the LUN to direct
+    (physical) mode access and configuring the AFU for direct access from
+    user space on a per-context basis. Additionally, the block size and
+    last logical block address (LBA) are returned to the user.
+
+    As mentioned previously, when operating in user space access mode,
+    LUNs may be accessed in whole or in part. Only one mode is allowed
+    at a time and if one mode is active (outstanding references exist),
+    requests to use the LUN in a different mode are denied.
+
+    The AFU is configured for direct access from user space by adding an
+    entry to the AFU's resource handle table. The index of the entry is
+    treated as a resource handle that is returned to the user. The user
+    is then able to use the handle to reference the LUN during I/O.
+
+DK_CXLFLASH_USER_VIRTUAL
+------------------------
+    This ioctl is responsible for transitioning the LUN to virtual mode
+    of access and configuring the AFU for virtual access from user space
+    on a per-context basis. Additionally, the block size and last logical
+    block address (LBA) are returned to the user.
+
+    As mentioned previously, when operating in user space access mode,
+    LUNs may be accessed in whole or in part. Only one mode is allowed
+    at a time and if one mode is active (outstanding references exist),
+    requests to use the LUN in a different mode are denied.
+
+    The AFU is configured for virtual access from user space by adding
+    an entry to the AFU's resource handle table. The index of the entry
+    is treated as a resource handle that is returned to the user. The
+    user is then able to use the handle to reference the LUN during I/O.
+
+    By default, the virtual LUN is created with a size of 0. The user
+    would need to use the DK_CXLFLASH_VLUN_RESIZE ioctl to adjust the grow
+    the virtual LUN to a desired size. To avoid having to perform this
+    resize for the initial creation of the virtual LUN, the user has the
+    option of specifying a size as part of the DK_CXLFLASH_USER_VIRTUAL
+    ioctl, such that when success is returned to the user, the
+    resource handle that is provided is already referencing provisioned
+    storage. This is reflected by the last LBA being a non-zero value.
+
+    When a LUN is accessible from more than one port, this ioctl will
+    return with the DK_CXLFLASH_ALL_PORTS_ACTIVE return flag set. This
+    provides the user with a hint that I/O can be retried in the event
+    of an I/O error as the LUN can be reached over multiple paths.
+
+DK_CXLFLASH_VLUN_RESIZE
+-----------------------
+    This ioctl is responsible for resizing a previously created virtual
+    LUN and will fail if invoked upon a LUN that is not in virtual
+    mode. Upon success, an updated last LBA is returned to the user
+    indicating the new size of the virtual LUN associated with the
+    resource handle.
+
+    The partitioning of virtual LUNs is jointly mediated by the cxlflash
+    driver and the AFU. An allocation table is kept for each LUN that is
+    operating in the virtual mode and used to program a LUN translation
+    table that the AFU references when provided with a resource handle.
+
+    This ioctl can return -EAGAIN if an AFU sync operation takes too long.
+    In addition to returning a failure to user, cxlflash will also schedule
+    an asynchronous AFU reset. Should the user choose to retry the operation,
+    it is expected to succeed. If this ioctl fails with -EAGAIN, the user
+    can either retry the operation or treat it as a failure.
+
+DK_CXLFLASH_RELEASE
+-------------------
+    This ioctl is responsible for releasing a previously obtained
+    reference to either a physical or virtual LUN. This can be
+    thought of as the inverse of the DK_CXLFLASH_USER_DIRECT or
+    DK_CXLFLASH_USER_VIRTUAL ioctls. Upon success, the resource handle
+    is no longer valid and the entry in the resource handle table is
+    made available to be used again.
+
+    As part of the release process for virtual LUNs, the virtual LUN
+    is first resized to 0 to clear out and free the translation tables
+    associated with the virtual LUN reference.
+
+DK_CXLFLASH_DETACH
+------------------
+    This ioctl is responsible for unregistering a context with the
+    cxlflash driver and release outstanding resources that were
+    not explicitly released via the DK_CXLFLASH_RELEASE ioctl. Upon
+    success, all "tokens" which had been provided to the user from the
+    DK_CXLFLASH_ATTACH onward are no longer valid.
+
+    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
+    attach, the application _must_ close the fd2 associated with the context
+    following the detach of the final user of the context.
+
+DK_CXLFLASH_VLUN_CLONE
+----------------------
+    This ioctl is responsible for cloning a previously created
+    context to a more recently created context. It exists solely to
+    support maintaining user space access to storage after a process
+    forks. Upon success, the child process (which invoked the ioctl)
+    will have access to the same LUNs via the same resource handle(s)
+    as the parent, but under a different context.
+
+    Context sharing across processes is not supported with CXL and
+    therefore each fork must be met with establishing a new context
+    for the child process. This ioctl simplifies the state management
+    and playback required by a user in such a scenario. When a process
+    forks, child process can clone the parents context by first creating
+    a context (via DK_CXLFLASH_ATTACH) and then using this ioctl to
+    perform the clone from the parent to the child.
+
+    The clone itself is fairly simple. The resource handle and lun
+    translation tables are copied from the parent context to the child's
+    and then synced with the AFU.
+
+    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
+    attach, the application _must_ close the fd2 associated with the source
+    context (still resident/accessible in the parent process) following the
+    clone. This is to avoid a stale entry in the file descriptor table of the
+    child process.
+
+    This ioctl can return -EAGAIN if an AFU sync operation takes too long.
+    In addition to returning a failure to user, cxlflash will also schedule
+    an asynchronous AFU reset. Should the user choose to retry the operation,
+    it is expected to succeed. If this ioctl fails with -EAGAIN, the user
+    can either retry the operation or treat it as a failure.
+
+DK_CXLFLASH_VERIFY
+------------------
+    This ioctl is used to detect various changes such as the capacity of
+    the disk changing, the number of LUNs visible changing, etc. In cases
+    where the changes affect the application (such as a LUN resize), the
+    cxlflash driver will report the changed state to the application.
+
+    The user calls in when they want to validate that a LUN hasn't been
+    changed in response to a check condition. As the user is operating out
+    of band from the kernel, they will see these types of events without
+    the kernel's knowledge. When encountered, the user's architected
+    behavior is to call in to this ioctl, indicating what they want to
+    verify and passing along any appropriate information. For now, only
+    verifying a LUN change (ie: size different) with sense data is
+    supported.
+
+DK_CXLFLASH_RECOVER_AFU
+-----------------------
+    This ioctl is used to drive recovery (if such an action is warranted)
+    of a specified user context. Any state associated with the user context
+    is re-established upon successful recovery.
+
+    User contexts are put into an error condition when the device needs to
+    be reset or is terminating. Users are notified of this error condition
+    by seeing all 0xF's on an MMIO read. Upon encountering this, the
+    architected behavior for a user is to call into this ioctl to recover
+    their context. A user may also call into this ioctl at any time to
+    check if the device is operating normally. If a failure is returned
+    from this ioctl, the user is expected to gracefully clean up their
+    context via release/detach ioctls. Until they do, the context they
+    hold is not relinquished. The user may also optionally exit the process
+    at which time the context/resources they held will be freed as part of
+    the release fop.
+
+    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
+    attach, the application _must_ unmap and close the fd2 associated with the
+    original context following this ioctl returning success and indicating that
+    the context was recovered (DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET).
+
+DK_CXLFLASH_MANAGE_LUN
+----------------------
+    This ioctl is used to switch a LUN from a mode where it is available
+    for file-system access (legacy), to a mode where it is set aside for
+    exclusive user space access (superpipe). In case a LUN is visible
+    across multiple ports and adapters, this ioctl is used to uniquely
+    identify each LUN by its World Wide Node Name (WWNN).
+
+
+CXL Flash Driver Host IOCTLs
+============================
+
+    Each host adapter instance that is supported by the cxlflash driver
+    has a special character device associated with it to enable a set of
+    host management function. These character devices are hosted in a
+    class dedicated for cxlflash and can be accessed via `/dev/cxlflash/*`.
+
+    Applications can be written to perform various functions using the
+    host ioctl APIs below.
+
+    The structure definitions for these IOCTLs are available in:
+    uapi/scsi/cxlflash_ioctl.h
+
+HT_CXLFLASH_LUN_PROVISION
+-------------------------
+    This ioctl is used to create and delete persistent LUNs on cxlflash
+    devices that lack an external LUN management interface. It is only
+    valid when used with AFUs that support the LUN provision capability.
+
+    When sufficient space is available, LUNs can be created by specifying
+    the target port to host the LUN and a desired size in 4K blocks. Upon
+    success, the LUN ID and WWID of the created LUN will be returned and
+    the SCSI bus can be scanned to detect the change in LUN topology. Note
+    that partial allocations are not supported. Should a creation fail due
+    to a space issue, the target port can be queried for its current LUN
+    geometry.
+
+    To remove a LUN, the device must first be disassociated from the Linux
+    SCSI subsystem. The LUN deletion can then be initiated by specifying a
+    target port and LUN ID. Upon success, the LUN geometry associated with
+    the port will be updated to reflect new number of provisioned LUNs and
+    available capacity.
+
+    To query the LUN geometry of a port, the target port is specified and
+    upon success, the following information is presented:
+
+        - Maximum number of provisioned LUNs allowed for the port
+        - Current number of provisioned LUNs for the port
+        - Maximum total capacity of provisioned LUNs for the port (4K blocks)
+        - Current total capacity of provisioned LUNs for the port (4K blocks)
+
+    With this information, the number of available LUNs and capacity can be
+    can be calculated.
+
+HT_CXLFLASH_AFU_DEBUG
+---------------------
+    This ioctl is used to debug AFUs by supporting a command pass-through
+    interface. It is only valid when used with AFUs that support the AFU
+    debug capability.
+
+    With exception of buffer management, AFU debug commands are opaque to
+    cxlflash and treated as pass-through. For debug commands that do require
+    data transfer, the user supplies an adequately sized data buffer and must
+    specify the data transfer direction with respect to the host. There is a
+    maximum transfer size of 256K imposed. Note that partial read completions
+    are not supported - when errors are experienced with a host read data
+    transfer, the data buffer is not copied back to the user.
diff --git a/Documentation/powerpc/cxlflash.txt b/Documentation/powerpc/cxlflash.txt
deleted file mode 100644
index a64bdaa0a1cf..000000000000
--- a/Documentation/powerpc/cxlflash.txt
+++ /dev/null
@@ -1,429 +0,0 @@
-Introduction
-============
-
-    The IBM Power architecture provides support for CAPI (Coherent
-    Accelerator Power Interface), which is available to certain PCIe slots
-    on Power 8 systems. CAPI can be thought of as a special tunneling
-    protocol through PCIe that allow PCIe adapters to look like special
-    purpose co-processors which can read or write an application's
-    memory and generate page faults. As a result, the host interface to
-    an adapter running in CAPI mode does not require the data buffers to
-    be mapped to the device's memory (IOMMU bypass) nor does it require
-    memory to be pinned.
-
-    On Linux, Coherent Accelerator (CXL) kernel services present CAPI
-    devices as a PCI device by implementing a virtual PCI host bridge.
-    This abstraction simplifies the infrastructure and programming
-    model, allowing for drivers to look similar to other native PCI
-    device drivers.
-
-    CXL provides a mechanism by which user space applications can
-    directly talk to a device (network or storage) bypassing the typical
-    kernel/device driver stack. The CXL Flash Adapter Driver enables a
-    user space application direct access to Flash storage.
-
-    The CXL Flash Adapter Driver is a kernel module that sits in the
-    SCSI stack as a low level device driver (below the SCSI disk and
-    protocol drivers) for the IBM CXL Flash Adapter. This driver is
-    responsible for the initialization of the adapter, setting up the
-    special path for user space access, and performing error recovery. It
-    communicates directly the Flash Accelerator Functional Unit (AFU)
-    as described in Documentation/powerpc/cxl.txt.
-
-    The cxlflash driver supports two, mutually exclusive, modes of
-    operation at the device (LUN) level:
-
-        - Any flash device (LUN) can be configured to be accessed as a
-          regular disk device (i.e.: /dev/sdc). This is the default mode.
-
-        - Any flash device (LUN) can be configured to be accessed from
-          user space with a special block library. This mode further
-          specifies the means of accessing the device and provides for
-          either raw access to the entire LUN (referred to as direct
-          or physical LUN access) or access to a kernel/AFU-mediated
-          partition of the LUN (referred to as virtual LUN access). The
-          segmentation of a disk device into virtual LUNs is assisted
-          by special translation services provided by the Flash AFU.
-
-Overview
-========
-
-    The Coherent Accelerator Interface Architecture (CAIA) introduces a
-    concept of a master context. A master typically has special privileges
-    granted to it by the kernel or hypervisor allowing it to perform AFU
-    wide management and control. The master may or may not be involved
-    directly in each user I/O, but at the minimum is involved in the
-    initial setup before the user application is allowed to send requests
-    directly to the AFU.
-
-    The CXL Flash Adapter Driver establishes a master context with the
-    AFU. It uses memory mapped I/O (MMIO) for this control and setup. The
-    Adapter Problem Space Memory Map looks like this:
-
-                     +-------------------------------+
-                     |    512 * 64 KB User MMIO      |
-                     |        (per context)          |
-                     |       User Accessible         |
-                     +-------------------------------+
-                     |    512 * 128 B per context    |
-                     |    Provisioning and Control   |
-                     |   Trusted Process accessible  |
-                     +-------------------------------+
-                     |         64 KB Global          |
-                     |   Trusted Process accessible  |
-                     +-------------------------------+
-
-    This driver configures itself into the SCSI software stack as an
-    adapter driver. The driver is the only entity that is considered a
-    Trusted Process to program the Provisioning and Control and Global
-    areas in the MMIO Space shown above.  The master context driver
-    discovers all LUNs attached to the CXL Flash adapter and instantiates
-    scsi block devices (/dev/sdb, /dev/sdc etc.) for each unique LUN
-    seen from each path.
-
-    Once these scsi block devices are instantiated, an application
-    written to a specification provided by the block library may get
-    access to the Flash from user space (without requiring a system call).
-
-    This master context driver also provides a series of ioctls for this
-    block library to enable this user space access.  The driver supports
-    two modes for accessing the block device.
-
-    The first mode is called a virtual mode. In this mode a single scsi
-    block device (/dev/sdb) may be carved up into any number of distinct
-    virtual LUNs. The virtual LUNs may be resized as long as the sum of
-    the sizes of all the virtual LUNs, along with the meta-data associated
-    with it does not exceed the physical capacity.
-
-    The second mode is called the physical mode. In this mode a single
-    block device (/dev/sdb) may be opened directly by the block library
-    and the entire space for the LUN is available to the application.
-
-    Only the physical mode provides persistence of the data.  i.e. The
-    data written to the block device will survive application exit and
-    restart and also reboot. The virtual LUNs do not persist (i.e. do
-    not survive after the application terminates or the system reboots).
-
-
-Block library API
-=================
-
-    Applications intending to get access to the CXL Flash from user
-    space should use the block library, as it abstracts the details of
-    interfacing directly with the cxlflash driver that are necessary for
-    performing administrative actions (i.e.: setup, tear down, resize).
-    The block library can be thought of as a 'user' of services,
-    implemented as IOCTLs, that are provided by the cxlflash driver
-    specifically for devices (LUNs) operating in user space access
-    mode. While it is not a requirement that applications understand
-    the interface between the block library and the cxlflash driver,
-    a high-level overview of each supported service (IOCTL) is provided
-    below.
-
-    The block library can be found on GitHub:
-    http://github.com/open-power/capiflash
-
-
-CXL Flash Driver LUN IOCTLs
-===========================
-
-    Users, such as the block library, that wish to interface with a flash
-    device (LUN) via user space access need to use the services provided
-    by the cxlflash driver. As these services are implemented as ioctls,
-    a file descriptor handle must first be obtained in order to establish
-    the communication channel between a user and the kernel.  This file
-    descriptor is obtained by opening the device special file associated
-    with the scsi disk device (/dev/sdb) that was created during LUN
-    discovery. As per the location of the cxlflash driver within the
-    SCSI protocol stack, this open is actually not seen by the cxlflash
-    driver. Upon successful open, the user receives a file descriptor
-    (herein referred to as fd1) that should be used for issuing the
-    subsequent ioctls listed below.
-
-    The structure definitions for these IOCTLs are available in:
-    uapi/scsi/cxlflash_ioctl.h
-
-DK_CXLFLASH_ATTACH
-------------------
-
-    This ioctl obtains, initializes, and starts a context using the CXL
-    kernel services. These services specify a context id (u16) by which
-    to uniquely identify the context and its allocated resources. The
-    services additionally provide a second file descriptor (herein
-    referred to as fd2) that is used by the block library to initiate
-    memory mapped I/O (via mmap()) to the CXL flash device and poll for
-    completion events. This file descriptor is intentionally installed by
-    this driver and not the CXL kernel services to allow for intermediary
-    notification and access in the event of a non-user-initiated close(),
-    such as a killed process. This design point is described in further
-    detail in the description for the DK_CXLFLASH_DETACH ioctl.
-
-    There are a few important aspects regarding the "tokens" (context id
-    and fd2) that are provided back to the user:
-
-        - These tokens are only valid for the process under which they
-          were created. The child of a forked process cannot continue
-          to use the context id or file descriptor created by its parent
-          (see DK_CXLFLASH_VLUN_CLONE for further details).
-
-        - These tokens are only valid for the lifetime of the context and
-          the process under which they were created. Once either is
-          destroyed, the tokens are to be considered stale and subsequent
-          usage will result in errors.
-
-	- A valid adapter file descriptor (fd2 >= 0) is only returned on
-	  the initial attach for a context. Subsequent attaches to an
-	  existing context (DK_CXLFLASH_ATTACH_REUSE_CONTEXT flag present)
-	  do not provide the adapter file descriptor as it was previously
-	  made known to the application.
-
-        - When a context is no longer needed, the user shall detach from
-          the context via the DK_CXLFLASH_DETACH ioctl. When this ioctl
-	  returns with a valid adapter file descriptor and the return flag
-	  DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_
-	  close the adapter file descriptor following a successful detach.
-
-	- When this ioctl returns with a valid fd2 and the return flag
-	  DK_CXLFLASH_APP_CLOSE_ADAP_FD is present, the application _must_
-	  close fd2 in the following circumstances:
-
-		+ Following a successful detach of the last user of the context
-		+ Following a successful recovery on the context's original fd2
-		+ In the child process of a fork(), following a clone ioctl,
-		  on the fd2 associated with the source context
-
-        - At any time, a close on fd2 will invalidate the tokens. Applications
-	  should exercise caution to only close fd2 when appropriate (outlined
-	  in the previous bullet) to avoid premature loss of I/O.
-
-DK_CXLFLASH_USER_DIRECT
------------------------
-    This ioctl is responsible for transitioning the LUN to direct
-    (physical) mode access and configuring the AFU for direct access from
-    user space on a per-context basis. Additionally, the block size and
-    last logical block address (LBA) are returned to the user.
-
-    As mentioned previously, when operating in user space access mode,
-    LUNs may be accessed in whole or in part. Only one mode is allowed
-    at a time and if one mode is active (outstanding references exist),
-    requests to use the LUN in a different mode are denied.
-
-    The AFU is configured for direct access from user space by adding an
-    entry to the AFU's resource handle table. The index of the entry is
-    treated as a resource handle that is returned to the user. The user
-    is then able to use the handle to reference the LUN during I/O.
-
-DK_CXLFLASH_USER_VIRTUAL
-------------------------
-    This ioctl is responsible for transitioning the LUN to virtual mode
-    of access and configuring the AFU for virtual access from user space
-    on a per-context basis. Additionally, the block size and last logical
-    block address (LBA) are returned to the user.
-
-    As mentioned previously, when operating in user space access mode,
-    LUNs may be accessed in whole or in part. Only one mode is allowed
-    at a time and if one mode is active (outstanding references exist),
-    requests to use the LUN in a different mode are denied.
-
-    The AFU is configured for virtual access from user space by adding
-    an entry to the AFU's resource handle table. The index of the entry
-    is treated as a resource handle that is returned to the user. The
-    user is then able to use the handle to reference the LUN during I/O.
-
-    By default, the virtual LUN is created with a size of 0. The user
-    would need to use the DK_CXLFLASH_VLUN_RESIZE ioctl to adjust the grow
-    the virtual LUN to a desired size. To avoid having to perform this
-    resize for the initial creation of the virtual LUN, the user has the
-    option of specifying a size as part of the DK_CXLFLASH_USER_VIRTUAL
-    ioctl, such that when success is returned to the user, the
-    resource handle that is provided is already referencing provisioned
-    storage. This is reflected by the last LBA being a non-zero value.
-
-    When a LUN is accessible from more than one port, this ioctl will
-    return with the DK_CXLFLASH_ALL_PORTS_ACTIVE return flag set. This
-    provides the user with a hint that I/O can be retried in the event
-    of an I/O error as the LUN can be reached over multiple paths.
-
-DK_CXLFLASH_VLUN_RESIZE
------------------------
-    This ioctl is responsible for resizing a previously created virtual
-    LUN and will fail if invoked upon a LUN that is not in virtual
-    mode. Upon success, an updated last LBA is returned to the user
-    indicating the new size of the virtual LUN associated with the
-    resource handle.
-
-    The partitioning of virtual LUNs is jointly mediated by the cxlflash
-    driver and the AFU. An allocation table is kept for each LUN that is
-    operating in the virtual mode and used to program a LUN translation
-    table that the AFU references when provided with a resource handle.
-
-    This ioctl can return -EAGAIN if an AFU sync operation takes too long.
-    In addition to returning a failure to user, cxlflash will also schedule
-    an asynchronous AFU reset. Should the user choose to retry the operation,
-    it is expected to succeed. If this ioctl fails with -EAGAIN, the user
-    can either retry the operation or treat it as a failure.
-
-DK_CXLFLASH_RELEASE
--------------------
-    This ioctl is responsible for releasing a previously obtained
-    reference to either a physical or virtual LUN. This can be
-    thought of as the inverse of the DK_CXLFLASH_USER_DIRECT or
-    DK_CXLFLASH_USER_VIRTUAL ioctls. Upon success, the resource handle
-    is no longer valid and the entry in the resource handle table is
-    made available to be used again.
-
-    As part of the release process for virtual LUNs, the virtual LUN
-    is first resized to 0 to clear out and free the translation tables
-    associated with the virtual LUN reference.
-
-DK_CXLFLASH_DETACH
-------------------
-    This ioctl is responsible for unregistering a context with the
-    cxlflash driver and release outstanding resources that were
-    not explicitly released via the DK_CXLFLASH_RELEASE ioctl. Upon
-    success, all "tokens" which had been provided to the user from the
-    DK_CXLFLASH_ATTACH onward are no longer valid.
-
-    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
-    attach, the application _must_ close the fd2 associated with the context
-    following the detach of the final user of the context.
-
-DK_CXLFLASH_VLUN_CLONE
-----------------------
-    This ioctl is responsible for cloning a previously created
-    context to a more recently created context. It exists solely to
-    support maintaining user space access to storage after a process
-    forks. Upon success, the child process (which invoked the ioctl)
-    will have access to the same LUNs via the same resource handle(s)
-    as the parent, but under a different context.
-
-    Context sharing across processes is not supported with CXL and
-    therefore each fork must be met with establishing a new context
-    for the child process. This ioctl simplifies the state management
-    and playback required by a user in such a scenario. When a process
-    forks, child process can clone the parents context by first creating
-    a context (via DK_CXLFLASH_ATTACH) and then using this ioctl to
-    perform the clone from the parent to the child.
-
-    The clone itself is fairly simple. The resource handle and lun
-    translation tables are copied from the parent context to the child's
-    and then synced with the AFU.
-
-    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
-    attach, the application _must_ close the fd2 associated with the source
-    context (still resident/accessible in the parent process) following the
-    clone. This is to avoid a stale entry in the file descriptor table of the
-    child process.
-
-    This ioctl can return -EAGAIN if an AFU sync operation takes too long.
-    In addition to returning a failure to user, cxlflash will also schedule
-    an asynchronous AFU reset. Should the user choose to retry the operation,
-    it is expected to succeed. If this ioctl fails with -EAGAIN, the user
-    can either retry the operation or treat it as a failure.
-
-DK_CXLFLASH_VERIFY
-------------------
-    This ioctl is used to detect various changes such as the capacity of
-    the disk changing, the number of LUNs visible changing, etc. In cases
-    where the changes affect the application (such as a LUN resize), the
-    cxlflash driver will report the changed state to the application.
-
-    The user calls in when they want to validate that a LUN hasn't been
-    changed in response to a check condition. As the user is operating out
-    of band from the kernel, they will see these types of events without
-    the kernel's knowledge. When encountered, the user's architected
-    behavior is to call in to this ioctl, indicating what they want to
-    verify and passing along any appropriate information. For now, only
-    verifying a LUN change (ie: size different) with sense data is
-    supported.
-
-DK_CXLFLASH_RECOVER_AFU
------------------------
-    This ioctl is used to drive recovery (if such an action is warranted)
-    of a specified user context. Any state associated with the user context
-    is re-established upon successful recovery.
-
-    User contexts are put into an error condition when the device needs to
-    be reset or is terminating. Users are notified of this error condition
-    by seeing all 0xF's on an MMIO read. Upon encountering this, the
-    architected behavior for a user is to call into this ioctl to recover
-    their context. A user may also call into this ioctl at any time to
-    check if the device is operating normally. If a failure is returned
-    from this ioctl, the user is expected to gracefully clean up their
-    context via release/detach ioctls. Until they do, the context they
-    hold is not relinquished. The user may also optionally exit the process
-    at which time the context/resources they held will be freed as part of
-    the release fop.
-
-    When the DK_CXLFLASH_APP_CLOSE_ADAP_FD flag was returned on a successful
-    attach, the application _must_ unmap and close the fd2 associated with the
-    original context following this ioctl returning success and indicating that
-    the context was recovered (DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET).
-
-DK_CXLFLASH_MANAGE_LUN
-----------------------
-    This ioctl is used to switch a LUN from a mode where it is available
-    for file-system access (legacy), to a mode where it is set aside for
-    exclusive user space access (superpipe). In case a LUN is visible
-    across multiple ports and adapters, this ioctl is used to uniquely
-    identify each LUN by its World Wide Node Name (WWNN).
-
-
-CXL Flash Driver Host IOCTLs
-============================
-
-    Each host adapter instance that is supported by the cxlflash driver
-    has a special character device associated with it to enable a set of
-    host management function. These character devices are hosted in a
-    class dedicated for cxlflash and can be accessed via /dev/cxlflash/*.
-
-    Applications can be written to perform various functions using the
-    host ioctl APIs below.
-
-    The structure definitions for these IOCTLs are available in:
-    uapi/scsi/cxlflash_ioctl.h
-
-HT_CXLFLASH_LUN_PROVISION
--------------------------
-    This ioctl is used to create and delete persistent LUNs on cxlflash
-    devices that lack an external LUN management interface. It is only
-    valid when used with AFUs that support the LUN provision capability.
-
-    When sufficient space is available, LUNs can be created by specifying
-    the target port to host the LUN and a desired size in 4K blocks. Upon
-    success, the LUN ID and WWID of the created LUN will be returned and
-    the SCSI bus can be scanned to detect the change in LUN topology. Note
-    that partial allocations are not supported. Should a creation fail due
-    to a space issue, the target port can be queried for its current LUN
-    geometry.
-
-    To remove a LUN, the device must first be disassociated from the Linux
-    SCSI subsystem. The LUN deletion can then be initiated by specifying a
-    target port and LUN ID. Upon success, the LUN geometry associated with
-    the port will be updated to reflect new number of provisioned LUNs and
-    available capacity.
-
-    To query the LUN geometry of a port, the target port is specified and
-    upon success, the following information is presented:
-
-        - Maximum number of provisioned LUNs allowed for the port
-        - Current number of provisioned LUNs for the port
-        - Maximum total capacity of provisioned LUNs for the port (4K blocks)
-        - Current total capacity of provisioned LUNs for the port (4K blocks)
-
-    With this information, the number of available LUNs and capacity can be
-    can be calculated.
-
-HT_CXLFLASH_AFU_DEBUG
----------------------
-    This ioctl is used to debug AFUs by supporting a command pass-through
-    interface. It is only valid when used with AFUs that support the AFU
-    debug capability.
-
-    With exception of buffer management, AFU debug commands are opaque to
-    cxlflash and treated as pass-through. For debug commands that do require
-    data transfer, the user supplies an adequately sized data buffer and must
-    specify the data transfer direction with respect to the host. There is a
-    maximum transfer size of 256K imposed. Note that partial read completions
-    are not supported - when errors are experienced with a host read data
-    transfer, the data buffer is not copied back to the user.
diff --git a/Documentation/powerpc/dawr-power9.rst b/Documentation/powerpc/dawr-power9.rst
new file mode 100644
index 000000000000..c96ab6befd9c
--- /dev/null
+++ b/Documentation/powerpc/dawr-power9.rst
@@ -0,0 +1,93 @@
+=====================
+DAWR issues on POWER9
+=====================
+
+On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop
+if it points to cache inhibited (CI) memory. Currently Linux has no way to
+disinguish CI memory when configuring the DAWR, so (for now) the DAWR is
+disabled by this commit::
+
+    commit 9654153158d3e0684a1bdb76dbababdb7111d5a0
+    Author: Michael Neuling <mikey@neuling.org>
+    Date:   Tue Mar 27 15:37:24 2018 +1100
+    powerpc: Disable DAWR in the base POWER9 CPU features
+
+Technical Details:
+==================
+
+DAWR has 6 different ways of being set.
+1) ptrace
+2) h_set_mode(DAWR)
+3) h_set_dabr()
+4) kvmppc_set_one_reg()
+5) xmon
+
+For ptrace, we now advertise zero breakpoints on POWER9 via the
+PPC_PTRACE_GETHWDBGINFO call. This results in GDB falling back to
+software emulation of the watchpoint (which is slow).
+
+h_set_mode(DAWR) and h_set_dabr() will now return an error to the
+guest on a POWER9 host. Current Linux guests ignore this error, so
+they will silently not get the DAWR.
+
+kvmppc_set_one_reg() will store the value in the vcpu but won't
+actually set it on POWER9 hardware. This is done so we don't break
+migration from POWER8 to POWER9, at the cost of silently losing the
+DAWR on the migration.
+
+For xmon, the 'bd' command will return an error on P9.
+
+Consequences for users
+======================
+
+For GDB watchpoints (ie 'watch' command) on POWER9 bare metal , GDB
+will accept the command. Unfortunately since there is no hardware
+support for the watchpoint, GDB will software emulate the watchpoint
+making it run very slowly.
+
+The same will also be true for any guests started on a POWER9
+host. The watchpoint will fail and GDB will fall back to software
+emulation.
+
+If a guest is started on a POWER8 host, GDB will accept the watchpoint
+and configure the hardware to use the DAWR. This will run at full
+speed since it can use the hardware emulation. Unfortunately if this
+guest is migrated to a POWER9 host, the watchpoint will be lost on the
+POWER9. Loads and stores to the watchpoint locations will not be
+trapped in GDB. The watchpoint is remembered, so if the guest is
+migrated back to the POWER8 host, it will start working again.
+
+Force enabling the DAWR
+=======================
+Kernels (since ~v5.2) have an option to force enable the DAWR via::
+
+  echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous
+
+This enables the DAWR even on POWER9.
+
+This is a dangerous setting, USE AT YOUR OWN RISK.
+
+Some users may not care about a bad user crashing their box
+(ie. single user/desktop systems) and really want the DAWR.  This
+allows them to force enable DAWR.
+
+This flag can also be used to disable DAWR access. Once this is
+cleared, all DAWR access should be cleared immediately and your
+machine once again safe from crashing.
+
+Userspace may get confused by toggling this. If DAWR is force
+enabled/disabled between getting the number of breakpoints (via
+PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an
+inconsistent view of what's available. Similarly for guests.
+
+For the DAWR to be enabled in a KVM guest, the DAWR needs to be force
+enabled in the host AND the guest. For this reason, this won't work on
+POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the
+dawr_enable_dangerous file will fail if the hypervisor doesn't support
+writing the DAWR.
+
+To double check the DAWR is working, run this kernel selftest:
+
+  tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+
+Any errors/failures/skips mean something is wrong.
diff --git a/Documentation/powerpc/dscr.rst b/Documentation/powerpc/dscr.rst
new file mode 100644
index 000000000000..2ab99006014c
--- /dev/null
+++ b/Documentation/powerpc/dscr.rst
@@ -0,0 +1,87 @@
+===================================
+DSCR (Data Stream Control Register)
+===================================
+
+DSCR register in powerpc allows user to have some control of prefetch of data
+stream in the processor. Please refer to the ISA documents or related manual
+for more detailed information regarding how to use this DSCR to attain this
+control of the prefetches . This document here provides an overview of kernel
+support for DSCR, related kernel objects, it's functionalities and exported
+user interface.
+
+(A) Data Structures:
+
+	(1) thread_struct::
+
+		dscr		/* Thread DSCR value */
+		dscr_inherit	/* Thread has changed default DSCR */
+
+	(2) PACA::
+
+		dscr_default	/* per-CPU DSCR default value */
+
+	(3) sysfs.c::
+
+		dscr_default	/* System DSCR default value */
+
+(B) Scheduler Changes:
+
+	Scheduler will write the per-CPU DSCR default which is stored in the
+	CPU's PACA value into the register if the thread has dscr_inherit value
+	cleared which means that it has not changed the default DSCR till now.
+	If the dscr_inherit value is set which means that it has changed the
+	default DSCR value, scheduler will write the changed value which will
+	now be contained in thread struct's dscr into the register instead of
+	the per-CPU default PACA based DSCR value.
+
+	NOTE: Please note here that the system wide global DSCR value never
+	gets used directly in the scheduler process context switch at all.
+
+(C) SYSFS Interface:
+
+	- Global DSCR default:		/sys/devices/system/cpu/dscr_default
+	- CPU specific DSCR default:	/sys/devices/system/cpu/cpuN/dscr
+
+	Changing the global DSCR default in the sysfs will change all the CPU
+	specific DSCR defaults immediately in their PACA structures. Again if
+	the current process has the dscr_inherit clear, it also writes the new
+	value into every CPU's DSCR register right away and updates the current
+	thread's DSCR value as well.
+
+	Changing the CPU specific DSCR default value in the sysfs does exactly
+	the same thing as above but unlike the global one above, it just changes
+	stuff for that particular CPU instead for all the CPUs on the system.
+
+(D) User Space Instructions:
+
+	The DSCR register can be accessed in the user space using any of these
+	two SPR numbers available for that purpose.
+
+	(1) Problem state SPR:		0x03	(Un-privileged, POWER8 only)
+	(2) Privileged state SPR:	0x11	(Privileged)
+
+	Accessing DSCR through privileged SPR number (0x11) from user space
+	works, as it is emulated following an illegal instruction exception
+	inside the kernel. Both mfspr and mtspr instructions are emulated.
+
+	Accessing DSCR through user level SPR (0x03) from user space will first
+	create a facility unavailable exception. Inside this exception handler
+	all mfspr instruction based read attempts will get emulated and returned
+	where as the first mtspr instruction based write attempts will enable
+	the DSCR facility for the next time around (both for read and write) by
+	setting DSCR facility in the FSCR register.
+
+(E) Specifics about 'dscr_inherit':
+
+	The thread struct element 'dscr_inherit' represents whether the thread
+	in question has attempted and changed the DSCR itself using any of the
+	following methods. This element signifies whether the thread wants to
+	use the CPU default DSCR value or its own changed DSCR value in the
+	kernel.
+
+		(1) mtspr instruction	(SPR number 0x03)
+		(2) mtspr instruction	(SPR number 0x11)
+		(3) ptrace interface	(Explicitly set user DSCR value)
+
+	Any child of the process created after this event in the process inherits
+	this same behaviour as well.
diff --git a/Documentation/powerpc/dscr.txt b/Documentation/powerpc/dscr.txt
deleted file mode 100644
index ece300c64f76..000000000000
--- a/Documentation/powerpc/dscr.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-			DSCR (Data Stream Control Register)
-		================================================
-
-DSCR register in powerpc allows user to have some control of prefetch of data
-stream in the processor. Please refer to the ISA documents or related manual
-for more detailed information regarding how to use this DSCR to attain this
-control of the prefetches . This document here provides an overview of kernel
-support for DSCR, related kernel objects, it's functionalities and exported
-user interface.
-
-(A) Data Structures:
-
-	(1) thread_struct:
-		dscr		/* Thread DSCR value */
-		dscr_inherit	/* Thread has changed default DSCR */
-
-	(2) PACA:
-		dscr_default	/* per-CPU DSCR default value */
-
-	(3) sysfs.c:
-		dscr_default	/* System DSCR default value */
-
-(B) Scheduler Changes:
-
-	Scheduler will write the per-CPU DSCR default which is stored in the
-	CPU's PACA value into the register if the thread has dscr_inherit value
-	cleared which means that it has not changed the default DSCR till now.
-	If the dscr_inherit value is set which means that it has changed the
-	default DSCR value, scheduler will write the changed value which will
-	now be contained in thread struct's dscr into the register instead of
-	the per-CPU default PACA based DSCR value.
-
-	NOTE: Please note here that the system wide global DSCR value never
-	gets used directly in the scheduler process context switch at all.
-
-(C) SYSFS Interface:
-
-	Global DSCR default:		/sys/devices/system/cpu/dscr_default
-	CPU specific DSCR default:	/sys/devices/system/cpu/cpuN/dscr
-
-	Changing the global DSCR default in the sysfs will change all the CPU
-	specific DSCR defaults immediately in their PACA structures. Again if
-	the current process has the dscr_inherit clear, it also writes the new
-	value into every CPU's DSCR register right away and updates the current
-	thread's DSCR value as well.
-
-	Changing the CPU specific DSCR default value in the sysfs does exactly
-	the same thing as above but unlike the global one above, it just changes
-	stuff for that particular CPU instead for all the CPUs on the system.
-
-(D) User Space Instructions:
-
-	The DSCR register can be accessed in the user space using any of these
-	two SPR numbers available for that purpose.
-
-	(1) Problem state SPR:		0x03	(Un-privileged, POWER8 only)
-	(2) Privileged state SPR:	0x11	(Privileged)
-
-	Accessing DSCR through privileged SPR number (0x11) from user space
-	works, as it is emulated following an illegal instruction exception
-	inside the kernel. Both mfspr and mtspr instructions are emulated.
-
-	Accessing DSCR through user level SPR (0x03) from user space will first
-	create a facility unavailable exception. Inside this exception handler
-	all mfspr instruction based read attempts will get emulated and returned
-	where as the first mtspr instruction based write attempts will enable
-	the DSCR facility for the next time around (both for read and write) by
-	setting DSCR facility in the FSCR register.
-
-(E) Specifics about 'dscr_inherit':
-
-	The thread struct element 'dscr_inherit' represents whether the thread
-	in question has attempted and changed the DSCR itself using any of the
-	following methods. This element signifies whether the thread wants to
-	use the CPU default DSCR value or its own changed DSCR value in the
-	kernel.
-
-		(1) mtspr instruction	(SPR number 0x03)
-		(2) mtspr instruction	(SPR number 0x11)
-		(3) ptrace interface	(Explicitly set user DSCR value)
-
-	Any child of the process created after this event in the process inherits
-	this same behaviour as well.
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.rst b/Documentation/powerpc/eeh-pci-error-recovery.rst
new file mode 100644
index 000000000000..438a87ebc095
--- /dev/null
+++ b/Documentation/powerpc/eeh-pci-error-recovery.rst
@@ -0,0 +1,336 @@
+==========================
+PCI Bus EEH Error Recovery
+==========================
+
+Linas Vepstas <linas@austin.ibm.com>
+
+12 January 2005
+
+
+Overview:
+---------
+The IBM POWER-based pSeries and iSeries computers include PCI bus
+controller chips that have extended capabilities for detecting and
+reporting a large variety of PCI bus error conditions.  These features
+go under the name of "EEH", for "Enhanced Error Handling".  The EEH
+hardware features allow PCI bus errors to be cleared and a PCI
+card to be "rebooted", without also having to reboot the operating
+system.
+
+This is in contrast to traditional PCI error handling, where the
+PCI chip is wired directly to the CPU, and an error would cause
+a CPU machine-check/check-stop condition, halting the CPU entirely.
+Another "traditional" technique is to ignore such errors, which
+can lead to data corruption, both of user data or of kernel data,
+hung/unresponsive adapters, or system crashes/lockups.  Thus,
+the idea behind EEH is that the operating system can become more
+reliable and robust by protecting it from PCI errors, and giving
+the OS the ability to "reboot"/recover individual PCI devices.
+
+Future systems from other vendors, based on the PCI-E specification,
+may contain similar features.
+
+
+Causes of EEH Errors
+--------------------
+EEH was originally designed to guard against hardware failure, such
+as PCI cards dying from heat, humidity, dust, vibration and bad
+electrical connections. The vast majority of EEH errors seen in
+"real life" are due to either poorly seated PCI cards, or,
+unfortunately quite commonly, due to device driver bugs, device firmware
+bugs, and sometimes PCI card hardware bugs.
+
+The most common software bug, is one that causes the device to
+attempt to DMA to a location in system memory that has not been
+reserved for DMA access for that card.  This is a powerful feature,
+as it prevents what; otherwise, would have been silent memory
+corruption caused by the bad DMA.  A number of device driver
+bugs have been found and fixed in this way over the past few
+years.  Other possible causes of EEH errors include data or
+address line parity errors (for example, due to poor electrical
+connectivity due to a poorly seated card), and PCI-X split-completion
+errors (due to software, device firmware, or device PCI hardware bugs).
+The vast majority of "true hardware failures" can be cured by
+physically removing and re-seating the PCI card.
+
+
+Detection and Recovery
+----------------------
+In the following discussion, a generic overview of how to detect
+and recover from EEH errors will be presented. This is followed
+by an overview of how the current implementation in the Linux
+kernel does it.  The actual implementation is subject to change,
+and some of the finer points are still being debated.  These
+may in turn be swayed if or when other architectures implement
+similar functionality.
+
+When a PCI Host Bridge (PHB, the bus controller connecting the
+PCI bus to the system CPU electronics complex) detects a PCI error
+condition, it will "isolate" the affected PCI card.  Isolation
+will block all writes (either to the card from the system, or
+from the card to the system), and it will cause all reads to
+return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads).
+This value was chosen because it is the same value you would
+get if the device was physically unplugged from the slot.
+This includes access to PCI memory, I/O space, and PCI config
+space.  Interrupts; however, will continued to be delivered.
+
+Detection and recovery are performed with the aid of ppc64
+firmware.  The programming interfaces in the Linux kernel
+into the firmware are referred to as RTAS (Run-Time Abstraction
+Services).  The Linux kernel does not (should not) access
+the EEH function in the PCI chipsets directly, primarily because
+there are a number of different chipsets out there, each with
+different interfaces and quirks. The firmware provides a
+uniform abstraction layer that will work with all pSeries
+and iSeries hardware (and be forwards-compatible).
+
+If the OS or device driver suspects that a PCI slot has been
+EEH-isolated, there is a firmware call it can make to determine if
+this is the case. If so, then the device driver should put itself
+into a consistent state (given that it won't be able to complete any
+pending work) and start recovery of the card.  Recovery normally
+would consist of resetting the PCI device (holding the PCI #RST
+line high for two seconds), followed by setting up the device
+config space (the base address registers (BAR's), latency timer,
+cache line size, interrupt line, and so on).  This is followed by a
+reinitialization of the device driver.  In a worst-case scenario,
+the power to the card can be toggled, at least on hot-plug-capable
+slots.  In principle, layers far above the device driver probably
+do not need to know that the PCI card has been "rebooted" in this
+way; ideally, there should be at most a pause in Ethernet/disk/USB
+I/O while the card is being reset.
+
+If the card cannot be recovered after three or four resets, the
+kernel/device driver should assume the worst-case scenario, that the
+card has died completely, and report this error to the sysadmin.
+In addition, error messages are reported through RTAS and also through
+syslogd (/var/log/messages) to alert the sysadmin of PCI resets.
+The correct way to deal with failed adapters is to use the standard
+PCI hotplug tools to remove and replace the dead card.
+
+
+Current PPC64 Linux EEH Implementation
+--------------------------------------
+At this time, a generic EEH recovery mechanism has been implemented,
+so that individual device drivers do not need to be modified to support
+EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
+infrastructure,  and percolates events up through the userspace/udev
+infrastructure.  Following is a detailed description of how this is
+accomplished.
+
+EEH must be enabled in the PHB's very early during the boot process,
+and if a PCI slot is hot-plugged. The former is performed by
+eeh_init() in arch/powerpc/platforms/pseries/eeh.c, and the later by
+drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code.
+EEH must be enabled before a PCI scan of the device can proceed.
+Current Power5 hardware will not work unless EEH is enabled;
+although older Power4 can run with it disabled.  Effectively,
+EEH can no longer be turned off.  PCI devices *must* be
+registered with the EEH code; the EEH code needs to know about
+the I/O address ranges of the PCI device in order to detect an
+error.  Given an arbitrary address, the routine
+pci_get_device_by_addr() will find the pci device associated
+with that address (if any).
+
+The default arch/powerpc/include/asm/io.h macros readb(), inb(), insb(),
+etc. include a check to see if the i/o read returned all-0xff's.
+If so, these make a call to eeh_dn_check_failure(), which in turn
+asks the firmware if the all-ff's value is the sign of a true EEH
+error.  If it is not, processing continues as normal.  The grand
+total number of these false alarms or "false positives" can be
+seen in /proc/ppc64/eeh (subject to change).  Normally, almost
+all of these occur during boot, when the PCI bus is scanned, where
+a large number of 0xff reads are part of the bus scan procedure.
+
+If a frozen slot is detected, code in
+arch/powerpc/platforms/pseries/eeh.c will print a stack trace to
+syslog (/var/log/messages).  This stack trace has proven to be very
+useful to device-driver authors for finding out at what point the EEH
+error was detected, as the error itself usually occurs slightly
+beforehand.
+
+Next, it uses the Linux kernel notifier chain/work queue mechanism to
+allow any interested parties to find out about the failure.  Device
+drivers, or other parts of the kernel, can use
+`eeh_register_notifier(struct notifier_block *)` to find out about EEH
+events.  The event will include a pointer to the pci device, the
+device node and some state info.  Receivers of the event can "do as
+they wish"; the default handler will be described further in this
+section.
+
+To assist in the recovery of the device, eeh.c exports the
+following functions:
+
+rtas_set_slot_reset()
+   assert the  PCI #RST line for 1/8th of a second
+rtas_configure_bridge()
+   ask firmware to configure any PCI bridges
+   located topologically under the pci slot.
+eeh_save_bars() and eeh_restore_bars():
+   save and restore the PCI
+   config-space info for a device and any devices under it.
+
+
+A handler for the EEH notifier_block events is implemented in
+drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
+It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
+This last call causes the device driver for the card to be stopped,
+which causes uevents to go out to user space. This triggers
+user-space scripts that might issue commands such as "ifdown eth0"
+for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
+hoping to give the user-space scripts enough time to complete.
+It then resets the PCI card, reconfigures the device BAR's, and
+any bridges underneath. It then calls rpaphp_enable_pci_slot(),
+which restarts the device driver and triggers more user-space
+events (for example, calling "ifup eth0" for ethernet cards).
+
+
+Device Shutdown and User-Space Events
+-------------------------------------
+This section documents what happens when a pci slot is unconfigured,
+focusing on how the device driver gets shut down, and on how the
+events get delivered to user-space scripts.
+
+Following is an example sequence of events that cause a device driver
+close function to be called during the first phase of an EEH reset.
+The following sequence is an example of the pcnet32 device driver::
+
+    rpa_php_unconfig_pci_adapter (struct slot *)  // in rpaphp_pci.c
+    {
+      calls
+      pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c
+      {
+        calls
+        pci_destroy_dev (struct pci_dev *)
+        {
+          calls
+          device_unregister (&dev->dev) // in /drivers/base/core.c
+          {
+            calls
+            device_del (struct device *)
+            {
+              calls
+              bus_remove_device() // in /drivers/base/bus.c
+              {
+                calls
+                device_release_driver()
+                {
+                  calls
+                  struct device_driver->remove() which is just
+                  pci_device_remove()  // in /drivers/pci/pci_driver.c
+                  {
+                    calls
+                    struct pci_driver->remove() which is just
+                    pcnet32_remove_one() // in /drivers/net/pcnet32.c
+                    {
+                      calls
+                      unregister_netdev() // in /net/core/dev.c
+                      {
+                        calls
+                        dev_close()  // in /net/core/dev.c
+                        {
+                           calls dev->stop();
+                           which is just pcnet32_close() // in pcnet32.c
+                           {
+                             which does what you wanted
+                             to stop the device
+                           }
+                        }
+                     }
+                   which
+                   frees pcnet32 device driver memory
+                }
+     }}}}}}
+
+
+in drivers/pci/pci_driver.c,
+struct device_driver->remove() is just pci_device_remove()
+which calls struct pci_driver->remove() which is pcnet32_remove_one()
+which calls unregister_netdev()  (in net/core/dev.c)
+which calls dev_close()  (in net/core/dev.c)
+which calls dev->stop() which is pcnet32_close()
+which then does the appropriate shutdown.
+
+---
+
+Following is the analogous stack trace for events sent to user-space
+when the pci device is unconfigured::
+
+  rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c
+    calls
+    pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c
+      calls
+      pci_destroy_dev (struct pci_dev *) {
+        calls
+        device_unregister (&dev->dev) {        // in /drivers/base/core.c
+          calls
+          device_del(struct device * dev) {    // in /drivers/base/core.c
+            calls
+            kobject_del() {                    //in /libs/kobject.c
+              calls
+              kobject_uevent() {               // in /libs/kobject.c
+                calls
+                kset_uevent() {                // in /lib/kobject.c
+                  calls
+                  kset->uevent_ops->uevent()   // which is really just
+                  a call to
+                  dev_uevent() {               // in /drivers/base/core.c
+                    calls
+                    dev->bus->uevent() which is really just a call to
+                    pci_uevent () {            // in drivers/pci/hotplug.c
+                      which prints device name, etc....
+                   }
+                 }
+                 then kobject_uevent() sends a netlink uevent to userspace
+                 --> userspace uevent
+                 (during early boot, nobody listens to netlink events and
+                 kobject_uevent() executes uevent_helper[], which runs the
+                 event process /sbin/hotplug)
+             }
+           }
+           kobject_del() then calls sysfs_remove_dir(), which would
+           trigger any user-space daemon that was watching /sysfs,
+           and notice the delete event.
+
+
+Pro's and Con's of the Current Design
+-------------------------------------
+There are several issues with the current EEH software recovery design,
+which may be addressed in future revisions.  But first, note that the
+big plus of the current design is that no changes need to be made to
+individual device drivers, so that the current design throws a wide net.
+The biggest negative of the design is that it potentially disturbs
+network daemons and file systems that didn't need to be disturbed.
+
+-  A minor complaint is that resetting the network card causes
+   user-space back-to-back ifdown/ifup burps that potentially disturb
+   network daemons, that didn't need to even know that the pci
+   card was being rebooted.
+
+-  A more serious concern is that the same reset, for SCSI devices,
+   causes havoc to mounted file systems.  Scripts cannot post-facto
+   unmount a file system without flushing pending buffers, but this
+   is impossible, because I/O has already been stopped.  Thus,
+   ideally, the reset should happen at or below the block layer,
+   so that the file systems are not disturbed.
+
+   Reiserfs does not tolerate errors returned from the block device.
+   Ext3fs seems to be tolerant, retrying reads/writes until it does
+   succeed. Both have been only lightly tested in this scenario.
+
+   The SCSI-generic subsystem already has built-in code for performing
+   SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter
+   (HBA) resets.  These are cascaded into a chain of attempted
+   resets if a SCSI command fails. These are completely hidden
+   from the block layer.  It would be very natural to add an EEH
+   reset into this chain of events.
+
+-  If a SCSI error occurs for the root device, all is lost unless
+   the sysadmin had the foresight to run /bin, /sbin, /etc, /var
+   and so on, out of ramdisk/tmpfs.
+
+
+Conclusions
+-----------
+There's forward progress ...
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
deleted file mode 100644
index 678189280bb4..000000000000
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ /dev/null
@@ -1,334 +0,0 @@
-
-
-                      PCI Bus EEH Error Recovery
-                      --------------------------
-                           Linas Vepstas
-                       <linas@austin.ibm.com>
-                          12 January 2005
-
-
-Overview:
----------
-The IBM POWER-based pSeries and iSeries computers include PCI bus
-controller chips that have extended capabilities for detecting and
-reporting a large variety of PCI bus error conditions.  These features
-go under the name of "EEH", for "Enhanced Error Handling".  The EEH
-hardware features allow PCI bus errors to be cleared and a PCI
-card to be "rebooted", without also having to reboot the operating
-system.
-
-This is in contrast to traditional PCI error handling, where the
-PCI chip is wired directly to the CPU, and an error would cause
-a CPU machine-check/check-stop condition, halting the CPU entirely.
-Another "traditional" technique is to ignore such errors, which
-can lead to data corruption, both of user data or of kernel data,
-hung/unresponsive adapters, or system crashes/lockups.  Thus,
-the idea behind EEH is that the operating system can become more
-reliable and robust by protecting it from PCI errors, and giving
-the OS the ability to "reboot"/recover individual PCI devices.
-
-Future systems from other vendors, based on the PCI-E specification,
-may contain similar features.
-
-
-Causes of EEH Errors
---------------------
-EEH was originally designed to guard against hardware failure, such
-as PCI cards dying from heat, humidity, dust, vibration and bad
-electrical connections. The vast majority of EEH errors seen in
-"real life" are due to either poorly seated PCI cards, or,
-unfortunately quite commonly, due to device driver bugs, device firmware
-bugs, and sometimes PCI card hardware bugs.
-
-The most common software bug, is one that causes the device to
-attempt to DMA to a location in system memory that has not been
-reserved for DMA access for that card.  This is a powerful feature,
-as it prevents what; otherwise, would have been silent memory
-corruption caused by the bad DMA.  A number of device driver
-bugs have been found and fixed in this way over the past few
-years.  Other possible causes of EEH errors include data or
-address line parity errors (for example, due to poor electrical
-connectivity due to a poorly seated card), and PCI-X split-completion
-errors (due to software, device firmware, or device PCI hardware bugs).
-The vast majority of "true hardware failures" can be cured by
-physically removing and re-seating the PCI card.
-
-
-Detection and Recovery
-----------------------
-In the following discussion, a generic overview of how to detect
-and recover from EEH errors will be presented. This is followed
-by an overview of how the current implementation in the Linux
-kernel does it.  The actual implementation is subject to change,
-and some of the finer points are still being debated.  These
-may in turn be swayed if or when other architectures implement
-similar functionality.
-
-When a PCI Host Bridge (PHB, the bus controller connecting the
-PCI bus to the system CPU electronics complex) detects a PCI error
-condition, it will "isolate" the affected PCI card.  Isolation
-will block all writes (either to the card from the system, or
-from the card to the system), and it will cause all reads to
-return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads).
-This value was chosen because it is the same value you would
-get if the device was physically unplugged from the slot.
-This includes access to PCI memory, I/O space, and PCI config
-space.  Interrupts; however, will continued to be delivered.
-
-Detection and recovery are performed with the aid of ppc64
-firmware.  The programming interfaces in the Linux kernel
-into the firmware are referred to as RTAS (Run-Time Abstraction
-Services).  The Linux kernel does not (should not) access
-the EEH function in the PCI chipsets directly, primarily because
-there are a number of different chipsets out there, each with
-different interfaces and quirks. The firmware provides a
-uniform abstraction layer that will work with all pSeries
-and iSeries hardware (and be forwards-compatible).
-
-If the OS or device driver suspects that a PCI slot has been
-EEH-isolated, there is a firmware call it can make to determine if
-this is the case. If so, then the device driver should put itself
-into a consistent state (given that it won't be able to complete any
-pending work) and start recovery of the card.  Recovery normally
-would consist of resetting the PCI device (holding the PCI #RST
-line high for two seconds), followed by setting up the device
-config space (the base address registers (BAR's), latency timer,
-cache line size, interrupt line, and so on).  This is followed by a
-reinitialization of the device driver.  In a worst-case scenario,
-the power to the card can be toggled, at least on hot-plug-capable
-slots.  In principle, layers far above the device driver probably
-do not need to know that the PCI card has been "rebooted" in this
-way; ideally, there should be at most a pause in Ethernet/disk/USB
-I/O while the card is being reset.
-
-If the card cannot be recovered after three or four resets, the
-kernel/device driver should assume the worst-case scenario, that the
-card has died completely, and report this error to the sysadmin.
-In addition, error messages are reported through RTAS and also through
-syslogd (/var/log/messages) to alert the sysadmin of PCI resets.
-The correct way to deal with failed adapters is to use the standard
-PCI hotplug tools to remove and replace the dead card.
-
-
-Current PPC64 Linux EEH Implementation
---------------------------------------
-At this time, a generic EEH recovery mechanism has been implemented,
-so that individual device drivers do not need to be modified to support
-EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
-infrastructure,  and percolates events up through the userspace/udev
-infrastructure.  Following is a detailed description of how this is
-accomplished.
-
-EEH must be enabled in the PHB's very early during the boot process,
-and if a PCI slot is hot-plugged. The former is performed by
-eeh_init() in arch/powerpc/platforms/pseries/eeh.c, and the later by
-drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code.
-EEH must be enabled before a PCI scan of the device can proceed.
-Current Power5 hardware will not work unless EEH is enabled;
-although older Power4 can run with it disabled.  Effectively,
-EEH can no longer be turned off.  PCI devices *must* be
-registered with the EEH code; the EEH code needs to know about
-the I/O address ranges of the PCI device in order to detect an
-error.  Given an arbitrary address, the routine
-pci_get_device_by_addr() will find the pci device associated
-with that address (if any).
-
-The default arch/powerpc/include/asm/io.h macros readb(), inb(), insb(),
-etc. include a check to see if the i/o read returned all-0xff's.
-If so, these make a call to eeh_dn_check_failure(), which in turn
-asks the firmware if the all-ff's value is the sign of a true EEH
-error.  If it is not, processing continues as normal.  The grand
-total number of these false alarms or "false positives" can be
-seen in /proc/ppc64/eeh (subject to change).  Normally, almost
-all of these occur during boot, when the PCI bus is scanned, where
-a large number of 0xff reads are part of the bus scan procedure.
-
-If a frozen slot is detected, code in 
-arch/powerpc/platforms/pseries/eeh.c will print a stack trace to 
-syslog (/var/log/messages).  This stack trace has proven to be very 
-useful to device-driver authors for finding out at what point the EEH 
-error was detected, as the error itself usually occurs slightly 
-beforehand.
-
-Next, it uses the Linux kernel notifier chain/work queue mechanism to
-allow any interested parties to find out about the failure.  Device
-drivers, or other parts of the kernel, can use
-eeh_register_notifier(struct notifier_block *) to find out about EEH
-events.  The event will include a pointer to the pci device, the
-device node and some state info.  Receivers of the event can "do as
-they wish"; the default handler will be described further in this
-section.
-
-To assist in the recovery of the device, eeh.c exports the
-following functions:
-
-rtas_set_slot_reset() -- assert the  PCI #RST line for 1/8th of a second
-rtas_configure_bridge() -- ask firmware to configure any PCI bridges
-   located topologically under the pci slot.
-eeh_save_bars() and eeh_restore_bars(): save and restore the PCI
-   config-space info for a device and any devices under it.
-
-
-A handler for the EEH notifier_block events is implemented in
-drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
-It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
-This last call causes the device driver for the card to be stopped,
-which causes uevents to go out to user space. This triggers
-user-space scripts that might issue commands such as "ifdown eth0"
-for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
-hoping to give the user-space scripts enough time to complete.
-It then resets the PCI card, reconfigures the device BAR's, and
-any bridges underneath. It then calls rpaphp_enable_pci_slot(),
-which restarts the device driver and triggers more user-space
-events (for example, calling "ifup eth0" for ethernet cards).
-
-
-Device Shutdown and User-Space Events
--------------------------------------
-This section documents what happens when a pci slot is unconfigured,
-focusing on how the device driver gets shut down, and on how the
-events get delivered to user-space scripts.
-
-Following is an example sequence of events that cause a device driver
-close function to be called during the first phase of an EEH reset.
-The following sequence is an example of the pcnet32 device driver.
-
-    rpa_php_unconfig_pci_adapter (struct slot *)  // in rpaphp_pci.c
-    {
-      calls
-      pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c
-      {
-        calls
-        pci_destroy_dev (struct pci_dev *)
-        {
-          calls
-          device_unregister (&dev->dev) // in /drivers/base/core.c
-          {
-            calls
-            device_del (struct device *)
-            {
-              calls
-              bus_remove_device() // in /drivers/base/bus.c
-              {
-                calls
-                device_release_driver()
-                {
-                  calls
-                  struct device_driver->remove() which is just
-                  pci_device_remove()  // in /drivers/pci/pci_driver.c
-                  {
-                    calls
-                    struct pci_driver->remove() which is just
-                    pcnet32_remove_one() // in /drivers/net/pcnet32.c
-                    {
-                      calls
-                      unregister_netdev() // in /net/core/dev.c
-                      {
-                        calls
-                        dev_close()  // in /net/core/dev.c
-                        {
-                           calls dev->stop();
-                           which is just pcnet32_close() // in pcnet32.c
-                           {
-                             which does what you wanted
-                             to stop the device
-                           }
-                        }
-                     }
-                   which
-                   frees pcnet32 device driver memory
-                }
-     }}}}}}
-
-
-    in drivers/pci/pci_driver.c,
-    struct device_driver->remove() is just pci_device_remove()
-    which calls struct pci_driver->remove() which is pcnet32_remove_one()
-    which calls unregister_netdev()  (in net/core/dev.c)
-    which calls dev_close()  (in net/core/dev.c)
-    which calls dev->stop() which is pcnet32_close()
-    which then does the appropriate shutdown.
-
----
-Following is the analogous stack trace for events sent to user-space
-when the pci device is unconfigured.
-
-rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c
-  calls
-  pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c
-    calls
-    pci_destroy_dev (struct pci_dev *) {
-      calls
-      device_unregister (&dev->dev) {        // in /drivers/base/core.c
-        calls
-        device_del(struct device * dev) {    // in /drivers/base/core.c
-          calls
-          kobject_del() {                    //in /libs/kobject.c
-            calls
-            kobject_uevent() {               // in /libs/kobject.c
-              calls
-              kset_uevent() {                // in /lib/kobject.c
-                calls
-                kset->uevent_ops->uevent()   // which is really just
-                a call to
-                dev_uevent() {               // in /drivers/base/core.c
-                  calls
-                  dev->bus->uevent() which is really just a call to
-                  pci_uevent () {            // in drivers/pci/hotplug.c
-                    which prints device name, etc....
-                 }
-               }
-               then kobject_uevent() sends a netlink uevent to userspace
-               --> userspace uevent
-               (during early boot, nobody listens to netlink events and
-               kobject_uevent() executes uevent_helper[], which runs the
-               event process /sbin/hotplug)
-           }
-         }
-         kobject_del() then calls sysfs_remove_dir(), which would
-         trigger any user-space daemon that was watching /sysfs,
-         and notice the delete event.
-
-
-Pro's and Con's of the Current Design
--------------------------------------
-There are several issues with the current EEH software recovery design,
-which may be addressed in future revisions.  But first, note that the
-big plus of the current design is that no changes need to be made to
-individual device drivers, so that the current design throws a wide net.
-The biggest negative of the design is that it potentially disturbs
-network daemons and file systems that didn't need to be disturbed.
-
--- A minor complaint is that resetting the network card causes
-   user-space back-to-back ifdown/ifup burps that potentially disturb
-   network daemons, that didn't need to even know that the pci
-   card was being rebooted.
-
--- A more serious concern is that the same reset, for SCSI devices,
-   causes havoc to mounted file systems.  Scripts cannot post-facto
-   unmount a file system without flushing pending buffers, but this
-   is impossible, because I/O has already been stopped.  Thus,
-   ideally, the reset should happen at or below the block layer,
-   so that the file systems are not disturbed.
-
-   Reiserfs does not tolerate errors returned from the block device.
-   Ext3fs seems to be tolerant, retrying reads/writes until it does
-   succeed. Both have been only lightly tested in this scenario.
-
-   The SCSI-generic subsystem already has built-in code for performing
-   SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter
-   (HBA) resets.  These are cascaded into a chain of attempted
-   resets if a SCSI command fails. These are completely hidden
-   from the block layer.  It would be very natural to add an EEH
-   reset into this chain of events.
-
--- If a SCSI error occurs for the root device, all is lost unless
-   the sysadmin had the foresight to run /bin, /sbin, /etc, /var
-   and so on, out of ramdisk/tmpfs.
-
-
-Conclusions
------------
-There's forward progress ...
-
-
diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
new file mode 100644
index 000000000000..9ca12830a48e
--- /dev/null
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -0,0 +1,301 @@
+======================
+Firmware-Assisted Dump
+======================
+
+July 2011
+
+The goal of firmware-assisted dump is to enable the dump of
+a crashed system, and to do so from a fully-reset system, and
+to minimize the total elapsed time until the system is back
+in production use.
+
+- Firmware assisted dump (fadump) infrastructure is intended to replace
+  the existing phyp assisted dump.
+- Fadump uses the same firmware interfaces and memory reservation model
+  as phyp assisted dump.
+- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+  in the ELF format in the same way as kdump. This helps us reuse the
+  kdump infrastructure for dump capture and filtering.
+- Unlike phyp dump, userspace tool does not need to refer any sysfs
+  interface while reading /proc/vmcore.
+- Unlike phyp dump, fadump allows user to release all the memory reserved
+  for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
+- Once enabled through kernel boot parameter, fadump can be
+  started/stopped through /sys/kernel/fadump_registered interface (see
+  sysfs files section below) and can be easily integrated with kdump
+  service start/stop init scripts.
+
+Comparing with kdump or other strategies, firmware-assisted
+dump offers several strong, practical advantages:
+
+-  Unlike kdump, the system has been reset, and loaded
+   with a fresh copy of the kernel.  In particular,
+   PCI and I/O devices have been reinitialized and are
+   in a clean, consistent state.
+-  Once the dump is copied out, the memory that held the dump
+   is immediately available to the running kernel. And therefore,
+   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   the system to the production configuration.
+
+The above can only be accomplished by coordination with,
+and assistance from the Power firmware. The procedure is
+as follows:
+
+-  The first kernel registers the sections of memory with the
+   Power firmware for dump preservation during OS initialization.
+   These registered sections of memory are reserved by the first
+   kernel during early boot.
+
+-  When a system crashes, the Power firmware will save
+   the low memory (boot memory of size larger of 5% of system RAM
+   or 256MB) of RAM to the previous registered region. It will
+   also save system registers, and hardware PTE's.
+
+   NOTE:
+         The term 'boot memory' means size of the low memory chunk
+         that is required for a kernel to boot successfully when
+         booted with restricted memory. By default, the boot memory
+         size will be the larger of 5% of system RAM or 256MB.
+         Alternatively, user can also specify boot memory size
+         through boot parameter 'crashkernel=' which will override
+         the default calculated size. Use this option if default
+         boot memory size is not sufficient for second kernel to
+         boot successfully. For syntax of crashkernel= parameter,
+         refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is
+         provided in crashkernel= parameter, it will be ignored
+         as fadump uses a predefined offset to reserve memory
+         for boot memory dump preservation in case of a crash.
+
+-  After the low memory (boot memory) area has been saved, the
+   firmware will reset PCI and other hardware state.  It will
+   *not* clear the RAM. It will then launch the bootloader, as
+   normal.
+
+-  The freshly booted kernel will notice that there is a new
+   node (ibm,dump-kernel) in the device tree, indicating that
+   there is crash data available from a previous boot. During
+   the early boot OS will reserve rest of the memory above
+   boot memory size effectively booting with restricted memory
+   size. This will make sure that the second kernel will not
+   touch any of the dump memory area.
+
+-  User-space tools will read /proc/vmcore to obtain the contents
+   of memory, which holds the previous crashed kernel dump in ELF
+   format. The userspace tools may copy this info to disk, or
+   network, nas, san, iscsi, etc. as desired.
+
+-  Once the userspace tool is done saving dump, it will echo
+   '1' to /sys/kernel/fadump_release_mem to release the reserved
+   memory back to general use, except the memory required for
+   next firmware-assisted dump registration.
+
+   e.g.::
+
+     # echo 1 > /sys/kernel/fadump_release_mem
+
+Please note that the firmware-assisted dump feature
+is only available on Power6 and above systems with recent
+firmware versions.
+
+Implementation details:
+-----------------------
+
+During boot, a check is made to see if firmware supports
+this feature on that particular machine. If it does, then
+we check to see if an active dump is waiting for us. If yes
+then everything but boot memory size of RAM is reserved during
+early boot (See Fig. 2). This area is released once we finish
+collecting the dump from user land scripts (e.g. kdump scripts)
+that are run. If there is dump data, then the
+/sys/kernel/fadump_release_mem file is created, and the reserved
+memory is held.
+
+If there is no waiting dump data, then only the memory required
+to hold CPU state, HPTE region, boot memory dump and elfcore
+header, is usually reserved at an offset greater than boot memory
+size (see Fig. 1). This area is *not* released: this region will
+be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state
+and HPTE region, in the case a crash does occur. Since this reserved
+memory area is used only after the system crash, there is no point in
+blocking this significant chunk of memory from production kernel.
+Hence, the implementation uses the Linux kernel's Contiguous Memory
+Allocator (CMA) for memory reservation if CMA is configured for kernel.
+With CMA reservation this memory will be available for applications to
+use it, while kernel is prevented from using it. With this fadump will
+still be able to capture all of the kernel memory and most of the user
+space memory except the user pages that were present in CMA region::
+
+  o Memory Reservation during first kernel
+
+  Low memory                                         Top of memory
+  0      boot memory size                                       |
+  |           |                |<--Reserved dump area -->|      |
+  V           V                |   Permanent Reservation |      V
+  +-----------+----------/ /---+---+----+-----------+----+------+
+  |           |                |CPU|HPTE|  DUMP     |ELF |      |
+  +-----------+----------/ /---+---+----+-----------+----+------+
+        |                                           ^
+        |                                           |
+        \                                           /
+         -------------------------------------------
+          Boot memory content gets transferred to
+          reserved area by firmware at the time of
+          crash
+                   Fig. 1
+
+  o Memory Reservation during second kernel after crash
+
+  Low memory                                        Top of memory
+  0      boot memory size                                       |
+  |           |<------------- Reserved dump area ----------- -->|
+  V           V                                                 V
+  +-----------+----------/ /---+---+----+-----------+----+------+
+  |           |                |CPU|HPTE|  DUMP     |ELF |      |
+  +-----------+----------/ /---+---+----+-----------+----+------+
+        |                                              |
+        V                                              V
+   Used by second                                /proc/vmcore
+   kernel to boot
+                   Fig. 2
+
+Currently the dump will be copied from /proc/vmcore to a
+a new file upon user intervention. The dump data available through
+/proc/vmcore will be in ELF format. Hence the existing kdump
+infrastructure (kdump scripts) to save the dump works fine with
+minor modifications.
+
+The tools to examine the dump will be same as the ones
+used for kdump.
+
+How to enable firmware-assisted dump (fadump):
+----------------------------------------------
+
+1. Set config option CONFIG_FA_DUMP=y and build kernel.
+2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
+   By default, fadump reserved memory will be initialized as CMA area.
+   Alternatively, user can boot linux kernel with 'fadump=nocma' to
+   prevent fadump to use CMA.
+3. Optionally, user can also set 'crashkernel=' kernel cmdline
+   to specify size of the memory to reserve for boot memory dump
+   preservation.
+
+NOTE:
+     1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
+        use 'crashkernel=' to specify size of the memory to reserve
+        for boot memory dump preservation.
+     2. If firmware-assisted dump fails to reserve memory then it
+        will fallback to existing kdump mechanism if 'crashkernel='
+        option is set at kernel cmdline.
+     3. if user wants to capture all of user space memory and ok with
+        reserved memory not available to production system, then
+        'fadump=nocma' kernel parameter can be used to fallback to
+        old behaviour.
+
+Sysfs/debugfs files:
+--------------------
+
+Firmware-assisted dump feature uses sysfs file system to hold
+the control files and debugfs file to display memory reserved region.
+
+Here is the list of files under kernel sysfs:
+
+ /sys/kernel/fadump_enabled
+    This is used to display the fadump status.
+
+    - 0 = fadump is disabled
+    - 1 = fadump is enabled
+
+    This interface can be used by kdump init scripts to identify if
+    fadump is enabled in the kernel and act accordingly.
+
+ /sys/kernel/fadump_registered
+    This is used to display the fadump registration status as well
+    as to control (start/stop) the fadump registration.
+
+    - 0 = fadump is not registered.
+    - 1 = fadump is registered and ready to handle system crash.
+
+    To register fadump echo 1 > /sys/kernel/fadump_registered and
+    echo 0 > /sys/kernel/fadump_registered for un-register and stop the
+    fadump. Once the fadump is un-registered, the system crash will not
+    be handled and vmcore will not be captured. This interface can be
+    easily integrated with kdump service start/stop.
+
+ /sys/kernel/fadump_release_mem
+    This file is available only when fadump is active during
+    second kernel. This is used to release the reserved memory
+    region that are held for saving crash dump. To release the
+    reserved memory echo 1 to it::
+
+	echo 1  > /sys/kernel/fadump_release_mem
+
+    After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region
+    file will change to reflect the new memory reservations.
+
+    The existing userspace tools (kdump infrastructure) can be easily
+    enhanced to use this interface to release the memory reserved for
+    dump and continue without 2nd reboot.
+
+Here is the list of files under powerpc debugfs:
+(Assuming debugfs is mounted on /sys/kernel/debug directory.)
+
+ /sys/kernel/debug/powerpc/fadump_region
+    This file shows the reserved memory regions if fadump is
+    enabled otherwise this file is empty. The output format
+    is::
+
+      <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
+
+    e.g.
+    Contents when fadump is registered during first kernel::
+
+      # cat /sys/kernel/debug/powerpc/fadump_region
+      CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
+      HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
+      DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
+
+    Contents when fadump is active during second kernel::
+
+      # cat /sys/kernel/debug/powerpc/fadump_region
+      CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
+      HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000
+      DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
+          : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
+
+NOTE:
+      Please refer to Documentation/filesystems/debugfs.txt on
+      how to mount the debugfs filesystem.
+
+
+TODO:
+-----
+ - Need to come up with the better approach to find out more
+   accurate boot memory size that is required for a kernel to
+   boot successfully when booted with restricted memory.
+ - The fadump implementation introduces a fadump crash info structure
+   in the scratch area before the ELF core header. The idea of introducing
+   this structure is to pass some important crash info data to the second
+   kernel which will help second kernel to populate ELF core header with
+   correct data before it gets exported through /proc/vmcore. The current
+   design implementation does not address a possibility of introducing
+   additional fields (in future) to this structure without affecting
+   compatibility. Need to come up with the better approach to address this.
+
+   The possible approaches are:
+
+	1. Introduce version field for version tracking, bump up the version
+	whenever a new field is added to the structure in future. The version
+	field can be used to find out what fields are valid for the current
+	version of the structure.
+	2. Reserve the area of predefined size (say PAGE_SIZE) for this
+	structure and have unused area as reserved (initialized to zero)
+	for future field additions.
+
+   The advantage of approach 1 over 2 is we don't need to reserve extra space.
+
+Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+
+This document is based on the original documentation written for phyp
+
+assisted dump by Linas Vepstas and Manish Ahuja.
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
deleted file mode 100644
index 18c5feef2577..000000000000
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ /dev/null
@@ -1,292 +0,0 @@
-
-                   Firmware-Assisted Dump
-                   ------------------------
-                       July 2011
-
-The goal of firmware-assisted dump is to enable the dump of
-a crashed system, and to do so from a fully-reset system, and
-to minimize the total elapsed time until the system is back
-in production use.
-
-- Firmware assisted dump (fadump) infrastructure is intended to replace
-  the existing phyp assisted dump.
-- Fadump uses the same firmware interfaces and memory reservation model
-  as phyp assisted dump.
-- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
-  in the ELF format in the same way as kdump. This helps us reuse the
-  kdump infrastructure for dump capture and filtering.
-- Unlike phyp dump, userspace tool does not need to refer any sysfs
-  interface while reading /proc/vmcore.
-- Unlike phyp dump, fadump allows user to release all the memory reserved
-  for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
-- Once enabled through kernel boot parameter, fadump can be
-  started/stopped through /sys/kernel/fadump_registered interface (see
-  sysfs files section below) and can be easily integrated with kdump
-  service start/stop init scripts.
-
-Comparing with kdump or other strategies, firmware-assisted
-dump offers several strong, practical advantages:
-
--- Unlike kdump, the system has been reset, and loaded
-   with a fresh copy of the kernel.  In particular,
-   PCI and I/O devices have been reinitialized and are
-   in a clean, consistent state.
--- Once the dump is copied out, the memory that held the dump
-   is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
-   the system to the production configuration.
-
-The above can only be accomplished by coordination with,
-and assistance from the Power firmware. The procedure is
-as follows:
-
--- The first kernel registers the sections of memory with the
-   Power firmware for dump preservation during OS initialization.
-   These registered sections of memory are reserved by the first
-   kernel during early boot.
-
--- When a system crashes, the Power firmware will save
-   the low memory (boot memory of size larger of 5% of system RAM
-   or 256MB) of RAM to the previous registered region. It will
-   also save system registers, and hardware PTE's.
-
-   NOTE: The term 'boot memory' means size of the low memory chunk
-         that is required for a kernel to boot successfully when
-         booted with restricted memory. By default, the boot memory
-         size will be the larger of 5% of system RAM or 256MB.
-         Alternatively, user can also specify boot memory size
-         through boot parameter 'crashkernel=' which will override
-         the default calculated size. Use this option if default
-         boot memory size is not sufficient for second kernel to
-         boot successfully. For syntax of crashkernel= parameter,
-         refer to Documentation/kdump/kdump.txt. If any offset is
-         provided in crashkernel= parameter, it will be ignored
-         as fadump uses a predefined offset to reserve memory
-         for boot memory dump preservation in case of a crash.
-
--- After the low memory (boot memory) area has been saved, the
-   firmware will reset PCI and other hardware state.  It will
-   *not* clear the RAM. It will then launch the bootloader, as
-   normal.
-
--- The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
-   there is crash data available from a previous boot. During
-   the early boot OS will reserve rest of the memory above
-   boot memory size effectively booting with restricted memory
-   size. This will make sure that the second kernel will not
-   touch any of the dump memory area.
-
--- User-space tools will read /proc/vmcore to obtain the contents
-   of memory, which holds the previous crashed kernel dump in ELF
-   format. The userspace tools may copy this info to disk, or
-   network, nas, san, iscsi, etc. as desired.
-
--- Once the userspace tool is done saving dump, it will echo
-   '1' to /sys/kernel/fadump_release_mem to release the reserved
-   memory back to general use, except the memory required for
-   next firmware-assisted dump registration.
-
-   e.g.
-     # echo 1 > /sys/kernel/fadump_release_mem
-
-Please note that the firmware-assisted dump feature
-is only available on Power6 and above systems with recent
-firmware versions.
-
-Implementation details:
-----------------------
-
-During boot, a check is made to see if firmware supports
-this feature on that particular machine. If it does, then
-we check to see if an active dump is waiting for us. If yes
-then everything but boot memory size of RAM is reserved during
-early boot (See Fig. 2). This area is released once we finish
-collecting the dump from user land scripts (e.g. kdump scripts)
-that are run. If there is dump data, then the
-/sys/kernel/fadump_release_mem file is created, and the reserved
-memory is held.
-
-If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region.
-
-  o Memory Reservation during first kernel
-
-  Low memory                                         Top of memory
-  0      boot memory size                                       |
-  |           |                |<--Reserved dump area -->|      |
-  V           V                |   Permanent Reservation |      V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
-        |                                           ^
-        |                                           |
-        \                                           /
-         -------------------------------------------
-          Boot memory content gets transferred to
-          reserved area by firmware at the time of
-          crash
-                   Fig. 1
-
-  o Memory Reservation during second kernel after crash
-
-  Low memory                                        Top of memory
-  0      boot memory size                                       |
-  |           |<------------- Reserved dump area ----------- -->|
-  V           V                                                 V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
-        |                                              |
-        V                                              V
-   Used by second                                /proc/vmcore
-   kernel to boot
-                   Fig. 2
-
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
-
-The tools to examine the dump will be same as the ones
-used for kdump.
-
-How to enable firmware-assisted dump (fadump):
--------------------------------------
-
-1. Set config option CONFIG_FA_DUMP=y and build kernel.
-2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
-   Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
-3. Optionally, user can also set 'crashkernel=' kernel cmdline
-   to specify size of the memory to reserve for boot memory dump
-   preservation.
-
-NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
-         use 'crashkernel=' to specify size of the memory to reserve
-         for boot memory dump preservation.
-      2. If firmware-assisted dump fails to reserve memory then it
-         will fallback to existing kdump mechanism if 'crashkernel='
-         option is set at kernel cmdline.
-      3. if user wants to capture all of user space memory and ok with
-         reserved memory not available to production system, then
-         'fadump=nocma' kernel parameter can be used to fallback to
-         old behaviour.
-
-Sysfs/debugfs files:
-------------
-
-Firmware-assisted dump feature uses sysfs file system to hold
-the control files and debugfs file to display memory reserved region.
-
-Here is the list of files under kernel sysfs:
-
- /sys/kernel/fadump_enabled
-
-    This is used to display the fadump status.
-    0 = fadump is disabled
-    1 = fadump is enabled
-
-    This interface can be used by kdump init scripts to identify if
-    fadump is enabled in the kernel and act accordingly.
-
- /sys/kernel/fadump_registered
-
-    This is used to display the fadump registration status as well
-    as to control (start/stop) the fadump registration.
-    0 = fadump is not registered.
-    1 = fadump is registered and ready to handle system crash.
-
-    To register fadump echo 1 > /sys/kernel/fadump_registered and
-    echo 0 > /sys/kernel/fadump_registered for un-register and stop the
-    fadump. Once the fadump is un-registered, the system crash will not
-    be handled and vmcore will not be captured. This interface can be
-    easily integrated with kdump service start/stop.
-
- /sys/kernel/fadump_release_mem
-
-    This file is available only when fadump is active during
-    second kernel. This is used to release the reserved memory
-    region that are held for saving crash dump. To release the
-    reserved memory echo 1 to it:
-
-    echo 1  > /sys/kernel/fadump_release_mem
-
-    After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region
-    file will change to reflect the new memory reservations.
-
-    The existing userspace tools (kdump infrastructure) can be easily
-    enhanced to use this interface to release the memory reserved for
-    dump and continue without 2nd reboot.
-
-Here is the list of files under powerpc debugfs:
-(Assuming debugfs is mounted on /sys/kernel/debug directory.)
-
- /sys/kernel/debug/powerpc/fadump_region
-
-    This file shows the reserved memory regions if fadump is
-    enabled otherwise this file is empty. The output format
-    is:
-    <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
-
-    e.g.
-    Contents when fadump is registered during first kernel
-
-    # cat /sys/kernel/debug/powerpc/fadump_region
-    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
-    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
-    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
-
-    Contents when fadump is active during second kernel
-
-    # cat /sys/kernel/debug/powerpc/fadump_region
-    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
-    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000
-    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
-        : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
-
-NOTE: Please refer to Documentation/filesystems/debugfs.txt on
-      how to mount the debugfs filesystem.
-
-
-TODO:
------
- o Need to come up with the better approach to find out more
-   accurate boot memory size that is required for a kernel to
-   boot successfully when booted with restricted memory.
- o The fadump implementation introduces a fadump crash info structure
-   in the scratch area before the ELF core header. The idea of introducing
-   this structure is to pass some important crash info data to the second
-   kernel which will help second kernel to populate ELF core header with
-   correct data before it gets exported through /proc/vmcore. The current
-   design implementation does not address a possibility of introducing
-   additional fields (in future) to this structure without affecting
-   compatibility. Need to come up with the better approach to address this.
-   The possible approaches are:
-	1. Introduce version field for version tracking, bump up the version
-	whenever a new field is added to the structure in future. The version
-	field can be used to find out what fields are valid for the current
-	version of the structure.
-	2. Reserve the area of predefined size (say PAGE_SIZE) for this
-	structure and have unused area as reserved (initialized to zero)
-	for future field additions.
-   The advantage of approach 1 over 2 is we don't need to reserve extra space.
----
-Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
-This document is based on the original documentation written for phyp
-assisted dump by Linas Vepstas and Manish Ahuja.
diff --git a/Documentation/powerpc/hvcs.rst b/Documentation/powerpc/hvcs.rst
new file mode 100644
index 000000000000..6808acde672f
--- /dev/null
+++ b/Documentation/powerpc/hvcs.rst
@@ -0,0 +1,581 @@
+===============================================================
+HVCS IBM "Hypervisor Virtual Console Server" Installation Guide
+===============================================================
+
+for Linux Kernel 2.6.4+
+
+Copyright (C) 2004 IBM Corporation
+
+.. ===========================================================================
+.. NOTE:Eight space tabs are the optimum editor setting for reading this file.
+.. ===========================================================================
+
+
+Author(s): Ryan S. Arnold <rsa@us.ibm.com>
+
+Date Created: March, 02, 2004
+Last Changed: August, 24, 2004
+
+.. Table of contents:
+
+	1.  Driver Introduction:
+	2.  System Requirements
+	3.  Build Options:
+		3.1  Built-in:
+		3.2  Module:
+	4.  Installation:
+	5.  Connection:
+	6.  Disconnection:
+	7.  Configuration:
+	8.  Questions & Answers:
+	9.  Reporting Bugs:
+
+1. Driver Introduction:
+=======================
+
+This is the device driver for the IBM Hypervisor Virtual Console Server,
+"hvcs".  The IBM hvcs provides a tty driver interface to allow Linux user
+space applications access to the system consoles of logically partitioned
+operating systems (Linux and AIX) running on the same partitioned Power5
+ppc64 system.  Physical hardware consoles per partition are not practical
+on this hardware so system consoles are accessed by this driver using
+firmware interfaces to virtual terminal devices.
+
+2. System Requirements:
+=======================
+
+This device driver was written using 2.6.4 Linux kernel APIs and will only
+build and run on kernels of this version or later.
+
+This driver was written to operate solely on IBM Power5 ppc64 hardware
+though some care was taken to abstract the architecture dependent firmware
+calls from the driver code.
+
+Sysfs must be mounted on the system so that the user can determine which
+major and minor numbers are associated with each vty-server.  Directions
+for sysfs mounting are outside the scope of this document.
+
+3. Build Options:
+=================
+
+The hvcs driver registers itself as a tty driver.  The tty layer
+dynamically allocates a block of major and minor numbers in a quantity
+requested by the registering driver.  The hvcs driver asks the tty layer
+for 64 of these major/minor numbers by default to use for hvcs device node
+entries.
+
+If the default number of device entries is adequate then this driver can be
+built into the kernel.  If not, the default can be over-ridden by inserting
+the driver as a module with insmod parameters.
+
+3.1 Built-in:
+-------------
+
+The following menuconfig example demonstrates selecting to build this
+driver into the kernel::
+
+	Device Drivers  --->
+		Character devices  --->
+			<*> IBM Hypervisor Virtual Console Server Support
+
+Begin the kernel make process.
+
+3.2 Module:
+-----------
+
+The following menuconfig example demonstrates selecting to build this
+driver as a kernel module::
+
+	Device Drivers  --->
+		Character devices  --->
+			<M> IBM Hypervisor Virtual Console Server Support
+
+The make process will build the following kernel modules:
+
+	- hvcs.ko
+	- hvcserver.ko
+
+To insert the module with the default allocation execute the following
+commands in the order they appear::
+
+	insmod hvcserver.ko
+	insmod hvcs.ko
+
+The hvcserver module contains architecture specific firmware calls and must
+be inserted first, otherwise the hvcs module will not find some of the
+symbols it expects.
+
+To override the default use an insmod parameter as follows (requesting 4
+tty devices as an example)::
+
+	insmod hvcs.ko hvcs_parm_num_devs=4
+
+There is a maximum number of dev entries that can be specified on insmod.
+We think that 1024 is currently a decent maximum number of server adapters
+to allow.  This can always be changed by modifying the constant in the
+source file before building.
+
+NOTE: The length of time it takes to insmod the driver seems to be related
+to the number of tty interfaces the registering driver requests.
+
+In order to remove the driver module execute the following command::
+
+	rmmod hvcs.ko
+
+The recommended method for installing hvcs as a module is to use depmod to
+build a current modules.dep file in /lib/modules/`uname -r` and then
+execute::
+
+	modprobe hvcs hvcs_parm_num_devs=4
+
+The modules.dep file indicates that hvcserver.ko needs to be inserted
+before hvcs.ko and modprobe uses this file to smartly insert the modules in
+the proper order.
+
+The following modprobe command is used to remove hvcs and hvcserver in the
+proper order::
+
+	modprobe -r hvcs
+
+4. Installation:
+================
+
+The tty layer creates sysfs entries which contain the major and minor
+numbers allocated for the hvcs driver.  The following snippet of "tree"
+output of the sysfs directory shows where these numbers are presented::
+
+	sys/
+	|-- *other sysfs base dirs*
+	|
+	|-- class
+	|   |-- *other classes of devices*
+	|   |
+	|   `-- tty
+	|       |-- *other tty devices*
+	|       |
+	|       |-- hvcs0
+	|       |   `-- dev
+	|       |-- hvcs1
+	|       |   `-- dev
+	|       |-- hvcs2
+	|       |   `-- dev
+	|       |-- hvcs3
+	|       |   `-- dev
+	|       |
+	|       |-- *other tty devices*
+	|
+	|-- *other sysfs base dirs*
+
+For the above examples the following output is a result of cat'ing the
+"dev" entry in the hvcs directory::
+
+	Pow5:/sys/class/tty/hvcs0/ # cat dev
+	254:0
+
+	Pow5:/sys/class/tty/hvcs1/ # cat dev
+	254:1
+
+	Pow5:/sys/class/tty/hvcs2/ # cat dev
+	254:2
+
+	Pow5:/sys/class/tty/hvcs3/ # cat dev
+	254:3
+
+The output from reading the "dev" attribute is the char device major and
+minor numbers that the tty layer has allocated for this driver's use.  Most
+systems running hvcs will already have the device entries created or udev
+will do it automatically.
+
+Given the example output above, to manually create a /dev/hvcs* node entry
+mknod can be used as follows::
+
+	mknod /dev/hvcs0 c 254 0
+	mknod /dev/hvcs1 c 254 1
+	mknod /dev/hvcs2 c 254 2
+	mknod /dev/hvcs3 c 254 3
+
+Using mknod to manually create the device entries makes these device nodes
+persistent.  Once created they will exist prior to the driver insmod.
+
+Attempting to connect an application to /dev/hvcs* prior to insertion of
+the hvcs module will result in an error message similar to the following::
+
+	"/dev/hvcs*: No such device".
+
+NOTE: Just because there is a device node present doesn't mean that there
+is a vty-server device configured for that node.
+
+5. Connection
+=============
+
+Since this driver controls devices that provide a tty interface a user can
+interact with the device node entries using any standard tty-interactive
+method (e.g. "cat", "dd", "echo").  The intent of this driver however, is
+to provide real time console interaction with a Linux partition's console,
+which requires the use of applications that provide bi-directional,
+interactive I/O with a tty device.
+
+Applications (e.g. "minicom" and "screen") that act as terminal emulators
+or perform terminal type control sequence conversion on the data being
+passed through them are NOT acceptable for providing interactive console
+I/O.  These programs often emulate antiquated terminal types (vt100 and
+ANSI) and expect inbound data to take the form of one of these supported
+terminal types but they either do not convert, or do not _adequately_
+convert, outbound data into the terminal type of the terminal which invoked
+them (though screen makes an attempt and can apparently be configured with
+much termcap wrestling.)
+
+For this reason kermit and cu are two of the recommended applications for
+interacting with a Linux console via an hvcs device.  These programs simply
+act as a conduit for data transfer to and from the tty device.  They do not
+require inbound data to take the form of a particular terminal type, nor do
+they cook outbound data to a particular terminal type.
+
+In order to ensure proper functioning of console applications one must make
+sure that once connected to a /dev/hvcs console that the console's $TERM
+env variable is set to the exact terminal type of the terminal emulator
+used to launch the interactive I/O application.  If one is using xterm and
+kermit to connect to /dev/hvcs0 when the console prompt becomes available
+one should "export TERM=xterm" on the console.  This tells ncurses
+applications that are invoked from the console that they should output
+control sequences that xterm can understand.
+
+As a precautionary measure an hvcs user should always "exit" from their
+session before disconnecting an application such as kermit from the device
+node.  If this is not done, the next user to connect to the console will
+continue using the previous user's logged in session which includes
+using the $TERM variable that the previous user supplied.
+
+Hotplug add and remove of vty-server adapters affects which /dev/hvcs* node
+is used to connect to each vty-server adapter.  In order to determine which
+vty-server adapter is associated with which /dev/hvcs* node a special sysfs
+attribute has been added to each vty-server sysfs entry.  This entry is
+called "index" and showing it reveals an integer that refers to the
+/dev/hvcs* entry to use to connect to that device.  For instance cating the
+index attribute of vty-server adapter 30000004 shows the following::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat index
+	2
+
+This index of '2' means that in order to connect to vty-server adapter
+30000004 the user should interact with /dev/hvcs2.
+
+It should be noted that due to the system hotplug I/O capabilities of a
+system the /dev/hvcs* entry that interacts with a particular vty-server
+adapter is not guaranteed to remain the same across system reboots.  Look
+in the Q & A section for more on this issue.
+
+6. Disconnection
+================
+
+As a security feature to prevent the delivery of stale data to an
+unintended target the Power5 system firmware disables the fetching of data
+and discards that data when a connection between a vty-server and a vty has
+been severed.  As an example, when a vty-server is immediately disconnected
+from a vty following output of data to the vty the vty adapter may not have
+enough time between when it received the data interrupt and when the
+connection was severed to fetch the data from firmware before the fetch is
+disabled by firmware.
+
+When hvcs is being used to serve consoles this behavior is not a huge issue
+because the adapter stays connected for large amounts of time following
+almost all data writes.  When hvcs is being used as a tty conduit to tunnel
+data between two partitions [see Q & A below] this is a huge problem
+because the standard Linux behavior when cat'ing or dd'ing data to a device
+is to open the tty, send the data, and then close the tty.  If this driver
+manually terminated vty-server connections on tty close this would close
+the vty-server and vty connection before the target vty has had a chance to
+fetch the data.
+
+Additionally, disconnecting a vty-server and vty only on module removal or
+adapter removal is impractical because other vty-servers in other
+partitions may require the usage of the target vty at any time.
+
+Due to this behavioral restriction disconnection of vty-servers from the
+connected vty is a manual procedure using a write to a sysfs attribute
+outlined below, on the other hand the initial vty-server connection to a
+vty is established automatically by this driver.  Manual vty-server
+connection is never required.
+
+In order to terminate the connection between a vty-server and vty the
+"vterm_state" sysfs attribute within each vty-server's sysfs entry is used.
+Reading this attribute reveals the current connection state of the
+vty-server adapter.  A zero means that the vty-server is not connected to a
+vty.  A one indicates that a connection is active.
+
+Writing a '0' (zero) to the vterm_state attribute will disconnect the VTERM
+connection between the vty-server and target vty ONLY if the vterm_state
+previously read '1'.  The write directive is ignored if the vterm_state
+read '0' or if any value other than '0' was written to the vterm_state
+attribute.  The following example will show the method used for verifying
+the vty-server connection status and disconnecting a vty-server connection::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
+	1
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo 0 > vterm_state
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
+	0
+
+All vty-server connections are automatically terminated when the device is
+hotplug removed and when the module is removed.
+
+7. Configuration
+================
+
+Each vty-server has a sysfs entry in the /sys/devices/vio directory, which
+is symlinked in several other sysfs tree directories, notably under the
+hvcs driver entry, which looks like the following example::
+
+	Pow5:/sys/bus/vio/drivers/hvcs # ls
+	.  ..  30000003  30000004  rescan
+
+By design, firmware notifies the hvcs driver of vty-server lifetimes and
+partner vty removals but not the addition of partner vtys.  Since an HMC
+Super Admin can add partner info dynamically we have provided the hvcs
+driver sysfs directory with the "rescan" update attribute which will query
+firmware and update the partner info for all the vty-servers that this
+driver manages.  Writing a '1' to the attribute triggers the update.  An
+explicit example follows:
+
+	Pow5:/sys/bus/vio/drivers/hvcs # echo 1 > rescan
+
+Reading the attribute will indicate a state of '1' or '0'.  A one indicates
+that an update is in process.  A zero indicates that an update has
+completed or was never executed.
+
+Vty-server entries in this directory are a 32 bit partition unique unit
+address that is created by firmware.  An example vty-server sysfs entry
+looks like the following::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # ls
+	.   current_vty   devspec       name          partner_vtys
+	..  index         partner_clcs  vterm_state
+
+Each entry is provided, by default with a "name" attribute.  Reading the
+"name" attribute will reveal the device type as shown in the following
+example::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000003 # cat name
+	vty-server
+
+Each entry is also provided, by default, with a "devspec" attribute which
+reveals the full device specification when read, as shown in the following
+example::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat devspec
+	/vdevice/vty-server@30000004
+
+Each vty-server sysfs dir is provided with two read-only attributes that
+provide lists of easily parsed partner vty data: "partner_vtys" and
+"partner_clcs"::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_vtys
+	30000000
+	30000001
+	30000002
+	30000000
+	30000000
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_clcs
+	U5112.428.103048A-V3-C0
+	U5112.428.103048A-V3-C2
+	U5112.428.103048A-V3-C3
+	U5112.428.103048A-V4-C0
+	U5112.428.103048A-V5-C0
+
+Reading partner_vtys returns a list of partner vtys.  Vty unit address
+numbering is only per-partition-unique so entries will frequently repeat.
+
+Reading partner_clcs returns a list of "converged location codes" which are
+composed of a system serial number followed by "-V*", where the '*' is the
+target partition number, and "-C*", where the '*' is the slot of the
+adapter.  The first vty partner corresponds to the first clc item, the
+second vty partner to the second clc item, etc.
+
+A vty-server can only be connected to a single vty at a time.  The entry,
+"current_vty" prints the clc of the currently selected partner vty when
+read.
+
+The current_vty can be changed by writing a valid partner clc to the entry
+as in the following example::
+
+	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo U5112.428.10304
+	8A-V4-C0 > current_vty
+
+Changing the current_vty when a vty-server is already connected to a vty
+does not affect the current connection.  The change takes effect when the
+currently open connection is freed.
+
+Information on the "vterm_state" attribute was covered earlier on the
+chapter entitled "disconnection".
+
+8. Questions & Answers:
+=======================
+
+Q: What are the security concerns involving hvcs?
+
+A: There are three main security concerns:
+
+	1. The creator of the /dev/hvcs* nodes has the ability to restrict
+	the access of the device entries to certain users or groups.  It
+	may be best to create a special hvcs group privilege for providing
+	access to system consoles.
+
+	2. To provide network security when grabbing the console it is
+	suggested that the user connect to the console hosting partition
+	using a secure method, such as SSH or sit at a hardware console.
+
+	3. Make sure to exit the user session when done with a console or
+	the next vty-server connection (which may be from another
+	partition) will experience the previously logged in session.
+
+---------------------------------------------------------------------------
+
+Q: How do I multiplex a console that I grab through hvcs so that other
+people can see it:
+
+A: You can use "screen" to directly connect to the /dev/hvcs* device and
+setup a session on your machine with the console group privileges.  As
+pointed out earlier by default screen doesn't provide the termcap settings
+for most terminal emulators to provide adequate character conversion from
+term type "screen" to others.  This means that curses based programs may
+not display properly in screen sessions.
+
+---------------------------------------------------------------------------
+
+Q: Why are the colors all messed up?
+Q: Why are the control characters acting strange or not working?
+Q: Why is the console output all strange and unintelligible?
+
+A: Please see the preceding section on "Connection" for a discussion of how
+applications can affect the display of character control sequences.
+Additionally, just because you logged into the console using and xterm
+doesn't mean someone else didn't log into the console with the HMC console
+(vt320) before you and leave the session logged in.  The best thing to do
+is to export TERM to the terminal type of your terminal emulator when you
+get the console.  Additionally make sure to "exit" the console before you
+disconnect from the console.  This will ensure that the next user gets
+their own TERM type set when they login.
+
+---------------------------------------------------------------------------
+
+Q: When I try to CONNECT kermit to an hvcs device I get:
+"Sorry, can't open connection: /dev/hvcs*"What is happening?
+
+A: Some other Power5 console mechanism has a connection to the vty and
+isn't giving it up.  You can try to force disconnect the consoles from the
+HMC by right clicking on the partition and then selecting "close terminal".
+Otherwise you have to hunt down the people who have console authority.  It
+is possible that you already have the console open using another kermit
+session and just forgot about it.  Please review the console options for
+Power5 systems to determine the many ways a system console can be held.
+
+OR
+
+A: Another user may not have a connectivity method currently attached to a
+/dev/hvcs device but the vterm_state may reveal that they still have the
+vty-server connection established.  They need to free this using the method
+outlined in the section on "Disconnection" in order for others to connect
+to the target vty.
+
+OR
+
+A: The user profile you are using to execute kermit probably doesn't have
+permissions to use the /dev/hvcs* device.
+
+OR
+
+A: You probably haven't inserted the hvcs.ko module yet but the /dev/hvcs*
+entry still exists (on systems without udev).
+
+OR
+
+A: There is not a corresponding vty-server device that maps to an existing
+/dev/hvcs* entry.
+
+---------------------------------------------------------------------------
+
+Q: When I try to CONNECT kermit to an hvcs device I get:
+"Sorry, write access to UUCP lockfile directory denied."
+
+A: The /dev/hvcs* entry you have specified doesn't exist where you said it
+does?  Maybe you haven't inserted the module (on systems with udev).
+
+---------------------------------------------------------------------------
+
+Q: If I already have one Linux partition installed can I use hvcs on said
+partition to provide the console for the install of a second Linux
+partition?
+
+A: Yes granted that your are connected to the /dev/hvcs* device using
+kermit or cu or some other program that doesn't provide terminal emulation.
+
+---------------------------------------------------------------------------
+
+Q: Can I connect to more than one partition's console at a time using this
+driver?
+
+A: Yes.  Of course this means that there must be more than one vty-server
+configured for this partition and each must point to a disconnected vty.
+
+---------------------------------------------------------------------------
+
+Q: Does the hvcs driver support dynamic (hotplug) addition of devices?
+
+A: Yes, if you have dlpar and hotplug enabled for your system and it has
+been built into the kernel the hvcs drivers is configured to dynamically
+handle additions of new devices and removals of unused devices.
+
+---------------------------------------------------------------------------
+
+Q: For some reason /dev/hvcs* doesn't map to the same vty-server adapter
+after a reboot.  What happened?
+
+A: Assignment of vty-server adapters to /dev/hvcs* entries is always done
+in the order that the adapters are exposed.  Due to hotplug capabilities of
+this driver assignment of hotplug added vty-servers may be in a different
+order than how they would be exposed on module load.  Rebooting or
+reloading the module after dynamic addition may result in the /dev/hvcs*
+and vty-server coupling changing if a vty-server adapter was added in a
+slot between two other vty-server adapters.  Refer to the section above
+on how to determine which vty-server goes with which /dev/hvcs* node.
+Hint; look at the sysfs "index" attribute for the vty-server.
+
+---------------------------------------------------------------------------
+
+Q: Can I use /dev/hvcs* as a conduit to another partition and use a tty
+device on that partition as the other end of the pipe?
+
+A: Yes, on Power5 platforms the hvc_console driver provides a tty interface
+for extra /dev/hvc* devices (where /dev/hvc0 is most likely the console).
+In order to get a tty conduit working between the two partitions the HMC
+Super Admin must create an additional "serial server" for the target
+partition with the HMC gui which will show up as /dev/hvc* when the target
+partition is rebooted.
+
+The HMC Super Admin then creates an additional "serial client" for the
+current partition and points this at the target partition's newly created
+"serial server" adapter (remember the slot).  This shows up as an
+additional /dev/hvcs* device.
+
+Now a program on the target system can be configured to read or write to
+/dev/hvc* and another program on the current partition can be configured to
+read or write to /dev/hvcs*.  Now you have a tty conduit between two
+partitions.
+
+---------------------------------------------------------------------------
+
+9. Reporting Bugs:
+==================
+
+The proper channel for reporting bugs is either through the Linux OS
+distribution company that provided your OS or by posting issues to the
+PowerPC development mailing list at:
+
+linuxppc-dev@lists.ozlabs.org
+
+This request is to provide a documented and searchable public exchange
+of the problems and solutions surrounding this driver for the benefit of
+all users.
diff --git a/Documentation/powerpc/hvcs.txt b/Documentation/powerpc/hvcs.txt
deleted file mode 100644
index a730ca5a07f8..000000000000
--- a/Documentation/powerpc/hvcs.txt
+++ /dev/null
@@ -1,567 +0,0 @@
-===========================================================================
-				   HVCS
-	IBM "Hypervisor Virtual Console Server" Installation Guide
-			  for Linux Kernel 2.6.4+
-		    Copyright (C) 2004 IBM Corporation
-
-===========================================================================
-NOTE:Eight space tabs are the optimum editor setting for reading this file.
-===========================================================================
-
-	       Author(s) :  Ryan S. Arnold <rsa@us.ibm.com>
-		       Date Created: March, 02, 2004
-		       Last Changed: August, 24, 2004
-
----------------------------------------------------------------------------
-Table of contents:
-
-	1.  Driver Introduction:
-	2.  System Requirements
-	3.  Build Options:
-		3.1  Built-in:
-		3.2  Module:
-	4.  Installation:
-	5.  Connection:
-	6.  Disconnection:
-	7.  Configuration:
-	8.  Questions & Answers:
-	9.  Reporting Bugs:
-
----------------------------------------------------------------------------
-1. Driver Introduction:
-
-This is the device driver for the IBM Hypervisor Virtual Console Server,
-"hvcs".  The IBM hvcs provides a tty driver interface to allow Linux user
-space applications access to the system consoles of logically partitioned
-operating systems (Linux and AIX) running on the same partitioned Power5
-ppc64 system.  Physical hardware consoles per partition are not practical
-on this hardware so system consoles are accessed by this driver using
-firmware interfaces to virtual terminal devices.
-
----------------------------------------------------------------------------
-2. System Requirements:
-
-This device driver was written using 2.6.4 Linux kernel APIs and will only
-build and run on kernels of this version or later.
-
-This driver was written to operate solely on IBM Power5 ppc64 hardware
-though some care was taken to abstract the architecture dependent firmware
-calls from the driver code.
-
-Sysfs must be mounted on the system so that the user can determine which
-major and minor numbers are associated with each vty-server.  Directions
-for sysfs mounting are outside the scope of this document.
-
----------------------------------------------------------------------------
-3. Build Options:
-
-The hvcs driver registers itself as a tty driver.  The tty layer
-dynamically allocates a block of major and minor numbers in a quantity
-requested by the registering driver.  The hvcs driver asks the tty layer
-for 64 of these major/minor numbers by default to use for hvcs device node
-entries.
-
-If the default number of device entries is adequate then this driver can be
-built into the kernel.  If not, the default can be over-ridden by inserting
-the driver as a module with insmod parameters.
-
----------------------------------------------------------------------------
-3.1 Built-in:
-
-The following menuconfig example demonstrates selecting to build this
-driver into the kernel.
-
-	Device Drivers  --->
-		Character devices  --->
-			<*> IBM Hypervisor Virtual Console Server Support
-
-Begin the kernel make process.
-
----------------------------------------------------------------------------
-3.2 Module:
-
-The following menuconfig example demonstrates selecting to build this
-driver as a kernel module.
-
-	Device Drivers  --->
-		Character devices  --->
-			<M> IBM Hypervisor Virtual Console Server Support
-
-The make process will build the following kernel modules:
-
-	hvcs.ko
-	hvcserver.ko
-
-To insert the module with the default allocation execute the following
-commands in the order they appear:
-
-	insmod hvcserver.ko
-	insmod hvcs.ko
-
-The hvcserver module contains architecture specific firmware calls and must
-be inserted first, otherwise the hvcs module will not find some of the
-symbols it expects.
-
-To override the default use an insmod parameter as follows (requesting 4
-tty devices as an example):
-
-	insmod hvcs.ko hvcs_parm_num_devs=4
-
-There is a maximum number of dev entries that can be specified on insmod.
-We think that 1024 is currently a decent maximum number of server adapters
-to allow.  This can always be changed by modifying the constant in the
-source file before building.
-
-NOTE: The length of time it takes to insmod the driver seems to be related
-to the number of tty interfaces the registering driver requests.
-
-In order to remove the driver module execute the following command:
-
-	rmmod hvcs.ko
-
-The recommended method for installing hvcs as a module is to use depmod to
-build a current modules.dep file in /lib/modules/`uname -r` and then
-execute:
-
-modprobe hvcs hvcs_parm_num_devs=4
-
-The modules.dep file indicates that hvcserver.ko needs to be inserted
-before hvcs.ko and modprobe uses this file to smartly insert the modules in
-the proper order.
-
-The following modprobe command is used to remove hvcs and hvcserver in the
-proper order:
-
-modprobe -r hvcs
-
----------------------------------------------------------------------------
-4. Installation:
-
-The tty layer creates sysfs entries which contain the major and minor
-numbers allocated for the hvcs driver.  The following snippet of "tree"
-output of the sysfs directory shows where these numbers are presented:
-
-	sys/
-	|-- *other sysfs base dirs*
-	|
-	|-- class
-	|   |-- *other classes of devices*
-	|   |
-	|   `-- tty
-	|       |-- *other tty devices*
-	|       |
-	|       |-- hvcs0
-	|       |   `-- dev
-	|       |-- hvcs1
-	|       |   `-- dev
-	|       |-- hvcs2
-	|       |   `-- dev
-	|       |-- hvcs3
-	|       |   `-- dev
-	|       |
-	|       |-- *other tty devices*
-	|
-	|-- *other sysfs base dirs*
-
-For the above examples the following output is a result of cat'ing the
-"dev" entry in the hvcs directory:
-
-	Pow5:/sys/class/tty/hvcs0/ # cat dev
-	254:0
-
-	Pow5:/sys/class/tty/hvcs1/ # cat dev
-	254:1
-
-	Pow5:/sys/class/tty/hvcs2/ # cat dev
-	254:2
-
-	Pow5:/sys/class/tty/hvcs3/ # cat dev
-	254:3
-
-The output from reading the "dev" attribute is the char device major and
-minor numbers that the tty layer has allocated for this driver's use.  Most
-systems running hvcs will already have the device entries created or udev
-will do it automatically.
-
-Given the example output above, to manually create a /dev/hvcs* node entry
-mknod can be used as follows:
-
-	mknod /dev/hvcs0 c 254 0
-	mknod /dev/hvcs1 c 254 1
-	mknod /dev/hvcs2 c 254 2
-	mknod /dev/hvcs3 c 254 3
-
-Using mknod to manually create the device entries makes these device nodes
-persistent.  Once created they will exist prior to the driver insmod.
-
-Attempting to connect an application to /dev/hvcs* prior to insertion of
-the hvcs module will result in an error message similar to the following:
-
-	"/dev/hvcs*: No such device".
-
-NOTE: Just because there is a device node present doesn't mean that there
-is a vty-server device configured for that node.
-
----------------------------------------------------------------------------
-5. Connection
-
-Since this driver controls devices that provide a tty interface a user can
-interact with the device node entries using any standard tty-interactive
-method (e.g. "cat", "dd", "echo").  The intent of this driver however, is
-to provide real time console interaction with a Linux partition's console,
-which requires the use of applications that provide bi-directional,
-interactive I/O with a tty device.
-
-Applications (e.g. "minicom" and "screen") that act as terminal emulators
-or perform terminal type control sequence conversion on the data being
-passed through them are NOT acceptable for providing interactive console
-I/O.  These programs often emulate antiquated terminal types (vt100 and
-ANSI) and expect inbound data to take the form of one of these supported
-terminal types but they either do not convert, or do not _adequately_
-convert, outbound data into the terminal type of the terminal which invoked
-them (though screen makes an attempt and can apparently be configured with
-much termcap wrestling.)
-
-For this reason kermit and cu are two of the recommended applications for
-interacting with a Linux console via an hvcs device.  These programs simply
-act as a conduit for data transfer to and from the tty device.  They do not
-require inbound data to take the form of a particular terminal type, nor do
-they cook outbound data to a particular terminal type.
-
-In order to ensure proper functioning of console applications one must make
-sure that once connected to a /dev/hvcs console that the console's $TERM
-env variable is set to the exact terminal type of the terminal emulator
-used to launch the interactive I/O application.  If one is using xterm and
-kermit to connect to /dev/hvcs0 when the console prompt becomes available
-one should "export TERM=xterm" on the console.  This tells ncurses
-applications that are invoked from the console that they should output
-control sequences that xterm can understand.
-
-As a precautionary measure an hvcs user should always "exit" from their
-session before disconnecting an application such as kermit from the device
-node.  If this is not done, the next user to connect to the console will
-continue using the previous user's logged in session which includes
-using the $TERM variable that the previous user supplied.
-
-Hotplug add and remove of vty-server adapters affects which /dev/hvcs* node
-is used to connect to each vty-server adapter.  In order to determine which
-vty-server adapter is associated with which /dev/hvcs* node a special sysfs
-attribute has been added to each vty-server sysfs entry.  This entry is
-called "index" and showing it reveals an integer that refers to the
-/dev/hvcs* entry to use to connect to that device.  For instance cating the
-index attribute of vty-server adapter 30000004 shows the following.
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat index
-	2
-
-This index of '2' means that in order to connect to vty-server adapter
-30000004 the user should interact with /dev/hvcs2.
-
-It should be noted that due to the system hotplug I/O capabilities of a
-system the /dev/hvcs* entry that interacts with a particular vty-server
-adapter is not guaranteed to remain the same across system reboots.  Look
-in the Q & A section for more on this issue.
-
----------------------------------------------------------------------------
-6. Disconnection
-
-As a security feature to prevent the delivery of stale data to an
-unintended target the Power5 system firmware disables the fetching of data
-and discards that data when a connection between a vty-server and a vty has
-been severed.  As an example, when a vty-server is immediately disconnected
-from a vty following output of data to the vty the vty adapter may not have
-enough time between when it received the data interrupt and when the
-connection was severed to fetch the data from firmware before the fetch is
-disabled by firmware.
-
-When hvcs is being used to serve consoles this behavior is not a huge issue
-because the adapter stays connected for large amounts of time following
-almost all data writes.  When hvcs is being used as a tty conduit to tunnel
-data between two partitions [see Q & A below] this is a huge problem
-because the standard Linux behavior when cat'ing or dd'ing data to a device
-is to open the tty, send the data, and then close the tty.  If this driver
-manually terminated vty-server connections on tty close this would close
-the vty-server and vty connection before the target vty has had a chance to
-fetch the data.
-
-Additionally, disconnecting a vty-server and vty only on module removal or
-adapter removal is impractical because other vty-servers in other
-partitions may require the usage of the target vty at any time.
-
-Due to this behavioral restriction disconnection of vty-servers from the
-connected vty is a manual procedure using a write to a sysfs attribute
-outlined below, on the other hand the initial vty-server connection to a
-vty is established automatically by this driver.  Manual vty-server
-connection is never required.
-
-In order to terminate the connection between a vty-server and vty the
-"vterm_state" sysfs attribute within each vty-server's sysfs entry is used.
-Reading this attribute reveals the current connection state of the
-vty-server adapter.  A zero means that the vty-server is not connected to a
-vty.  A one indicates that a connection is active.
-
-Writing a '0' (zero) to the vterm_state attribute will disconnect the VTERM
-connection between the vty-server and target vty ONLY if the vterm_state
-previously read '1'.  The write directive is ignored if the vterm_state
-read '0' or if any value other than '0' was written to the vterm_state
-attribute.  The following example will show the method used for verifying
-the vty-server connection status and disconnecting a vty-server connection.
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
-	1
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo 0 > vterm_state
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
-	0
-
-All vty-server connections are automatically terminated when the device is
-hotplug removed and when the module is removed.
-
----------------------------------------------------------------------------
-7. Configuration
-
-Each vty-server has a sysfs entry in the /sys/devices/vio directory, which
-is symlinked in several other sysfs tree directories, notably under the
-hvcs driver entry, which looks like the following example:
-
-	Pow5:/sys/bus/vio/drivers/hvcs # ls
-	.  ..  30000003  30000004  rescan
-
-By design, firmware notifies the hvcs driver of vty-server lifetimes and
-partner vty removals but not the addition of partner vtys.  Since an HMC
-Super Admin can add partner info dynamically we have provided the hvcs
-driver sysfs directory with the "rescan" update attribute which will query
-firmware and update the partner info for all the vty-servers that this
-driver manages.  Writing a '1' to the attribute triggers the update.  An
-explicit example follows:
-
-	Pow5:/sys/bus/vio/drivers/hvcs # echo 1 > rescan
-
-Reading the attribute will indicate a state of '1' or '0'.  A one indicates
-that an update is in process.  A zero indicates that an update has
-completed or was never executed.
-
-Vty-server entries in this directory are a 32 bit partition unique unit
-address that is created by firmware.  An example vty-server sysfs entry
-looks like the following:
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # ls
-	.   current_vty   devspec       name          partner_vtys
-	..  index         partner_clcs  vterm_state
-
-Each entry is provided, by default with a "name" attribute.  Reading the
-"name" attribute will reveal the device type as shown in the following
-example:
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000003 # cat name
-	vty-server
-
-Each entry is also provided, by default, with a "devspec" attribute which
-reveals the full device specification when read, as shown in the following
-example:
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat devspec
-	/vdevice/vty-server@30000004
-
-Each vty-server sysfs dir is provided with two read-only attributes that
-provide lists of easily parsed partner vty data: "partner_vtys" and
-"partner_clcs".
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_vtys
-	30000000
-	30000001
-	30000002
-	30000000
-	30000000
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_clcs
-	U5112.428.103048A-V3-C0
-	U5112.428.103048A-V3-C2
-	U5112.428.103048A-V3-C3
-	U5112.428.103048A-V4-C0
-	U5112.428.103048A-V5-C0
-
-Reading partner_vtys returns a list of partner vtys.  Vty unit address
-numbering is only per-partition-unique so entries will frequently repeat.
-
-Reading partner_clcs returns a list of "converged location codes" which are
-composed of a system serial number followed by "-V*", where the '*' is the
-target partition number, and "-C*", where the '*' is the slot of the
-adapter.  The first vty partner corresponds to the first clc item, the
-second vty partner to the second clc item, etc.
-
-A vty-server can only be connected to a single vty at a time.  The entry,
-"current_vty" prints the clc of the currently selected partner vty when
-read.
-
-The current_vty can be changed by writing a valid partner clc to the entry
-as in the following example:
-
-	Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo U5112.428.10304
-	8A-V4-C0 > current_vty
-
-Changing the current_vty when a vty-server is already connected to a vty
-does not affect the current connection.  The change takes effect when the
-currently open connection is freed.
-
-Information on the "vterm_state" attribute was covered earlier on the
-chapter entitled "disconnection".
-
----------------------------------------------------------------------------
-8. Questions & Answers:
-===========================================================================
-Q: What are the security concerns involving hvcs?
-
-A: There are three main security concerns:
-
-	1. The creator of the /dev/hvcs* nodes has the ability to restrict
-	the access of the device entries to certain users or groups.  It
-	may be best to create a special hvcs group privilege for providing
-	access to system consoles.
-
-	2. To provide network security when grabbing the console it is
-	suggested that the user connect to the console hosting partition
-	using a secure method, such as SSH or sit at a hardware console.
-
-	3. Make sure to exit the user session when done with a console or
-	the next vty-server connection (which may be from another
-	partition) will experience the previously logged in session.
-
----------------------------------------------------------------------------
-Q: How do I multiplex a console that I grab through hvcs so that other
-people can see it:
-
-A: You can use "screen" to directly connect to the /dev/hvcs* device and
-setup a session on your machine with the console group privileges.  As
-pointed out earlier by default screen doesn't provide the termcap settings
-for most terminal emulators to provide adequate character conversion from
-term type "screen" to others.  This means that curses based programs may
-not display properly in screen sessions.
-
----------------------------------------------------------------------------
-Q: Why are the colors all messed up?
-Q: Why are the control characters acting strange or not working?
-Q: Why is the console output all strange and unintelligible?
-
-A: Please see the preceding section on "Connection" for a discussion of how
-applications can affect the display of character control sequences.
-Additionally, just because you logged into the console using and xterm
-doesn't mean someone else didn't log into the console with the HMC console
-(vt320) before you and leave the session logged in.  The best thing to do
-is to export TERM to the terminal type of your terminal emulator when you
-get the console.  Additionally make sure to "exit" the console before you
-disconnect from the console.  This will ensure that the next user gets
-their own TERM type set when they login.
-
----------------------------------------------------------------------------
-Q: When I try to CONNECT kermit to an hvcs device I get:
-"Sorry, can't open connection: /dev/hvcs*"What is happening?
-
-A: Some other Power5 console mechanism has a connection to the vty and
-isn't giving it up.  You can try to force disconnect the consoles from the
-HMC by right clicking on the partition and then selecting "close terminal".
-Otherwise you have to hunt down the people who have console authority.  It
-is possible that you already have the console open using another kermit
-session and just forgot about it.  Please review the console options for
-Power5 systems to determine the many ways a system console can be held.
-
-OR
-
-A: Another user may not have a connectivity method currently attached to a
-/dev/hvcs device but the vterm_state may reveal that they still have the
-vty-server connection established.  They need to free this using the method
-outlined in the section on "Disconnection" in order for others to connect
-to the target vty.
-
-OR
-
-A: The user profile you are using to execute kermit probably doesn't have
-permissions to use the /dev/hvcs* device.
-
-OR
-
-A: You probably haven't inserted the hvcs.ko module yet but the /dev/hvcs*
-entry still exists (on systems without udev).
-
-OR
-
-A: There is not a corresponding vty-server device that maps to an existing
-/dev/hvcs* entry.
-
----------------------------------------------------------------------------
-Q: When I try to CONNECT kermit to an hvcs device I get:
-"Sorry, write access to UUCP lockfile directory denied."
-
-A: The /dev/hvcs* entry you have specified doesn't exist where you said it
-does?  Maybe you haven't inserted the module (on systems with udev).
-
----------------------------------------------------------------------------
-Q: If I already have one Linux partition installed can I use hvcs on said
-partition to provide the console for the install of a second Linux
-partition?
-
-A: Yes granted that your are connected to the /dev/hvcs* device using
-kermit or cu or some other program that doesn't provide terminal emulation.
-
----------------------------------------------------------------------------
-Q: Can I connect to more than one partition's console at a time using this
-driver?
-
-A: Yes.  Of course this means that there must be more than one vty-server
-configured for this partition and each must point to a disconnected vty.
-
----------------------------------------------------------------------------
-Q: Does the hvcs driver support dynamic (hotplug) addition of devices?
-
-A: Yes, if you have dlpar and hotplug enabled for your system and it has
-been built into the kernel the hvcs drivers is configured to dynamically
-handle additions of new devices and removals of unused devices.
-
----------------------------------------------------------------------------
-Q: For some reason /dev/hvcs* doesn't map to the same vty-server adapter
-after a reboot.  What happened?
-
-A: Assignment of vty-server adapters to /dev/hvcs* entries is always done
-in the order that the adapters are exposed.  Due to hotplug capabilities of
-this driver assignment of hotplug added vty-servers may be in a different
-order than how they would be exposed on module load.  Rebooting or
-reloading the module after dynamic addition may result in the /dev/hvcs*
-and vty-server coupling changing if a vty-server adapter was added in a
-slot between two other vty-server adapters.  Refer to the section above
-on how to determine which vty-server goes with which /dev/hvcs* node.
-Hint; look at the sysfs "index" attribute for the vty-server.
-
----------------------------------------------------------------------------
-Q: Can I use /dev/hvcs* as a conduit to another partition and use a tty
-device on that partition as the other end of the pipe?
-
-A: Yes, on Power5 platforms the hvc_console driver provides a tty interface
-for extra /dev/hvc* devices (where /dev/hvc0 is most likely the console).
-In order to get a tty conduit working between the two partitions the HMC
-Super Admin must create an additional "serial server" for the target
-partition with the HMC gui which will show up as /dev/hvc* when the target
-partition is rebooted.
-
-The HMC Super Admin then creates an additional "serial client" for the
-current partition and points this at the target partition's newly created
-"serial server" adapter (remember the slot).  This shows up as an
-additional /dev/hvcs* device.
-
-Now a program on the target system can be configured to read or write to
-/dev/hvc* and another program on the current partition can be configured to
-read or write to /dev/hvcs*.  Now you have a tty conduit between two
-partitions.
-
----------------------------------------------------------------------------
-9. Reporting Bugs:
-
-The proper channel for reporting bugs is either through the Linux OS
-distribution company that provided your OS or by posting issues to the
-PowerPC development mailing list at:
-
-linuxppc-dev@lists.ozlabs.org
-
-This request is to provide a documented and searchable public exchange
-of the problems and solutions surrounding this driver for the benefit of
-all users.
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
new file mode 100644
index 000000000000..549b1cdd77ae
--- /dev/null
+++ b/Documentation/powerpc/index.rst
@@ -0,0 +1,34 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+powerpc
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    bootwrapper
+    cpu_families
+    cpu_features
+    cxl
+    cxlflash
+    dawr-power9
+    dscr
+    eeh-pci-error-recovery
+    firmware-assisted-dump
+    hvcs
+    isa-versions
+    mpc52xx
+    pci_iov_resource_on_powernv
+    pmu-ebb
+    ptrace
+    qe_firmware
+    syscall64-abi
+    transactional_memory
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/powerpc/isa-versions.rst b/Documentation/powerpc/isa-versions.rst
index 812e20cc898c..a363d8c1603c 100644
--- a/Documentation/powerpc/isa-versions.rst
+++ b/Documentation/powerpc/isa-versions.rst
@@ -1,11 +1,12 @@
+==========================
 CPU to ISA Version Mapping
 ==========================
 
 Mapping of some CPU versions to relevant ISA versions.
 
-========= ====================
+========= ====================================================================
 CPU       Architecture version
-========= ====================
+========= ====================================================================
 Power9    Power ISA v3.0B
 Power8    Power ISA v2.07
 Power7    Power ISA v2.06
@@ -22,7 +23,7 @@ PPC970    - PowerPC User Instruction Set Architecture Book I v2.01
           - PowerPC Virtual Environment Architecture Book II v2.01
           - PowerPC Operating Environment Architecture Book III v2.01
           - Plus Altivec/VMX ~= 2.03
-========= ====================
+========= ====================================================================
 
 
 Key Features
@@ -58,9 +59,9 @@ Power5     No
 PPC970     No
 ========== ====
 
-========== ====================
+========== ====================================
 CPU        Transactional Memory
-========== ====================
+========== ====================================
 Power9     Yes (* see transactional_memory.txt)
 Power8     Yes
 Power7     No
@@ -71,4 +72,4 @@ Power5++   No
 Power5+    No
 Power5     No
 PPC970     No
-========== ====================
+========== ====================================
diff --git a/Documentation/powerpc/mpc52xx.rst b/Documentation/powerpc/mpc52xx.rst
new file mode 100644
index 000000000000..8676ac63e077
--- /dev/null
+++ b/Documentation/powerpc/mpc52xx.rst
@@ -0,0 +1,43 @@
+=============================
+Linux 2.6.x on MPC52xx family
+=============================
+
+For the latest info, go to http://www.246tNt.com/mpc52xx/
+
+To compile/use :
+
+  - U-Boot::
+
+     # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
+        if you wish to ).
+     # make lite5200_defconfig
+     # make uImage
+
+     then, on U-boot:
+     => tftpboot 200000 uImage
+     => tftpboot 400000 pRamdisk
+     => bootm 200000 400000
+
+  - DBug::
+
+     # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
+        if you wish to ).
+     # make lite5200_defconfig
+     # cp your_initrd.gz arch/ppc/boot/images/ramdisk.image.gz
+     # make zImage.initrd
+     # make
+
+     then in DBug:
+     DBug> dn -i zImage.initrd.lite5200
+
+
+Some remarks:
+
+ - The port is named mpc52xxx, and config options are PPC_MPC52xx. The MGT5100
+   is not supported, and I'm not sure anyone is interesting in working on it
+   so. I didn't took 5xxx because there's apparently a lot of 5xxx that have
+   nothing to do with the MPC5200. I also included the 'MPC' for the same
+   reason.
+ - Of course, I inspired myself from the 2.4 port. If you think I forgot to
+   mention you/your company in the copyright of some code, I'll correct it
+   ASAP.
diff --git a/Documentation/powerpc/mpc52xx.txt b/Documentation/powerpc/mpc52xx.txt
deleted file mode 100644
index 0d540a31ea1a..000000000000
--- a/Documentation/powerpc/mpc52xx.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Linux 2.6.x on MPC52xx family
------------------------------
-
-For the latest info, go to http://www.246tNt.com/mpc52xx/
-
-To compile/use :
-
-  - U-Boot:
-     # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
-        if you wish to ).
-     # make lite5200_defconfig
-     # make uImage
-
-     then, on U-boot:
-     => tftpboot 200000 uImage
-     => tftpboot 400000 pRamdisk
-     => bootm 200000 400000
-
-  - DBug:
-     # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
-        if you wish to ).
-     # make lite5200_defconfig
-     # cp your_initrd.gz arch/ppc/boot/images/ramdisk.image.gz
-     # make zImage.initrd
-     # make
-
-     then in DBug:
-     DBug> dn -i zImage.initrd.lite5200
-
-
-Some remarks :
- - The port is named mpc52xxx, and config options are PPC_MPC52xx. The MGT5100
-   is not supported, and I'm not sure anyone is interesting in working on it
-   so. I didn't took 5xxx because there's apparently a lot of 5xxx that have
-   nothing to do with the MPC5200. I also included the 'MPC' for the same
-   reason.
- - Of course, I inspired myself from the 2.4 port. If you think I forgot to
-   mention you/your company in the copyright of some code, I'll correct it
-   ASAP.
diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.rst b/Documentation/powerpc/pci_iov_resource_on_powernv.rst
new file mode 100644
index 000000000000..f5a5793e1613
--- /dev/null
+++ b/Documentation/powerpc/pci_iov_resource_on_powernv.rst
@@ -0,0 +1,312 @@
+===================================================
+PCI Express I/O Virtualization Resource on Powerenv
+===================================================
+
+Wei Yang <weiyang@linux.vnet.ibm.com>
+
+Benjamin Herrenschmidt <benh@au1.ibm.com>
+
+Bjorn Helgaas <bhelgaas@google.com>
+
+26 Aug 2014
+
+This document describes the requirement from hardware for PCI MMIO resource
+sizing and assignment on PowerKVM and how generic PCI code handles this
+requirement. The first two sections describe the concepts of Partitionable
+Endpoints and the implementation on P8 (IODA2). The next two sections talks
+about considerations on enabling SRIOV on IODA2.
+
+1. Introduction to Partitionable Endpoints
+==========================================
+
+A Partitionable Endpoint (PE) is a way to group the various resources
+associated with a device or a set of devices to provide isolation between
+partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism
+to freeze a device that is causing errors in order to limit the possibility
+of propagation of bad data.
+
+There is thus, in HW, a table of PE states that contains a pair of "frozen"
+state bits (one for MMIO and one for DMA, they get set together but can be
+cleared independently) for each PE.
+
+When a PE is frozen, all stores in any direction are dropped and all loads
+return all 1's value. MSIs are also blocked. There's a bit more state that
+captures things like the details of the error that caused the freeze etc., but
+that's not critical.
+
+The interesting part is how the various PCIe transactions (MMIO, DMA, ...)
+are matched to their corresponding PEs.
+
+The following section provides a rough description of what we have on P8
+(IODA2).  Keep in mind that this is all per PHB (PCI host bridge).  Each PHB
+is a completely separate HW entity that replicates the entire logic, so has
+its own set of PEs, etc.
+
+2. Implementation of Partitionable Endpoints on P8 (IODA2)
+==========================================================
+
+P8 supports up to 256 Partitionable Endpoints per PHB.
+
+  * Inbound
+
+    For DMA, MSIs and inbound PCIe error messages, we have a table (in
+    memory but accessed in HW by the chip) that provides a direct
+    correspondence between a PCIe RID (bus/dev/fn) with a PE number.
+    We call this the RTT.
+
+    - For DMA we then provide an entire address space for each PE that can
+      contain two "windows", depending on the value of PCI address bit 59.
+      Each window can be configured to be remapped via a "TCE table" (IOMMU
+      translation table), which has various configurable characteristics
+      not described here.
+
+    - For MSIs, we have two windows in the address space (one at the top of
+      the 32-bit space and one much higher) which, via a combination of the
+      address and MSI value, will result in one of the 2048 interrupts per
+      bridge being triggered.  There's a PE# in the interrupt controller
+      descriptor table as well which is compared with the PE# obtained from
+      the RTT to "authorize" the device to emit that specific interrupt.
+
+    - Error messages just use the RTT.
+
+  * Outbound.  That's where the tricky part is.
+
+    Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
+    from the CPU address space to the PCI address space.  There is one M32
+    window and sixteen M64 windows.  They have different characteristics.
+    First what they have in common: they forward a configurable portion of
+    the CPU address space to the PCIe bus and must be naturally aligned
+    power of two in size.  The rest is different:
+
+    - The M32 window:
+
+      * Is limited to 4GB in size.
+
+      * Drops the top bits of the address (above the size) and replaces
+	them with a configurable value.  This is typically used to generate
+	32-bit PCIe accesses.  We configure that window at boot from FW and
+	don't touch it from Linux; it's usually set to forward a 2GB
+	portion of address space from the CPU to PCIe
+	0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
+	reserved for MSIs but this is not a problem at this point; we just
+	need to ensure Linux doesn't assign anything there, the M32 logic
+	ignores that however and will forward in that space if we try).
+
+      * It is divided into 256 segments of equal size.  A table in the chip
+	maps each segment to a PE#.  That allows portions of the MMIO space
+	to be assigned to PEs on a segment granularity.  For a 2GB window,
+	the segment granularity is 2GB/256 = 8MB.
+
+    Now, this is the "main" window we use in Linux today (excluding
+    SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
+    onto a segment alignment/granularity so that the space behind a bridge
+    can be assigned to a PE.
+
+    Ideally we would like to be able to have individual functions in PEs
+    but that would mean using a completely different address allocation
+    scheme where individual function BARs can be "grouped" to fit in one or
+    more segments.
+
+    - The M64 windows:
+
+      * Must be at least 256MB in size.
+
+      * Do not translate addresses (the address on PCIe is the same as the
+	address on the PowerBus).  There is a way to also set the top 14
+	bits which are not conveyed by PowerBus but we don't use this.
+
+      * Can be configured to be segmented.  When not segmented, we can
+	specify the PE# for the entire window.  When segmented, a window
+	has 256 segments; however, there is no table for mapping a segment
+	to a PE#.  The segment number *is* the PE#.
+
+      * Support overlaps.  If an address is covered by multiple windows,
+	there's a defined ordering for which window applies.
+
+    We have code (fairly new compared to the M32 stuff) that exploits that
+    for large BARs in 64-bit space:
+
+    We configure an M64 window to cover the entire region of address space
+    that has been assigned by FW for the PHB (about 64GB, ignore the space
+    for the M32, it comes out of a different "reserve").  We configure it
+    as segmented.
+
+    Then we do the same thing as with M32, using the bridge alignment
+    trick, to match to those giant segments.
+
+    Since we cannot remap, we have two additional constraints:
+
+    - We do the PE# allocation *after* the 64-bit space has been assigned
+      because the addresses we use directly determine the PE#.  We then
+      update the M32 PE# for the devices that use both 32-bit and 64-bit
+      spaces or assign the remaining PE# to 32-bit only devices.
+
+    - We cannot "group" segments in HW, so if a device ends up using more
+      than one segment, we end up with more than one PE#.  There is a HW
+      mechanism to make the freeze state cascade to "companion" PEs but
+      that only works for PCIe error messages (typically used so that if
+      you freeze a switch, it freezes all its children).  So we do it in
+      SW.  We lose a bit of effectiveness of EEH in that case, but that's
+      the best we found.  So when any of the PEs freezes, we freeze the
+      other ones for that "domain".  We thus introduce the concept of
+      "master PE" which is the one used for DMA, MSIs, etc., and "secondary
+      PEs" that are used for the remaining M64 segments.
+
+    We would like to investigate using additional M64 windows in "single
+    PE" mode to overlay over specific BARs to work around some of that, for
+    example for devices with very large BARs, e.g., GPUs.  It would make
+    sense, but we haven't done it yet.
+
+3. Considerations for SR-IOV on PowerKVM
+========================================
+
+  * SR-IOV Background
+
+    The PCIe SR-IOV feature allows a single Physical Function (PF) to
+    support several Virtual Functions (VFs).  Registers in the PF's SR-IOV
+    Capability control the number of VFs and whether they are enabled.
+
+    When VFs are enabled, they appear in Configuration Space like normal
+    PCI devices, but the BARs in VF config space headers are unusual.  For
+    a non-VF device, software uses BARs in the config space header to
+    discover the BAR sizes and assign addresses for them.  For VF devices,
+    software uses VF BAR registers in the *PF* SR-IOV Capability to
+    discover sizes and assign addresses.  The BARs in the VF's config space
+    header are read-only zeros.
+
+    When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
+    base address for all the corresponding VF(n) BARs.  For example, if the
+    PF SR-IOV Capability is programmed to enable eight VFs, and it has a
+    1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
+    This region is divided into eight contiguous 1MB regions, each of which
+    is a BAR0 for one of the VFs.  Note that even though the VF BAR
+    describes an 8MB region, the alignment requirement is for a single VF,
+    i.e., 1MB in this example.
+
+  There are several strategies for isolating VFs in PEs:
+
+  - M32 window: There's one M32 window, and it is split into 256
+    equally-sized segments.  The finest granularity possible is a 256MB
+    window with 1MB segments.  VF BARs that are 1MB or larger could be
+    mapped to separate PEs in this window.  Each segment can be
+    individually mapped to a PE via the lookup table, so this is quite
+    flexible, but it works best when all the VF BARs are the same size.  If
+    they are different sizes, the entire window has to be small enough that
+    the segment size matches the smallest VF BAR, which means larger VF
+    BARs span several segments.
+
+  - Non-segmented M64 window: A non-segmented M64 window is mapped entirely
+    to a single PE, so it could only isolate one VF.
+
+  - Single segmented M64 windows: A segmented M64 window could be used just
+    like the M32 window, but the segments can't be individually mapped to
+    PEs (the segment number is the PE#), so there isn't as much
+    flexibility.  A VF with multiple BARs would have to be in a "domain" of
+    multiple PEs, which is not as well isolated as a single PE.
+
+  - Multiple segmented M64 windows: As usual, each window is split into 256
+    equally-sized segments, and the segment number is the PE#.  But if we
+    use several M64 windows, they can be set to different base addresses
+    and different segment sizes.  If we have VFs that each have a 1MB BAR
+    and a 32MB BAR, we could use one M64 window to assign 1MB segments and
+    another M64 window to assign 32MB segments.
+
+  Finally, the plan to use M64 windows for SR-IOV, which will be described
+  more in the next two sections.  For a given VF BAR, we need to
+  effectively reserve the entire 256 segments (256 * VF BAR size) and
+  position the VF BAR to start at the beginning of a free range of
+  segments/PEs inside that M64 window.
+
+  The goal is of course to be able to give a separate PE for each VF.
+
+  The IODA2 platform has 16 M64 windows, which are used to map MMIO
+  range to PE#.  Each M64 window defines one MMIO range and this range is
+  divided into 256 segments, with each segment corresponding to one PE.
+
+  We decide to leverage this M64 window to map VFs to individual PEs, since
+  SR-IOV VF BARs are all the same size.
+
+  But doing so introduces another problem: total_VFs is usually smaller
+  than the number of M64 window segments, so if we map one VF BAR directly
+  to one M64 window, some part of the M64 window will map to another
+  device's MMIO range.
+
+  IODA supports 256 PEs, so segmented windows contain 256 segments, so if
+  total_VFs is less than 256, we have the situation in Figure 1.0, where
+  segments [total_VFs, 255] of the M64 window may map to some MMIO range on
+  other devices::
+
+     0      1                     total_VFs - 1
+     +------+------+-     -+------+------+
+     |      |      |  ...  |      |      |
+     +------+------+-     -+------+------+
+
+                           VF(n) BAR space
+
+     0      1                     total_VFs - 1                255
+     +------+------+-     -+------+------+-      -+------+------+
+     |      |      |  ...  |      |      |   ...  |      |      |
+     +------+------+-     -+------+------+-      -+------+------+
+
+                           M64 window
+
+		Figure 1.0 Direct map VF(n) BAR space
+
+  Our current solution is to allocate 256 segments even if the VF(n) BAR
+  space doesn't need that much, as shown in Figure 1.1::
+
+     0      1                     total_VFs - 1                255
+     +------+------+-     -+------+------+-      -+------+------+
+     |      |      |  ...  |      |      |   ...  |      |      |
+     +------+------+-     -+------+------+-      -+------+------+
+
+                           VF(n) BAR space + extra
+
+     0      1                     total_VFs - 1                255
+     +------+------+-     -+------+------+-      -+------+------+
+     |      |      |  ...  |      |      |   ...  |      |      |
+     +------+------+-     -+------+------+-      -+------+------+
+
+			   M64 window
+
+		Figure 1.1 Map VF(n) BAR space + extra
+
+  Allocating the extra space ensures that the entire M64 window will be
+  assigned to this one SR-IOV device and none of the space will be
+  available for other devices.  Note that this only expands the space
+  reserved in software; there are still only total_VFs VFs, and they only
+  respond to segments [0, total_VFs - 1].  There's nothing in hardware that
+  responds to segments [total_VFs, 255].
+
+4. Implications for the Generic PCI Code
+========================================
+
+The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be
+aligned to the size of an individual VF BAR.
+
+In IODA2, the MMIO address determines the PE#.  If the address is in an M32
+window, we can set the PE# by updating the table that translates segments
+to PE#s.  Similarly, if the address is in an unsegmented M64 window, we can
+set the PE# for the window.  But if it's in a segmented M64 window, the
+segment number is the PE#.
+
+Therefore, the only way to control the PE# for a VF is to change the base
+of the VF(n) BAR space in the VF BAR.  If the PCI core allocates the exact
+amount of space required for the VF(n) BAR space, the VF BAR value is fixed
+and cannot be changed.
+
+On the other hand, if the PCI core allocates additional space, the VF BAR
+value can be changed as long as the entire VF(n) BAR space remains inside
+the space allocated by the core.
+
+Ideally the segment size will be the same as an individual VF BAR size.
+Then each VF will be in its own PE.  The VF BARs (and therefore the PE#s)
+are contiguous.  If VF0 is in PE(x), then VF(n) is in PE(x+n).  If we
+allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0.
+
+If the segment size is smaller than the VF BAR size, it will take several
+segments to cover a VF BAR, and a VF will be in several PEs.  This is
+possible, but the isolation isn't as good, and it reduces the number of PE#
+choices because instead of consuming only numVFs segments, the VF(n) BAR
+space will consume (numVFs * n) segments.  That means there aren't as many
+available segments for adjusting base of the VF(n) BAR space.
diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.txt b/Documentation/powerpc/pci_iov_resource_on_powernv.txt
deleted file mode 100644
index b55c5cd83f8d..000000000000
--- a/Documentation/powerpc/pci_iov_resource_on_powernv.txt
+++ /dev/null
@@ -1,301 +0,0 @@
-Wei Yang <weiyang@linux.vnet.ibm.com>
-Benjamin Herrenschmidt <benh@au1.ibm.com>
-Bjorn Helgaas <bhelgaas@google.com>
-26 Aug 2014
-
-This document describes the requirement from hardware for PCI MMIO resource
-sizing and assignment on PowerKVM and how generic PCI code handles this
-requirement. The first two sections describe the concepts of Partitionable
-Endpoints and the implementation on P8 (IODA2). The next two sections talks
-about considerations on enabling SRIOV on IODA2.
-
-1. Introduction to Partitionable Endpoints
-
-A Partitionable Endpoint (PE) is a way to group the various resources
-associated with a device or a set of devices to provide isolation between
-partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism
-to freeze a device that is causing errors in order to limit the possibility
-of propagation of bad data.
-
-There is thus, in HW, a table of PE states that contains a pair of "frozen"
-state bits (one for MMIO and one for DMA, they get set together but can be
-cleared independently) for each PE.
-
-When a PE is frozen, all stores in any direction are dropped and all loads
-return all 1's value. MSIs are also blocked. There's a bit more state that
-captures things like the details of the error that caused the freeze etc., but
-that's not critical.
-
-The interesting part is how the various PCIe transactions (MMIO, DMA, ...)
-are matched to their corresponding PEs.
-
-The following section provides a rough description of what we have on P8
-(IODA2).  Keep in mind that this is all per PHB (PCI host bridge).  Each PHB
-is a completely separate HW entity that replicates the entire logic, so has
-its own set of PEs, etc.
-
-2. Implementation of Partitionable Endpoints on P8 (IODA2)
-
-P8 supports up to 256 Partitionable Endpoints per PHB.
-
-  * Inbound
-
-    For DMA, MSIs and inbound PCIe error messages, we have a table (in
-    memory but accessed in HW by the chip) that provides a direct
-    correspondence between a PCIe RID (bus/dev/fn) with a PE number.
-    We call this the RTT.
-
-    - For DMA we then provide an entire address space for each PE that can
-      contain two "windows", depending on the value of PCI address bit 59.
-      Each window can be configured to be remapped via a "TCE table" (IOMMU
-      translation table), which has various configurable characteristics
-      not described here.
-
-    - For MSIs, we have two windows in the address space (one at the top of
-      the 32-bit space and one much higher) which, via a combination of the
-      address and MSI value, will result in one of the 2048 interrupts per
-      bridge being triggered.  There's a PE# in the interrupt controller
-      descriptor table as well which is compared with the PE# obtained from
-      the RTT to "authorize" the device to emit that specific interrupt.
-
-    - Error messages just use the RTT.
-
-  * Outbound.  That's where the tricky part is.
-
-    Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
-    from the CPU address space to the PCI address space.  There is one M32
-    window and sixteen M64 windows.  They have different characteristics.
-    First what they have in common: they forward a configurable portion of
-    the CPU address space to the PCIe bus and must be naturally aligned
-    power of two in size.  The rest is different:
-
-    - The M32 window:
-
-      * Is limited to 4GB in size.
-
-      * Drops the top bits of the address (above the size) and replaces
-	them with a configurable value.  This is typically used to generate
-	32-bit PCIe accesses.  We configure that window at boot from FW and
-	don't touch it from Linux; it's usually set to forward a 2GB
-	portion of address space from the CPU to PCIe
-	0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
-	reserved for MSIs but this is not a problem at this point; we just
-	need to ensure Linux doesn't assign anything there, the M32 logic
-	ignores that however and will forward in that space if we try).
-
-      * It is divided into 256 segments of equal size.  A table in the chip
-	maps each segment to a PE#.  That allows portions of the MMIO space
-	to be assigned to PEs on a segment granularity.  For a 2GB window,
-	the segment granularity is 2GB/256 = 8MB.
-
-    Now, this is the "main" window we use in Linux today (excluding
-    SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
-    onto a segment alignment/granularity so that the space behind a bridge
-    can be assigned to a PE.
-
-    Ideally we would like to be able to have individual functions in PEs
-    but that would mean using a completely different address allocation
-    scheme where individual function BARs can be "grouped" to fit in one or
-    more segments.
-
-    - The M64 windows:
-
-      * Must be at least 256MB in size.
-
-      * Do not translate addresses (the address on PCIe is the same as the
-	address on the PowerBus).  There is a way to also set the top 14
-	bits which are not conveyed by PowerBus but we don't use this.
-
-      * Can be configured to be segmented.  When not segmented, we can
-	specify the PE# for the entire window.  When segmented, a window
-	has 256 segments; however, there is no table for mapping a segment
-	to a PE#.  The segment number *is* the PE#.
-
-      * Support overlaps.  If an address is covered by multiple windows,
-	there's a defined ordering for which window applies.
-
-    We have code (fairly new compared to the M32 stuff) that exploits that
-    for large BARs in 64-bit space:
-
-    We configure an M64 window to cover the entire region of address space
-    that has been assigned by FW for the PHB (about 64GB, ignore the space
-    for the M32, it comes out of a different "reserve").  We configure it
-    as segmented.
-
-    Then we do the same thing as with M32, using the bridge alignment
-    trick, to match to those giant segments.
-
-    Since we cannot remap, we have two additional constraints:
-
-    - We do the PE# allocation *after* the 64-bit space has been assigned
-      because the addresses we use directly determine the PE#.  We then
-      update the M32 PE# for the devices that use both 32-bit and 64-bit
-      spaces or assign the remaining PE# to 32-bit only devices.
-
-    - We cannot "group" segments in HW, so if a device ends up using more
-      than one segment, we end up with more than one PE#.  There is a HW
-      mechanism to make the freeze state cascade to "companion" PEs but
-      that only works for PCIe error messages (typically used so that if
-      you freeze a switch, it freezes all its children).  So we do it in
-      SW.  We lose a bit of effectiveness of EEH in that case, but that's
-      the best we found.  So when any of the PEs freezes, we freeze the
-      other ones for that "domain".  We thus introduce the concept of
-      "master PE" which is the one used for DMA, MSIs, etc., and "secondary
-      PEs" that are used for the remaining M64 segments.
-
-    We would like to investigate using additional M64 windows in "single
-    PE" mode to overlay over specific BARs to work around some of that, for
-    example for devices with very large BARs, e.g., GPUs.  It would make
-    sense, but we haven't done it yet.
-
-3. Considerations for SR-IOV on PowerKVM
-
-  * SR-IOV Background
-
-    The PCIe SR-IOV feature allows a single Physical Function (PF) to
-    support several Virtual Functions (VFs).  Registers in the PF's SR-IOV
-    Capability control the number of VFs and whether they are enabled.
-
-    When VFs are enabled, they appear in Configuration Space like normal
-    PCI devices, but the BARs in VF config space headers are unusual.  For
-    a non-VF device, software uses BARs in the config space header to
-    discover the BAR sizes and assign addresses for them.  For VF devices,
-    software uses VF BAR registers in the *PF* SR-IOV Capability to
-    discover sizes and assign addresses.  The BARs in the VF's config space
-    header are read-only zeros.
-
-    When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
-    base address for all the corresponding VF(n) BARs.  For example, if the
-    PF SR-IOV Capability is programmed to enable eight VFs, and it has a
-    1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
-    This region is divided into eight contiguous 1MB regions, each of which
-    is a BAR0 for one of the VFs.  Note that even though the VF BAR
-    describes an 8MB region, the alignment requirement is for a single VF,
-    i.e., 1MB in this example.
-
-  There are several strategies for isolating VFs in PEs:
-
-  - M32 window: There's one M32 window, and it is split into 256
-    equally-sized segments.  The finest granularity possible is a 256MB
-    window with 1MB segments.  VF BARs that are 1MB or larger could be
-    mapped to separate PEs in this window.  Each segment can be
-    individually mapped to a PE via the lookup table, so this is quite
-    flexible, but it works best when all the VF BARs are the same size.  If
-    they are different sizes, the entire window has to be small enough that
-    the segment size matches the smallest VF BAR, which means larger VF
-    BARs span several segments.
-
-  - Non-segmented M64 window: A non-segmented M64 window is mapped entirely
-    to a single PE, so it could only isolate one VF.
-
-  - Single segmented M64 windows: A segmented M64 window could be used just
-    like the M32 window, but the segments can't be individually mapped to
-    PEs (the segment number is the PE#), so there isn't as much
-    flexibility.  A VF with multiple BARs would have to be in a "domain" of
-    multiple PEs, which is not as well isolated as a single PE.
-
-  - Multiple segmented M64 windows: As usual, each window is split into 256
-    equally-sized segments, and the segment number is the PE#.  But if we
-    use several M64 windows, they can be set to different base addresses
-    and different segment sizes.  If we have VFs that each have a 1MB BAR
-    and a 32MB BAR, we could use one M64 window to assign 1MB segments and
-    another M64 window to assign 32MB segments.
-
-  Finally, the plan to use M64 windows for SR-IOV, which will be described
-  more in the next two sections.  For a given VF BAR, we need to
-  effectively reserve the entire 256 segments (256 * VF BAR size) and
-  position the VF BAR to start at the beginning of a free range of
-  segments/PEs inside that M64 window.
-
-  The goal is of course to be able to give a separate PE for each VF.
-
-  The IODA2 platform has 16 M64 windows, which are used to map MMIO
-  range to PE#.  Each M64 window defines one MMIO range and this range is
-  divided into 256 segments, with each segment corresponding to one PE.
-
-  We decide to leverage this M64 window to map VFs to individual PEs, since
-  SR-IOV VF BARs are all the same size.
-
-  But doing so introduces another problem: total_VFs is usually smaller
-  than the number of M64 window segments, so if we map one VF BAR directly
-  to one M64 window, some part of the M64 window will map to another
-  device's MMIO range.
-
-  IODA supports 256 PEs, so segmented windows contain 256 segments, so if
-  total_VFs is less than 256, we have the situation in Figure 1.0, where
-  segments [total_VFs, 255] of the M64 window may map to some MMIO range on
-  other devices:
-
-     0      1                     total_VFs - 1
-     +------+------+-     -+------+------+
-     |      |      |  ...  |      |      |
-     +------+------+-     -+------+------+
-
-                           VF(n) BAR space
-
-     0      1                     total_VFs - 1                255
-     +------+------+-     -+------+------+-      -+------+------+
-     |      |      |  ...  |      |      |   ...  |      |      |
-     +------+------+-     -+------+------+-      -+------+------+
-
-                           M64 window
-
-		Figure 1.0 Direct map VF(n) BAR space
-
-  Our current solution is to allocate 256 segments even if the VF(n) BAR
-  space doesn't need that much, as shown in Figure 1.1:
-
-     0      1                     total_VFs - 1                255
-     +------+------+-     -+------+------+-      -+------+------+
-     |      |      |  ...  |      |      |   ...  |      |      |
-     +------+------+-     -+------+------+-      -+------+------+
-
-                           VF(n) BAR space + extra
-
-     0      1                     total_VFs - 1                255
-     +------+------+-     -+------+------+-      -+------+------+
-     |      |      |  ...  |      |      |   ...  |      |      |
-     +------+------+-     -+------+------+-      -+------+------+
-
-			   M64 window
-
-		Figure 1.1 Map VF(n) BAR space + extra
-
-  Allocating the extra space ensures that the entire M64 window will be
-  assigned to this one SR-IOV device and none of the space will be
-  available for other devices.  Note that this only expands the space
-  reserved in software; there are still only total_VFs VFs, and they only
-  respond to segments [0, total_VFs - 1].  There's nothing in hardware that
-  responds to segments [total_VFs, 255].
-
-4. Implications for the Generic PCI Code
-
-The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be
-aligned to the size of an individual VF BAR.
-
-In IODA2, the MMIO address determines the PE#.  If the address is in an M32
-window, we can set the PE# by updating the table that translates segments
-to PE#s.  Similarly, if the address is in an unsegmented M64 window, we can
-set the PE# for the window.  But if it's in a segmented M64 window, the
-segment number is the PE#.
-
-Therefore, the only way to control the PE# for a VF is to change the base
-of the VF(n) BAR space in the VF BAR.  If the PCI core allocates the exact
-amount of space required for the VF(n) BAR space, the VF BAR value is fixed
-and cannot be changed.
-
-On the other hand, if the PCI core allocates additional space, the VF BAR
-value can be changed as long as the entire VF(n) BAR space remains inside
-the space allocated by the core.
-
-Ideally the segment size will be the same as an individual VF BAR size.
-Then each VF will be in its own PE.  The VF BARs (and therefore the PE#s)
-are contiguous.  If VF0 is in PE(x), then VF(n) is in PE(x+n).  If we
-allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0.
-
-If the segment size is smaller than the VF BAR size, it will take several
-segments to cover a VF BAR, and a VF will be in several PEs.  This is
-possible, but the isolation isn't as good, and it reduces the number of PE#
-choices because instead of consuming only numVFs segments, the VF(n) BAR
-space will consume (numVFs * n) segments.  That means there aren't as many
-available segments for adjusting base of the VF(n) BAR space.
diff --git a/Documentation/powerpc/pmu-ebb.rst b/Documentation/powerpc/pmu-ebb.rst
new file mode 100644
index 000000000000..4f474758eb55
--- /dev/null
+++ b/Documentation/powerpc/pmu-ebb.rst
@@ -0,0 +1,138 @@
+========================
+PMU Event Based Branches
+========================
+
+Event Based Branches (EBBs) are a feature which allows the hardware to
+branch directly to a specified user space address when certain events occur.
+
+The full specification is available in Power ISA v2.07:
+
+  https://www.power.org/documentation/power-isa-version-2-07/
+
+One type of event for which EBBs can be configured is PMU exceptions. This
+document describes the API for configuring the Power PMU to generate EBBs,
+using the Linux perf_events API.
+
+
+Terminology
+-----------
+
+Throughout this document we will refer to an "EBB event" or "EBB events". This
+just refers to a struct perf_event which has set the "EBB" flag in its
+attr.config. All events which can be configured on the hardware PMU are
+possible "EBB events".
+
+
+Background
+----------
+
+When a PMU EBB occurs it is delivered to the currently running process. As such
+EBBs can only sensibly be used by programs for self-monitoring.
+
+It is a feature of the perf_events API that events can be created on other
+processes, subject to standard permission checks. This is also true of EBB
+events, however unless the target process enables EBBs (via mtspr(BESCR)) no
+EBBs will ever be delivered.
+
+This makes it possible for a process to enable EBBs for itself, but not
+actually configure any events. At a later time another process can come along
+and attach an EBB event to the process, which will then cause EBBs to be
+delivered to the first process. It's not clear if this is actually useful.
+
+
+When the PMU is configured for EBBs, all PMU interrupts are delivered to the
+user process. This means once an EBB event is scheduled on the PMU, no non-EBB
+events can be configured. This means that EBB events can not be run
+concurrently with regular 'perf' commands, or any other perf events.
+
+It is however safe to run 'perf' commands on a process which is using EBBs. The
+kernel will in general schedule the EBB event, and perf will be notified that
+its events could not run.
+
+The exclusion between EBB events and regular events is implemented using the
+existing "pinned" and "exclusive" attributes of perf_events. This means EBB
+events will be given priority over other events, unless they are also pinned.
+If an EBB event and a regular event are both pinned, then whichever is enabled
+first will be scheduled and the other will be put in error state. See the
+section below titled "Enabling an EBB event" for more information.
+
+
+Creating an EBB event
+---------------------
+
+To request that an event is counted using EBB, the event code should have bit
+63 set.
+
+EBB events must be created with a particular, and restrictive, set of
+attributes - this is so that they interoperate correctly with the rest of the
+perf_events subsystem.
+
+An EBB event must be created with the "pinned" and "exclusive" attributes set.
+Note that if you are creating a group of EBB events, only the leader can have
+these attributes set.
+
+An EBB event must NOT set any of the "inherit", "sample_period", "freq" or
+"enable_on_exec" attributes.
+
+An EBB event must be attached to a task. This is specified to perf_event_open()
+by passing a pid value, typically 0 indicating the current task.
+
+All events in a group must agree on whether they want EBB. That is all events
+must request EBB, or none may request EBB.
+
+EBB events must specify the PMC they are to be counted on. This ensures
+userspace is able to reliably determine which PMC the event is scheduled on.
+
+
+Enabling an EBB event
+---------------------
+
+Once an EBB event has been successfully opened, it must be enabled with the
+perf_events API. This can be achieved either via the ioctl() interface, or the
+prctl() interface.
+
+However, due to the design of the perf_events API, enabling an event does not
+guarantee that it has been scheduled on the PMU. To ensure that the EBB event
+has been scheduled on the PMU, you must perform a read() on the event. If the
+read() returns EOF, then the event has not been scheduled and EBBs are not
+enabled.
+
+This behaviour occurs because the EBB event is pinned and exclusive. When the
+EBB event is enabled it will force all other non-pinned events off the PMU. In
+this case the enable will be successful. However if there is already an event
+pinned on the PMU then the enable will not be successful.
+
+
+Reading an EBB event
+--------------------
+
+It is possible to read() from an EBB event. However the results are
+meaningless. Because interrupts are being delivered to the user process the
+kernel is not able to count the event, and so will return a junk value.
+
+
+Closing an EBB event
+--------------------
+
+When an EBB event is finished with, you can close it using close() as for any
+regular event. If this is the last EBB event the PMU will be deconfigured and
+no further PMU EBBs will be delivered.
+
+
+EBB Handler
+-----------
+
+The EBB handler is just regular userspace code, however it must be written in
+the style of an interrupt handler. When the handler is entered all registers
+are live (possibly) and so must be saved somehow before the handler can invoke
+other code.
+
+It's up to the program how to handle this. For C programs a relatively simple
+option is to create an interrupt frame on the stack and save registers there.
+
+Fork
+----
+
+EBB events are not inherited across fork. If the child process wishes to use
+EBBs it should open a new event for itself. Similarly the EBB state in
+BESCR/EBBHR/EBBRR is cleared across fork().
diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt
deleted file mode 100644
index 73cd163dbfb8..000000000000
--- a/Documentation/powerpc/pmu-ebb.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-PMU Event Based Branches
-========================
-
-Event Based Branches (EBBs) are a feature which allows the hardware to
-branch directly to a specified user space address when certain events occur.
-
-The full specification is available in Power ISA v2.07:
-
-  https://www.power.org/documentation/power-isa-version-2-07/
-
-One type of event for which EBBs can be configured is PMU exceptions. This
-document describes the API for configuring the Power PMU to generate EBBs,
-using the Linux perf_events API.
-
-
-Terminology
------------
-
-Throughout this document we will refer to an "EBB event" or "EBB events". This
-just refers to a struct perf_event which has set the "EBB" flag in its
-attr.config. All events which can be configured on the hardware PMU are
-possible "EBB events".
-
-
-Background
-----------
-
-When a PMU EBB occurs it is delivered to the currently running process. As such
-EBBs can only sensibly be used by programs for self-monitoring.
-
-It is a feature of the perf_events API that events can be created on other
-processes, subject to standard permission checks. This is also true of EBB
-events, however unless the target process enables EBBs (via mtspr(BESCR)) no
-EBBs will ever be delivered.
-
-This makes it possible for a process to enable EBBs for itself, but not
-actually configure any events. At a later time another process can come along
-and attach an EBB event to the process, which will then cause EBBs to be
-delivered to the first process. It's not clear if this is actually useful.
-
-
-When the PMU is configured for EBBs, all PMU interrupts are delivered to the
-user process. This means once an EBB event is scheduled on the PMU, no non-EBB
-events can be configured. This means that EBB events can not be run
-concurrently with regular 'perf' commands, or any other perf events.
-
-It is however safe to run 'perf' commands on a process which is using EBBs. The
-kernel will in general schedule the EBB event, and perf will be notified that
-its events could not run.
-
-The exclusion between EBB events and regular events is implemented using the
-existing "pinned" and "exclusive" attributes of perf_events. This means EBB
-events will be given priority over other events, unless they are also pinned.
-If an EBB event and a regular event are both pinned, then whichever is enabled
-first will be scheduled and the other will be put in error state. See the
-section below titled "Enabling an EBB event" for more information.
-
-
-Creating an EBB event
----------------------
-
-To request that an event is counted using EBB, the event code should have bit
-63 set.
-
-EBB events must be created with a particular, and restrictive, set of
-attributes - this is so that they interoperate correctly with the rest of the
-perf_events subsystem.
-
-An EBB event must be created with the "pinned" and "exclusive" attributes set.
-Note that if you are creating a group of EBB events, only the leader can have
-these attributes set.
-
-An EBB event must NOT set any of the "inherit", "sample_period", "freq" or
-"enable_on_exec" attributes.
-
-An EBB event must be attached to a task. This is specified to perf_event_open()
-by passing a pid value, typically 0 indicating the current task.
-
-All events in a group must agree on whether they want EBB. That is all events
-must request EBB, or none may request EBB.
-
-EBB events must specify the PMC they are to be counted on. This ensures
-userspace is able to reliably determine which PMC the event is scheduled on.
-
-
-Enabling an EBB event
----------------------
-
-Once an EBB event has been successfully opened, it must be enabled with the
-perf_events API. This can be achieved either via the ioctl() interface, or the
-prctl() interface.
-
-However, due to the design of the perf_events API, enabling an event does not
-guarantee that it has been scheduled on the PMU. To ensure that the EBB event
-has been scheduled on the PMU, you must perform a read() on the event. If the
-read() returns EOF, then the event has not been scheduled and EBBs are not
-enabled.
-
-This behaviour occurs because the EBB event is pinned and exclusive. When the
-EBB event is enabled it will force all other non-pinned events off the PMU. In
-this case the enable will be successful. However if there is already an event
-pinned on the PMU then the enable will not be successful.
-
-
-Reading an EBB event
---------------------
-
-It is possible to read() from an EBB event. However the results are
-meaningless. Because interrupts are being delivered to the user process the
-kernel is not able to count the event, and so will return a junk value.
-
-
-Closing an EBB event
---------------------
-
-When an EBB event is finished with, you can close it using close() as for any
-regular event. If this is the last EBB event the PMU will be deconfigured and
-no further PMU EBBs will be delivered.
-
-
-EBB Handler
------------
-
-The EBB handler is just regular userspace code, however it must be written in
-the style of an interrupt handler. When the handler is entered all registers
-are live (possibly) and so must be saved somehow before the handler can invoke
-other code.
-
-It's up to the program how to handle this. For C programs a relatively simple
-option is to create an interrupt frame on the stack and save registers there.
-
-Fork
-----
-
-EBB events are not inherited across fork. If the child process wishes to use
-EBBs it should open a new event for itself. Similarly the EBB state in
-BESCR/EBBHR/EBBRR is cleared across fork().
diff --git a/Documentation/powerpc/ptrace.rst b/Documentation/powerpc/ptrace.rst
new file mode 100644
index 000000000000..864d4b6dddd1
--- /dev/null
+++ b/Documentation/powerpc/ptrace.rst
@@ -0,0 +1,156 @@
+======
+Ptrace
+======
+
+GDB intends to support the following hardware debug features of BookE
+processors:
+
+4 hardware breakpoints (IAC)
+2 hardware watchpoints (read, write and read-write) (DAC)
+2 value conditions for the hardware watchpoints (DVC)
+
+For that, we need to extend ptrace so that GDB can query and set these
+resources. Since we're extending, we're trying to create an interface
+that's extendable and that covers both BookE and server processors, so
+that GDB doesn't need to special-case each of them. We added the
+following 3 new ptrace requests.
+
+1. PTRACE_PPC_GETHWDEBUGINFO
+============================
+
+Query for GDB to discover the hardware debug features. The main info to
+be returned here is the minimum alignment for the hardware watchpoints.
+BookE processors don't have restrictions here, but server processors have
+an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid
+adding special cases to GDB based on what it sees in AUXV.
+
+Since we're at it, we added other useful info that the kernel can return to
+GDB: this query will return the number of hardware breakpoints, hardware
+watchpoints and whether it supports a range of addresses and a condition.
+The query will fill the following structure provided by the requesting process::
+
+  struct ppc_debug_info {
+       unit32_t version;
+       unit32_t num_instruction_bps;
+       unit32_t num_data_bps;
+       unit32_t num_condition_regs;
+       unit32_t data_bp_alignment;
+       unit32_t sizeof_condition; /* size of the DVC register */
+       uint64_t features; /* bitmask of the individual flags */
+  };
+
+features will have bits indicating whether there is support for::
+
+  #define PPC_DEBUG_FEATURE_INSN_BP_RANGE		0x1
+  #define PPC_DEBUG_FEATURE_INSN_BP_MASK		0x2
+  #define PPC_DEBUG_FEATURE_DATA_BP_RANGE		0x4
+  #define PPC_DEBUG_FEATURE_DATA_BP_MASK		0x8
+  #define PPC_DEBUG_FEATURE_DATA_BP_DAWR		0x10
+
+2. PTRACE_SETHWDEBUG
+
+Sets a hardware breakpoint or watchpoint, according to the provided structure::
+
+  struct ppc_hw_breakpoint {
+        uint32_t version;
+  #define PPC_BREAKPOINT_TRIGGER_EXECUTE  0x1
+  #define PPC_BREAKPOINT_TRIGGER_READ     0x2
+ #define PPC_BREAKPOINT_TRIGGER_WRITE    0x4
+        uint32_t trigger_type;       /* only some combinations allowed */
+  #define PPC_BREAKPOINT_MODE_EXACT               0x0
+  #define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE     0x1
+  #define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE     0x2
+  #define PPC_BREAKPOINT_MODE_MASK                0x3
+        uint32_t addr_mode;          /* address match mode */
+
+  #define PPC_BREAKPOINT_CONDITION_MODE   0x3
+  #define PPC_BREAKPOINT_CONDITION_NONE   0x0
+  #define PPC_BREAKPOINT_CONDITION_AND    0x1
+  #define PPC_BREAKPOINT_CONDITION_EXACT  0x1	/* different name for the same thing as above */
+  #define PPC_BREAKPOINT_CONDITION_OR     0x2
+  #define PPC_BREAKPOINT_CONDITION_AND_OR 0x3
+  #define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000	/* byte enable bits */
+  #define PPC_BREAKPOINT_CONDITION_BE(n)  (1<<((n)+16))
+        uint32_t condition_mode;     /* break/watchpoint condition flags */
+
+        uint64_t addr;
+        uint64_t addr2;
+        uint64_t condition_value;
+  };
+
+A request specifies one event, not necessarily just one register to be set.
+For instance, if the request is for a watchpoint with a condition, both the
+DAC and DVC registers will be set in the same request.
+
+With this GDB can ask for all kinds of hardware breakpoints and watchpoints
+that the BookE supports. COMEFROM breakpoints available in server processors
+are not contemplated, but that is out of the scope of this work.
+
+ptrace will return an integer (handle) uniquely identifying the breakpoint or
+watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG
+request to ask for its removal. Return -ENOSPC if the requested breakpoint
+can't be allocated on the registers.
+
+Some examples of using the structure to:
+
+- set a breakpoint in the first breakpoint register::
+
+    p.version         = PPC_DEBUG_CURRENT_VERSION;
+    p.trigger_type    = PPC_BREAKPOINT_TRIGGER_EXECUTE;
+    p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
+    p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
+    p.addr            = (uint64_t) address;
+    p.addr2           = 0;
+    p.condition_value = 0;
+
+- set a watchpoint which triggers on reads in the second watchpoint register::
+
+    p.version         = PPC_DEBUG_CURRENT_VERSION;
+    p.trigger_type    = PPC_BREAKPOINT_TRIGGER_READ;
+    p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
+    p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
+    p.addr            = (uint64_t) address;
+    p.addr2           = 0;
+    p.condition_value = 0;
+
+- set a watchpoint which triggers only with a specific value::
+
+    p.version         = PPC_DEBUG_CURRENT_VERSION;
+    p.trigger_type    = PPC_BREAKPOINT_TRIGGER_READ;
+    p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
+    p.condition_mode  = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL;
+    p.addr            = (uint64_t) address;
+    p.addr2           = 0;
+    p.condition_value = (uint64_t) condition;
+
+- set a ranged hardware breakpoint::
+
+    p.version         = PPC_DEBUG_CURRENT_VERSION;
+    p.trigger_type    = PPC_BREAKPOINT_TRIGGER_EXECUTE;
+    p.addr_mode       = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
+    p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
+    p.addr            = (uint64_t) begin_range;
+    p.addr2           = (uint64_t) end_range;
+    p.condition_value = 0;
+
+- set a watchpoint in server processors (BookS)::
+
+    p.version         = 1;
+    p.trigger_type    = PPC_BREAKPOINT_TRIGGER_RW;
+    p.addr_mode       = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
+    or
+    p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
+
+    p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
+    p.addr            = (uint64_t) begin_range;
+    /* For PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE addr2 needs to be specified, where
+     * addr2 - addr <= 8 Bytes.
+     */
+    p.addr2           = (uint64_t) end_range;
+    p.condition_value = 0;
+
+3. PTRACE_DELHWDEBUG
+
+Takes an integer which identifies an existing breakpoint or watchpoint
+(i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the
+corresponding breakpoint or watchpoint..
diff --git a/Documentation/powerpc/ptrace.txt b/Documentation/powerpc/ptrace.txt
deleted file mode 100644
index 99c5ce88d0fe..000000000000
--- a/Documentation/powerpc/ptrace.txt
+++ /dev/null
@@ -1,151 +0,0 @@
-GDB intends to support the following hardware debug features of BookE
-processors:
-
-4 hardware breakpoints (IAC)
-2 hardware watchpoints (read, write and read-write) (DAC)
-2 value conditions for the hardware watchpoints (DVC)
-
-For that, we need to extend ptrace so that GDB can query and set these
-resources. Since we're extending, we're trying to create an interface
-that's extendable and that covers both BookE and server processors, so
-that GDB doesn't need to special-case each of them. We added the
-following 3 new ptrace requests.
-
-1. PTRACE_PPC_GETHWDEBUGINFO
-
-Query for GDB to discover the hardware debug features. The main info to
-be returned here is the minimum alignment for the hardware watchpoints.
-BookE processors don't have restrictions here, but server processors have
-an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid
-adding special cases to GDB based on what it sees in AUXV.
-
-Since we're at it, we added other useful info that the kernel can return to
-GDB: this query will return the number of hardware breakpoints, hardware
-watchpoints and whether it supports a range of addresses and a condition.
-The query will fill the following structure provided by the requesting process:
-
-struct ppc_debug_info {
-       unit32_t version;
-       unit32_t num_instruction_bps;
-       unit32_t num_data_bps;
-       unit32_t num_condition_regs;
-       unit32_t data_bp_alignment;
-       unit32_t sizeof_condition; /* size of the DVC register */
-       uint64_t features; /* bitmask of the individual flags */
-};
-
-features will have bits indicating whether there is support for:
-
-#define PPC_DEBUG_FEATURE_INSN_BP_RANGE		0x1
-#define PPC_DEBUG_FEATURE_INSN_BP_MASK		0x2
-#define PPC_DEBUG_FEATURE_DATA_BP_RANGE		0x4
-#define PPC_DEBUG_FEATURE_DATA_BP_MASK		0x8
-#define PPC_DEBUG_FEATURE_DATA_BP_DAWR		0x10
-
-2. PTRACE_SETHWDEBUG
-
-Sets a hardware breakpoint or watchpoint, according to the provided structure:
-
-struct ppc_hw_breakpoint {
-        uint32_t version;
-#define PPC_BREAKPOINT_TRIGGER_EXECUTE  0x1
-#define PPC_BREAKPOINT_TRIGGER_READ     0x2
-#define PPC_BREAKPOINT_TRIGGER_WRITE    0x4
-        uint32_t trigger_type;       /* only some combinations allowed */
-#define PPC_BREAKPOINT_MODE_EXACT               0x0
-#define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE     0x1
-#define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE     0x2
-#define PPC_BREAKPOINT_MODE_MASK                0x3
-        uint32_t addr_mode;          /* address match mode */
-
-#define PPC_BREAKPOINT_CONDITION_MODE   0x3
-#define PPC_BREAKPOINT_CONDITION_NONE   0x0
-#define PPC_BREAKPOINT_CONDITION_AND    0x1
-#define PPC_BREAKPOINT_CONDITION_EXACT  0x1	/* different name for the same thing as above */
-#define PPC_BREAKPOINT_CONDITION_OR     0x2
-#define PPC_BREAKPOINT_CONDITION_AND_OR 0x3
-#define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000	/* byte enable bits */
-#define PPC_BREAKPOINT_CONDITION_BE(n)  (1<<((n)+16))
-        uint32_t condition_mode;     /* break/watchpoint condition flags */
-
-        uint64_t addr;
-        uint64_t addr2;
-        uint64_t condition_value;
-};
-
-A request specifies one event, not necessarily just one register to be set.
-For instance, if the request is for a watchpoint with a condition, both the
-DAC and DVC registers will be set in the same request.
-
-With this GDB can ask for all kinds of hardware breakpoints and watchpoints
-that the BookE supports. COMEFROM breakpoints available in server processors
-are not contemplated, but that is out of the scope of this work.
-
-ptrace will return an integer (handle) uniquely identifying the breakpoint or
-watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG
-request to ask for its removal. Return -ENOSPC if the requested breakpoint
-can't be allocated on the registers.
-
-Some examples of using the structure to:
-
-- set a breakpoint in the first breakpoint register
-
-  p.version         = PPC_DEBUG_CURRENT_VERSION;
-  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_EXECUTE;
-  p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
-  p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
-  p.addr            = (uint64_t) address;
-  p.addr2           = 0;
-  p.condition_value = 0;
-
-- set a watchpoint which triggers on reads in the second watchpoint register
-
-  p.version         = PPC_DEBUG_CURRENT_VERSION;
-  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_READ;
-  p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
-  p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
-  p.addr            = (uint64_t) address;
-  p.addr2           = 0;
-  p.condition_value = 0;
-
-- set a watchpoint which triggers only with a specific value
-
-  p.version         = PPC_DEBUG_CURRENT_VERSION;
-  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_READ;
-  p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
-  p.condition_mode  = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL;
-  p.addr            = (uint64_t) address;
-  p.addr2           = 0;
-  p.condition_value = (uint64_t) condition;
-
-- set a ranged hardware breakpoint
-
-  p.version         = PPC_DEBUG_CURRENT_VERSION;
-  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_EXECUTE;
-  p.addr_mode       = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
-  p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
-  p.addr            = (uint64_t) begin_range;
-  p.addr2           = (uint64_t) end_range;
-  p.condition_value = 0;
-
-- set a watchpoint in server processors (BookS)
-
-  p.version         = 1;
-  p.trigger_type    = PPC_BREAKPOINT_TRIGGER_RW;
-  p.addr_mode       = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
-  or
-  p.addr_mode       = PPC_BREAKPOINT_MODE_EXACT;
-
-  p.condition_mode  = PPC_BREAKPOINT_CONDITION_NONE;
-  p.addr            = (uint64_t) begin_range;
-  /* For PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE addr2 needs to be specified, where
-   * addr2 - addr <= 8 Bytes.
-   */
-  p.addr2           = (uint64_t) end_range;
-  p.condition_value = 0;
-
-3. PTRACE_DELHWDEBUG
-
-Takes an integer which identifies an existing breakpoint or watchpoint
-(i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the
-corresponding breakpoint or watchpoint..
diff --git a/Documentation/powerpc/qe_firmware.rst b/Documentation/powerpc/qe_firmware.rst
new file mode 100644
index 000000000000..42f5103140c9
--- /dev/null
+++ b/Documentation/powerpc/qe_firmware.rst
@@ -0,0 +1,296 @@
+=========================================
+Freescale QUICC Engine Firmware Uploading
+=========================================
+
+(c) 2007 Timur Tabi <timur at freescale.com>,
+    Freescale Semiconductor
+
+.. Table of Contents
+
+   I - Software License for Firmware
+
+   II - Microcode Availability
+
+   III - Description and Terminology
+
+   IV - Microcode Programming Details
+
+   V - Firmware Structure Layout
+
+   VI - Sample Code for Creating Firmware Files
+
+Revision Information
+====================
+
+November 30, 2007: Rev 1.0 - Initial version
+
+I - Software License for Firmware
+=================================
+
+Each firmware file comes with its own software license.  For information on
+the particular license, please see the license text that is distributed with
+the firmware.
+
+II - Microcode Availability
+===========================
+
+Firmware files are distributed through various channels.  Some are available on
+http://opensource.freescale.com.  For other firmware files, please contact
+your Freescale representative or your operating system vendor.
+
+III - Description and Terminology
+=================================
+
+In this document, the term 'microcode' refers to the sequence of 32-bit
+integers that compose the actual QE microcode.
+
+The term 'firmware' refers to a binary blob that contains the microcode as
+well as other data that
+
+	1) describes the microcode's purpose
+	2) describes how and where to upload the microcode
+	3) specifies the values of various registers
+	4) includes additional data for use by specific device drivers
+
+Firmware files are binary files that contain only a firmware.
+
+IV - Microcode Programming Details
+===================================
+
+The QE architecture allows for only one microcode present in I-RAM for each
+RISC processor.  To replace any current microcode, a full QE reset (which
+disables the microcode) must be performed first.
+
+QE microcode is uploaded using the following procedure:
+
+1) The microcode is placed into I-RAM at a specific location, using the
+   IRAM.IADD and IRAM.IDATA registers.
+
+2) The CERCR.CIR bit is set to 0 or 1, depending on whether the firmware
+   needs split I-RAM.  Split I-RAM is only meaningful for SOCs that have
+   QEs with multiple RISC processors, such as the 8360.  Splitting the I-RAM
+   allows each processor to run a different microcode, effectively creating an
+   asymmetric multiprocessing (AMP) system.
+
+3) The TIBCR trap registers are loaded with the addresses of the trap handlers
+   in the microcode.
+
+4) The RSP.ECCR register is programmed with the value provided.
+
+5) If necessary, device drivers that need the virtual traps and extended mode
+   data will use them.
+
+Virtual Microcode Traps
+
+These virtual traps are conditional branches in the microcode.  These are
+"soft" provisional introduced in the ROMcode in order to enable higher
+flexibility and save h/w traps If new features are activated or an issue is
+being fixed in the RAM package utilizing they should be activated.  This data
+structure signals the microcode which of these virtual traps is active.
+
+This structure contains 6 words that the application should copy to some
+specific been defined.  This table describes the structure::
+
+	---------------------------------------------------------------
+	| Offset in |                  | Destination Offset | Size of |
+	|   array   |     Protocol     |   within PRAM      | Operand |
+	--------------------------------------------------------------|
+	|     0     | Ethernet         |      0xF8          | 4 bytes |
+	|           | interworking     |                    |         |
+	---------------------------------------------------------------
+	|     4     | ATM              |      0xF8          | 4 bytes |
+	|           | interworking     |                    |         |
+	---------------------------------------------------------------
+	|     8     | PPP              |      0xF8          | 4 bytes |
+	|           | interworking     |                    |         |
+	---------------------------------------------------------------
+	|     12    | Ethernet RX      |      0x22          | 1 byte  |
+	|           | Distributor Page |                    |         |
+	---------------------------------------------------------------
+	|     16    | ATM Globtal      |      0x28          | 1 byte  |
+	|           | Params Table     |                    |         |
+	---------------------------------------------------------------
+	|     20    | Insert Frame     |      0xF8          | 4 bytes |
+	---------------------------------------------------------------
+
+
+Extended Modes
+
+This is a double word bit array (64 bits) that defines special functionality
+which has an impact on the software drivers.  Each bit has its own impact
+and has special instructions for the s/w associated with it.  This structure is
+described in this table::
+
+	-----------------------------------------------------------------------
+	| Bit #  |     Name     |   Description                               |
+	-----------------------------------------------------------------------
+	|   0    | General      | Indicates that prior to each host command   |
+	|        | push command | given by the application, the software must |
+	|        |              | assert a special host command (push command)|
+	|        |              | CECDR = 0x00800000.                         |
+	|        |              | CECR = 0x01c1000f.                          |
+	-----------------------------------------------------------------------
+	|   1    | UCC ATM      | Indicates that after issuing ATM RX INIT    |
+	|        | RX INIT      | command, the host must issue another special|
+	|        | push command | command (push command) and immediately      |
+	|        |              | following that re-issue the ATM RX INIT     |
+	|        |              | command. (This makes the sequence of        |
+	|        |              | initializing the ATM receiver a sequence of |
+	|        |              | three host commands)                        |
+	|        |              | CECDR = 0x00800000.                         |
+	|        |              | CECR = 0x01c1000f.                          |
+	-----------------------------------------------------------------------
+	|   2    | Add/remove   | Indicates that following the specific host  |
+	|        | command      | command: "Add/Remove entry in Hash Lookup   |
+	|        | validation   | Table" used in Interworking setup, the user |
+	|        |              | must issue another command.                 |
+	|        |              | CECDR = 0xce000003.                         |
+	|        |              | CECR = 0x01c10f58.                          |
+	-----------------------------------------------------------------------
+	|   3    | General push | Indicates that the s/w has to initialize    |
+	|        | command      | some pointers in the Ethernet thread pages  |
+	|        |              | which are used when Header Compression is   |
+	|        |              | activated.  The full details of these       |
+	|        |              | pointers is located in the software drivers.|
+	-----------------------------------------------------------------------
+	|   4    | General push | Indicates that after issuing Ethernet TX    |
+	|        | command      | INIT command, user must issue this command  |
+	|        |              | for each SNUM of Ethernet TX thread.        |
+	|        |              | CECDR = 0x00800003.                         |
+	|        |              | CECR = 0x7'b{0}, 8'b{Enet TX thread SNUM},  |
+	|        |              |        1'b{1}, 12'b{0}, 4'b{1}              |
+	-----------------------------------------------------------------------
+	| 5 - 31 |     N/A      | Reserved, set to zero.                      |
+	-----------------------------------------------------------------------
+
+V - Firmware Structure Layout
+==============================
+
+QE microcode from Freescale is typically provided as a header file.  This
+header file contains macros that define the microcode binary itself as well as
+some other data used in uploading that microcode.  The format of these files
+do not lend themselves to simple inclusion into other code.  Hence,
+the need for a more portable format.  This section defines that format.
+
+Instead of distributing a header file, the microcode and related data are
+embedded into a binary blob.  This blob is passed to the qe_upload_firmware()
+function, which parses the blob and performs everything necessary to upload
+the microcode.
+
+All integers are big-endian.  See the comments for function
+qe_upload_firmware() for up-to-date implementation information.
+
+This structure supports versioning, where the version of the structure is
+embedded into the structure itself.  To ensure forward and backwards
+compatibility, all versions of the structure must use the same 'qe_header'
+structure at the beginning.
+
+'header' (type: struct qe_header):
+	The 'length' field is the size, in bytes, of the entire structure,
+	including all the microcode embedded in it, as well as the CRC (if
+	present).
+
+	The 'magic' field is an array of three bytes that contains the letters
+	'Q', 'E', and 'F'.  This is an identifier that indicates that this
+	structure is a QE Firmware structure.
+
+	The 'version' field is a single byte that indicates the version of this
+	structure.  If the layout of the structure should ever need to be
+	changed to add support for additional types of microcode, then the
+	version number should also be changed.
+
+The 'id' field is a null-terminated string(suitable for printing) that
+identifies the firmware.
+
+The 'count' field indicates the number of 'microcode' structures.  There
+must be one and only one 'microcode' structure for each RISC processor.
+Therefore, this field also represents the number of RISC processors for this
+SOC.
+
+The 'soc' structure contains the SOC numbers and revisions used to match
+the microcode to the SOC itself.  Normally, the microcode loader should
+check the data in this structure with the SOC number and revisions, and
+only upload the microcode if there's a match.  However, this check is not
+made on all platforms.
+
+Although it is not recommended, you can specify '0' in the soc.model
+field to skip matching SOCs altogether.
+
+The 'model' field is a 16-bit number that matches the actual SOC. The
+'major' and 'minor' fields are the major and minor revision numbers,
+respectively, of the SOC.
+
+For example, to match the 8323, revision 1.0::
+
+     soc.model = 8323
+     soc.major = 1
+     soc.minor = 0
+
+'padding' is necessary for structure alignment.  This field ensures that the
+'extended_modes' field is aligned on a 64-bit boundary.
+
+'extended_modes' is a bitfield that defines special functionality which has an
+impact on the device drivers.  Each bit has its own impact and has special
+instructions for the driver associated with it.  This field is stored in
+the QE library and available to any driver that calles qe_get_firmware_info().
+
+'vtraps' is an array of 8 words that contain virtual trap values for each
+virtual traps.  As with 'extended_modes', this field is stored in the QE
+library and available to any driver that calles qe_get_firmware_info().
+
+'microcode' (type: struct qe_microcode):
+	For each RISC processor there is one 'microcode' structure.  The first
+	'microcode' structure is for the first RISC, and so on.
+
+	The 'id' field is a null-terminated string suitable for printing that
+	identifies this particular microcode.
+
+	'traps' is an array of 16 words that contain hardware trap values
+	for each of the 16 traps.  If trap[i] is 0, then this particular
+	trap is to be ignored (i.e. not written to TIBCR[i]).  The entire value
+	is written as-is to the TIBCR[i] register, so be sure to set the EN
+	and T_IBP bits if necessary.
+
+	'eccr' is the value to program into the ECCR register.
+
+	'iram_offset' is the offset into IRAM to start writing the
+	microcode.
+
+	'count' is the number of 32-bit words in the microcode.
+
+	'code_offset' is the offset, in bytes, from the beginning of this
+	structure where the microcode itself can be found.  The first
+	microcode binary should be located immediately after the 'microcode'
+	array.
+
+	'major', 'minor', and 'revision' are the major, minor, and revision
+	version numbers, respectively, of the microcode.  If all values are 0,
+	then these fields are ignored.
+
+	'reserved' is necessary for structure alignment.  Since 'microcode'
+	is an array, the 64-bit 'extended_modes' field needs to be aligned
+	on a 64-bit boundary, and this can only happen if the size of
+	'microcode' is a multiple of 8 bytes.  To ensure that, we add
+	'reserved'.
+
+After the last microcode is a 32-bit CRC.  It can be calculated using
+this algorithm::
+
+  u32 crc32(const u8 *p, unsigned int len)
+  {
+	unsigned int i;
+	u32 crc = 0;
+
+	while (len--) {
+	   crc ^= *p++;
+	   for (i = 0; i < 8; i++)
+		   crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0);
+	}
+	return crc;
+  }
+
+VI - Sample Code for Creating Firmware Files
+============================================
+
+A Python program that creates firmware binaries from the header files normally
+distributed by Freescale can be found on http://opensource.freescale.com.
diff --git a/Documentation/powerpc/qe_firmware.txt b/Documentation/powerpc/qe_firmware.txt
deleted file mode 100644
index e7ac24aec4ff..000000000000
--- a/Documentation/powerpc/qe_firmware.txt
+++ /dev/null
@@ -1,295 +0,0 @@
-	   Freescale QUICC Engine Firmware Uploading
-	   -----------------------------------------
-
-(c) 2007 Timur Tabi <timur at freescale.com>,
-    Freescale Semiconductor
-
-Table of Contents
-=================
-
-  I - Software License for Firmware
-
-  II - Microcode Availability
-
-  III - Description and Terminology
-
-  IV - Microcode Programming Details
-
-  V - Firmware Structure Layout
-
-  VI - Sample Code for Creating Firmware Files
-
-Revision Information
-====================
-
-November 30, 2007: Rev 1.0 - Initial version
-
-I - Software License for Firmware
-=================================
-
-Each firmware file comes with its own software license.  For information on
-the particular license, please see the license text that is distributed with
-the firmware.
-
-II - Microcode Availability
-===========================
-
-Firmware files are distributed through various channels.  Some are available on
-http://opensource.freescale.com.  For other firmware files, please contact
-your Freescale representative or your operating system vendor.
-
-III - Description and Terminology
-================================
-
-In this document, the term 'microcode' refers to the sequence of 32-bit
-integers that compose the actual QE microcode.
-
-The term 'firmware' refers to a binary blob that contains the microcode as
-well as other data that
-
-	1) describes the microcode's purpose
-	2) describes how and where to upload the microcode
-	3) specifies the values of various registers
-	4) includes additional data for use by specific device drivers
-
-Firmware files are binary files that contain only a firmware.
-
-IV - Microcode Programming Details
-===================================
-
-The QE architecture allows for only one microcode present in I-RAM for each
-RISC processor.  To replace any current microcode, a full QE reset (which
-disables the microcode) must be performed first.
-
-QE microcode is uploaded using the following procedure:
-
-1) The microcode is placed into I-RAM at a specific location, using the
-   IRAM.IADD and IRAM.IDATA registers.
-
-2) The CERCR.CIR bit is set to 0 or 1, depending on whether the firmware
-   needs split I-RAM.  Split I-RAM is only meaningful for SOCs that have
-   QEs with multiple RISC processors, such as the 8360.  Splitting the I-RAM
-   allows each processor to run a different microcode, effectively creating an
-   asymmetric multiprocessing (AMP) system.
-
-3) The TIBCR trap registers are loaded with the addresses of the trap handlers
-   in the microcode.
-
-4) The RSP.ECCR register is programmed with the value provided.
-
-5) If necessary, device drivers that need the virtual traps and extended mode
-   data will use them.
-
-Virtual Microcode Traps
-
-These virtual traps are conditional branches in the microcode.  These are
-"soft" provisional introduced in the ROMcode in order to enable higher
-flexibility and save h/w traps If new features are activated or an issue is
-being fixed in the RAM package utilizing they should be activated.  This data
-structure signals the microcode which of these virtual traps is active.
-
-This structure contains 6 words that the application should copy to some
-specific been defined.  This table describes the structure.
-
-	---------------------------------------------------------------
-	| Offset in |                  | Destination Offset | Size of |
-	|   array   |     Protocol     |   within PRAM      | Operand |
-	--------------------------------------------------------------|
-	|     0     | Ethernet         |      0xF8          | 4 bytes |
-	|           | interworking     |                    |         |
-	---------------------------------------------------------------
-	|     4     | ATM              |      0xF8          | 4 bytes |
-	|           | interworking     |                    |         |
-	---------------------------------------------------------------
-	|     8     | PPP              |      0xF8          | 4 bytes |
-	|           | interworking     |                    |         |
-	---------------------------------------------------------------
-	|     12    | Ethernet RX      |      0x22          | 1 byte  |
-	|           | Distributor Page |                    |         |
-	---------------------------------------------------------------
-	|     16    | ATM Globtal      |      0x28          | 1 byte  |
-	|           | Params Table     |                    |         |
-	---------------------------------------------------------------
-	|     20    | Insert Frame     |      0xF8          | 4 bytes |
-	---------------------------------------------------------------
-
-
-Extended Modes
-
-This is a double word bit array (64 bits) that defines special functionality
-which has an impact on the software drivers.  Each bit has its own impact
-and has special instructions for the s/w associated with it.  This structure is
-described in this table:
-
-	-----------------------------------------------------------------------
-	| Bit #  |     Name     |   Description                               |
-	-----------------------------------------------------------------------
-	|   0    | General      | Indicates that prior to each host command   |
-	|        | push command | given by the application, the software must |
-	|        |              | assert a special host command (push command)|
-	|        |              | CECDR = 0x00800000.                         |
-	|        |              | CECR = 0x01c1000f.                          |
-	-----------------------------------------------------------------------
-	|   1    | UCC ATM      | Indicates that after issuing ATM RX INIT    |
-	|        | RX INIT      | command, the host must issue another special|
-	|        | push command | command (push command) and immediately      |
-	|        |              | following that re-issue the ATM RX INIT     |
-	|        |              | command. (This makes the sequence of        |
-	|        |              | initializing the ATM receiver a sequence of |
-	|        |              | three host commands)                        |
-	|        |              | CECDR = 0x00800000.                         |
-	|        |              | CECR = 0x01c1000f.                          |
-	-----------------------------------------------------------------------
-	|   2    | Add/remove   | Indicates that following the specific host  |
-	|        | command      | command: "Add/Remove entry in Hash Lookup   |
-	|        | validation   | Table" used in Interworking setup, the user |
-	|        |              | must issue another command.                 |
-	|        |              | CECDR = 0xce000003.                         |
-	|        |              | CECR = 0x01c10f58.                          |
-	-----------------------------------------------------------------------
-	|   3    | General push | Indicates that the s/w has to initialize    |
-	|        | command      | some pointers in the Ethernet thread pages  |
-	|        |              | which are used when Header Compression is   |
-	|        |              | activated.  The full details of these       |
-	|        |              | pointers is located in the software drivers.|
-	-----------------------------------------------------------------------
-	|   4    | General push | Indicates that after issuing Ethernet TX    |
-	|        | command      | INIT command, user must issue this command  |
-	|        |              | for each SNUM of Ethernet TX thread.        |
-	|        |              | CECDR = 0x00800003.                         |
-	|        |              | CECR = 0x7'b{0}, 8'b{Enet TX thread SNUM},  |
-	|        |              |        1'b{1}, 12'b{0}, 4'b{1}              |
-	-----------------------------------------------------------------------
-	| 5 - 31 |     N/A      | Reserved, set to zero.                      |
-	-----------------------------------------------------------------------
-
-V - Firmware Structure Layout
-==============================
-
-QE microcode from Freescale is typically provided as a header file.  This
-header file contains macros that define the microcode binary itself as well as
-some other data used in uploading that microcode.  The format of these files
-do not lend themselves to simple inclusion into other code.  Hence,
-the need for a more portable format.  This section defines that format.
-
-Instead of distributing a header file, the microcode and related data are
-embedded into a binary blob.  This blob is passed to the qe_upload_firmware()
-function, which parses the blob and performs everything necessary to upload
-the microcode.
-
-All integers are big-endian.  See the comments for function
-qe_upload_firmware() for up-to-date implementation information.
-
-This structure supports versioning, where the version of the structure is
-embedded into the structure itself.  To ensure forward and backwards
-compatibility, all versions of the structure must use the same 'qe_header'
-structure at the beginning.
-
-'header' (type: struct qe_header):
-	The 'length' field is the size, in bytes, of the entire structure,
-	including all the microcode embedded in it, as well as the CRC (if
-	present).
-
-	The 'magic' field is an array of three bytes that contains the letters
-	'Q', 'E', and 'F'.  This is an identifier that indicates that this
-	structure is a QE Firmware structure.
-
-	The 'version' field is a single byte that indicates the version of this
-	structure.  If the layout of the structure should ever need to be
-	changed to add support for additional types of microcode, then the
-	version number should also be changed.
-
-The 'id' field is a null-terminated string(suitable for printing) that
-identifies the firmware.
-
-The 'count' field indicates the number of 'microcode' structures.  There
-must be one and only one 'microcode' structure for each RISC processor.
-Therefore, this field also represents the number of RISC processors for this
-SOC.
-
-The 'soc' structure contains the SOC numbers and revisions used to match
-the microcode to the SOC itself.  Normally, the microcode loader should
-check the data in this structure with the SOC number and revisions, and
-only upload the microcode if there's a match.  However, this check is not
-made on all platforms.
-
-Although it is not recommended, you can specify '0' in the soc.model
-field to skip matching SOCs altogether.
-
-The 'model' field is a 16-bit number that matches the actual SOC. The
-'major' and 'minor' fields are the major and minor revision numbers,
-respectively, of the SOC.
-
-For example, to match the 8323, revision 1.0:
-     soc.model = 8323
-     soc.major = 1
-     soc.minor = 0
-
-'padding' is necessary for structure alignment.  This field ensures that the
-'extended_modes' field is aligned on a 64-bit boundary.
-
-'extended_modes' is a bitfield that defines special functionality which has an
-impact on the device drivers.  Each bit has its own impact and has special
-instructions for the driver associated with it.  This field is stored in
-the QE library and available to any driver that calles qe_get_firmware_info().
-
-'vtraps' is an array of 8 words that contain virtual trap values for each
-virtual traps.  As with 'extended_modes', this field is stored in the QE
-library and available to any driver that calles qe_get_firmware_info().
-
-'microcode' (type: struct qe_microcode):
-	For each RISC processor there is one 'microcode' structure.  The first
-	'microcode' structure is for the first RISC, and so on.
-
-	The 'id' field is a null-terminated string suitable for printing that
-	identifies this particular microcode.
-
-	'traps' is an array of 16 words that contain hardware trap values
-	for each of the 16 traps.  If trap[i] is 0, then this particular
-	trap is to be ignored (i.e. not written to TIBCR[i]).  The entire value
-	is written as-is to the TIBCR[i] register, so be sure to set the EN
-	and T_IBP bits if necessary.
-
-	'eccr' is the value to program into the ECCR register.
-
-	'iram_offset' is the offset into IRAM to start writing the
-	microcode.
-
-	'count' is the number of 32-bit words in the microcode.
-
-	'code_offset' is the offset, in bytes, from the beginning of this
-	structure where the microcode itself can be found.  The first
-	microcode binary should be located immediately after the 'microcode'
-	array.
-
-	'major', 'minor', and 'revision' are the major, minor, and revision
-	version numbers, respectively, of the microcode.  If all values are 0,
-	then these fields are ignored.
-
-	'reserved' is necessary for structure alignment.  Since 'microcode'
-	is an array, the 64-bit 'extended_modes' field needs to be aligned
-	on a 64-bit boundary, and this can only happen if the size of
-	'microcode' is a multiple of 8 bytes.  To ensure that, we add
-	'reserved'.
-
-After the last microcode is a 32-bit CRC.  It can be calculated using
-this algorithm:
-
-u32 crc32(const u8 *p, unsigned int len)
-{
-	unsigned int i;
-	u32 crc = 0;
-
-	while (len--) {
-	   crc ^= *p++;
-	   for (i = 0; i < 8; i++)
-		   crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0);
-	}
-	return crc;
-}
-
-VI - Sample Code for Creating Firmware Files
-============================================
-
-A Python program that creates firmware binaries from the header files normally
-distributed by Freescale can be found on http://opensource.freescale.com.
diff --git a/Documentation/powerpc/syscall64-abi.rst b/Documentation/powerpc/syscall64-abi.rst
new file mode 100644
index 000000000000..e49f69f941b9
--- /dev/null
+++ b/Documentation/powerpc/syscall64-abi.rst
@@ -0,0 +1,110 @@
+===============================================
+Power Architecture 64-bit Linux system call ABI
+===============================================
+
+syscall
+=======
+
+syscall calling sequence\ [1]_ matches the Power Architecture 64-bit ELF ABI
+specification C function calling sequence, including register preservation
+rules, with the following differences.
+
+.. [1] Some syscalls (typically low-level management functions) may have
+       different calling sequences (e.g., rt_sigreturn).
+
+Parameters and return value
+---------------------------
+The system call number is specified in r0.
+
+There is a maximum of 6 integer parameters to a syscall, passed in r3-r8.
+
+Both a return value and a return error code are returned. cr0.SO is the return
+error code, and r3 is the return value or error code. When cr0.SO is clear,
+the syscall succeeded and r3 is the return value. When cr0.SO is set, the
+syscall failed and r3 is the error code that generally corresponds to errno.
+
+Stack
+-----
+System calls do not modify the caller's stack frame. For example, the caller's
+stack frame LR and CR save fields are not used.
+
+Register preservation rules
+---------------------------
+Register preservation rules match the ELF ABI calling sequence with the
+following differences:
+
+=========== ============= ========================================
+r0          Volatile      (System call number.)
+r3          Volatile      (Parameter 1, and return value.)
+r4-r8       Volatile      (Parameters 2-6.)
+cr0         Volatile      (cr0.SO is the return error condition)
+cr1, cr5-7  Nonvolatile
+lr          Nonvolatile
+=========== ============= ========================================
+
+All floating point and vector data registers as well as control and status
+registers are nonvolatile.
+
+Invocation
+----------
+The syscall is performed with the sc instruction, and returns with execution
+continuing at the instruction following the sc instruction.
+
+Transactional Memory
+--------------------
+Syscall behavior can change if the processor is in transactional or suspended
+transaction state, and the syscall can affect the behavior of the transaction.
+
+If the processor is in suspended state when a syscall is made, the syscall
+will be performed as normal, and will return as normal. The syscall will be
+performed in suspended state, so its side effects will be persistent according
+to the usual transactional memory semantics. A syscall may or may not result
+in the transaction being doomed by hardware.
+
+If the processor is in transactional state when a syscall is made, then the
+behavior depends on the presence of PPC_FEATURE2_HTM_NOSC in the AT_HWCAP2 ELF
+auxiliary vector.
+
+- If present, which is the case for newer kernels, then the syscall will not
+  be performed and the transaction will be doomed by the kernel with the
+  failure code TM_CAUSE_SYSCALL | TM_CAUSE_PERSISTENT in the TEXASR SPR.
+
+- If not present (older kernels), then the kernel will suspend the
+  transactional state and the syscall will proceed as in the case of a
+  suspended state syscall, and will resume the transactional state before
+  returning to the caller. This case is not well defined or supported, so this
+  behavior should not be relied upon.
+
+
+vsyscall
+========
+
+vsyscall calling sequence matches the syscall calling sequence, with the
+following differences. Some vsyscalls may have different calling sequences.
+
+Parameters and return value
+---------------------------
+r0 is not used as an input. The vsyscall is selected by its address.
+
+Stack
+-----
+The vsyscall may or may not use the caller's stack frame save areas.
+
+Register preservation rules
+---------------------------
+
+=========== ========
+r0          Volatile
+cr1, cr5-7  Volatile
+lr          Volatile
+=========== ========
+
+Invocation
+----------
+The vsyscall is performed with a branch-with-link instruction to the vsyscall
+function address.
+
+Transactional Memory
+--------------------
+vsyscalls will run in the same transactional state as the caller. A vsyscall
+may or may not result in the transaction being doomed by hardware.
diff --git a/Documentation/powerpc/syscall64-abi.txt b/Documentation/powerpc/syscall64-abi.txt
deleted file mode 100644
index fa716a0d88bd..000000000000
--- a/Documentation/powerpc/syscall64-abi.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-===============================================
-Power Architecture 64-bit Linux system call ABI
-===============================================
-
-syscall
-=======
-
-syscall calling sequence[*] matches the Power Architecture 64-bit ELF ABI
-specification C function calling sequence, including register preservation
-rules, with the following differences.
-
-[*] Some syscalls (typically low-level management functions) may have
-    different calling sequences (e.g., rt_sigreturn).
-
-Parameters and return value
----------------------------
-The system call number is specified in r0.
-
-There is a maximum of 6 integer parameters to a syscall, passed in r3-r8.
-
-Both a return value and a return error code are returned. cr0.SO is the return
-error code, and r3 is the return value or error code. When cr0.SO is clear,
-the syscall succeeded and r3 is the return value. When cr0.SO is set, the
-syscall failed and r3 is the error code that generally corresponds to errno.
-
-Stack
------
-System calls do not modify the caller's stack frame. For example, the caller's
-stack frame LR and CR save fields are not used.
-
-Register preservation rules
----------------------------
-Register preservation rules match the ELF ABI calling sequence with the
-following differences:
-
-r0:         Volatile.   (System call number.)
-r3:         Volatile.   (Parameter 1, and return value.)
-r4-r8:      Volatile.   (Parameters 2-6.)
-cr0:        Volatile    (cr0.SO is the return error condition)
-cr1, cr5-7: Nonvolatile.
-lr:         Nonvolatile.
-
-All floating point and vector data registers as well as control and status
-registers are nonvolatile.
-
-Invocation
-----------
-The syscall is performed with the sc instruction, and returns with execution
-continuing at the instruction following the sc instruction.
-
-Transactional Memory
---------------------
-Syscall behavior can change if the processor is in transactional or suspended
-transaction state, and the syscall can affect the behavior of the transaction.
-
-If the processor is in suspended state when a syscall is made, the syscall
-will be performed as normal, and will return as normal. The syscall will be
-performed in suspended state, so its side effects will be persistent according
-to the usual transactional memory semantics. A syscall may or may not result
-in the transaction being doomed by hardware.
-
-If the processor is in transactional state when a syscall is made, then the
-behavior depends on the presence of PPC_FEATURE2_HTM_NOSC in the AT_HWCAP2 ELF
-auxiliary vector.
-
-- If present, which is the case for newer kernels, then the syscall will not
-  be performed and the transaction will be doomed by the kernel with the
-  failure code TM_CAUSE_SYSCALL | TM_CAUSE_PERSISTENT in the TEXASR SPR.
-
-- If not present (older kernels), then the kernel will suspend the
-  transactional state and the syscall will proceed as in the case of a
-  suspended state syscall, and will resume the transactional state before
-  returning to the caller. This case is not well defined or supported, so this
-  behavior should not be relied upon.
-
-
-vsyscall
-========
-
-vsyscall calling sequence matches the syscall calling sequence, with the
-following differences. Some vsyscalls may have different calling sequences.
-
-Parameters and return value
----------------------------
-r0 is not used as an input. The vsyscall is selected by its address.
-
-Stack
------
-The vsyscall may or may not use the caller's stack frame save areas.
-
-Register preservation rules
----------------------------
-r0: Volatile.
-cr1, cr5-7: Volatile.
-lr: Volatile.
-
-Invocation
-----------
-The vsyscall is performed with a branch-with-link instruction to the vsyscall
-function address.
-
-Transactional Memory
---------------------
-vsyscalls will run in the same transactional state as the caller. A vsyscall
-may or may not result in the transaction being doomed by hardware.
diff --git a/Documentation/powerpc/transactional_memory.rst b/Documentation/powerpc/transactional_memory.rst
new file mode 100644
index 000000000000..09955103acb4
--- /dev/null
+++ b/Documentation/powerpc/transactional_memory.rst
@@ -0,0 +1,247 @@
+============================
+Transactional Memory support
+============================
+
+POWER kernel support for this feature is currently limited to supporting
+its use by user programs.  It is not currently used by the kernel itself.
+
+This file aims to sum up how it is supported by Linux and what behaviour you
+can expect from your user programs.
+
+
+Basic overview
+==============
+
+Hardware Transactional Memory is supported on POWER8 processors, and is a
+feature that enables a different form of atomic memory access.  Several new
+instructions are presented to delimit transactions; transactions are
+guaranteed to either complete atomically or roll back and undo any partial
+changes.
+
+A simple transaction looks like this::
+
+  begin_move_money:
+    tbegin
+    beq   abort_handler
+
+    ld    r4, SAVINGS_ACCT(r3)
+    ld    r5, CURRENT_ACCT(r3)
+    subi  r5, r5, 1
+    addi  r4, r4, 1
+    std   r4, SAVINGS_ACCT(r3)
+    std   r5, CURRENT_ACCT(r3)
+
+    tend
+
+    b     continue
+
+  abort_handler:
+    ... test for odd failures ...
+
+    /* Retry the transaction if it failed because it conflicted with
+     * someone else: */
+    b     begin_move_money
+
+
+The 'tbegin' instruction denotes the start point, and 'tend' the end point.
+Between these points the processor is in 'Transactional' state; any memory
+references will complete in one go if there are no conflicts with other
+transactional or non-transactional accesses within the system.  In this
+example, the transaction completes as though it were normal straight-line code
+IF no other processor has touched SAVINGS_ACCT(r3) or CURRENT_ACCT(r3); an
+atomic move of money from the current account to the savings account has been
+performed.  Even though the normal ld/std instructions are used (note no
+lwarx/stwcx), either *both* SAVINGS_ACCT(r3) and CURRENT_ACCT(r3) will be
+updated, or neither will be updated.
+
+If, in the meantime, there is a conflict with the locations accessed by the
+transaction, the transaction will be aborted by the CPU.  Register and memory
+state will roll back to that at the 'tbegin', and control will continue from
+'tbegin+4'.  The branch to abort_handler will be taken this second time; the
+abort handler can check the cause of the failure, and retry.
+
+Checkpointed registers include all GPRs, FPRs, VRs/VSRs, LR, CCR/CR, CTR, FPCSR
+and a few other status/flag regs; see the ISA for details.
+
+Causes of transaction aborts
+============================
+
+- Conflicts with cache lines used by other processors
+- Signals
+- Context switches
+- See the ISA for full documentation of everything that will abort transactions.
+
+
+Syscalls
+========
+
+Syscalls made from within an active transaction will not be performed and the
+transaction will be doomed by the kernel with the failure code TM_CAUSE_SYSCALL
+| TM_CAUSE_PERSISTENT.
+
+Syscalls made from within a suspended transaction are performed as normal and
+the transaction is not explicitly doomed by the kernel.  However, what the
+kernel does to perform the syscall may result in the transaction being doomed
+by the hardware.  The syscall is performed in suspended mode so any side
+effects will be persistent, independent of transaction success or failure.  No
+guarantees are provided by the kernel about which syscalls will affect
+transaction success.
+
+Care must be taken when relying on syscalls to abort during active transactions
+if the calls are made via a library.  Libraries may cache values (which may
+give the appearance of success) or perform operations that cause transaction
+failure before entering the kernel (which may produce different failure codes).
+Examples are glibc's getpid() and lazy symbol resolution.
+
+
+Signals
+=======
+
+Delivery of signals (both sync and async) during transactions provides a second
+thread state (ucontext/mcontext) to represent the second transactional register
+state.  Signal delivery 'treclaim's to capture both register states, so signals
+abort transactions.  The usual ucontext_t passed to the signal handler
+represents the checkpointed/original register state; the signal appears to have
+arisen at 'tbegin+4'.
+
+If the sighandler ucontext has uc_link set, a second ucontext has been
+delivered.  For future compatibility the MSR.TS field should be checked to
+determine the transactional state -- if so, the second ucontext in uc->uc_link
+represents the active transactional registers at the point of the signal.
+
+For 64-bit processes, uc->uc_mcontext.regs->msr is a full 64-bit MSR and its TS
+field shows the transactional mode.
+
+For 32-bit processes, the mcontext's MSR register is only 32 bits; the top 32
+bits are stored in the MSR of the second ucontext, i.e. in
+uc->uc_link->uc_mcontext.regs->msr.  The top word contains the transactional
+state TS.
+
+However, basic signal handlers don't need to be aware of transactions
+and simply returning from the handler will deal with things correctly:
+
+Transaction-aware signal handlers can read the transactional register state
+from the second ucontext.  This will be necessary for crash handlers to
+determine, for example, the address of the instruction causing the SIGSEGV.
+
+Example signal handler::
+
+    void crash_handler(int sig, siginfo_t *si, void *uc)
+    {
+      ucontext_t *ucp = uc;
+      ucontext_t *transactional_ucp = ucp->uc_link;
+
+      if (ucp_link) {
+        u64 msr = ucp->uc_mcontext.regs->msr;
+        /* May have transactional ucontext! */
+  #ifndef __powerpc64__
+        msr |= ((u64)transactional_ucp->uc_mcontext.regs->msr) << 32;
+  #endif
+        if (MSR_TM_ACTIVE(msr)) {
+           /* Yes, we crashed during a transaction.  Oops. */
+   fprintf(stderr, "Transaction to be restarted at 0x%llx, but "
+                           "crashy instruction was at 0x%llx\n",
+                           ucp->uc_mcontext.regs->nip,
+                           transactional_ucp->uc_mcontext.regs->nip);
+        }
+      }
+
+      fix_the_problem(ucp->dar);
+    }
+
+When in an active transaction that takes a signal, we need to be careful with
+the stack.  It's possible that the stack has moved back up after the tbegin.
+The obvious case here is when the tbegin is called inside a function that
+returns before a tend.  In this case, the stack is part of the checkpointed
+transactional memory state.  If we write over this non transactionally or in
+suspend, we are in trouble because if we get a tm abort, the program counter and
+stack pointer will be back at the tbegin but our in memory stack won't be valid
+anymore.
+
+To avoid this, when taking a signal in an active transaction, we need to use
+the stack pointer from the checkpointed state, rather than the speculated
+state.  This ensures that the signal context (written tm suspended) will be
+written below the stack required for the rollback.  The transaction is aborted
+because of the treclaim, so any memory written between the tbegin and the
+signal will be rolled back anyway.
+
+For signals taken in non-TM or suspended mode, we use the
+normal/non-checkpointed stack pointer.
+
+Any transaction initiated inside a sighandler and suspended on return
+from the sighandler to the kernel will get reclaimed and discarded.
+
+Failure cause codes used by kernel
+==================================
+
+These are defined in <asm/reg.h>, and distinguish different reasons why the
+kernel aborted a transaction:
+
+ ====================== ================================
+ TM_CAUSE_RESCHED       Thread was rescheduled.
+ TM_CAUSE_TLBI          Software TLB invalid.
+ TM_CAUSE_FAC_UNAV      FP/VEC/VSX unavailable trap.
+ TM_CAUSE_SYSCALL       Syscall from active transaction.
+ TM_CAUSE_SIGNAL        Signal delivered.
+ TM_CAUSE_MISC          Currently unused.
+ TM_CAUSE_ALIGNMENT     Alignment fault.
+ TM_CAUSE_EMULATE       Emulation that touched memory.
+ ====================== ================================
+
+These can be checked by the user program's abort handler as TEXASR[0:7].  If
+bit 7 is set, it indicates that the error is consider persistent.  For example
+a TM_CAUSE_ALIGNMENT will be persistent while a TM_CAUSE_RESCHED will not.
+
+GDB
+===
+
+GDB and ptrace are not currently TM-aware.  If one stops during a transaction,
+it looks like the transaction has just started (the checkpointed state is
+presented).  The transaction cannot then be continued and will take the failure
+handler route.  Furthermore, the transactional 2nd register state will be
+inaccessible.  GDB can currently be used on programs using TM, but not sensibly
+in parts within transactions.
+
+POWER9
+======
+
+TM on POWER9 has issues with storing the complete register state. This
+is described in this commit::
+
+    commit 4bb3c7a0208fc13ca70598efd109901a7cd45ae7
+    Author: Paul Mackerras <paulus@ozlabs.org>
+    Date:   Wed Mar 21 21:32:01 2018 +1100
+    KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9
+
+To account for this different POWER9 chips have TM enabled in
+different ways.
+
+On POWER9N DD2.01 and below, TM is disabled. ie
+HWCAP2[PPC_FEATURE2_HTM] is not set.
+
+On POWER9N DD2.1 TM is configured by firmware to always abort a
+transaction when tm suspend occurs. So tsuspend will cause a
+transaction to be aborted and rolled back. Kernel exceptions will also
+cause the transaction to be aborted and rolled back and the exception
+will not occur. If userspace constructs a sigcontext that enables TM
+suspend, the sigcontext will be rejected by the kernel. This mode is
+advertised to users with HWCAP2[PPC_FEATURE2_HTM_NO_SUSPEND] set.
+HWCAP2[PPC_FEATURE2_HTM] is not set in this mode.
+
+On POWER9N DD2.2 and above, KVM and POWERVM emulate TM for guests (as
+described in commit 4bb3c7a0208f), hence TM is enabled for guests
+ie. HWCAP2[PPC_FEATURE2_HTM] is set for guest userspace. Guests that
+makes heavy use of TM suspend (tsuspend or kernel suspend) will result
+in traps into the hypervisor and hence will suffer a performance
+degradation. Host userspace has TM disabled
+ie. HWCAP2[PPC_FEATURE2_HTM] is not set. (although we make enable it
+at some point in the future if we bring the emulation into host
+userspace context switching).
+
+POWER9C DD1.2 and above are only available with POWERVM and hence
+Linux only runs as a guest. On these systems TM is emulated like on
+POWER9N DD2.2.
+
+Guest migration from POWER8 to POWER9 will work with POWER9N DD2.2 and
+POWER9C DD1.2. Since earlier POWER9 processors don't support TM
+emulation, migration from POWER8 to POWER9 is not supported there.
diff --git a/Documentation/powerpc/transactional_memory.txt b/Documentation/powerpc/transactional_memory.txt
deleted file mode 100644
index 52c023e14f26..000000000000
--- a/Documentation/powerpc/transactional_memory.txt
+++ /dev/null
@@ -1,244 +0,0 @@
-Transactional Memory support
-============================
-
-POWER kernel support for this feature is currently limited to supporting
-its use by user programs.  It is not currently used by the kernel itself.
-
-This file aims to sum up how it is supported by Linux and what behaviour you
-can expect from your user programs.
-
-
-Basic overview
-==============
-
-Hardware Transactional Memory is supported on POWER8 processors, and is a
-feature that enables a different form of atomic memory access.  Several new
-instructions are presented to delimit transactions; transactions are
-guaranteed to either complete atomically or roll back and undo any partial
-changes.
-
-A simple transaction looks like this:
-
-begin_move_money:
-  tbegin
-  beq   abort_handler
-
-  ld    r4, SAVINGS_ACCT(r3)
-  ld    r5, CURRENT_ACCT(r3)
-  subi  r5, r5, 1
-  addi  r4, r4, 1
-  std   r4, SAVINGS_ACCT(r3)
-  std   r5, CURRENT_ACCT(r3)
-
-  tend
-
-  b     continue
-
-abort_handler:
-  ... test for odd failures ...
-
-  /* Retry the transaction if it failed because it conflicted with
-   * someone else: */
-  b     begin_move_money
-
-
-The 'tbegin' instruction denotes the start point, and 'tend' the end point.
-Between these points the processor is in 'Transactional' state; any memory
-references will complete in one go if there are no conflicts with other
-transactional or non-transactional accesses within the system.  In this
-example, the transaction completes as though it were normal straight-line code
-IF no other processor has touched SAVINGS_ACCT(r3) or CURRENT_ACCT(r3); an
-atomic move of money from the current account to the savings account has been
-performed.  Even though the normal ld/std instructions are used (note no
-lwarx/stwcx), either *both* SAVINGS_ACCT(r3) and CURRENT_ACCT(r3) will be
-updated, or neither will be updated.
-
-If, in the meantime, there is a conflict with the locations accessed by the
-transaction, the transaction will be aborted by the CPU.  Register and memory
-state will roll back to that at the 'tbegin', and control will continue from
-'tbegin+4'.  The branch to abort_handler will be taken this second time; the
-abort handler can check the cause of the failure, and retry.
-
-Checkpointed registers include all GPRs, FPRs, VRs/VSRs, LR, CCR/CR, CTR, FPCSR
-and a few other status/flag regs; see the ISA for details.
-
-Causes of transaction aborts
-============================
-
-- Conflicts with cache lines used by other processors
-- Signals
-- Context switches
-- See the ISA for full documentation of everything that will abort transactions.
-
-
-Syscalls
-========
-
-Syscalls made from within an active transaction will not be performed and the
-transaction will be doomed by the kernel with the failure code TM_CAUSE_SYSCALL
-| TM_CAUSE_PERSISTENT.
-
-Syscalls made from within a suspended transaction are performed as normal and
-the transaction is not explicitly doomed by the kernel.  However, what the
-kernel does to perform the syscall may result in the transaction being doomed
-by the hardware.  The syscall is performed in suspended mode so any side
-effects will be persistent, independent of transaction success or failure.  No
-guarantees are provided by the kernel about which syscalls will affect
-transaction success.
-
-Care must be taken when relying on syscalls to abort during active transactions
-if the calls are made via a library.  Libraries may cache values (which may
-give the appearance of success) or perform operations that cause transaction
-failure before entering the kernel (which may produce different failure codes).
-Examples are glibc's getpid() and lazy symbol resolution.
-
-
-Signals
-=======
-
-Delivery of signals (both sync and async) during transactions provides a second
-thread state (ucontext/mcontext) to represent the second transactional register
-state.  Signal delivery 'treclaim's to capture both register states, so signals
-abort transactions.  The usual ucontext_t passed to the signal handler
-represents the checkpointed/original register state; the signal appears to have
-arisen at 'tbegin+4'.
-
-If the sighandler ucontext has uc_link set, a second ucontext has been
-delivered.  For future compatibility the MSR.TS field should be checked to
-determine the transactional state -- if so, the second ucontext in uc->uc_link
-represents the active transactional registers at the point of the signal.
-
-For 64-bit processes, uc->uc_mcontext.regs->msr is a full 64-bit MSR and its TS
-field shows the transactional mode.
-
-For 32-bit processes, the mcontext's MSR register is only 32 bits; the top 32
-bits are stored in the MSR of the second ucontext, i.e. in
-uc->uc_link->uc_mcontext.regs->msr.  The top word contains the transactional
-state TS.
-
-However, basic signal handlers don't need to be aware of transactions
-and simply returning from the handler will deal with things correctly:
-
-Transaction-aware signal handlers can read the transactional register state
-from the second ucontext.  This will be necessary for crash handlers to
-determine, for example, the address of the instruction causing the SIGSEGV.
-
-Example signal handler:
-
-    void crash_handler(int sig, siginfo_t *si, void *uc)
-    {
-      ucontext_t *ucp = uc;
-      ucontext_t *transactional_ucp = ucp->uc_link;
-
-      if (ucp_link) {
-        u64 msr = ucp->uc_mcontext.regs->msr;
-        /* May have transactional ucontext! */
-#ifndef __powerpc64__
-        msr |= ((u64)transactional_ucp->uc_mcontext.regs->msr) << 32;
-#endif
-        if (MSR_TM_ACTIVE(msr)) {
-           /* Yes, we crashed during a transaction.  Oops. */
-   fprintf(stderr, "Transaction to be restarted at 0x%llx, but "
-                           "crashy instruction was at 0x%llx\n",
-                           ucp->uc_mcontext.regs->nip,
-                           transactional_ucp->uc_mcontext.regs->nip);
-        }
-      }
-
-      fix_the_problem(ucp->dar);
-    }
-
-When in an active transaction that takes a signal, we need to be careful with
-the stack.  It's possible that the stack has moved back up after the tbegin.
-The obvious case here is when the tbegin is called inside a function that
-returns before a tend.  In this case, the stack is part of the checkpointed
-transactional memory state.  If we write over this non transactionally or in
-suspend, we are in trouble because if we get a tm abort, the program counter and
-stack pointer will be back at the tbegin but our in memory stack won't be valid
-anymore.
-
-To avoid this, when taking a signal in an active transaction, we need to use
-the stack pointer from the checkpointed state, rather than the speculated
-state.  This ensures that the signal context (written tm suspended) will be
-written below the stack required for the rollback.  The transaction is aborted
-because of the treclaim, so any memory written between the tbegin and the
-signal will be rolled back anyway.
-
-For signals taken in non-TM or suspended mode, we use the
-normal/non-checkpointed stack pointer.
-
-Any transaction initiated inside a sighandler and suspended on return
-from the sighandler to the kernel will get reclaimed and discarded.
-
-Failure cause codes used by kernel
-==================================
-
-These are defined in <asm/reg.h>, and distinguish different reasons why the
-kernel aborted a transaction:
-
- TM_CAUSE_RESCHED       Thread was rescheduled.
- TM_CAUSE_TLBI          Software TLB invalid.
- TM_CAUSE_FAC_UNAV      FP/VEC/VSX unavailable trap.
- TM_CAUSE_SYSCALL       Syscall from active transaction.
- TM_CAUSE_SIGNAL        Signal delivered.
- TM_CAUSE_MISC          Currently unused.
- TM_CAUSE_ALIGNMENT     Alignment fault.
- TM_CAUSE_EMULATE       Emulation that touched memory.
-
-These can be checked by the user program's abort handler as TEXASR[0:7].  If
-bit 7 is set, it indicates that the error is consider persistent.  For example
-a TM_CAUSE_ALIGNMENT will be persistent while a TM_CAUSE_RESCHED will not.
-
-GDB
-===
-
-GDB and ptrace are not currently TM-aware.  If one stops during a transaction,
-it looks like the transaction has just started (the checkpointed state is
-presented).  The transaction cannot then be continued and will take the failure
-handler route.  Furthermore, the transactional 2nd register state will be
-inaccessible.  GDB can currently be used on programs using TM, but not sensibly
-in parts within transactions.
-
-POWER9
-======
-
-TM on POWER9 has issues with storing the complete register state. This
-is described in this commit:
-
-    commit 4bb3c7a0208fc13ca70598efd109901a7cd45ae7
-    Author: Paul Mackerras <paulus@ozlabs.org>
-    Date:   Wed Mar 21 21:32:01 2018 +1100
-    KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9
-
-To account for this different POWER9 chips have TM enabled in
-different ways.
-
-On POWER9N DD2.01 and below, TM is disabled. ie
-HWCAP2[PPC_FEATURE2_HTM] is not set.
-
-On POWER9N DD2.1 TM is configured by firmware to always abort a
-transaction when tm suspend occurs. So tsuspend will cause a
-transaction to be aborted and rolled back. Kernel exceptions will also
-cause the transaction to be aborted and rolled back and the exception
-will not occur. If userspace constructs a sigcontext that enables TM
-suspend, the sigcontext will be rejected by the kernel. This mode is
-advertised to users with HWCAP2[PPC_FEATURE2_HTM_NO_SUSPEND] set.
-HWCAP2[PPC_FEATURE2_HTM] is not set in this mode.
-
-On POWER9N DD2.2 and above, KVM and POWERVM emulate TM for guests (as
-described in commit 4bb3c7a0208f), hence TM is enabled for guests
-ie. HWCAP2[PPC_FEATURE2_HTM] is set for guest userspace. Guests that
-makes heavy use of TM suspend (tsuspend or kernel suspend) will result
-in traps into the hypervisor and hence will suffer a performance
-degradation. Host userspace has TM disabled
-ie. HWCAP2[PPC_FEATURE2_HTM] is not set. (although we make enable it
-at some point in the future if we bring the emulation into host
-userspace context switching).
-
-POWER9C DD1.2 and above are only available with POWERVM and hence
-Linux only runs as a guest. On these systems TM is emulated like on
-POWER9N DD2.2.
-
-Guest migration from POWER8 to POWER9 will work with POWER9N DD2.2 and
-POWER9C DD1.2. Since earlier POWER9 processors don't support TM
-emulation, migration from POWER8 to POWER9 is not supported there.
diff --git a/Documentation/powerpc/vcpudispatch_stats.txt b/Documentation/powerpc/vcpudispatch_stats.txt
new file mode 100644
index 000000000000..e21476bfd78c
--- /dev/null
+++ b/Documentation/powerpc/vcpudispatch_stats.txt
@@ -0,0 +1,68 @@
+VCPU Dispatch Statistics:
+=========================
+
+For Shared Processor LPARs, the POWER Hypervisor maintains a relatively
+static mapping of the LPAR processors (vcpus) to physical processor
+chips (representing the "home" node) and tries to always dispatch vcpus
+on their associated physical processor chip. However, under certain
+scenarios, vcpus may be dispatched on a different processor chip (away
+from its home node).
+
+/proc/powerpc/vcpudispatch_stats can be used to obtain statistics
+related to the vcpu dispatch behavior. Writing '1' to this file enables
+collecting the statistics, while writing '0' disables the statistics.
+By default, the DTLB log for each vcpu is processed 50 times a second so
+as not to miss any entries. This processing frequency can be changed
+through /proc/powerpc/vcpudispatch_stats_freq.
+
+The statistics themselves are available by reading the procfs file
+/proc/powerpc/vcpudispatch_stats. Each line in the output corresponds to
+a vcpu as represented by the first field, followed by 8 numbers.
+
+The first number corresponds to:
+1. total vcpu dispatches since the beginning of statistics collection
+
+The next 4 numbers represent vcpu dispatch dispersions:
+2. number of times this vcpu was dispatched on the same processor as last
+   time
+3. number of times this vcpu was dispatched on a different processor core
+   as last time, but within the same chip
+4. number of times this vcpu was dispatched on a different chip
+5. number of times this vcpu was dispatches on a different socket/drawer
+(next numa boundary)
+
+The final 3 numbers represent statistics in relation to the home node of
+the vcpu:
+6. number of times this vcpu was dispatched in its home node (chip)
+7. number of times this vcpu was dispatched in a different node
+8. number of times this vcpu was dispatched in a node further away (numa
+distance)
+
+An example output:
+    $ sudo cat /proc/powerpc/vcpudispatch_stats
+    cpu0 6839 4126 2683 30 0 6821 18 0
+    cpu1 2515 1274 1229 12 0 2509 6 0
+    cpu2 2317 1198 1109 10 0 2312 5 0
+    cpu3 2259 1165 1088 6 0 2256 3 0
+    cpu4 2205 1143 1056 6 0 2202 3 0
+    cpu5 2165 1121 1038 6 0 2162 3 0
+    cpu6 2183 1127 1050 6 0 2180 3 0
+    cpu7 2193 1133 1052 8 0 2187 6 0
+    cpu8 2165 1115 1032 18 0 2156 9 0
+    cpu9 2301 1252 1033 16 0 2293 8 0
+    cpu10 2197 1138 1041 18 0 2187 10 0
+    cpu11 2273 1185 1062 26 0 2260 13 0
+    cpu12 2186 1125 1043 18 0 2177 9 0
+    cpu13 2161 1115 1030 16 0 2153 8 0
+    cpu14 2206 1153 1033 20 0 2196 10 0
+    cpu15 2163 1115 1032 16 0 2155 8 0
+
+In the output above, for vcpu0, there have been 6839 dispatches since
+statistics were enabled. 4126 of those dispatches were on the same
+physical cpu as the last time. 2683 were on a different core, but within
+the same chip, while 30 dispatches were on a different chip compared to
+its last dispatch.
+
+Also, out of the total of 6839 dispatches, we see that there have been
+6821 dispatches on the vcpu's home node, while 18 dispatches were
+outside its home node, on a neighbouring chip.
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
deleted file mode 100644
index 99f5d8c4c652..000000000000
--- a/Documentation/pps/pps.txt
+++ /dev/null
@@ -1,239 +0,0 @@
-
-			PPS - Pulse Per Second
-			----------------------
-
-(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-
-
-Overview
---------
-
-LinuxPPS provides a programming interface (API) to define in the
-system several PPS sources.
-
-PPS means "pulse per second" and a PPS source is just a device which
-provides a high precision signal each second so that an application
-can use it to adjust system clock time.
-
-A PPS source can be connected to a serial port (usually to the Data
-Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
-CPU's GPIOs (this is the common case in embedded systems) but in each
-case when a new pulse arrives the system must apply to it a timestamp
-and record it for userland.
-
-Common use is the combination of the NTPD as userland program, with a
-GPS receiver as PPS source, to obtain a wallclock-time with
-sub-millisecond synchronisation to UTC.
-
-
-RFC considerations
-------------------
-
-While implementing a PPS API as RFC 2783 defines and using an embedded
-CPU GPIO-Pin as physical link to the signal, I encountered a deeper
-problem:
-
-   At startup it needs a file descriptor as argument for the function
-   time_pps_create().
-
-This implies that the source has a /dev/... entry. This assumption is
-OK for the serial and parallel port, where you can do something
-useful besides(!) the gathering of timestamps as it is the central
-task for a PPS API. But this assumption does not work for a single
-purpose GPIO line. In this case even basic file-related functionality
-(like read() and write()) makes no sense at all and should not be a
-precondition for the use of a PPS API.
-
-The problem can be simply solved if you consider that a PPS source is
-not always connected with a GPS data source.
-
-So your programs should check if the GPS data source (the serial port
-for instance) is a PPS source too, and if not they should provide the
-possibility to open another device as PPS source.
-
-In LinuxPPS the PPS sources are simply char devices usually mapped
-into files /dev/pps0, /dev/pps1, etc.
-
-
-PPS with USB to serial devices
-------------------------------
-
-It is possible to grab the PPS from an USB to serial device. However,
-you should take into account the latencies and jitter introduced by
-the USB stack. Users have reported clock instability around +-1ms when
-synchronized with PPS through USB. With USB 2.0, jitter may decrease
-down to the order of 125 microseconds.
-
-This may be suitable for time server synchronization with NTP because
-of its undersampling and algorithms.
-
-If your device doesn't report PPS, you can check that the feature is
-supported by its driver. Most of the time, you only need to add a call
-to usb_serial_handle_dcd_change after checking the DCD status (see
-ch341 and pl2303 examples).
-
-
-Coding example
---------------
-
-To register a PPS source into the kernel you should define a struct
-pps_source_info as follows:
-
-    static struct pps_source_info pps_ktimer_info = {
-	    .name         = "ktimer",
-	    .path         = "",
-	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
-			    PPS_ECHOASSERT |
-			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
-	    .echo         = pps_ktimer_echo,
-	    .owner        = THIS_MODULE,
-    };
-
-and then calling the function pps_register_source() in your
-initialization routine as follows:
-
-    source = pps_register_source(&pps_ktimer_info,
-			PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
-
-The pps_register_source() prototype is:
-
-  int pps_register_source(struct pps_source_info *info, int default_params)
-
-where "info" is a pointer to a structure that describes a particular
-PPS source, "default_params" tells the system what the initial default
-parameters for the device should be (it is obvious that these parameters
-must be a subset of ones defined in the struct
-pps_source_info which describe the capabilities of the driver).
-
-Once you have registered a new PPS source into the system you can
-signal an assert event (for example in the interrupt handler routine)
-just using:
-
-    pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
-
-where "ts" is the event's timestamp.
-
-The same function may also run the defined echo function
-(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
-asked for that... etc..
-
-Please see the file drivers/pps/clients/pps-ktimer.c for example code.
-
-
-SYSFS support
--------------
-
-If the SYSFS filesystem is enabled in the kernel it provides a new class:
-
-   $ ls /sys/class/pps/
-   pps0/  pps1/  pps2/
-
-Every directory is the ID of a PPS sources defined in the system and
-inside you find several files:
-
-   $ ls -F /sys/class/pps/pps0/
-   assert     dev        mode       path       subsystem@
-   clear      echo       name       power/     uevent
-
-
-Inside each "assert" and "clear" file you can find the timestamp and a
-sequence number:
-
-   $ cat /sys/class/pps/pps0/assert
-   1170026870.983207967#8
-
-Where before the "#" is the timestamp in seconds; after it is the
-sequence number. Other files are:
-
- * echo: reports if the PPS source has an echo function or not;
-
- * mode: reports available PPS functioning modes;
-
- * name: reports the PPS source's name;
-
- * path: reports the PPS source's device path, that is the device the
-   PPS source is connected to (if it exists).
-
-
-Testing the PPS support
------------------------
-
-In order to test the PPS support even without specific hardware you can use
-the pps-ktimer driver (see the client subsection in the PPS configuration menu)
-and the userland tools available in your distribution's pps-tools package,
-http://linuxpps.org , or https://github.com/redlab-i/pps-tools.
-
-Once you have enabled the compilation of pps-ktimer just modprobe it (if
-not statically compiled):
-
-   # modprobe pps-ktimer
-
-and the run ppstest as follow:
-
-   $ ./ppstest /dev/pps1
-   trying PPS source "/dev/pps1"
-   found PPS source "/dev/pps1"
-   ok, found 1 source(s), now start fetching data...
-   source 0 - assert 1186592699.388832443, sequence: 364 - clear  0.000000000, sequence: 0
-   source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
-   source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
-
-Please note that to compile userland programs, you need the file timepps.h.
-This is available in the pps-tools repository mentioned above.
-
-
-Generators
-----------
-
-Sometimes one needs to be able not only to catch PPS signals but to produce
-them also. For example, running a distributed simulation, which requires
-computers' clock to be synchronized very tightly. One way to do this is to
-invent some complicated hardware solutions but it may be neither necessary
-nor affordable. The cheap way is to load a PPS generator on one of the
-computers (master) and PPS clients on others (slaves), and use very simple
-cables to deliver signals using parallel ports, for example.
-
-Parallel port cable pinout:
-pin	name	master      slave
-1	STROBE	  *------     *
-2	D0	  *     |     *
-3	D1	  *     |     *
-4	D2	  *     |     *
-5	D3	  *     |     *
-6	D4	  *     |     *
-7	D5	  *     |     *
-8	D6	  *     |     *
-9	D7	  *     |     *
-10	ACK	  *     ------*
-11	BUSY	  *           *
-12	PE	  *           *
-13	SEL	  *           *
-14	AUTOFD	  *           *
-15	ERROR	  *           *
-16	INIT	  *           *
-17	SELIN	  *           *
-18-25	GND	  *-----------*
-
-Please note that parallel port interrupt occurs only on high->low transition,
-so it is used for PPS assert edge. PPS clear edge can be determined only
-using polling in the interrupt handler which actually can be done way more
-precisely because interrupt handling delays can be quite big and random. So
-current parport PPS generator implementation (pps_gen_parport module) is
-geared towards using the clear edge for time synchronization.
-
-Clear edge polling is done with disabled interrupts so it's better to select
-delay between assert and clear edge as small as possible to reduce system
-latencies. But if it is too small slave won't be able to capture clear edge
-transition. The default of 30us should be good enough in most situations.
-The delay can be selected using 'delay' pps_gen_parport module parameter.
diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst
index 4b7a5ab3cec1..13dd893c9f88 100644
--- a/Documentation/process/4.Coding.rst
+++ b/Documentation/process/4.Coding.rst
@@ -298,7 +298,7 @@ enabled, a configurable percentage of memory allocations will be made to
 fail; these failures can be restricted to a specific range of code.
 Running with fault injection enabled allows the programmer to see how the
 code responds when things go badly.  See
-Documentation/fault-injection/fault-injection.txt for more information on
+Documentation/fault-injection/fault-injection.rst for more information on
 how to use this facility.
 
 Other kinds of errors can be found with the "sparse" static analysis tool.
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 18735dc460a0..2284f2221f02 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -23,15 +23,15 @@ running, the suggested command should tell you.
 
 Again, keep in mind that this list assumes you are already functionally
 running a Linux kernel.  Also, not all tools are necessary on all
-systems; obviously, if you don't have any ISDN hardware, for example,
-you probably needn't concern yourself with isdn4k-utils.
+systems; obviously, if you don't have any PC Card hardware, for example,
+you probably needn't concern yourself with pcmciautils.
 
 ====================== ===============  ========================================
         Program        Minimal version       Command to check the version
 ====================== ===============  ========================================
 GNU C                  4.6              gcc --version
 GNU make               3.81             make --version
-binutils               2.20             ld -v
+binutils               2.21             ld -v
 flex                   2.5.35           flex --version
 bison                  2.0              bison --version
 util-linux             2.10o            fdformat --version
@@ -45,7 +45,6 @@ btrfs-progs            0.18             btrfsck
 pcmciautils            004              pccardctl -V
 quota-tools            3.09             quota -V
 PPP                    2.4.0            pppd --version
-isdn4k-utils           3.1pre1          isdnctrl 2>&1|grep version
 nfs-utils              1.0.5            showmount --version
 procps                 3.2.0            ps --version
 oprofile               0.9              oprofiled --version
@@ -77,9 +76,7 @@ You will need GNU make 3.81 or later to build the kernel.
 Binutils
 --------
 
-The build system has, as of 4.13, switched to using thin archives (`ar T`)
-rather than incremental linking (`ld -r`) for built-in.a intermediate steps.
-This requires binutils 2.20 or newer.
+Binutils 2.21 or newer is needed to build the kernel.
 
 pkg-config
 ----------
@@ -279,12 +276,6 @@ which can be made by::
 
 as root.
 
-Isdn4k-utils
-------------
-
-Due to changes in the length of the phone number field, isdn4k-utils
-needs to be recompiled or (preferably) upgraded.
-
 NFS-utils
 ---------
 
@@ -448,11 +439,6 @@ PPP
 
 - <ftp://ftp.samba.org/pub/ppp/>
 
-Isdn4k-utils
-------------
-
-- <ftp://ftp.isdn4linux.de/pub/isdn4linux/utils/>
-
 NFS-utils
 ---------
 
diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst
index fa864a51e6ea..f4a2198187f9 100644
--- a/Documentation/process/coding-style.rst
+++ b/Documentation/process/coding-style.rst
@@ -686,7 +686,7 @@ filesystems) should advertise this prominently in their prompt string::
 	...
 
 For full documentation on the configuration files, see the file
-Documentation/kbuild/kconfig-language.txt.
+Documentation/kbuild/kconfig-language.rst.
 
 
 11) Data structures
diff --git a/Documentation/process/conf.py b/Documentation/process/conf.py
deleted file mode 100644
index 1b01a80ad9ce..000000000000
--- a/Documentation/process/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = 'Linux Kernel Development Documentation'
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'process.tex', 'Linux Kernel Development Documentation',
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index 4bab7464ff8c..17db11b7ed48 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -238,7 +238,10 @@ your new subkey::
     work.
 
     If for some reason you prefer to stay with RSA subkeys, just replace
-    "ed25519" with "rsa2048" in the above command.
+    "ed25519" with "rsa2048" in the above command. Additionally, if you
+    plan to use a hardware device that does not support ED25519 ECC
+    keys, like Nitrokey Pro or a Yubikey, then you should use
+    "nistp256" instead or "ed25519."
 
 
 Back up your master key for disaster recovery
@@ -432,23 +435,23 @@ Available smartcard devices
 
 Unless all your laptops and workstations have smartcard readers, the
 easiest is to get a specialized USB device that implements smartcard
-functionality.  There are several options available:
+functionality. There are several options available:
 
 - `Nitrokey Start`_: Open hardware and Free Software, based on FSI
-  Japan's `Gnuk`_. Offers support for ECC keys, but fewest security
-  features (such as resistance to tampering or some side-channel
-  attacks).
-- `Nitrokey Pro`_: Similar to the Nitrokey Start, but more
-  tamper-resistant and offers more security features, but no ECC
-  support.
-- `Yubikey 4`_: proprietary hardware and software, but cheaper than
+  Japan's `Gnuk`_. One of the few available commercial devices that
+  support ED25519 ECC keys, but offer fewest security features (such as
+  resistance to tampering or some side-channel attacks).
+- `Nitrokey Pro 2`_: Similar to the Nitrokey Start, but more
+  tamper-resistant and offers more security features. Pro 2 supports ECC
+  cryptography (NISTP).
+- `Yubikey 5`_: proprietary hardware and software, but cheaper than
   Nitrokey Pro and comes available in the USB-C form that is more useful
   with newer laptops. Offers additional security features such as FIDO
-  U2F, but no ECC.
+  U2F, among others, and now finally supports ECC keys (NISTP).
 
 `LWN has a good review`_ of some of the above models, as well as several
-others. If you want to use ECC keys, your best bet among commercially
-available devices is the Nitrokey Start.
+others. Your choice will depend on cost, shipping availability in your
+geographical region, and open/proprietary hardware considerations.
 
 .. note::
 
@@ -457,8 +460,8 @@ available devices is the Nitrokey Start.
     Foundation.
 
 .. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6
-.. _`Nitrokey Pro`: https://shop.nitrokey.com/shop/product/nitrokey-pro-3
-.. _`Yubikey 4`: https://www.yubico.com/product/yubikey-4-series/
+.. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nitrokey-pro-2-3
+.. _`Yubikey 5`: https://www.yubico.com/products/yubikey-5-overview/
 .. _Gnuk: http://www.fsij.org/doc-gnuk/
 .. _`LWN has a good review`: https://lwn.net/Articles/736231/
 .. _`qualify for a free Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html
diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst
index c88867b173d9..8e56337d422d 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -39,7 +39,7 @@ and elsewhere regarding submitting Linux kernel patches.
 
 6) Any new or modified ``CONFIG`` options do not muck up the config menu and
    default to off unless they meet the exception criteria documented in
-   ``Documentation/kbuild/kconfig-language.txt`` Menu attributes: default value.
+   ``Documentation/kbuild/kconfig-language.rst`` Menu attributes: default value.
 
 7) All new ``Kconfig`` options have help text.
 
@@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches.
     and why.
 
 26) If any ioctl's are added by the patch, then also update
-    ``Documentation/ioctl/ioctl-number.txt``.
+    ``Documentation/ioctl/ioctl-number.rst``.
 
 27) If your modified source code depends on or uses any of the kernel
     APIs or features that are related to the following ``Kconfig`` symbols,
diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst
index 58bc047e7b95..1acaa14903d6 100644
--- a/Documentation/process/submitting-drivers.rst
+++ b/Documentation/process/submitting-drivers.rst
@@ -117,7 +117,7 @@ PM support:
 		implemented") error.  You should also try to make sure that your
 		driver uses as little power as possible when it's not doing
 		anything.  For the driver testing instructions see
-		Documentation/power/drivers-testing.txt and for a relatively
+		Documentation/power/drivers-testing.rst and for a relatively
 		complete overview of the power management issues related to
 		drivers see :ref:`Documentation/driver-api/pm/devices.rst <driverapi_pm_devices>`.
 
diff --git a/Documentation/pti/pti_intel_mid.txt b/Documentation/pti/pti_intel_mid.txt
deleted file mode 100644
index e7a5b6d1f7a9..000000000000
--- a/Documentation/pti/pti_intel_mid.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-The Intel MID PTI project is HW implemented in Intel Atom
-system-on-a-chip designs based on the Parallel Trace
-Interface for MIPI P1149.7 cJTAG standard.  The kernel solution
-for this platform involves the following files:
-
-./include/linux/pti.h
-./drivers/.../n_tracesink.h
-./drivers/.../n_tracerouter.c
-./drivers/.../n_tracesink.c
-./drivers/.../pti.c
-
-pti.c is the driver that enables various debugging features
-popular on platforms from certain mobile manufacturers.
-n_tracerouter.c and n_tracesink.c allow extra system information to
-be collected and routed to the pti driver, such as trace
-debugging data from a modem.  Although n_tracerouter
-and n_tracesink are a part of the complete PTI solution,
-these two line disciplines can work separately from
-pti.c and route any data stream from one /dev/tty node
-to another /dev/tty node via kernel-space.  This provides
-a stable, reliable connection that will not break unless
-the user-space application shuts down (plus avoids
-kernel->user->kernel context switch overheads of routing
-data).
-
-An example debugging usage for this driver system:
-   *Hook /dev/ttyPTI0 to syslogd.  Opening this port will also start
-    a console device to further capture debugging messages to PTI.
-   *Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
-    This is where n_tracerouter and n_tracesink are used.
-   *Hook /dev/pti to a user-level debugging application for writing
-    to PTI HW.
-   *Use mipi_* Kernel Driver API in other device drivers for
-    debugging to PTI by first requesting a PTI write address via
-    mipi_request_masterchannel(1).
-
-Below is example pseudo-code on how a 'privileged' application
-can hook up n_tracerouter and n_tracesink to any tty on
-a system.  'Privileged' means the application has enough
-privileges to successfully manipulate the ldisc drivers
-but is not just blindly executing as 'root'. Keep in mind
-the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
-and n_tracesink line discpline drivers but is a generic
-operation for a program to use a line discpline driver
-on a tty port other than the default n_tty.
-
-/////////// To hook up n_tracerouter and n_tracesink /////////
-
-// Note that n_tracerouter depends on n_tracesink.
-#include <errno.h>
-#define ONE_TTY "/dev/ttyOne"
-#define TWO_TTY "/dev/ttyTwo"
-
-// needed global to hand onto ldisc connection
-static int g_fd_source = -1;
-static int g_fd_sink  = -1;
-
-// these two vars used to grab LDISC values from loaded ldisc drivers
-// in OS.  Look at /proc/tty/ldiscs to get the right numbers from
-// the ldiscs loaded in the system.
-int source_ldisc_num, sink_ldisc_num = -1;
-int retval;
-
-g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
-g_fd_sink   = open(TWO_TTY, O_RDWR); // must be R/W
-
-if (g_fd_source <= 0) || (g_fd_sink <= 0) {
-   // doubt you'll want to use these exact error lines of code
-   printf("Error on open(). errno: %d\n",errno);
-   return errno;
-}
-
-retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
-if (retval < 0) {
-   printf("Error on ioctl().  errno: %d\n", errno);
-   return errno;
-}
-
-retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
-if (retval < 0) {
-   printf("Error on ioctl().  errno: %d\n", errno);
-   return errno;
-}
-
-/////////// To disconnect n_tracerouter and n_tracesink ////////
-
-// First make sure data through the ldiscs has stopped.
-
-// Second, disconnect ldiscs.  This provides a
-// little cleaner shutdown on tty stack.
-sink_ldisc_num = 0;
-source_ldisc_num = 0;
-ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
-ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
-
-// Three, program closes connection, and cleanup:
-close(g_fd_uart);
-close(g_fd_gadget);
-g_fd_uart = g_fd_gadget = NULL;
diff --git a/Documentation/ptp/ptp.txt b/Documentation/ptp/ptp.txt
deleted file mode 100644
index 11e904ee073f..000000000000
--- a/Documentation/ptp/ptp.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-
-* PTP hardware clock infrastructure for Linux
-
-  This patch set introduces support for IEEE 1588 PTP clocks in
-  Linux. Together with the SO_TIMESTAMPING socket options, this
-  presents a standardized method for developing PTP user space
-  programs, synchronizing Linux with external clocks, and using the
-  ancillary features of PTP hardware clocks.
-
-  A new class driver exports a kernel interface for specific clock
-  drivers and a user space interface. The infrastructure supports a
-  complete set of PTP hardware clock functionality.
-
-  + Basic clock operations
-    - Set time
-    - Get time
-    - Shift the clock by a given offset atomically
-    - Adjust clock frequency
-
-  + Ancillary clock features
-    - Time stamp external events
-    - Period output signals configurable from user space
-    - Synchronization of the Linux system time via the PPS subsystem
-
-** PTP hardware clock kernel API
-
-   A PTP clock driver registers itself with the class driver. The
-   class driver handles all of the dealings with user space. The
-   author of a clock driver need only implement the details of
-   programming the clock hardware. The clock driver notifies the class
-   driver of asynchronous events (alarms and external time stamps) via
-   a simple message passing interface.
-
-   The class driver supports multiple PTP clock drivers. In normal use
-   cases, only one PTP clock is needed. However, for testing and
-   development, it can be useful to have more than one clock in a
-   single system, in order to allow performance comparisons.
-
-** PTP hardware clock user space API
-
-   The class driver also creates a character device for each
-   registered clock. User space can use an open file descriptor from
-   the character device as a POSIX clock id and may call
-   clock_gettime, clock_settime, and clock_adjtime.  These calls
-   implement the basic clock operations.
-
-   User space programs may control the clock using standardized
-   ioctls. A program may query, enable, configure, and disable the
-   ancillary clock features. User space can receive time stamped
-   events via blocking read() and poll().
-
-** Writing clock drivers
-
-   Clock drivers include include/linux/ptp_clock_kernel.h and register
-   themselves by presenting a 'struct ptp_clock_info' to the
-   registration method. Clock drivers must implement all of the
-   functions in the interface. If a clock does not offer a particular
-   ancillary feature, then the driver should just return -EOPNOTSUPP
-   from those functions.
-
-   Drivers must ensure that all of the methods in interface are
-   reentrant. Since most hardware implementations treat the time value
-   as a 64 bit integer accessed as two 32 bit registers, drivers
-   should use spin_lock_irqsave/spin_unlock_irqrestore to protect
-   against concurrent access. This locking cannot be accomplished in
-   class driver, since the lock may also be needed by the clock
-   driver's interrupt service routine.
-
-** Supported hardware
-
-   + Freescale eTSEC gianfar
-     - 2 Time stamp external triggers, programmable polarity (opt. interrupt)
-     - 2 Alarm registers (optional interrupt)
-     - 3 Periodic signals (optional interrupt)
-
-   + National DP83640
-     - 6 GPIOs programmable as inputs or outputs
-     - 6 GPIOs with dedicated functions (LED/JTAG/clock) can also be
-       used as general inputs or outputs
-     - GPIO inputs can time stamp external triggers
-     - GPIO outputs can produce periodic signals
-     - 1 interrupt pin
-
-   + Intel IXP465
-     - Auxiliary Slave/Master Mode Snapshot (optional interrupt)
-     - Target Time (optional interrupt)
diff --git a/Documentation/pwm.txt b/Documentation/pwm.txt
deleted file mode 100644
index 8fbf0aa3ba2d..000000000000
--- a/Documentation/pwm.txt
+++ /dev/null
@@ -1,158 +0,0 @@
-======================================
-Pulse Width Modulation (PWM) interface
-======================================
-
-This provides an overview about the Linux PWM interface
-
-PWMs are commonly used for controlling LEDs, fans or vibrators in
-cell phones. PWMs with a fixed purpose have no need implementing
-the Linux PWM API (although they could). However, PWMs are often
-found as discrete devices on SoCs which have no fixed purpose. It's
-up to the board designer to connect them to LEDs or fans. To provide
-this kind of flexibility the generic PWM API exists.
-
-Identifying PWMs
-----------------
-
-Users of the legacy PWM API use unique IDs to refer to PWM devices.
-
-Instead of referring to a PWM device via its unique ID, board setup code
-should instead register a static mapping that can be used to match PWM
-consumers to providers, as given in the following example::
-
-	static struct pwm_lookup board_pwm_lookup[] = {
-		PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL,
-			   50000, PWM_POLARITY_NORMAL),
-	};
-
-	static void __init board_init(void)
-	{
-		...
-		pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup));
-		...
-	}
-
-Using PWMs
-----------
-
-Legacy users can request a PWM device using pwm_request() and free it
-after usage with pwm_free().
-
-New users should use the pwm_get() function and pass to it the consumer
-device or a consumer name. pwm_put() is used to free the PWM device. Managed
-variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
-
-After being requested, a PWM has to be configured using::
-
-	int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
-
-This API controls both the PWM period/duty_cycle config and the
-enable/disable state.
-
-The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
-around pwm_apply_state() and should not be used if the user wants to change
-several parameter at once. For example, if you see pwm_config() and
-pwm_{enable,disable}() calls in the same function, this probably means you
-should switch to pwm_apply_state().
-
-The PWM user API also allows one to query the PWM state with pwm_get_state().
-
-In addition to the PWM state, the PWM API also exposes PWM arguments, which
-are the reference PWM config one should use on this PWM.
-PWM arguments are usually platform-specific and allows the PWM user to only
-care about dutycycle relatively to the full period (like, duty = 50% of the
-period). struct pwm_args contains 2 fields (period and polarity) and should
-be used to set the initial PWM config (usually done in the probe function
-of the PWM user). PWM arguments are retrieved with pwm_get_args().
-
-Using PWMs with the sysfs interface
------------------------------------
-
-If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs
-interface is provided to use the PWMs from userspace. It is exposed at
-/sys/class/pwm/. Each probed PWM controller/chip will be exported as
-pwmchipN, where N is the base of the PWM chip. Inside the directory you
-will find:
-
-  npwm
-    The number of PWM channels this chip supports (read-only).
-
-  export
-    Exports a PWM channel for use with sysfs (write-only).
-
-  unexport
-   Unexports a PWM channel from sysfs (write-only).
-
-The PWM channels are numbered using a per-chip index from 0 to npwm-1.
-
-When a PWM channel is exported a pwmX directory will be created in the
-pwmchipN directory it is associated with, where X is the number of the
-channel that was exported. The following properties will then be available:
-
-  period
-    The total period of the PWM signal (read/write).
-    Value is in nanoseconds and is the sum of the active and inactive
-    time of the PWM.
-
-  duty_cycle
-    The active time of the PWM signal (read/write).
-    Value is in nanoseconds and must be less than the period.
-
-  polarity
-    Changes the polarity of the PWM signal (read/write).
-    Writes to this property only work if the PWM chip supports changing
-    the polarity. The polarity can only be changed if the PWM is not
-    enabled. Value is the string "normal" or "inversed".
-
-  enable
-    Enable/disable the PWM signal (read/write).
-
-	- 0 - disabled
-	- 1 - enabled
-
-Implementing a PWM driver
--------------------------
-
-Currently there are two ways to implement pwm drivers. Traditionally
-there only has been the barebone API meaning that each driver has
-to implement the pwm_*() functions itself. This means that it's impossible
-to have multiple PWM drivers in the system. For this reason it's mandatory
-for new drivers to use the generic PWM framework.
-
-A new PWM controller/chip can be added using pwmchip_add() and removed
-again with pwmchip_remove(). pwmchip_add() takes a filled in struct
-pwm_chip as argument which provides a description of the PWM chip, the
-number of PWM devices provided by the chip and the chip-specific
-implementation of the supported PWM operations to the framework.
-
-When implementing polarity support in a PWM driver, make sure to respect the
-signal conventions in the PWM framework. By definition, normal polarity
-characterizes a signal starts high for the duration of the duty cycle and
-goes low for the remainder of the period. Conversely, a signal with inversed
-polarity starts low for the duration of the duty cycle and goes high for the
-remainder of the period.
-
-Drivers are encouraged to implement ->apply() instead of the legacy
-->enable(), ->disable() and ->config() methods. Doing that should provide
-atomicity in the PWM config workflow, which is required when the PWM controls
-a critical device (like a regulator).
-
-The implementation of ->get_state() (a method used to retrieve initial PWM
-state) is also encouraged for the same reason: letting the PWM user know
-about the current PWM state would allow him to avoid glitches.
-
-Locking
--------
-
-The PWM core list manipulations are protected by a mutex, so pwm_request()
-and pwm_free() may not be called from an atomic context. Currently the
-PWM core does not enforce any locking to pwm_enable(), pwm_disable() and
-pwm_config(), so the calling context is currently driver specific. This
-is an issue derived from the former barebone API and should be fixed soon.
-
-Helpers
--------
-
-Currently a PWM can only be configured with period_ns and duty_ns. For several
-use cases freq_hz and duty_percent might be better. Instead of calculating
-this in your driver please consider adding appropriate helpers to the framework.
diff --git a/Documentation/rapidio/mport_cdev.txt b/Documentation/rapidio/mport_cdev.txt
deleted file mode 100644
index a53f786ee2e9..000000000000
--- a/Documentation/rapidio/mport_cdev.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-RapidIO subsystem mport character device driver (rio_mport_cdev.c)
-==================================================================
-
-Version History:
-----------------
-  1.0.0 - Initial driver release.
-
-==================================================================
-
-I. Overview
-
-This device driver is the result of collaboration within the RapidIO.org
-Software Task Group (STG) between Texas Instruments, Freescale,
-Prodrive Technologies, Nokia Networks, BAE and IDT.  Additional input was
-received from other members of RapidIO.org. The objective was to create a
-character mode driver interface which exposes the capabilities of RapidIO
-devices directly to applications, in a manner that allows the numerous and
-varied RapidIO implementations to interoperate.
-
-This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
-for user-space applications. Most of RapidIO operations are supported through
-'ioctl' system calls.
-
-When loaded this device driver creates filesystem nodes named rio_mportX in /dev
-directory for each registered RapidIO mport device. 'X' in the node name matches
-to unique port ID assigned to each local mport device.
-
-Using available set of ioctl commands user-space applications can perform
-following RapidIO bus and subsystem operations:
-
-- Reads and writes from/to configuration registers of mport devices
-    (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
-- Reads and writes from/to configuration registers of remote RapidIO devices.
-  This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
-    (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
-- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
-- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
-- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
-- Query capabilities and RapidIO link configuration of mport devices
-    (RIO_MPORT_GET_PROPERTIES)
-- Enable/Disable reporting of RapidIO doorbell events to user-space applications
-    (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
-- Enable/Disable reporting of RIO port-write events to user-space applications
-    (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
-- Query/Control type of events reported through this driver: doorbells,
-  port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
-- Configure/Map mport's outbound requests window(s) for specific size,
-  RapidIO destination ID, hopcount and request type
-    (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
-- Configure/Map mport's inbound requests window(s) for specific size,
-  RapidIO base address and local memory base address
-    (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
-- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
-  to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
-- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
-  Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
-  transfer modes.
-- Check/Wait for completion of asynchronous DMA data transfer
-    (RIO_WAIT_FOR_ASYNC)
-- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
-  This allows implementation of various RapidIO fabric enumeration algorithms
-  as user-space applications while using remaining functionality provided by
-  kernel RapidIO subsystem.
-
-II. Hardware Compatibility
-
-This device driver uses standard interfaces defined by kernel RapidIO subsystem
-and therefore it can be used with any mport device driver registered by RapidIO
-subsystem with limitations set by available mport implementation.
-
-At this moment the most common limitation is availability of RapidIO-specific
-DMA engine framework for specific mport device. Users should verify available
-functionality of their platform when planning to use this driver:
-
-- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
-  compatible with this driver.
-- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
-  specific DMA engine support and therefore DMA data transfers mport_cdev driver
-  are not available.
-
-III. Module parameters
-
-- 'dma_timeout' - DMA transfer completion timeout (in msec, default value 3000).
-        This parameter set a maximum completion wait time for SYNC mode DMA
-        transfer requests and for RIO_WAIT_FOR_ASYNC ioctl requests.
-
-- 'dbg_level' - This parameter allows to control amount of debug information
-        generated by this device driver. This parameter is formed by set of
-        bit masks that correspond to the specific functional blocks.
-        For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
-        This parameter can be changed dynamically.
-        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-IV. Known problems
-
-  None.
-
-V. User-space Applications and API
-
-API library and applications that use this device driver are available from
-RapidIO.org.
-
-VI. TODO List
-
-- Add support for sending/receiving "raw" RapidIO messaging packets.
-- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
-  is not available.
diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt
deleted file mode 100644
index 28fbd877f85a..000000000000
--- a/Documentation/rapidio/rapidio.txt
+++ /dev/null
@@ -1,351 +0,0 @@
-                          The Linux RapidIO Subsystem
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RapidIO standard is a packet-based fabric interconnect standard designed for
-use in embedded systems. Development of the RapidIO standard is directed by the
-RapidIO Trade Association (RTA). The current version of the RapidIO specification
-is publicly available for download from the RTA web-site [1].
-
-This document describes the basics of the Linux RapidIO subsystem and provides
-information on its major components.
-
-1 Overview
-----------
-
-Because the RapidIO subsystem follows the Linux device model it is integrated
-into the kernel similarly to other buses by defining RapidIO-specific device and
-bus types and registering them within the device model.
-
-The Linux RapidIO subsystem is architecture independent and therefore defines
-architecture-specific interfaces that provide support for common RapidIO
-subsystem operations.
-
-2. Core Components
-------------------
-
-A typical RapidIO network is a combination of endpoints and switches.
-Each of these components is represented in the subsystem by an associated data
-structure. The core logical components of the RapidIO subsystem are defined
-in include/linux/rio.h file.
-
-2.1 Master Port
-
-A master port (or mport) is a RapidIO interface controller that is local to the
-processor executing the Linux code. A master port generates and receives RapidIO
-packets (transactions). In the RapidIO subsystem each master port is represented
-by a rio_mport data structure. This structure contains master port specific
-resources such as mailboxes and doorbells. The rio_mport also includes a unique
-host device ID that is valid when a master port is configured as an enumerating
-host.
-
-RapidIO master ports are serviced by subsystem specific mport device drivers
-that provide functionality defined for this subsystem. To provide a hardware
-independent interface for RapidIO subsystem operations, rio_mport structure
-includes rio_ops data structure which contains pointers to hardware specific
-implementations of RapidIO functions.
-
-2.2 Device
-
-A RapidIO device is any endpoint (other than mport) or switch in the network.
-All devices are presented in the RapidIO subsystem by corresponding rio_dev data
-structure. Devices form one global device list and per-network device lists
-(depending on number of available mports and networks).
-
-2.3 Switch
-
-A RapidIO switch is a special class of device that routes packets between its
-ports towards their final destination. The packet destination port within a
-switch is defined by an internal routing table. A switch is presented in the
-RapidIO subsystem by rio_dev data structure expanded by additional rio_switch
-data structure, which contains switch specific information such as copy of the
-routing table and pointers to switch specific functions.
-
-The RapidIO subsystem defines the format and initialization method for subsystem
-specific switch drivers that are designed to provide hardware-specific
-implementation of common switch management routines.
-
-2.4 Network
-
-A RapidIO network is a combination of interconnected endpoint and switch devices.
-Each RapidIO network known to the system is represented by corresponding rio_net
-data structure. This structure includes lists of all devices and local master
-ports that form the same network. It also contains a pointer to the default
-master port that is used to communicate with devices within the network.
-
-2.5 Device Drivers
-
-RapidIO device-specific drivers follow Linux Kernel Driver Model and are
-intended to support specific RapidIO devices attached to the RapidIO network.
-
-2.6 Subsystem Interfaces
-
-RapidIO interconnect specification defines features that may be used to provide
-one or more common service layers for all participating RapidIO devices. These
-common services may act separately from device-specific drivers or be used by
-device-specific drivers. Example of such service provider is the RIONET driver
-which implements Ethernet-over-RapidIO interface. Because only one driver can be
-registered for a device, all common RapidIO services have to be registered as
-subsystem interfaces. This allows to have multiple common services attached to
-the same device without blocking attachment of a device-specific driver.
-
-3. Subsystem Initialization
----------------------------
-
-In order to initialize the RapidIO subsystem, a platform must initialize and
-register at least one master port within the RapidIO network. To register mport
-within the subsystem controller driver's initialization code calls function
-rio_register_mport() for each available master port.
-
-After all active master ports are registered with a RapidIO subsystem,
-an enumeration and/or discovery routine may be called automatically or
-by user-space command.
-
-RapidIO subsystem can be configured to be built as a statically linked or
-modular component of the kernel (see details below).
-
-4. Enumeration and Discovery
-----------------------------
-
-4.1 Overview
-------------
-
-RapidIO subsystem configuration options allow users to build enumeration and
-discovery methods as statically linked components or loadable modules.
-An enumeration/discovery method implementation and available input parameters
-define how any given method can be attached to available RapidIO mports:
-simply to all available mports OR individually to the specified mport device.
-
-Depending on selected enumeration/discovery build configuration, there are
-several methods to initiate an enumeration and/or discovery process:
-
-  (a) Statically linked enumeration and discovery process can be started
-  automatically during kernel initialization time using corresponding module
-  parameters. This was the original method used since introduction of RapidIO
-  subsystem. Now this method relies on enumerator module parameter which is
-  'rio-scan.scan' for existing basic enumeration/discovery method.
-  When automatic start of enumeration/discovery is used a user has to ensure
-  that all discovering endpoints are started before the enumerating endpoint
-  and are waiting for enumeration to be completed.
-  Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering
-  endpoint waits for enumeration to be completed. If the specified timeout
-  expires the discovery process is terminated without obtaining RapidIO network
-  information. NOTE: a timed out discovery process may be restarted later using
-  a user-space command as it is described below (if the given endpoint was
-  enumerated successfully).
-
-  (b) Statically linked enumeration and discovery process can be started by
-  a command from user space. This initiation method provides more flexibility
-  for a system startup compared to the option (a) above. After all participating
-  endpoints have been successfully booted, an enumeration process shall be
-  started first by issuing a user-space command, after an enumeration is
-  completed a discovery process can be started on all remaining endpoints.
-
-  (c) Modular enumeration and discovery process can be started by a command from
-  user space. After an enumeration/discovery module is loaded, a network scan
-  process can be started by issuing a user-space command.
-  Similar to the option (b) above, an enumerator has to be started first.
-
-  (d) Modular enumeration and discovery process can be started by a module
-  initialization routine. In this case an enumerating module shall be loaded
-  first.
-
-When a network scan process is started it calls an enumeration or discovery
-routine depending on the configured role of a master port: host or agent.
-
-Enumeration is performed by a master port if it is configured as a host port by
-assigning a host destination ID greater than or equal to zero. The host
-destination ID can be assigned to a master port using various methods depending
-on RapidIO subsystem build configuration:
-
-  (a) For a statically linked RapidIO subsystem core use command line parameter
-  "rapidio.hdid=" with a list of destination ID assignments in order of mport
-  device registration. For example, in a system with two RapidIO controllers
-  the command line parameter "rapidio.hdid=-1,7" will result in assignment of
-  the host destination ID=7 to the second RapidIO controller, while the first
-  one will be assigned destination ID=-1.
-
-  (b) If the RapidIO subsystem core is built as a loadable module, in addition
-  to the method shown above, the host destination ID(s) can be specified using
-  traditional methods of passing module parameter "hdid=" during its loading:
-  - from command line: "modprobe rapidio hdid=-1,7", or
-  - from modprobe configuration file using configuration command "options",
-    like in this example: "options rapidio hdid=-1,7". An example of modprobe
-    configuration file is provided in the section below.
-
-  NOTES:
-  (i) if "hdid=" parameter is omitted all available mport will be assigned
-  destination ID = -1;
-  (ii) the "hdid=" parameter in systems with multiple mports can have
-  destination ID assignments omitted from the end of list (default = -1).
-
-If the host device ID for a specific master port is set to -1, the discovery
-process will be performed for it.
-
-The enumeration and discovery routines use RapidIO maintenance transactions
-to access the configuration space of devices.
-
-NOTE: If RapidIO switch-specific device drivers are built as loadable modules
-they must be loaded before enumeration/discovery process starts.
-This requirement is cased by the fact that enumeration/discovery methods invoke
-vendor-specific callbacks on early stages.
-
-4.2 Automatic Start of Enumeration and Discovery
-------------------------------------------------
-
-Automatic enumeration/discovery start method is applicable only to built-in
-enumeration/discovery RapidIO configuration selection. To enable automatic
-enumeration/discovery start by existing basic enumerator method set use boot
-command line parameter "rio-scan.scan=1".
-
-This configuration requires synchronized start of all RapidIO endpoints that
-form a network which will be enumerated/discovered. Discovering endpoints have
-to be started before an enumeration starts to ensure that all RapidIO
-controllers have been initialized and are ready to be discovered. Configuration
-parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which
-a discovering endpoint will wait for enumeration to be completed.
-
-When automatic enumeration/discovery start is selected, basic method's
-initialization routine calls rio_init_mports() to perform enumeration or
-discovery for all known mport devices.
-
-Depending on RapidIO network size and configuration this automatic
-enumeration/discovery start method may be difficult to use due to the
-requirement for synchronized start of all endpoints.
-
-4.3 User-space Start of Enumeration and Discovery
--------------------------------------------------
-
-User-space start of enumeration and discovery can be used with built-in and
-modular build configurations. For user-space controlled start RapidIO subsystem
-creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate
-an enumeration or discovery process on specific mport device, a user needs to
-write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a
-sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device
-registration. For example for machine with single RapidIO controller, mport_ID
-for that controller always will be 0.
-
-To initiate RapidIO enumeration/discovery on all available mports a user may
-write '-1' (or RIO_MPORT_ANY) into the scan attribute file.
-
-4.4 Basic Enumeration Method
-----------------------------
-
-This is an original enumeration/discovery method which is available since
-first release of RapidIO subsystem code. The enumeration process is
-implemented according to the enumeration algorithm outlined in the RapidIO
-Interconnect Specification: Annex I [1].
-
-This method can be configured as statically linked or loadable module.
-The method's single parameter "scan" allows to trigger the enumeration/discovery
-process from module initialization routine.
-
-This enumeration/discovery method can be started only once and does not support
-unloading if it is built as a module.
-
-The enumeration process traverses the network using a recursive depth-first
-algorithm. When a new device is found, the enumerator takes ownership of that
-device by writing into the Host Device ID Lock CSR. It does this to ensure that
-the enumerator has exclusive right to enumerate the device. If device ownership
-is successfully acquired, the enumerator allocates a new rio_dev structure and
-initializes it according to device capabilities.
-
-If the device is an endpoint, a unique device ID is assigned to it and its value
-is written into the device's Base Device ID CSR.
-
-If the device is a switch, the enumerator allocates an additional rio_switch
-structure to store switch specific information. Then the switch's vendor ID and
-device ID are queried against a table of known RapidIO switches. Each switch
-table entry contains a pointer to a switch-specific initialization routine that
-initializes pointers to the rest of switch specific operations, and performs
-hardware initialization if necessary. A RapidIO switch does not have a unique
-device ID; it relies on hopcount and routing for device ID of an attached
-endpoint if access to its configuration registers is required. If a switch (or
-chain of switches) does not have any endpoint (except enumerator) attached to
-it, a fake device ID will be assigned to configure a route to that switch.
-In the case of a chain of switches without endpoint, one fake device ID is used
-to configure a route through the entire chain and switches are differentiated by
-their hopcount value.
-
-For both endpoints and switches the enumerator writes a unique component tag
-into device's Component Tag CSR. That unique value is used by the error
-management notification mechanism to identify a device that is reporting an
-error management event.
-
-Enumeration beyond a switch is completed by iterating over each active egress
-port of that switch. For each active link, a route to a default device ID
-(0xFF for 8-bit systems and 0xFFFF for 16-bit systems) is temporarily written
-into the routing table. The algorithm recurs by calling itself with hopcount + 1
-and the default device ID in order to access the device on the active port.
-
-After the host has completed enumeration of the entire network it releases
-devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint
-in the system, it sets the Discovered bit in the Port General Control CSR
-to indicate that enumeration is completed and agents are allowed to execute
-passive discovery of the network.
-
-The discovery process is performed by agents and is similar to the enumeration
-process that is described above. However, the discovery process is performed
-without changes to the existing routing because agents only gather information
-about RapidIO network structure and are building an internal map of discovered
-devices. This way each Linux-based component of the RapidIO subsystem has
-a complete view of the network. The discovery process can be performed
-simultaneously by several agents. After initializing its RapidIO master port
-each agent waits for enumeration completion by the host for the configured wait
-time period. If this wait time period expires before enumeration is completed,
-an agent skips RapidIO discovery and continues with remaining kernel
-initialization.
-
-4.5 Adding New Enumeration/Discovery Method
--------------------------------------------
-
-RapidIO subsystem code organization allows addition of new enumeration/discovery
-methods as new configuration options without significant impact to the core
-RapidIO code.
-
-A new enumeration/discovery method has to be attached to one or more mport
-devices before an enumeration/discovery process can be started. Normally,
-method's module initialization routine calls rio_register_scan() to attach
-an enumerator to a specified mport device (or devices). The basic enumerator
-implementation demonstrates this process.
-
-4.6 Using Loadable RapidIO Switch Drivers
------------------------------------------
-
-In the case when RapidIO switch drivers are built as loadable modules a user
-must ensure that they are loaded before the enumeration/discovery starts.
-This process can be automated by specifying pre- or post- dependencies in the
-RapidIO-specific modprobe configuration file as shown in the example below.
-
-  File /etc/modprobe.d/rapidio.conf:
-  ----------------------------------
-
-  # Configure RapidIO subsystem modules
-
-  # Set enumerator host destination ID (overrides kernel command line option)
-  options rapidio hdid=-1,2
-
-  # Load RapidIO switch drivers immediately after rapidio core module was loaded
-  softdep rapidio post: idt_gen2 idtcps tsi57x
-
-  # OR :
-
-  # Load RapidIO switch drivers just before rio-scan enumerator module is loaded
-  softdep rio-scan pre: idt_gen2 idtcps tsi57x
-
-  --------------------------
-
-NOTE: In the example above, one of "softdep" commands must be removed or
-commented out to keep required module loading sequence.
-
-A. References
--------------
-
-[1] RapidIO Trade Association. RapidIO Interconnect Specifications.
-    http://www.rapidio.org.
-[2] Rapidio TA. Technology Comparisons.
-    http://www.rapidio.org/education/technology_comparisons/
-[3] RapidIO support for Linux.
-    http://lwn.net/Articles/139118/
-[4] Matt Porter. RapidIO for Linux. Ottawa Linux Symposium, 2005
-    http://www.kernel.org/doc/ols/2005/ols2005v2-pages-43-56.pdf
diff --git a/Documentation/rapidio/rio_cm.txt b/Documentation/rapidio/rio_cm.txt
deleted file mode 100644
index 27aa401f1126..000000000000
--- a/Documentation/rapidio/rio_cm.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-RapidIO subsystem Channelized Messaging character device driver (rio_cm.c)
-==========================================================================
-
-Version History:
-----------------
-  1.0.0 - Initial driver release.
-
-==========================================================================
-
-I. Overview
-
-This device driver is the result of collaboration within the RapidIO.org
-Software Task Group (STG) between Texas Instruments, Prodrive Technologies,
-Nokia Networks, BAE and IDT.  Additional input was received from other members
-of RapidIO.org.
-
-The objective was to create a character mode driver interface which exposes
-messaging capabilities of RapidIO endpoint devices (mports) directly
-to applications, in a manner that allows the numerous and varied RapidIO
-implementations to interoperate.
-
-This driver (RIO_CM) provides to user-space applications shared access to
-RapidIO mailbox messaging resources.
-
-RapidIO specification (Part 2) defines that endpoint devices may have up to four
-messaging mailboxes in case of multi-packet message (up to 4KB) and
-up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition
-to protocol definition limitations, a particular hardware implementation can
-have reduced number of messaging mailboxes.  RapidIO aware applications must
-therefore share the messaging resources of a RapidIO endpoint.
-
-Main purpose of this device driver is to provide RapidIO mailbox messaging
-capability to large number of user-space processes by introducing socket-like
-operations using a single messaging mailbox.  This allows applications to
-use the limited RapidIO messaging hardware resources efficiently.
-
-Most of device driver's operations are supported through 'ioctl' system calls.
-
-When loaded this device driver creates a single file system node named rio_cm
-in /dev directory common for all registered RapidIO mport devices.
-
-Following ioctl commands are available to user-space applications:
-
-- RIO_CM_MPORT_GET_LIST : Returns to caller list of local mport devices that
-    support messaging operations (number of entries up to RIO_MAX_MPORTS).
-    Each list entry is combination of mport's index in the system and RapidIO
-    destination ID assigned to the port.
-- RIO_CM_EP_GET_LIST_SIZE : Returns number of messaging capable remote endpoints
-    in a RapidIO network associated with the specified mport device.
-- RIO_CM_EP_GET_LIST : Returns list of RapidIO destination IDs for messaging
-    capable remote endpoints (peers) available in a RapidIO network associated
-    with the specified mport device.
-- RIO_CM_CHAN_CREATE : Creates RapidIO message exchange channel data structure
-    with channel ID assigned automatically or as requested by a caller.
-- RIO_CM_CHAN_BIND : Binds the specified channel data structure to the specified
-    mport device.
-- RIO_CM_CHAN_LISTEN : Enables listening for connection requests on the specified
-    channel.
-- RIO_CM_CHAN_ACCEPT : Accepts a connection request from peer on the specified
-    channel. If wait timeout for this request is specified by a caller it is
-    a blocking call. If timeout set to 0 this is non-blocking call - ioctl
-    handler checks for a pending connection request and if one is not available
-    exits with -EGAIN error status immediately.
-- RIO_CM_CHAN_CONNECT : Sends a connection request to a remote peer/channel.
-- RIO_CM_CHAN_SEND : Sends a data message through the specified channel.
-    The handler for this request assumes that message buffer specified by
-    a caller includes the reserved space for a packet header required by
-    this driver.
-- RIO_CM_CHAN_RECEIVE : Receives a data message through a connected channel.
-    If the channel does not have an incoming message ready to return this ioctl
-    handler will wait for new message until timeout specified by a caller
-    expires. If timeout value is set to 0, ioctl handler uses a default value
-    defined by MAX_SCHEDULE_TIMEOUT.
-- RIO_CM_CHAN_CLOSE : Closes a specified channel and frees associated buffers.
-    If the specified channel is in the CONNECTED state, sends close notification
-    to the remote peer.
-
-The ioctl command codes and corresponding data structures intended for use by
-user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'.
-
-II. Hardware Compatibility
-
-This device driver uses standard interfaces defined by kernel RapidIO subsystem
-and therefore it can be used with any mport device driver registered by RapidIO
-subsystem with limitations set by available mport HW implementation of messaging
-mailboxes.
-
-III. Module parameters
-
-- 'dbg_level' - This parameter allows to control amount of debug information
-        generated by this device driver. This parameter is formed by set of
-        bit masks that correspond to the specific functional block.
-        For mask definitions see 'drivers/rapidio/devices/rio_cm.c'
-        This parameter can be changed dynamically.
-        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-- 'cmbox' - Number of RapidIO mailbox to use (default value is 1).
-        This parameter allows to set messaging mailbox number that will be used
-        within entire RapidIO network. It can be used when default mailbox is
-        used by other device drivers or is not supported by some nodes in the
-        RapidIO network.
-
-- 'chstart' - Start channel number for dynamic assignment. Default value - 256.
-        Allows to exclude channel numbers below this parameter from dynamic
-        allocation to avoid conflicts with software components that use
-        reserved predefined channel numbers.
-
-IV. Known problems
-
-  None.
-
-V. User-space Applications and API Library
-
-Messaging API library and applications that use this device driver are available
-from RapidIO.org.
-
-VI. TODO List
-
-- Add support for system notification messages (reserved channel 0).
diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt
deleted file mode 100644
index a1adac888e6e..000000000000
--- a/Documentation/rapidio/sysfs.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-The RapidIO sysfs files have moved to:
-Documentation/ABI/testing/sysfs-bus-rapidio and
-Documentation/ABI/testing/sysfs-class-rapidio
diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt
deleted file mode 100644
index cd2a2935d51d..000000000000
--- a/Documentation/rapidio/tsi721.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge.
-=========================================================================
-
-I. Overview
-
-This driver implements all currently defined RapidIO mport callback functions.
-It supports maintenance read and write operations, inbound and outbound RapidIO
-doorbells, inbound maintenance port-writes and RapidIO messaging.
-
-To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA
-channels. This mechanism provides access to larger range of hop counts and
-destination IDs without need for changes in outbound window translation.
-
-RapidIO messaging support uses dedicated messaging channels for each mailbox.
-For inbound messages this driver uses destination ID matching to forward messages
-into the corresponding message queue. Messaging callbacks are implemented to be
-fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
-
-1. Module parameters:
-- 'dbg_level' - This parameter allows to control amount of debug information
-        generated by this device driver. This parameter is formed by set of
-        This parameter can be changed bit masks that correspond to the specific
-        functional block.
-        For mask definitions see 'drivers/rapidio/devices/tsi721.h'
-        This parameter can be changed dynamically.
-        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-- 'dma_desc_per_channel' - This parameter defines number of hardware buffer
-        descriptors allocated for each registered Tsi721 DMA channel.
-        Its default value is 128.
-
-- 'dma_txqueue_sz' - DMA transactions queue size. Defines number of pending
-        transaction requests that can be accepted by each DMA channel.
-        Default value is 16.
-
-- 'dma_sel' - DMA channel selection mask. Bitmask that defines which hardware
-        DMA channels (0 ... 6) will be registered with DmaEngine core.
-        If bit is set to 1, the corresponding DMA channel will be registered.
-        DMA channels not selected by this mask will not be used by this device
-        driver. Default value is 0x7f (use all channels).
-
-- 'pcie_mrrs' - override value for PCIe Maximum Read Request Size (MRRS).
-        This parameter gives an ability to override MRRS value set during PCIe
-        configuration process. Tsi721 supports read request sizes up to 4096B.
-        Value for this parameter must be set as defined by PCIe specification:
-        0 = 128B, 1 = 256B, 2 = 512B, 3 = 1024B, 4 = 2048B and 5 = 4096B.
-        Default value is '-1' (= keep platform setting).
-
-- 'mbox_sel' - RIO messaging MBOX selection mask. This is a bitmask that defines
-        messaging MBOXes are managed by this device driver. Mask bits 0 - 3
-        correspond to MBOX0 - MBOX3. MBOX is under driver's control if the
-        corresponding bit is set to '1'. Default value is 0x0f (= all).
-
-II. Known problems
-
-  None.
-
-III. DMA Engine Support
-
-Tsi721 mport driver supports DMA data transfers between local system memory and
-remote RapidIO devices. This functionality is implemented according to SLAVE
-mode API defined by common Linux kernel DMA Engine framework.
-
-Depending on system requirements RapidIO DMA operations can be included/excluded
-by setting CONFIG_RAPIDIO_DMA_ENGINE option. Tsi721 miniport driver uses seven
-out of eight available BDMA channels to support DMA data transfers.
-One BDMA channel is reserved for generation of maintenance read/write requests.
-
-If Tsi721 mport driver have been built with RAPIDIO_DMA_ENGINE support included,
-this driver will accept DMA-specific module parameter:
-  "dma_desc_per_channel" - defines number of hardware buffer descriptors used by
-                           each BDMA channel of Tsi721 (by default - 128).
-
-IV. Version History
-
-  1.1.0 - DMA operations re-worked to support data scatter/gather lists larger
-          than hardware buffer descriptors ring.
-  1.0.0 - Initial driver release.
-
-V.  License
------------------------------------------------
-
-  Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms of the GNU General Public License as published by the Free
-  Software Foundation; either version 2 of the License, or (at your option)
-  any later version.
-
-  This program is distributed in the hope that it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index c42a21b99046..523d54b60087 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -204,21 +204,21 @@ potentially expensive tree iterations. This is done at negligible runtime
 overhead for maintanence; albeit larger memory footprint.
 
 Similar to the rb_root structure, cached rbtrees are initialized to be
-empty via:
+empty via::
 
   struct rb_root_cached mytree = RB_ROOT_CACHED;
 
 Cached rbtree is simply a regular rb_root with an extra pointer to cache the
 leftmost node. This allows rb_root_cached to exist wherever rb_root does,
 which permits augmented trees to be supported as well as only a few extra
-interfaces:
+interfaces::
 
   struct rb_node *rb_first_cached(struct rb_root_cached *tree);
   void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
   void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
 
 Both insert and erase calls have their respective counterpart of augmented
-trees:
+trees::
 
   void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
 				  bool, struct rb_augment_callbacks *);
diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index 77fb03acdbb4..03c3d2e568b0 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -314,6 +314,8 @@ Here are the various resource types that are currently supported::
    * @RSC_VDEV:       declare support for a virtio device, and serve as its
    *		    virtio header.
    * @RSC_LAST:       just keep this one at the end
+   * @RSC_VENDOR_START:	start of the vendor specific resource types range
+   * @RSC_VENDOR_END:	end of the vendor specific resource types range
    *
    * Please note that these values are used as indices to the rproc_handle_rsc
    * lookup table, so please keep them sane. Moreover, @RSC_LAST is used to
@@ -321,11 +323,13 @@ Here are the various resource types that are currently supported::
    * please update it as needed.
    */
   enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
   };
 
 For more details regarding a specific resource type, please see its
diff --git a/Documentation/riscv/boot-image-header.txt b/Documentation/riscv/boot-image-header.txt
new file mode 100644
index 000000000000..1b73fea23b39
--- /dev/null
+++ b/Documentation/riscv/boot-image-header.txt
@@ -0,0 +1,50 @@
+				Boot image header in RISC-V Linux
+			=============================================
+
+Author: Atish Patra <atish.patra@wdc.com>
+Date  : 20 May 2019
+
+This document only describes the boot image header details for RISC-V Linux.
+The complete booting guide will be available at Documentation/riscv/booting.txt.
+
+The following 64-byte header is present in decompressed Linux kernel image.
+
+	u32 code0;		  /* Executable code */
+	u32 code1; 		  /* Executable code */
+	u64 text_offset;	  /* Image load offset, little endian */
+	u64 image_size;		  /* Effective Image size, little endian */
+	u64 flags;		  /* kernel flags, little endian */
+	u32 version;		  /* Version of this header */
+	u32 res1  = 0;		  /* Reserved */
+	u64 res2  = 0;    	  /* Reserved */
+	u64 magic = 0x5643534952; /* Magic number, little endian, "RISCV" */
+	u32 res3;		  /* Reserved for additional RISC-V specific header */
+	u32 res4;		  /* Reserved for PE COFF offset */
+
+This header format is compliant with PE/COFF header and largely inspired from
+ARM64 header. Thus, both ARM64 & RISC-V header can be combined into one common
+header in future.
+
+Notes:
+- This header can also be reused to support EFI stub for RISC-V in future. EFI
+  specification needs PE/COFF image header in the beginning of the kernel image
+  in order to load it as an EFI application. In order to support EFI stub,
+  code0 should be replaced with "MZ" magic string and res5(at offset 0x3c) should
+  point to the rest of the PE/COFF header.
+
+- version field indicate header version number.
+	Bits 0:15  - Minor version
+	Bits 16:31 - Major version
+
+  This preserves compatibility across newer and older version of the header.
+  The current version is defined as 0.1.
+
+- res3 is reserved for offset to any other additional fields. This makes the
+  header extendible in future. One example would be to accommodate ISA
+  extension for RISC-V in future. For current version, it is set to be zero.
+
+- In current header, the flag field has only one field.
+	Bit 0: Kernel endianness. 1 if BE, 0 if LE.
+
+- Image size is mandatory for boot loader to load kernel image. Booting will
+  fail otherwise.
diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst
new file mode 100644
index 000000000000..e3ca0922a8c2
--- /dev/null
+++ b/Documentation/riscv/index.rst
@@ -0,0 +1,15 @@
+===================
+RISC-V architecture
+===================
+
+.. toctree::
+    :maxdepth: 1
+
+    pmu
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/riscv/pmu.rst b/Documentation/riscv/pmu.rst
new file mode 100644
index 000000000000..acb216b99c26
--- /dev/null
+++ b/Documentation/riscv/pmu.rst
@@ -0,0 +1,255 @@
+===================================
+Supporting PMUs on RISC-V platforms
+===================================
+
+Alan Kao <alankao@andestech.com>, Mar 2018
+
+Introduction
+------------
+
+As of this writing, perf_event-related features mentioned in The RISC-V ISA
+Privileged Version 1.10 are as follows:
+(please check the manual for more details)
+
+* [m|s]counteren
+* mcycle[h], cycle[h]
+* minstret[h], instret[h]
+* mhpeventx, mhpcounterx[h]
+
+With such function set only, porting perf would require a lot of work, due to
+the lack of the following general architectural performance monitoring features:
+
+* Enabling/Disabling counters
+  Counters are just free-running all the time in our case.
+* Interrupt caused by counter overflow
+  No such feature in the spec.
+* Interrupt indicator
+  It is not possible to have many interrupt ports for all counters, so an
+  interrupt indicator is required for software to tell which counter has
+  just overflowed.
+* Writing to counters
+  There will be an SBI to support this since the kernel cannot modify the
+  counters [1].  Alternatively, some vendor considers to implement
+  hardware-extension for M-S-U model machines to write counters directly.
+
+This document aims to provide developers a quick guide on supporting their
+PMUs in the kernel.  The following sections briefly explain perf' mechanism
+and todos.
+
+You may check previous discussions here [1][2].  Also, it might be helpful
+to check the appendix for related kernel structures.
+
+
+1. Initialization
+-----------------
+
+*riscv_pmu* is a global pointer of type *struct riscv_pmu*, which contains
+various methods according to perf's internal convention and PMU-specific
+parameters.  One should declare such instance to represent the PMU.  By default,
+*riscv_pmu* points to a constant structure *riscv_base_pmu*, which has very
+basic support to a baseline QEMU model.
+
+Then he/she can either assign the instance's pointer to *riscv_pmu* so that
+the minimal and already-implemented logic can be leveraged, or invent his/her
+own *riscv_init_platform_pmu* implementation.
+
+In other words, existing sources of *riscv_base_pmu* merely provide a
+reference implementation.  Developers can flexibly decide how many parts they
+can leverage, and in the most extreme case, they can customize every function
+according to their needs.
+
+
+2. Event Initialization
+-----------------------
+
+When a user launches a perf command to monitor some events, it is first
+interpreted by the userspace perf tool into multiple *perf_event_open*
+system calls, and then each of them calls to the body of *event_init*
+member function that was assigned in the previous step.  In *riscv_base_pmu*'s
+case, it is *riscv_event_init*.
+
+The main purpose of this function is to translate the event provided by user
+into bitmap, so that HW-related control registers or counters can directly be
+manipulated.  The translation is based on the mappings and methods provided in
+*riscv_pmu*.
+
+Note that some features can be done in this stage as well:
+
+(1) interrupt setting, which is stated in the next section;
+(2) privilege level setting (user space only, kernel space only, both);
+(3) destructor setting.  Normally it is sufficient to apply *riscv_destroy_event*;
+(4) tweaks for non-sampling events, which will be utilized by functions such as
+    *perf_adjust_period*, usually something like the follows::
+
+      if (!is_sampling_event(event)) {
+              hwc->sample_period = x86_pmu.max_period;
+              hwc->last_period = hwc->sample_period;
+              local64_set(&hwc->period_left, hwc->sample_period);
+      }
+
+In the case of *riscv_base_pmu*, only (3) is provided for now.
+
+
+3. Interrupt
+------------
+
+3.1. Interrupt Initialization
+
+This often occurs at the beginning of the *event_init* method. In common
+practice, this should be a code segment like::
+
+  int x86_reserve_hardware(void)
+  {
+        int err = 0;
+
+        if (!atomic_inc_not_zero(&pmc_refcount)) {
+                mutex_lock(&pmc_reserve_mutex);
+                if (atomic_read(&pmc_refcount) == 0) {
+                        if (!reserve_pmc_hardware())
+                                err = -EBUSY;
+                        else
+                                reserve_ds_buffers();
+                }
+                if (!err)
+                        atomic_inc(&pmc_refcount);
+                mutex_unlock(&pmc_reserve_mutex);
+        }
+
+        return err;
+  }
+
+And the magic is in *reserve_pmc_hardware*, which usually does atomic
+operations to make implemented IRQ accessible from some global function pointer.
+*release_pmc_hardware* serves the opposite purpose, and it is used in event
+destructors mentioned in previous section.
+
+(Note: From the implementations in all the architectures, the *reserve/release*
+pair are always IRQ settings, so the *pmc_hardware* seems somehow misleading.
+It does NOT deal with the binding between an event and a physical counter,
+which will be introduced in the next section.)
+
+3.2. IRQ Structure
+
+Basically, a IRQ runs the following pseudo code::
+
+  for each hardware counter that triggered this overflow
+
+      get the event of this counter
+
+      // following two steps are defined as *read()*,
+      // check the section Reading/Writing Counters for details.
+      count the delta value since previous interrupt
+      update the event->count (# event occurs) by adding delta, and
+                 event->hw.period_left by subtracting delta
+
+      if the event overflows
+          sample data
+          set the counter appropriately for the next overflow
+
+          if the event overflows again
+              too frequently, throttle this event
+          fi
+      fi
+
+  end for
+
+However as of this writing, none of the RISC-V implementations have designed an
+interrupt for perf, so the details are to be completed in the future.
+
+4. Reading/Writing Counters
+---------------------------
+
+They seem symmetric but perf treats them quite differently.  For reading, there
+is a *read* interface in *struct pmu*, but it serves more than just reading.
+According to the context, the *read* function not only reads the content of the
+counter (event->count), but also updates the left period to the next interrupt
+(event->hw.period_left).
+
+But the core of perf does not need direct write to counters.  Writing counters
+is hidden behind the abstraction of 1) *pmu->start*, literally start counting so one
+has to set the counter to a good value for the next interrupt; 2) inside the IRQ
+it should set the counter to the same resonable value.
+
+Reading is not a problem in RISC-V but writing would need some effort, since
+counters are not allowed to be written by S-mode.
+
+
+5. add()/del()/start()/stop()
+-----------------------------
+
+Basic idea: add()/del() adds/deletes events to/from a PMU, and start()/stop()
+starts/stop the counter of some event in the PMU.  All of them take the same
+arguments: *struct perf_event *event* and *int flag*.
+
+Consider perf as a state machine, then you will find that these functions serve
+as the state transition process between those states.
+Three states (event->hw.state) are defined:
+
+* PERF_HES_STOPPED:	the counter is stopped
+* PERF_HES_UPTODATE:	the event->count is up-to-date
+* PERF_HES_ARCH:	arch-dependent usage ... we don't need this for now
+
+A normal flow of these state transitions are as follows:
+
+* A user launches a perf event, resulting in calling to *event_init*.
+* When being context-switched in, *add* is called by the perf core, with a flag
+  PERF_EF_START, which means that the event should be started after it is added.
+  At this stage, a general event is bound to a physical counter, if any.
+  The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, because it is now
+  stopped, and the (software) event count does not need updating.
+
+  - *start* is then called, and the counter is enabled.
+    With flag PERF_EF_RELOAD, it writes an appropriate value to the counter (check
+    previous section for detail).
+    Nothing is written if the flag does not contain PERF_EF_RELOAD.
+    The state now is reset to none, because it is neither stopped nor updated
+    (the counting already started)
+
+* When being context-switched out, *del* is called.  It then checks out all the
+  events in the PMU and calls *stop* to update their counts.
+
+  - *stop* is called by *del*
+    and the perf core with flag PERF_EF_UPDATE, and it often shares the same
+    subroutine as *read* with the same logic.
+    The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, again.
+
+  - Life cycle of these two pairs: *add* and *del* are called repeatedly as
+    tasks switch in-and-out; *start* and *stop* is also called when the perf core
+    needs a quick stop-and-start, for instance, when the interrupt period is being
+    adjusted.
+
+Current implementation is sufficient for now and can be easily extended to
+features in the future.
+
+A. Related Structures
+---------------------
+
+* struct pmu: include/linux/perf_event.h
+* struct riscv_pmu: arch/riscv/include/asm/perf_event.h
+
+  Both structures are designed to be read-only.
+
+  *struct pmu* defines some function pointer interfaces, and most of them take
+  *struct perf_event* as a main argument, dealing with perf events according to
+  perf's internal state machine (check kernel/events/core.c for details).
+
+  *struct riscv_pmu* defines PMU-specific parameters.  The naming follows the
+  convention of all other architectures.
+
+* struct perf_event: include/linux/perf_event.h
+* struct hw_perf_event
+
+  The generic structure that represents perf events, and the hardware-related
+  details.
+
+* struct riscv_hw_events: arch/riscv/include/asm/perf_event.h
+
+  The structure that holds the status of events, has two fixed members:
+  the number of events and the array of the events.
+
+References
+----------
+
+[1] https://github.com/riscv/riscv-linux/pull/124
+
+[2] https://groups.google.com/a/groups.riscv.org/forum/#!topic/sw-dev/f19TmCNP6yA
diff --git a/Documentation/riscv/pmu.txt b/Documentation/riscv/pmu.txt
deleted file mode 100644
index b29f03a6d82f..000000000000
--- a/Documentation/riscv/pmu.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-Supporting PMUs on RISC-V platforms
-==========================================
-Alan Kao <alankao@andestech.com>, Mar 2018
-
-Introduction
-------------
-
-As of this writing, perf_event-related features mentioned in The RISC-V ISA
-Privileged Version 1.10 are as follows:
-(please check the manual for more details)
-
-* [m|s]counteren
-* mcycle[h], cycle[h]
-* minstret[h], instret[h]
-* mhpeventx, mhpcounterx[h]
-
-With such function set only, porting perf would require a lot of work, due to
-the lack of the following general architectural performance monitoring features:
-
-* Enabling/Disabling counters
-  Counters are just free-running all the time in our case.
-* Interrupt caused by counter overflow
-  No such feature in the spec.
-* Interrupt indicator
-  It is not possible to have many interrupt ports for all counters, so an
-  interrupt indicator is required for software to tell which counter has
-  just overflowed.
-* Writing to counters
-  There will be an SBI to support this since the kernel cannot modify the
-  counters [1].  Alternatively, some vendor considers to implement
-  hardware-extension for M-S-U model machines to write counters directly.
-
-This document aims to provide developers a quick guide on supporting their
-PMUs in the kernel.  The following sections briefly explain perf' mechanism
-and todos.
-
-You may check previous discussions here [1][2].  Also, it might be helpful
-to check the appendix for related kernel structures.
-
-
-1. Initialization
------------------
-
-*riscv_pmu* is a global pointer of type *struct riscv_pmu*, which contains
-various methods according to perf's internal convention and PMU-specific
-parameters.  One should declare such instance to represent the PMU.  By default,
-*riscv_pmu* points to a constant structure *riscv_base_pmu*, which has very
-basic support to a baseline QEMU model.
-
-Then he/she can either assign the instance's pointer to *riscv_pmu* so that
-the minimal and already-implemented logic can be leveraged, or invent his/her
-own *riscv_init_platform_pmu* implementation.
-
-In other words, existing sources of *riscv_base_pmu* merely provide a
-reference implementation.  Developers can flexibly decide how many parts they
-can leverage, and in the most extreme case, they can customize every function
-according to their needs.
-
-
-2. Event Initialization
------------------------
-
-When a user launches a perf command to monitor some events, it is first
-interpreted by the userspace perf tool into multiple *perf_event_open*
-system calls, and then each of them calls to the body of *event_init*
-member function that was assigned in the previous step.  In *riscv_base_pmu*'s
-case, it is *riscv_event_init*.
-
-The main purpose of this function is to translate the event provided by user
-into bitmap, so that HW-related control registers or counters can directly be
-manipulated.  The translation is based on the mappings and methods provided in
-*riscv_pmu*.
-
-Note that some features can be done in this stage as well:
-
-(1) interrupt setting, which is stated in the next section;
-(2) privilege level setting (user space only, kernel space only, both);
-(3) destructor setting.  Normally it is sufficient to apply *riscv_destroy_event*;
-(4) tweaks for non-sampling events, which will be utilized by functions such as
-*perf_adjust_period*, usually something like the follows:
-
-if (!is_sampling_event(event)) {
-        hwc->sample_period = x86_pmu.max_period;
-        hwc->last_period = hwc->sample_period;
-        local64_set(&hwc->period_left, hwc->sample_period);
-}
-
-In the case of *riscv_base_pmu*, only (3) is provided for now.
-
-
-3. Interrupt
-------------
-
-3.1. Interrupt Initialization
-
-This often occurs at the beginning of the *event_init* method. In common
-practice, this should be a code segment like
-
-int x86_reserve_hardware(void)
-{
-        int err = 0;
-
-        if (!atomic_inc_not_zero(&pmc_refcount)) {
-                mutex_lock(&pmc_reserve_mutex);
-                if (atomic_read(&pmc_refcount) == 0) {
-                        if (!reserve_pmc_hardware())
-                                err = -EBUSY;
-                        else
-                                reserve_ds_buffers();
-                }
-                if (!err)
-                        atomic_inc(&pmc_refcount);
-                mutex_unlock(&pmc_reserve_mutex);
-        }
-
-        return err;
-}
-
-And the magic is in *reserve_pmc_hardware*, which usually does atomic
-operations to make implemented IRQ accessible from some global function pointer.
-*release_pmc_hardware* serves the opposite purpose, and it is used in event
-destructors mentioned in previous section.
-
-(Note: From the implementations in all the architectures, the *reserve/release*
-pair are always IRQ settings, so the *pmc_hardware* seems somehow misleading.
-It does NOT deal with the binding between an event and a physical counter,
-which will be introduced in the next section.)
-
-3.2. IRQ Structure
-
-Basically, a IRQ runs the following pseudo code:
-
-for each hardware counter that triggered this overflow
-
-    get the event of this counter
-
-    // following two steps are defined as *read()*,
-    // check the section Reading/Writing Counters for details.
-    count the delta value since previous interrupt
-    update the event->count (# event occurs) by adding delta, and
-               event->hw.period_left by subtracting delta
-
-    if the event overflows
-        sample data
-        set the counter appropriately for the next overflow
-
-        if the event overflows again
-            too frequently, throttle this event
-        fi
-    fi
-
-end for
-
-However as of this writing, none of the RISC-V implementations have designed an
-interrupt for perf, so the details are to be completed in the future.
-
-4. Reading/Writing Counters
----------------------------
-
-They seem symmetric but perf treats them quite differently.  For reading, there
-is a *read* interface in *struct pmu*, but it serves more than just reading.
-According to the context, the *read* function not only reads the content of the
-counter (event->count), but also updates the left period to the next interrupt
-(event->hw.period_left).
-
-But the core of perf does not need direct write to counters.  Writing counters
-is hidden behind the abstraction of 1) *pmu->start*, literally start counting so one
-has to set the counter to a good value for the next interrupt; 2) inside the IRQ
-it should set the counter to the same resonable value.
-
-Reading is not a problem in RISC-V but writing would need some effort, since
-counters are not allowed to be written by S-mode.
-
-
-5. add()/del()/start()/stop()
------------------------------
-
-Basic idea: add()/del() adds/deletes events to/from a PMU, and start()/stop()
-starts/stop the counter of some event in the PMU.  All of them take the same
-arguments: *struct perf_event *event* and *int flag*.
-
-Consider perf as a state machine, then you will find that these functions serve
-as the state transition process between those states.
-Three states (event->hw.state) are defined:
-
-* PERF_HES_STOPPED:	the counter is stopped
-* PERF_HES_UPTODATE:	the event->count is up-to-date
-* PERF_HES_ARCH:	arch-dependent usage ... we don't need this for now
-
-A normal flow of these state transitions are as follows:
-
-* A user launches a perf event, resulting in calling to *event_init*.
-* When being context-switched in, *add* is called by the perf core, with a flag
-  PERF_EF_START, which means that the event should be started after it is added.
-  At this stage, a general event is bound to a physical counter, if any.
-  The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, because it is now
-  stopped, and the (software) event count does not need updating.
-** *start* is then called, and the counter is enabled.
-   With flag PERF_EF_RELOAD, it writes an appropriate value to the counter (check
-   previous section for detail).
-   Nothing is written if the flag does not contain PERF_EF_RELOAD.
-   The state now is reset to none, because it is neither stopped nor updated
-   (the counting already started)
-* When being context-switched out, *del* is called.  It then checks out all the
-  events in the PMU and calls *stop* to update their counts.
-** *stop* is called by *del*
-   and the perf core with flag PERF_EF_UPDATE, and it often shares the same
-   subroutine as *read* with the same logic.
-   The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, again.
-
-** Life cycle of these two pairs: *add* and *del* are called repeatedly as
-  tasks switch in-and-out; *start* and *stop* is also called when the perf core
-  needs a quick stop-and-start, for instance, when the interrupt period is being
-  adjusted.
-
-Current implementation is sufficient for now and can be easily extended to
-features in the future.
-
-A. Related Structures
----------------------
-
-* struct pmu: include/linux/perf_event.h
-* struct riscv_pmu: arch/riscv/include/asm/perf_event.h
-
-  Both structures are designed to be read-only.
-
-  *struct pmu* defines some function pointer interfaces, and most of them take
-*struct perf_event* as a main argument, dealing with perf events according to
-perf's internal state machine (check kernel/events/core.c for details).
-
-  *struct riscv_pmu* defines PMU-specific parameters.  The naming follows the
-convention of all other architectures.
-
-* struct perf_event: include/linux/perf_event.h
-* struct hw_perf_event
-
-  The generic structure that represents perf events, and the hardware-related
-details.
-
-* struct riscv_hw_events: arch/riscv/include/asm/perf_event.h
-
-  The structure that holds the status of events, has two fixed members:
-the number of events and the array of the events.
-
-References
-----------
-
-[1] https://github.com/riscv/riscv-linux/pull/124
-[2] https://groups.google.com/a/groups.riscv.org/forum/#!topic/sw-dev/f19TmCNP6yA
diff --git a/Documentation/s390/3270.rst b/Documentation/s390/3270.rst
new file mode 100644
index 000000000000..e09e77954238
--- /dev/null
+++ b/Documentation/s390/3270.rst
@@ -0,0 +1,298 @@
+===============================
+IBM 3270 Display System support
+===============================
+
+This file describes the driver that supports local channel attachment
+of IBM 3270 devices.  It consists of three sections:
+
+	* Introduction
+	* Installation
+	* Operation
+
+
+Introduction
+============
+
+This paper describes installing and operating 3270 devices under
+Linux/390.  A 3270 device is a block-mode rows-and-columns terminal of
+which I'm sure hundreds of millions were sold by IBM and clonemakers
+twenty and thirty years ago.
+
+You may have 3270s in-house and not know it.  If you're using the
+VM-ESA operating system, define a 3270 to your virtual machine by using
+the command "DEF GRAF <hex-address>"  This paper presumes you will be
+defining four 3270s with the CP/CMS commands:
+
+	- DEF GRAF 620
+	- DEF GRAF 621
+	- DEF GRAF 622
+	- DEF GRAF 623
+
+Your network connection from VM-ESA allows you to use x3270, tn3270, or
+another 3270 emulator, started from an xterm window on your PC or
+workstation.  With the DEF GRAF command, an application such as xterm,
+and this Linux-390 3270 driver, you have another way of talking to your
+Linux box.
+
+This paper covers installation of the driver and operation of a
+dialed-in x3270.
+
+
+Installation
+============
+
+You install the driver by installing a patch, doing a kernel build, and
+running the configuration script (config3270.sh, in this directory).
+
+WARNING:  If you are using 3270 console support, you must rerun the
+configuration script every time you change the console's address (perhaps
+by using the condev= parameter in silo's /boot/parmfile).  More precisely,
+you should rerun the configuration script every time your set of 3270s,
+including the console 3270, changes subchannel identifier relative to
+one another.  ReIPL as soon as possible after running the configuration
+script and the resulting /tmp/mkdev3270.
+
+If you have chosen to make tub3270 a module, you add a line to a
+configuration file under /etc/modprobe.d/.  If you are working on a VM
+virtual machine, you can use DEF GRAF to define virtual 3270 devices.
+
+You may generate both 3270 and 3215 console support, or one or the
+other, or neither.  If you generate both, the console type under VM is
+not changed.  Use #CP Q TERM to see what the current console type is.
+Use #CP TERM CONMODE 3270 to change it to 3270.  If you generate only
+3270 console support, then the driver automatically converts your console
+at boot time to a 3270 if it is a 3215.
+
+In brief, these are the steps:
+
+	1. Install the tub3270 patch
+	2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf`
+	3. (If VM) define devices with DEF GRAF
+	4. Reboot
+	5. Configure
+
+To test that everything works, assuming VM and x3270,
+
+	1. Bring up an x3270 window.
+	2. Use the DIAL command in that window.
+	3. You should immediately see a Linux login screen.
+
+Here are the installation steps in detail:
+
+	1.  The 3270 driver is a part of the official Linux kernel
+	source.  Build a tree with the kernel source and any necessary
+	patches.  Then do::
+
+		make oldconfig
+		(If you wish to disable 3215 console support, edit
+		.config; change CONFIG_TN3215's value to "n";
+		and rerun "make oldconfig".)
+		make image
+		make modules
+		make modules_install
+
+	2. (Perform this step only if you have configured tub3270 as a
+	module.)  Add a line to a file `/etc/modprobe.d/*.conf` to automatically
+	load the driver when it's needed.  With this line added, you will see
+	login prompts appear on your 3270s as soon as boot is complete (or
+	with emulated 3270s, as soon as you dial into your vm guest using the
+	command "DIAL <vmguestname>").  Since the line-mode major number is
+	227, the line to add should be::
+
+		alias char-major-227 tub3270
+
+	3. Define graphic devices to your vm guest machine, if you
+	haven't already.  Define them before you reboot (reipl):
+
+		- DEFINE GRAF 620
+		- DEFINE GRAF 621
+		- DEFINE GRAF 622
+		- DEFINE GRAF 623
+
+	4. Reboot.  The reboot process scans hardware devices, including
+	3270s, and this enables the tub3270 driver once loaded to respond
+	correctly to the configuration requests of the next step.  If
+	you have chosen 3270 console support, your console now behaves
+	as a 3270, not a 3215.
+
+	5. Run the 3270 configuration script config3270.  It is
+	distributed in this same directory, Documentation/s390, as
+	config3270.sh.  Inspect the output script it produces,
+	/tmp/mkdev3270, and then run that script.  This will create the
+	necessary character special device files and make the necessary
+	changes to /etc/inittab.
+
+	Then notify /sbin/init that /etc/inittab has changed, by issuing
+	the telinit command with the q operand::
+
+		cd Documentation/s390
+		sh config3270.sh
+		sh /tmp/mkdev3270
+		telinit q
+
+	This should be sufficient for your first time.  If your 3270
+	configuration has changed and you're reusing config3270, you
+	should follow these steps::
+
+		Change 3270 configuration
+		Reboot
+		Run config3270 and /tmp/mkdev3270
+		Reboot
+
+Here are the testing steps in detail:
+
+	1. Bring up an x3270 window, or use an actual hardware 3278 or
+	3279, or use the 3270 emulator of your choice.  You would be
+	running the emulator on your PC or workstation.  You would use
+	the command, for example::
+
+		x3270 vm-esa-domain-name &
+
+	if you wanted a 3278 Model 4 with 43 rows of 80 columns, the
+	default model number.  The driver does not take advantage of
+	extended attributes.
+
+	The screen you should now see contains a VM logo with input
+	lines near the bottom.  Use TAB to move to the bottom line,
+	probably labeled "COMMAND  ===>".
+
+	2. Use the DIAL command instead of the LOGIN command to connect
+	to one of the virtual 3270s you defined with the DEF GRAF
+	commands::
+
+		dial my-vm-guest-name
+
+	3. You should immediately see a login prompt from your
+	Linux-390 operating system.  If that does not happen, you would
+	see instead the line "DIALED TO my-vm-guest-name   0620".
+
+	To troubleshoot:  do these things.
+
+	A. Is the driver loaded?  Use the lsmod command (no operands)
+	to find out.  Probably it isn't.  Try loading it manually, with
+	the command "insmod tub3270".  Does that command give error
+	messages?  Ha!  There's your problem.
+
+	B. Is the /etc/inittab file modified as in installation step 3
+	above?  Use the grep command to find out; for instance, issue
+	"grep 3270 /etc/inittab".  Nothing found?  There's your
+	problem!
+
+	C. Are the device special files created, as in installation
+	step 2 above?  Use the ls -l command to find out; for instance,
+	issue "ls -l /dev/3270/tty620".  The output should start with the
+	letter "c" meaning character device and should contain "227, 1"
+	just to the left of the device name.  No such file?  no "c"?
+	Wrong major number?  Wrong minor number?  There's your
+	problem!
+
+	D. Do you get the message::
+
+		 "HCPDIA047E my-vm-guest-name 0620 does not exist"?
+
+	If so, you must issue the command "DEF GRAF 620" from your VM
+	3215 console and then reboot the system.
+
+
+
+OPERATION.
+==========
+
+The driver defines three areas on the 3270 screen:  the log area, the
+input area, and the status area.
+
+The log area takes up all but the bottom two lines of the screen.  The
+driver writes terminal output to it, starting at the top line and going
+down.  When it fills, the status area changes from "Linux Running" to
+"Linux More...".  After a scrolling timeout of (default) 5 sec, the
+screen clears and more output is written, from the top down.
+
+The input area extends from the beginning of the second-to-last screen
+line to the start of the status area.  You type commands in this area
+and hit ENTER to execute them.
+
+The status area initializes to "Linux Running" to give you a warm
+fuzzy feeling.  When the log area fills up and output awaits, it
+changes to "Linux More...".  At this time you can do several things or
+nothing.  If you do nothing, the screen will clear in (default) 5 sec
+and more output will appear.  You may hit ENTER with nothing typed in
+the input area to toggle between "Linux More..." and "Linux Holding",
+which indicates no scrolling will occur.  (If you hit ENTER with "Linux
+Running" and nothing typed, the application receives a newline.)
+
+You may change the scrolling timeout value.  For example, the following
+command line::
+
+	echo scrolltime=60 > /proc/tty/driver/tty3270
+
+changes the scrolling timeout value to 60 sec.  Set scrolltime to 0 if
+you wish to prevent scrolling entirely.
+
+Other things you may do when the log area fills up are:  hit PA2 to
+clear the log area and write more output to it, or hit CLEAR to clear
+the log area and the input area and write more output to the log area.
+
+Some of the Program Function (PF) and Program Attention (PA) keys are
+preassigned special functions.  The ones that are not yield an alarm
+when pressed.
+
+PA1 causes a SIGINT to the currently running application.  You may do
+the same thing from the input area, by typing "^C" and hitting ENTER.
+
+PA2 causes the log area to be cleared.  If output awaits, it is then
+written to the log area.
+
+PF3 causes an EOF to be received as input by the application.  You may
+cause an EOF also by typing "^D" and hitting ENTER.
+
+No PF key is preassigned to cause a job suspension, but you may cause a
+job suspension by typing "^Z" and hitting ENTER.  You may wish to
+assign this function to a PF key.  To make PF7 cause job suspension,
+execute the command::
+
+	echo pf7=^z > /proc/tty/driver/tty3270
+
+If the input you type does not end with the two characters "^n", the
+driver appends a newline character and sends it to the tty driver;
+otherwise the driver strips the "^n" and does not append a newline.
+The IBM 3215 driver behaves similarly.
+
+Pf10 causes the most recent command to be retrieved from the tube's
+command stack (default depth 20) and displayed in the input area.  You
+may hit PF10 again for the next-most-recent command, and so on.  A
+command is entered into the stack only when the input area is not made
+invisible (such as for password entry) and it is not identical to the
+current top entry.  PF10 rotates backward through the command stack;
+PF11 rotates forward.  You may assign the backward function to any PF
+key (or PA key, for that matter), say, PA3, with the command::
+
+	echo -e pa3=\\033k > /proc/tty/driver/tty3270
+
+This assigns the string ESC-k to PA3.  Similarly, the string ESC-j
+performs the forward function.  (Rationale:  In bash with vi-mode line
+editing, ESC-k and ESC-j retrieve backward and forward history.
+Suggestions welcome.)
+
+Is a stack size of twenty commands not to your liking?  Change it on
+the fly.  To change to saving the last 100 commands, execute the
+command::
+
+	echo recallsize=100 > /proc/tty/driver/tty3270
+
+Have a command you issue frequently?  Assign it to a PF or PA key!  Use
+the command::
+
+	echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270
+
+to execute the commands mkdir foobar and cd foobar immediately when you
+hit PF24.  Want to see the command line first, before you execute it?
+Use the -n option of the echo command::
+
+	echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270
+
+
+
+Happy testing!  I welcome any and all comments about this document, the
+driver, etc etc.
+
+Dick Hitt <rbh00@utsglobal.com>
diff --git a/Documentation/s390/3270.txt b/Documentation/s390/3270.txt
deleted file mode 100644
index 7c715de99774..000000000000
--- a/Documentation/s390/3270.txt
+++ /dev/null
@@ -1,271 +0,0 @@
-IBM 3270 Display System support
-
-This file describes the driver that supports local channel attachment
-of IBM 3270 devices.  It consists of three sections:
-	* Introduction
-	* Installation
-	* Operation
-
-
-INTRODUCTION.
-
-This paper describes installing and operating 3270 devices under
-Linux/390.  A 3270 device is a block-mode rows-and-columns terminal of
-which I'm sure hundreds of millions were sold by IBM and clonemakers
-twenty and thirty years ago.
-
-You may have 3270s in-house and not know it.  If you're using the
-VM-ESA operating system, define a 3270 to your virtual machine by using
-the command "DEF GRAF <hex-address>"  This paper presumes you will be
-defining four 3270s with the CP/CMS commands
-
-	DEF GRAF 620
-	DEF GRAF 621
-	DEF GRAF 622
-	DEF GRAF 623
-
-Your network connection from VM-ESA allows you to use x3270, tn3270, or
-another 3270 emulator, started from an xterm window on your PC or
-workstation.  With the DEF GRAF command, an application such as xterm,
-and this Linux-390 3270 driver, you have another way of talking to your
-Linux box.
-
-This paper covers installation of the driver and operation of a
-dialed-in x3270.
-
-
-INSTALLATION.
-
-You install the driver by installing a patch, doing a kernel build, and
-running the configuration script (config3270.sh, in this directory).
-
-WARNING:  If you are using 3270 console support, you must rerun the
-configuration script every time you change the console's address (perhaps
-by using the condev= parameter in silo's /boot/parmfile).  More precisely,
-you should rerun the configuration script every time your set of 3270s,
-including the console 3270, changes subchannel identifier relative to
-one another.  ReIPL as soon as possible after running the configuration
-script and the resulting /tmp/mkdev3270.
-
-If you have chosen to make tub3270 a module, you add a line to a
-configuration file under /etc/modprobe.d/.  If you are working on a VM
-virtual machine, you can use DEF GRAF to define virtual 3270 devices.
-
-You may generate both 3270 and 3215 console support, or one or the
-other, or neither.  If you generate both, the console type under VM is
-not changed.  Use #CP Q TERM to see what the current console type is.
-Use #CP TERM CONMODE 3270 to change it to 3270.  If you generate only
-3270 console support, then the driver automatically converts your console
-at boot time to a 3270 if it is a 3215.
-
-In brief, these are the steps:
-	1. Install the tub3270 patch
-	2. (If a module) add a line to a file in /etc/modprobe.d/*.conf
-	3. (If VM) define devices with DEF GRAF
-	4. Reboot
-	5. Configure
-
-To test that everything works, assuming VM and x3270,
-	1. Bring up an x3270 window.
-	2. Use the DIAL command in that window.
-	3. You should immediately see a Linux login screen.
-
-Here are the installation steps in detail:
-
-	1.  The 3270 driver is a part of the official Linux kernel
-	source.  Build a tree with the kernel source and any necessary
-	patches.  Then do
-		make oldconfig
-		(If you wish to disable 3215 console support, edit
-		.config; change CONFIG_TN3215's value to "n";
-		and rerun "make oldconfig".)
-		make image
-		make modules
-		make modules_install
-
-	2. (Perform this step only if you have configured tub3270 as a
-	module.)  Add a line to a file /etc/modprobe.d/*.conf to automatically
-	load the driver when it's needed.  With this line added, you will see
-	login prompts appear on your 3270s as soon as boot is complete (or
-	with emulated 3270s, as soon as you dial into your vm guest using the
-	command "DIAL <vmguestname>").  Since the line-mode major number is
-	227, the line to add should be:
-		alias char-major-227 tub3270
-
-	3. Define graphic devices to your vm guest machine, if you
-	haven't already.  Define them before you reboot (reipl):
-		DEFINE GRAF 620
-		DEFINE GRAF 621
-		DEFINE GRAF 622
-		DEFINE GRAF 623
-
-	4. Reboot.  The reboot process scans hardware devices, including
-	3270s, and this enables the tub3270 driver once loaded to respond
-	correctly to the configuration requests of the next step.  If
-	you have chosen 3270 console support, your console now behaves
-	as a 3270, not a 3215.
-
-	5. Run the 3270 configuration script config3270.  It is
-	distributed in this same directory, Documentation/s390, as
-	config3270.sh.	Inspect the output script it produces,
-	/tmp/mkdev3270, and then run that script.  This will create the
-	necessary character special device files and make the necessary
-	changes to /etc/inittab.
-
-	Then notify /sbin/init that /etc/inittab has changed, by issuing
-	the telinit command with the q operand:
-		cd Documentation/s390
-		sh config3270.sh
-		sh /tmp/mkdev3270
-		telinit q
-
-	This should be sufficient for your first time.	If your 3270
-	configuration has changed and you're reusing config3270, you
-	should follow these steps:
-		Change 3270 configuration
-		Reboot
-		Run config3270 and /tmp/mkdev3270
-		Reboot
-
-Here are the testing steps in detail:
-
-	1. Bring up an x3270 window, or use an actual hardware 3278 or
-	3279, or use the 3270 emulator of your choice.  You would be
-	running the emulator on your PC or workstation.  You would use
-	the command, for example,
-		x3270 vm-esa-domain-name &
-	if you wanted a 3278 Model 4 with 43 rows of 80 columns, the
-	default model number.  The driver does not take advantage of
-	extended attributes.
-
-	The screen you should now see contains a VM logo with input
-	lines near the bottom.  Use TAB to move to the bottom line,
-	probably labeled "COMMAND  ===>".
-
-	2. Use the DIAL command instead of the LOGIN command to connect
-	to one of the virtual 3270s you defined with the DEF GRAF
-	commands:
-		dial my-vm-guest-name
-
-	3. You should immediately see a login prompt from your
-	Linux-390 operating system.  If that does not happen, you would
-	see instead the line "DIALED TO my-vm-guest-name   0620".
-
-	To troubleshoot:  do these things.
-
-	A. Is the driver loaded?  Use the lsmod command (no operands)
-	to find out.  Probably it isn't.  Try loading it manually, with
-	the command "insmod tub3270".  Does that command give error
-	messages?  Ha!  There's your problem.
-
-	B. Is the /etc/inittab file modified as in installation step 3
-	above?  Use the grep command to find out; for instance, issue
-	"grep 3270 /etc/inittab".  Nothing found?  There's your
-	problem!
-
-	C. Are the device special files created, as in installation
-	step 2 above?  Use the ls -l command to find out; for instance,
-	issue "ls -l /dev/3270/tty620".  The output should start with the
-	letter "c" meaning character device and should contain "227, 1"
-	just to the left of the device name.  No such file?  no "c"?
-	Wrong major number?  Wrong minor number?  There's your
-	problem!
-
-	D. Do you get the message
-		 "HCPDIA047E my-vm-guest-name 0620 does not exist"?
-	If so, you must issue the command "DEF GRAF 620" from your VM
-	3215 console and then reboot the system.
-
-
-
-OPERATION.
-
-The driver defines three areas on the 3270 screen:  the log area, the
-input area, and the status area.
-
-The log area takes up all but the bottom two lines of the screen.  The
-driver writes terminal output to it, starting at the top line and going
-down.  When it fills, the status area changes from "Linux Running" to
-"Linux More...".  After a scrolling timeout of (default) 5 sec, the
-screen clears and more output is written, from the top down.
-
-The input area extends from the beginning of the second-to-last screen
-line to the start of the status area.  You type commands in this area
-and hit ENTER to execute them.
-
-The status area initializes to "Linux Running" to give you a warm
-fuzzy feeling.  When the log area fills up and output awaits, it
-changes to "Linux More...".  At this time you can do several things or
-nothing.  If you do nothing, the screen will clear in (default) 5 sec
-and more output will appear.  You may hit ENTER with nothing typed in
-the input area to toggle between "Linux More..." and "Linux Holding",
-which indicates no scrolling will occur.  (If you hit ENTER with "Linux
-Running" and nothing typed, the application receives a newline.)
-
-You may change the scrolling timeout value.  For example, the following
-command line:
-	echo scrolltime=60 > /proc/tty/driver/tty3270
-changes the scrolling timeout value to 60 sec.  Set scrolltime to 0 if
-you wish to prevent scrolling entirely.
-
-Other things you may do when the log area fills up are:  hit PA2 to
-clear the log area and write more output to it, or hit CLEAR to clear
-the log area and the input area and write more output to the log area.
-
-Some of the Program Function (PF) and Program Attention (PA) keys are
-preassigned special functions.  The ones that are not yield an alarm
-when pressed.
-
-PA1 causes a SIGINT to the currently running application.  You may do
-the same thing from the input area, by typing "^C" and hitting ENTER.
-
-PA2 causes the log area to be cleared.  If output awaits, it is then
-written to the log area.
-
-PF3 causes an EOF to be received as input by the application.  You may
-cause an EOF also by typing "^D" and hitting ENTER.
-
-No PF key is preassigned to cause a job suspension, but you may cause a
-job suspension by typing "^Z" and hitting ENTER.  You may wish to
-assign this function to a PF key.  To make PF7 cause job suspension,
-execute the command:
-	echo pf7=^z > /proc/tty/driver/tty3270
-
-If the input you type does not end with the two characters "^n", the
-driver appends a newline character and sends it to the tty driver;
-otherwise the driver strips the "^n" and does not append a newline.
-The IBM 3215 driver behaves similarly.
-
-Pf10 causes the most recent command to be retrieved from the tube's
-command stack (default depth 20) and displayed in the input area.  You
-may hit PF10 again for the next-most-recent command, and so on.  A
-command is entered into the stack only when the input area is not made
-invisible (such as for password entry) and it is not identical to the
-current top entry.  PF10 rotates backward through the command stack;
-PF11 rotates forward.  You may assign the backward function to any PF
-key (or PA key, for that matter), say, PA3, with the command:
-	echo -e pa3=\\033k > /proc/tty/driver/tty3270
-This assigns the string ESC-k to PA3.  Similarly, the string ESC-j
-performs the forward function.  (Rationale:  In bash with vi-mode line
-editing, ESC-k and ESC-j retrieve backward and forward history.
-Suggestions welcome.)
-
-Is a stack size of twenty commands not to your liking?  Change it on
-the fly.  To change to saving the last 100 commands, execute the
-command:
-	echo recallsize=100 > /proc/tty/driver/tty3270
-
-Have a command you issue frequently?  Assign it to a PF or PA key!  Use
-the command
-	echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 
-to execute the commands mkdir foobar and cd foobar immediately when you
-hit PF24.  Want to see the command line first, before you execute it?
-Use the -n option of the echo command:
-	echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270
-
-
-
-Happy testing!  I welcome any and all comments about this document, the
-driver, etc etc.
-
-Dick Hitt <rbh00@utsglobal.com>
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
deleted file mode 100644
index 6e0f63f343b4..000000000000
--- a/Documentation/s390/CommonIO
+++ /dev/null
@@ -1,125 +0,0 @@
-S/390 common I/O-Layer - command line parameters, procfs and debugfs entries
-============================================================================
-
-Command line parameters
------------------------
-
-* ccw_timeout_log
-
-  Enable logging of debug information in case of ccw device timeouts.
-
-* cio_ignore = device[,device[,..]]
-
-	device := {all | [!]ipldev | [!]condev | [!]<devno> | [!]<devno>-<devno>}
-
-  The given devices will be ignored by the common I/O-layer; no detection
-  and device sensing will be done on any of those devices. The subchannel to 
-  which the device in question is attached will be treated as if no device was
-  attached.
-
-  An ignored device can be un-ignored later; see the "/proc entries"-section for
-  details.
-
-  The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
-  device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
-  give a device number 0xabcd, it will be interpreted as 0.0.abcd.
-
-  You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev'
-  keywords can be used to refer to the CCW based boot device and CCW console
-  device respectively (these are probably useful only when combined with the '!'
-  operator). The '!' operator will cause the I/O-layer to _not_ ignore a device.
-  The command line is parsed from left to right.
-
-  For example, 
-	cio_ignore=0.0.0023-0.0.0042,0.0.4711
-  will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device
-  0.0.4711, if detected.
-  As another example,
-	cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02
-  will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02.
-
-  By default, no devices are ignored.
-
-
-/proc entries
--------------
-
-* /proc/cio_ignore
-
-  Lists the ranges of devices (by bus id) which are ignored by common I/O.
-
-  You can un-ignore certain or all devices by piping to /proc/cio_ignore. 
-  "free all" will un-ignore all ignored devices, 
-  "free <device range>, <device range>, ..." will un-ignore the specified
-  devices.
-
-  For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored,
-  - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore
-    will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023
-    to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored;
-  - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device
-    0.0.0041;
-  - echo free all > /proc/cio_ignore will un-ignore all remaining ignored 
-    devices.
-
-  When a device is un-ignored, device recognition and sensing is performed and 
-  the device driver will be notified if possible, so the device will become
-  available to the system. Note that un-ignoring is performed asynchronously.
-
-  You can also add ranges of devices to be ignored by piping to 
-  /proc/cio_ignore; "add <device range>, <device range>, ..." will ignore the
-  specified devices.
-
-  Note: While already known devices can be added to the list of devices to be
-        ignored, there will be no effect on then. However, if such a device
-	disappears and then reappears, it will then be ignored. To make
-	known devices go away, you need the "purge" command (see below).
-
-  For example,
-	"echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
-  will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
-  devices.
-
-  You can remove already known but now ignored devices via
-	"echo purge > /proc/cio_ignore"
-  All devices ignored but still registered and not online (= not in use)
-  will be deregistered and thus removed from the system.
-
-  The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
-  compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
-  numbers given as 0xabcd will be interpreted as 0.0.abcd.
-
-* /proc/cio_settle
-
-  A write request to this file is blocked until all queued cio actions are
-  handled. This will allow userspace to wait for pending work affecting
-  device availability after changing cio_ignore or the hardware configuration.
-
-* For some of the information present in the /proc filesystem in 2.4 (namely,
-  /proc/subchannels and /proc/chpids), see driver-model.txt.
-  Information formerly in /proc/irq_count is now in /proc/interrupts.
-
-
-debugfs entries
----------------
-
-* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
-
-  Some views generated by the debug feature to hold various debug outputs.
-
-  - /sys/kernel/debug/s390dbf/cio_crw/sprintf
-    Messages from the processing of pending channel report words (machine check
-    handling).
-
-  - /sys/kernel/debug/s390dbf/cio_msg/sprintf
-    Various debug messages from the common I/O-layer.
-
-  - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
-    Logs the calling of functions in the common I/O-layer and, if applicable, 
-    which subchannel they were called for, as well as dumps of some data
-    structures (like irb in an error case).
-
-  The level of logging can be changed to be more or less verbose by piping to 
-  /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
-  documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt)
-  for details.
diff --git a/Documentation/s390/DASD b/Documentation/s390/DASD
deleted file mode 100644
index 9963f1e9c98a..000000000000
--- a/Documentation/s390/DASD
+++ /dev/null
@@ -1,73 +0,0 @@
-DASD device driver
-
-S/390's disk devices (DASDs) are managed by Linux via the DASD device
-driver. It is valid for all types of DASDs and represents them to
-Linux as block devices, namely "dd". Currently the DASD driver uses a
-single major number (254) and 4 minor numbers per volume (1 for the
-physical volume and 3 for partitions). With respect to partitions see
-below. Thus you may have up to 64 DASD devices in your system.
-
-The kernel parameter 'dasd=from-to,...' may be issued arbitrary times
-in the kernel's parameter line or not at all. The 'from' and 'to'
-parameters are to be given in hexadecimal notation without a leading
-0x.
-If you supply kernel parameters the different instances are processed
-in order of appearance and a minor number is reserved for any device
-covered by the supplied range up to 64 volumes. Additional DASDs are
-ignored. If you do not supply the 'dasd=' kernel parameter at all, the 
-DASD driver registers all supported DASDs of your system to a minor
-number in ascending order of the subchannel number.
-
-The driver currently supports ECKD-devices and there are stubs for
-support of the FBA and CKD architectures. For the FBA architecture
-only some smart data structures are missing to make the support
-complete. 
-We performed our testing on 3380 and 3390 type disks of different
-sizes, under VM and on the bare hardware (LPAR), using internal disks
-of the multiprise as well as a RAMAC virtual array. Disks exported by
-an Enterprise Storage Server (Seascape) should work fine as well.
-
-We currently implement one partition per volume, which is the whole
-volume, skipping the first blocks up to the volume label. These are
-reserved for IPL records and IBM's volume label to assure
-accessibility of the DASD from other OSs. In a later stage we will
-provide support of partitions, maybe VTOC oriented or using a kind of
-partition table in the label record.
-
-USAGE
-
--Low-level format (?CKD only)
-For using an ECKD-DASD as a Linux harddisk you have to low-level
-format the tracks by issuing the BLKDASDFORMAT-ioctl on that
-device. This will erase any data on that volume including IBM volume
-labels, VTOCs etc. The ioctl may take a 'struct format_data *' or
-'NULL' as an argument.  
-typedef struct {
-	int start_unit;
-	int stop_unit;
-	int blksize;
-} format_data_t;
-When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole
-disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit
-and stop_unit are the first and last track to be formatted. If
-stop_unit is -1 it implies that the DASD is formatted from start_unit
-up to the last track. blksize can be any power of two between 512 and
-4096. We recommend no blksize lower than 1024 because the ext2fs uses
-1kB blocks anyway and you gain approx. 50% of capacity increasing your
-blksize from 512 byte to 1kB.
-
--Make a filesystem
-Then you can mk??fs the filesystem of your choice on that volume or
-partition. For reasons of sanity you should build your filesystem on
-the partition /dev/dd?1 instead of the whole volume. You only lose 3kB	
-but may be sure that you can reuse your data after introduction of a
-real partition table.
-
-BUGS:
-- Performance sometimes is rather low because we don't fully exploit clustering
-
-TODO-List:
-- Add IBM'S Disk layout to genhd
-- Enhance driver to use more than one major number
-- Enable usage as a module
-- Support Cache fast write and DASD fast write (ECKD)
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
deleted file mode 100644
index 5ae7f868a007..000000000000
--- a/Documentation/s390/Debugging390.txt
+++ /dev/null
@@ -1,2142 +0,0 @@
-
-		  Debugging on Linux for s/390 & z/Architecture
-				       by
-	  Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
-    Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation
-			Best viewed with fixed width fonts
-
-Overview of Document:
-=====================
-This document is intended to give a good overview of how to debug Linux for
-s/390 and z/Architecture. It is not intended as a complete reference and not a
-tutorial on the fundamentals of C & assembly. It doesn't go into
-390 IO in any detail. It is intended to complement the documents in the
-reference section below & any other worthwhile references you get.
-
-It is intended like the Enterprise Systems Architecture/390 Reference Summary
-to be printed out & used as a quick cheat sheet self help style reference when
-problems occur.
-
-Contents
-========
-Register Set
-Address Spaces on Intel Linux
-Address Spaces on Linux for s/390 & z/Architecture
-The Linux for s/390 & z/Architecture Kernel Task Structure
-Register Usage & Stackframes on Linux for s/390 & z/Architecture
-A sample program with comments
-Compiling programs for debugging on Linux for s/390 & z/Architecture
-Debugging under VM
-s/390 & z/Architecture IO Overview
-Debugging IO on s/390 & z/Architecture under VM
-GDB on s/390 & z/Architecture
-Stack chaining in gdb by hand
-Examining core dumps
-ldd
-Debugging modules
-The proc file system
-SysRq
-References
-Special Thanks
-
-Register Set
-============
-The current architectures have the following registers.
- 
-16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture,
-r0-r15 (or gpr0-gpr15), used for arithmetic and addressing.
-
-16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15,
-kernel usage only, used for memory management, interrupt control, debugging
-control etc.
-
-16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture,
-normally not used by normal programs but potentially could be used as
-temporary storage. These registers have a 1:1 association with general
-purpose registers and are designed to be used in the so-called access
-register mode to select different address spaces.
-Access register 0 (and access register 1 on z/Architecture, which needs a
-64 bit pointer) is currently used by the pthread library as a pointer to
-the current running threads private area.
-
-16 64 bit floating point registers (fp0-fp15 ) IEEE & HFP floating 
-point format compliant on G5 upwards & a Floating point control reg (FPC) 
-4  64 bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines.
-Note:
-Linux (currently) always uses IEEE & emulates G5 IEEE format on older machines,
-( provided the kernel is configured for this ).
-
-
-The PSW is the most important register on the machine it
-is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of 
-a program counter (pc), condition code register,memory space designator.
-In IBM standard notation I am counting bit 0 as the MSB.
-It has several advantages over a normal program counter
-in that you can change address translation & program counter 
-in a single instruction. To change address translation,
-e.g. switching address translation off requires that you
-have a logical=physical mapping for the address you are
-currently running at.
-
-      Bit           Value
-s/390 z/Architecture
-0       0     Reserved ( must be 0 ) otherwise specification exception occurs.
-
-1       1     Program Event Recording 1 PER enabled, 
-	      PER is used to facilitate debugging e.g. single stepping.
-
-2-4    2-4    Reserved ( must be 0 ). 
-
-5       5     Dynamic address translation 1=DAT on.
-
-6       6     Input/Output interrupt Mask
-
-7	7     External interrupt Mask used primarily for interprocessor
-	      signalling and clock interrupts.
-
-8-11  8-11    PSW Key used for complex memory protection mechanism
-	      (not used under linux)
-
-12      12    1 on s/390 0 on z/Architecture
-
-13      13    Machine Check Mask 1=enable machine check interrupts
-
-14	14    Wait State. Set this to 1 to stop the processor except for
-	      interrupts and give  time to other LPARS. Used in CPU idle in
-	      the kernel to increase overall usage of processor resources.
-
-15      15    Problem state ( if set to 1 certain instructions are disabled )
-	      all linux user programs run with this bit 1 
-	      ( useful info for debugging under VM ).
-
-16-17 16-17   Address Space Control
-
-	      00 Primary Space Mode:
-	      The register CR1 contains the primary address-space control ele-
-	      ment (PASCE), which points to the primary space region/segment
-	      table origin.
-
-	      01 Access register mode
-
-	      10 Secondary Space Mode:
-	      The register CR7 contains the secondary address-space control
-	      element (SASCE), which points to the secondary space region or
-	      segment table origin.
-
-	      11 Home Space Mode:
-	      The register CR13 contains the home space address-space control
-	      element (HASCE), which points to the home space region/segment
-	      table origin.
-
-	      See "Address Spaces on Linux for s/390 & z/Architecture" below
-	      for more information about address space usage in Linux.
-
-18-19 18-19   Condition codes (CC)
-
-20    20      Fixed point overflow mask if 1=FPU exceptions for this event 
-              occur ( normally 0 ) 
-
-21    21      Decimal overflow mask if 1=FPU exceptions for this event occur 
-              ( normally 0 )
-
-22    22      Exponent underflow mask if 1=FPU exceptions for this event occur 
-              ( normally 0 )
-
-23    23      Significance Mask if 1=FPU exceptions for this event occur 
-              ( normally 0 )
-
-24-31 24-30   Reserved Must be 0.
-
-      31      Extended Addressing Mode
-      32      Basic Addressing Mode
-              Used to set addressing mode
-	      PSW 31   PSW 32
-                0         0        24 bit
-                0         1        31 bit
-                1         1        64 bit
-
-32             1=31 bit addressing mode 0=24 bit addressing mode (for backward 
-               compatibility), linux always runs with this bit set to 1
-
-33-64          Instruction address.
-      33-63    Reserved must be 0
-      64-127   Address
-               In 24 bits mode bits 64-103=0 bits 104-127 Address 
-               In 31 bits mode bits 64-96=0 bits 97-127 Address
-               Note: unlike 31 bit mode on s/390 bit 96 must be zero
-	       when loading the address with LPSWE otherwise a 
-               specification exception occurs, LPSW is fully backward
-               compatible.
-
-
-Prefix Page(s)
---------------
-This per cpu memory area is too intimately tied to the processor not to mention.
-It exists between the real addresses 0-4096 on s/390 and between 0-8192 on
-z/Architecture and is exchanged with one page on s/390 or two pages on
-z/Architecture in absolute storage by the set prefix instruction during Linux
-startup.
-This page is mapped to a different prefix for each processor in an SMP
-configuration (assuming the OS designer is sane of course).
-Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on
-z/Architecture are used by the processor itself for holding such information
-as exception indications and entry points for exceptions.
-Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and
-z/Architecture (there is a gap on z/Architecture currently between 0xc00 and
-0x1000, too, which is used by Linux).
-The closest thing to this on traditional architectures is the interrupt
-vector table. This is a good thing & does simplify some of the kernel coding
-however it means that we now cannot catch stray NULL pointers in the
-kernel without hard coded checks.
-
-
-
-Address Spaces on Intel Linux
-=============================
-
-The traditional Intel Linux is approximately mapped as follows forgive
-the ascii art.
-0xFFFFFFFF 4GB Himem		*****************
-				*		*
-				* Kernel Space	*
-				*		*
-				*****************	  ****************
-User Space Himem		*  User Stack	*	  *		 *
-(typically 0xC0000000 3GB )	*****************	  *		 *
-				*  Shared Libs	*	  * Next Process *
-				*****************	  *	to	 *
-				*		*   <==   *	Run	 *  <==
-				*  User Program *	  *		 *
-				*   Data BSS	*	  *		 *
-				*    Text	*	  *		 *
-				*   Sections	*	  *		 *
-0x00000000			*****************	  ****************
-
-Now it is easy to see that on Intel it is quite easy to recognise a kernel
-address as being one greater than user space himem (in this case 0xC0000000),
-and addresses of less than this are the ones in the current running program on
-this processor (if an smp box).
-If using the virtual machine ( VM ) as a debugger it is quite difficult to
-know which user process is running as the address space you are looking at
-could be from any process in the run queue.
-
-The limitation of Intels addressing technique is that the linux
-kernel uses a very simple real address to virtual addressing technique
-of Real Address=Virtual Address-User Space Himem.
-This means that on Intel the kernel linux can typically only address
-Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines
-can typically use.
-They can lower User Himem to 2GB or lower & thus be
-able to use 2GB of RAM however this shrinks the maximum size
-of User Space from 3GB to 2GB they have a no win limit of 4GB unless
-they go to 64 Bit.
-
-
-On 390 our limitations & strengths make us slightly different.
-For backward compatibility we are only allowed use 31 bits (2GB)
-of our 32 bit addresses, however, we use entirely separate address 
-spaces for the user & kernel.
-
-This means we can support 2GB of non Extended RAM on s/390, & more
-with the Extended memory management swap device & 
-currently 4TB of physical memory currently on z/Architecture.
-
-
-Address Spaces on Linux for s/390 & z/Architecture
-==================================================
-
-Our addressing scheme is basically as follows:
-
-				   Primary Space	       Home Space
-Himem 0x7fffffff 2GB on s/390    *****************          ****************
-currently 0x3ffffffffff (2^42)-1 *  User Stack   *          *              *
-on z/Architecture.		 *****************          *              *
-				 *  Shared Libs  *	    *		   *
-				 *****************	    *		   *
-			         *               *          *    Kernel    *  
-		                 *  User Program *          *              *
-		                 *   Data BSS    *          *              *
-                                 *    Text       *          *              *
-            			 *   Sections    *          *              *
-0x00000000                       *****************          ****************
-
-This also means that we need to look at the PSW problem state bit and the
-addressing mode to decide whether we are looking at user or kernel space.
-
-User space runs in primary address mode (or access register mode within
-the vdso code).
-
-The kernel usually also runs in home space mode, however when accessing
-user space the kernel switches to primary or secondary address mode if
-the mvcos instruction is not available or if a compare-and-swap (futex)
-instruction on a user space address is performed.
-
-When also looking at the ASCE control registers, this means:
-
-User space:
-- runs in primary or access register mode
-- cr1 contains the user asce
-- cr7 contains the user asce
-- cr13 contains the kernel asce
-
-Kernel space:
-- runs in home space mode
-- cr1 contains the user or kernel asce
-  -> the kernel asce is loaded when a uaccess requires primary or
-     secondary address mode
-- cr7 contains the user or kernel asce, (changed with set_fs())
-- cr13 contains the kernel asce
-
-In case of uaccess the kernel changes to:
-- primary space mode in case of a uaccess (copy_to_user) and uses
-  e.g. the mvcp instruction to access user space. However the kernel
-  will stay in home space mode if the mvcos instruction is available
-- secondary space mode in case of futex atomic operations, so that the
-  instructions come from primary address space and data from secondary
-  space
-
-In case of KVM, the kernel runs in home space mode, but cr1 gets switched
-to contain the gmap asce before the SIE instruction gets executed. When
-the SIE instruction is finished, cr1 will be switched back to contain the
-user asce.
-
-
-Virtual Addresses on s/390 & z/Architecture
-===========================================
-
-A virtual address on s/390 is made up of 3 parts
-The SX (segment index, roughly corresponding to the PGD & PMD in Linux
-terminology) being bits 1-11.
-The PX (page index, corresponding to the page table entry (pte) in Linux
-terminology) being bits 12-19.
-The remaining bits BX (the byte index are the offset in the page )
-i.e. bits 20 to 31.
-
-On z/Architecture in linux we currently make up an address from 4 parts.
-The region index bits (RX) 0-32 we currently use bits 22-32
-The segment index (SX) being bits 33-43
-The page index (PX) being bits  44-51
-The byte index (BX) being bits  52-63
-
-Notes:
-1) s/390 has no PMD so the PMD is really the PGD also.
-A lot of this stuff is defined in pgtable.h.
-
-2) Also seeing as s/390's page indexes are only 1k  in size 
-(bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k )
-to make the best use of memory by updating 4 segment indices 
-entries each time we mess with a PMD & use offsets 
-0,1024,2048 & 3072 in this page as for our segment indexes.
-On z/Architecture our page indexes are now 2k in size
-( bits 12-19 x 8 bytes per pte ) we do a similar trick
-but only mess with 2 segment indices each time we mess with
-a PMD.
-
-3) As z/Architecture supports up to a massive 5-level page table lookup we
-can only use 3 currently on Linux ( as this is all the generic kernel
-currently supports ) however this may change in future
-this allows us to access ( according to my sums )
-4TB of virtual storage per process i.e.
-4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes,
-enough for another 2 or 3 of years I think :-).
-to do this we use a region-third-table designation type in
-our address space control registers.
- 
-
-The Linux for s/390 & z/Architecture Kernel Task Structure
-==========================================================
-Each process/thread under Linux for S390 has its own kernel task_struct
-defined in linux/include/linux/sched.h
-The S390 on initialisation & resuming of a process on a cpu sets
-the __LC_KERNEL_STACK variable in the spare prefix area for this cpu
-(which we use for per-processor globals).
-
-The kernel stack pointer is intimately tied with the task structure for
-each processor as follows.
-
-                      s/390
-            ************************
-            *  1 page kernel stack *
-	    *        ( 4K )        *
-            ************************
-            *   1 page task_struct *        
-            *        ( 4K )        *
-8K aligned  ************************ 
-
-                 z/Architecture
-            ************************
-            *  2 page kernel stack *
-	    *        ( 8K )        *
-            ************************
-            *  2 page task_struct  *        
-            *        ( 8K )        *
-16K aligned ************************ 
-
-What this means is that we don't need to dedicate any register or global
-variable to point to the current running process & can retrieve it with the
-following very simple construct for s/390 & one very similar for z/Architecture.
-
-static inline struct task_struct * get_current(void)
-{
-        struct task_struct *current;
-        __asm__("lhi   %0,-8192\n\t"
-                "nr    %0,15"
-                : "=r" (current) );
-        return current;
-}
-
-i.e. just anding the current kernel stack pointer with the mask -8192.
-Thankfully because Linux doesn't have support for nested IO interrupts
-& our devices have large buffers can survive interrupts being shut for 
-short amounts of time we don't need a separate stack for interrupts.
-
-
-
-
-Register Usage & Stackframes on Linux for s/390 & z/Architecture
-=================================================================
-Overview:
----------
-This is the code that gcc produces at the top & the bottom of
-each function. It usually is fairly consistent & similar from 
-function to function & if you know its layout you can probably
-make some headway in finding the ultimate cause of a problem
-after a crash without a source level debugger.
-
-Note: To follow stackframes requires a knowledge of C or Pascal &
-limited knowledge of one assembly language.
-
-It should be noted that there are some differences between the
-s/390 and z/Architecture stack layouts as the z/Architecture stack layout
-didn't have to maintain compatibility with older linkage formats.
-
-Glossary:
----------
-alloca:
-This is a built in compiler function for runtime allocation
-of extra space on the callers stack which is obviously freed
-up on function exit ( e.g. the caller may choose to allocate nothing
-of a buffer of 4k if required for temporary purposes ), it generates 
-very efficient code ( a few cycles  ) when compared to alternatives 
-like malloc.
-
-automatics: These are local variables on the stack,
-i.e they aren't in registers & they aren't static.
-
-back-chain:
-This is a pointer to the stack pointer before entering a
-framed functions ( see frameless function ) prologue got by 
-dereferencing the address of the current stack pointer,
- i.e. got by accessing the 32 bit value at the stack pointers
-current location.
-
-base-pointer:
-This is a pointer to the back of the literal pool which
-is an area just behind each procedure used to store constants
-in each function.
-
-call-clobbered: The caller probably needs to save these registers if there 
-is something of value in them, on the stack or elsewhere before making a 
-call to another procedure so that it can restore it later.
-
-epilogue:
-The code generated by the compiler to return to the caller.
-
-frameless-function
-A frameless function in Linux for s390 & z/Architecture is one which doesn't 
-need more than the register save area (96 bytes on s/390, 160 on z/Architecture)
-given to it by the caller.
-A frameless function never:
-1) Sets up a back chain.
-2) Calls alloca.
-3) Calls other normal functions
-4) Has automatics.
-
-GOT-pointer:
-This is a pointer to the global-offset-table in ELF
-( Executable Linkable Format, Linux'es most common executable format ),
-all globals & shared library objects are found using this pointer.
-
-lazy-binding
-ELF shared libraries are typically only loaded when routines in the shared
-library are actually first called at runtime. This is lazy binding.
-
-procedure-linkage-table
-This is a table found from the GOT which contains pointers to routines
-in other shared libraries which can't be called to by easier means.
-
-prologue:
-The code generated by the compiler to set up the stack frame.
-
-outgoing-args:
-This is extra area allocated on the stack of the calling function if the
-parameters for the callee's cannot all be put in registers, the same
-area can be reused by each function the caller calls.
-
-routine-descriptor:
-A COFF  executable format based concept of a procedure reference 
-actually being 8 bytes or more as opposed to a simple pointer to the routine.
-This is typically defined as follows
-Routine Descriptor offset 0=Pointer to Function
-Routine Descriptor offset 4=Pointer to Table of Contents
-The table of contents/TOC is roughly equivalent to a GOT pointer.
-& it means that shared libraries etc. can be shared between several
-environments each with their own TOC.
-
- 
-static-chain: This is used in nested functions a concept adopted from pascal 
-by gcc not used in ansi C or C++ ( although quite useful ), basically it
-is a pointer used to reference local variables of enclosing functions.
-You might come across this stuff once or twice in your lifetime.
-
-e.g.
-The function below should return 11 though gcc may get upset & toss warnings 
-about unused variables.
-int FunctionA(int a)
-{
-	int b;
-	FunctionC(int c)
-	{
-		b=c+1;
-	}
-	FunctionC(10);
-	return(b);
-}
-
-
-s/390 & z/Architecture Register usage
-=====================================
-r0       used by syscalls/assembly                  call-clobbered
-r1	 used by syscalls/assembly                  call-clobbered
-r2       argument 0 / return value 0                call-clobbered
-r3       argument 1 / return value 1 (if long long) call-clobbered
-r4       argument 2                                 call-clobbered
-r5       argument 3                                 call-clobbered
-r6	 argument 4				    saved
-r7       pointer-to arguments 5 to ...              saved      
-r8       this & that                                saved
-r9       this & that                                saved
-r10      static-chain ( if nested function )        saved
-r11      frame-pointer ( if function used alloca )  saved
-r12      got-pointer                                saved
-r13      base-pointer                               saved
-r14      return-address                             saved
-r15      stack-pointer                              saved
-
-f0       argument 0 / return value ( float/double ) call-clobbered
-f2       argument 1                                 call-clobbered
-f4       z/Architecture argument 2                  saved
-f6       z/Architecture argument 3                  saved
-The remaining floating points
-f1,f3,f5 f7-f15 are call-clobbered.
-
-Notes:
-------
-1) The only requirement is that registers which are used
-by the callee are saved, e.g. the compiler is perfectly
-capable of using r11 for purposes other than a frame a
-frame pointer if a frame pointer is not needed.
-2) In functions with variable arguments e.g. printf the calling procedure 
-is identical to one without variable arguments & the same number of 
-parameters. However, the prologue of this function is somewhat more
-hairy owing to it having to move these parameters to the stack to
-get va_start, va_arg & va_end to work.
-3) Access registers are currently unused by gcc but are used in
-the kernel. Possibilities exist to use them at the moment for
-temporary storage but it isn't recommended.
-4) Only 4 of the floating point registers are used for
-parameter passing as older machines such as G3 only have only 4
-& it keeps the stack frame compatible with other compilers.
-However with IEEE floating point emulation under linux on the
-older machines you are free to use the other 12.
-5) A long long or double parameter cannot be have the 
-first 4 bytes in a register & the second four bytes in the 
-outgoing args area. It must be purely in the outgoing args
-area if crossing this boundary.
-6) Floating point parameters are mixed with outgoing args
-on the outgoing args area in the order the are passed in as parameters.
-7) Floating point arguments 2 & 3 are saved in the outgoing args area for 
-z/Architecture
-
-
-Stack Frame Layout
-------------------
-s/390     z/Architecture
-0         0             back chain ( a 0 here signifies end of back chain )
-4         8             eos ( end of stack, not used on Linux for S390 used in other linkage formats )
-8         16            glue used in other s/390 linkage formats for saved routine descriptors etc.
-12        24            glue used in other s/390 linkage formats for saved routine descriptors etc.
-16        32            scratch area
-20        40            scratch area
-24        48            saved r6 of caller function
-28        56            saved r7 of caller function
-32        64            saved r8 of caller function
-36        72            saved r9 of caller function
-40        80            saved r10 of caller function
-44        88            saved r11 of caller function
-48        96            saved r12 of caller function
-52        104           saved r13 of caller function
-56        112           saved r14 of caller function
-60        120           saved r15 of caller function
-64        128           saved f4 of caller function
-72        132           saved f6 of caller function
-80                      undefined
-96        160           outgoing args passed from caller to callee
-96+x      160+x         possible stack alignment ( 8 bytes desirable )
-96+x+y    160+x+y       alloca space of caller ( if used )
-96+x+y+z  160+x+y+z     automatics of caller ( if used )
-0                       back-chain
-
-A sample program with comments.
-===============================
-
-Comments on the function test
------------------------------
-1) It didn't need to set up a pointer to the constant pool gpr13 as it is not
-used ( :-( ).
-2) This is a frameless function & no stack is bought.
-3) The compiler was clever enough to recognise that it could return the
-value in r2 as well as use it for the passed in parameter ( :-) ).
-4) The basr ( branch relative & save ) trick works as follows the instruction 
-has a special case with r0,r0 with some instruction operands is understood as 
-the literal value 0, some risc architectures also do this ). So now
-we are branching to the next address & the address new program counter is
-in r13,so now we subtract the size of the function prologue we have executed
-+ the size of the literal pool to get to the top of the literal pool
-0040037c int test(int b)
-{                                                          # Function prologue below
-  40037c:	90 de f0 34 	stm	%r13,%r14,52(%r15) # Save registers r13 & r14
-  400380:	0d d0       	basr	%r13,%r0           # Set up pointer to constant pool using
-  400382:	a7 da ff fa 	ahi	%r13,-6            # basr trick
-	return(5+b);
-	                                                   # Huge main program
-  400386:	a7 2a 00 05 	ahi	%r2,5              # add 5 to r2
-
-                                                           # Function epilogue below 
-  40038a:	98 de f0 34 	lm	%r13,%r14,52(%r15) # restore registers r13 & 14
-  40038e:	07 fe       	br	%r14               # return
-}
-
-Comments on the function main
------------------------------
-1) The compiler did this function optimally ( 8-) )
-
-Literal pool for main.
-400390:	ff ff ff ec 	.long 0xffffffec
-main(int argc,char *argv[])
-{                                                          # Function prologue below
-  400394:	90 bf f0 2c 	stm	%r11,%r15,44(%r15) # Save necessary registers
-  400398:	18 0f       	lr	%r0,%r15           # copy stack pointer to r0
-  40039a:	a7 fa ff a0 	ahi	%r15,-96           # Make area for callee saving 
-  40039e:	0d d0       	basr	%r13,%r0           # Set up r13 to point to
-  4003a0:	a7 da ff f0 	ahi	%r13,-16           # literal pool
-  4003a4:	50 00 f0 00 	st	%r0,0(%r15)        # Save backchain
-
-	return(test(5));                                   # Main Program Below
-  4003a8:	58 e0 d0 00 	l	%r14,0(%r13)       # load relative address of test from
-						           # literal pool
-  4003ac:	a7 28 00 05 	lhi	%r2,5              # Set first parameter to 5
-  4003b0:	4d ee d0 00 	bas	%r14,0(%r14,%r13)  # jump to test setting r14 as return
-							   # address using branch & save instruction.
-
-							   # Function Epilogue below
-  4003b4:	98 bf f0 8c 	lm	%r11,%r15,140(%r15)# Restore necessary registers.
-  4003b8:	07 fe       	br	%r14               # return to do program exit 
-}
-
-
-Compiler updates
-----------------
-
-main(int argc,char *argv[])
-{
-  4004fc:	90 7f f0 1c       	stm	%r7,%r15,28(%r15)
-  400500:	a7 d5 00 04       	bras	%r13,400508 <main+0xc>
-  400504:	00 40 04 f4       	.long	0x004004f4 
-  # compiler now puts constant pool in code to so it saves an instruction 
-  400508:	18 0f             	lr	%r0,%r15
-  40050a:	a7 fa ff a0       	ahi	%r15,-96
-  40050e:	50 00 f0 00       	st	%r0,0(%r15)
-	return(test(5));
-  400512:	58 10 d0 00       	l	%r1,0(%r13)
-  400516:	a7 28 00 05       	lhi	%r2,5
-  40051a:	0d e1             	basr	%r14,%r1
-  # compiler adds 1 extra instruction to epilogue this is done to
-  # avoid processor pipeline stalls owing to data dependencies on g5 &
-  # above as register 14 in the old code was needed directly after being loaded 
-  # by the lm	%r11,%r15,140(%r15) for the br %14.
-  40051c:	58 40 f0 98       	l	%r4,152(%r15)
-  400520:	98 7f f0 7c       	lm	%r7,%r15,124(%r15)
-  400524:	07 f4             	br	%r4
-}
-
-
-Hartmut ( our compiler developer ) also has been threatening to take out the
-stack backchain in optimised code as this also causes pipeline stalls, you
-have been warned.
-
-64 bit z/Architecture code disassembly
---------------------------------------
-
-If you understand the stuff above you'll understand the stuff
-below too so I'll avoid repeating myself & just say that 
-some of the instructions have g's on the end of them to indicate
-they are 64 bit & the stack offsets are a bigger, 
-the only other difference you'll find between 32 & 64 bit is that
-we now use f4 & f6 for floating point arguments on 64 bit.
-00000000800005b0 <test>:
-int test(int b)
-{
-	return(5+b);
-    800005b0:	a7 2a 00 05       	ahi	%r2,5
-    800005b4:	b9 14 00 22       	lgfr	%r2,%r2 # downcast to integer
-    800005b8:	07 fe             	br	%r14
-    800005ba:	07 07             	bcr	0,%r7
-
-
-}
-
-00000000800005bc <main>:
-main(int argc,char *argv[])
-{ 
-    800005bc:	eb bf f0 58 00 24 	stmg	%r11,%r15,88(%r15)
-    800005c2:	b9 04 00 1f       	lgr	%r1,%r15
-    800005c6:	a7 fb ff 60       	aghi	%r15,-160
-    800005ca:	e3 10 f0 00 00 24 	stg	%r1,0(%r15)
-	return(test(5));
-    800005d0:	a7 29 00 05       	lghi	%r2,5
-    # brasl allows jumps > 64k & is overkill here bras would do fune
-    800005d4:	c0 e5 ff ff ff ee 	brasl	%r14,800005b0 <test> 
-    800005da:	e3 40 f1 10 00 04 	lg	%r4,272(%r15)
-    800005e0:	eb bf f0 f8 00 04 	lmg	%r11,%r15,248(%r15)
-    800005e6:	07 f4             	br	%r4
-}
-
-
-
-Compiling programs for debugging on Linux for s/390 & z/Architecture
-====================================================================
--gdwarf-2 now works it should be considered the default debugging
-format for s/390 & z/Architecture as it is more reliable for debugging
-shared libraries,  normal -g debugging works much better now
-Thanks to the IBM java compiler developers bug reports. 
-
-This is typically done adding/appending the flags -g or -gdwarf-2 to the 
-CFLAGS & LDFLAGS variables Makefile of the program concerned.
-
-If using gdb & you would like accurate displays of registers &
- stack traces compile without optimisation i.e make sure
-that there is no -O2 or similar on the CFLAGS line of the Makefile &
-the emitted gcc commands, obviously this will produce worse code 
-( not advisable for shipment ) but it is an  aid to the debugging process.
-
-This aids debugging because the compiler will copy parameters passed in
-in registers onto the stack so backtracing & looking at passed in
-parameters will work, however some larger programs which use inline functions
-will not compile without optimisation.
-
-Debugging with optimisation has since much improved after fixing
-some bugs, please make sure you are using gdb-5.0 or later developed 
-after Nov'2000.
-
-
-
-Debugging under VM
-==================
-
-Notes
------
-Addresses & values in the VM debugger are always hex never decimal
-Address ranges are of the format <HexValue1>-<HexValue2> or
-<HexValue1>.<HexValue2>
-For example, the address range	0x2000 to 0x3000 can be described as 2000-3000
-or 2000.1000
-
-The VM Debugger is case insensitive.
-
-VM's strengths are usually other debuggers weaknesses you can get at any
-resource no matter how sensitive e.g. memory management resources, change
-address translation in the PSW. For kernel hacking you will reap dividends if
-you get good at it.
-
-The VM Debugger displays operators but not operands, and also the debugger
-displays useful information on the same line as the author of the code probably
-felt that it was a good idea not to go over the 80 columns on the screen.
-This isn't as unintuitive as it may seem as the s/390 instructions are easy to
-decode mentally and you can make a good guess at a lot of them as all the
-operands are nibble (half byte aligned).
-So if you have an objdump listing by hand, it is quite easy to follow, and if
-you don't have an objdump listing keep a copy of the s/390 Reference Summary
-or alternatively the s/390 principles of operation next to you.
-e.g. even I can guess that 
-0001AFF8' LR    180F        CC 0
-is a ( load register ) lr r0,r15 
-
-Also it is very easy to tell the length of a 390 instruction from the 2 most
-significant bits in the instruction (not that this info is really useful except
-if you are trying to make sense of a hexdump of code).
-Here is a table
-Bits                    Instruction Length
-------------------------------------------
-00                          2 Bytes
-01                          4 Bytes
-10                          4 Bytes
-11                          6 Bytes
-
-The debugger also displays other useful info on the same line such as the
-addresses being operated on destination addresses of branches & condition codes.
-e.g.  
-00019736' AHI   A7DAFF0E    CC 1
-000198BA' BRC   A7840004 -> 000198C2'   CC 0
-000198CE' STM   900EF068 >> 0FA95E78    CC 2
-
-
-
-Useful VM debugger commands
----------------------------
-
-I suppose I'd better mention this before I start
-to list the current active traces do 
-Q TR
-there can be a maximum of 255 of these per set
-( more about trace sets later ).
-To stop traces issue a
-TR END.
-To delete a particular breakpoint issue
-TR DEL <breakpoint number>
-
-The PA1 key drops to CP mode so you can issue debugger commands,
-Doing alt c (on my 3270 console at least ) clears the screen. 
-hitting b <enter> comes back to the running operating system
-from cp mode ( in our case linux ).
-It is typically useful to add shortcuts to your profile.exec file
-if you have one ( this is roughly equivalent to autoexec.bat in DOS ).
-file here are a few from mine.
-/* this gives me command history on issuing f12 */
-set pf12 retrieve 
-/* this continues */
-set pf8 imm b
-/* goes to trace set a */
-set pf1 imm tr goto a
-/* goes to trace set b */
-set pf2 imm tr goto b
-/* goes to trace set c */
-set pf3 imm tr goto c
-
-
-
-Instruction Tracing
--------------------
-Setting a simple breakpoint
-TR I PSWA <address>
-To debug a particular function try
-TR I R <function address range>
-TR I on its own will single step.
-TR I DATA <MNEMONIC> <OPTIONAL RANGE> will trace for particular mnemonics
-e.g.
-TR I DATA 4D R 0197BC.4000
-will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000
-if you were inclined you could add traces for all branch instructions &
-suffix them with the run prefix so you would have a backtrace on screen 
-when a program crashes.
-TR BR <INTO OR FROM> will trace branches into or out of an address.
-e.g.
-TR BR INTO 0 is often quite useful if a program is getting awkward & deciding
-to branch to 0 & crashing as this will stop at the address before in jumps to 0.
-TR I R <address range> RUN cmd d g
-single steps a range of addresses but stays running &
-displays the gprs on each step.
-
-
-
-Displaying & modifying Registers
---------------------------------
-D G will display all the gprs
-Adding a extra G to all the commands is necessary to access the full 64 bit 
-content in VM on z/Architecture. Obviously this isn't required for access
-registers as these are still 32 bit.
-e.g. DGG instead of DG 
-D X will display all the control registers
-D AR will display all the access registers
-D AR4-7 will display access registers 4 to 7
-CPU ALL D G will display the GRPS of all CPUS in the configuration
-D PSW will display the current PSW
-st PSW 2000 will put the value 2000 into the PSW &
-cause crash your machine.
-D PREFIX displays the prefix offset
-
-
-Displaying Memory
------------------
-To display memory mapped using the current PSW's mapping try
-D <range>
-To make VM display a message each time it hits a particular address and
-continue try
-D I<range> will disassemble/display a range of instructions.
-ST addr 32 bit word will store a 32 bit aligned address
-D T<range> will display the EBCDIC in an address (if you are that way inclined)
-D R<range> will display real addresses ( without DAT ) but with prefixing.
-There are other complex options to display if you need to get at say home space
-but are in primary space the easiest thing to do is to temporarily
-modify the PSW to the other addressing mode, display the stuff & then
-restore it.
-
-
- 
-Hints
------
-If you want to issue a debugger command without halting your virtual machine
-with the PA1 key try prefixing the command with #CP e.g.
-#cp tr i pswa 2000
-also suffixing most debugger commands with RUN will cause them not
-to stop just display the mnemonic at the current instruction on the console.
-If you have several breakpoints you want to put into your program &
-you get fed up of cross referencing with System.map
-you can do the following trick for several symbols.
-grep do_signal System.map 
-which emits the following among other things
-0001f4e0 T do_signal 
-now you can do
-
-TR I PSWA 0001f4e0 cmd msg * do_signal
-This sends a message to your own console each time do_signal is entered.
-( As an aside I wrote a perl script once which automatically generated a REXX
-script with breakpoints on every kernel procedure, this isn't a good idea
-because there are thousands of these routines & VM can only set 255 breakpoints
-at a time so you nearly had to spend as long pruning the file down as you would 
-entering the msgs by hand), however, the trick might be useful for a single
-object file. In the 3270 terminal emulator x3270 there is a very useful option
-in the file menu called "Save Screen In File" - this is very good for keeping a
-copy of traces.
-
-From CMS help <command name> will give you online help on a particular command. 
-e.g. 
-HELP DISPLAY
-
-Also CP has a file called profile.exec which automatically gets called
-on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session
-CP has a feature similar to doskey, it may be useful for you to
-use profile.exec to define some keystrokes. 
-e.g.
-SET PF9 IMM B
-This does a single step in VM on pressing F8. 
-SET PF10  ^
-This sets up the ^ key.
-which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed directly
-into some 3270 consoles.
-SET PF11 ^-
-This types the starting keystrokes for a sysrq see SysRq below.
-SET PF12 RETRIEVE
-This retrieves command history on pressing F12.
-
-
-Sometimes in VM the display is set up to scroll automatically this
-can be very annoying if there are messages you wish to look at
-to stop this do
-TERM MORE 255 255
-This will nearly stop automatic screen updates, however it will
-cause a denial of service if lots of messages go to the 3270 console,
-so it would be foolish to use this as the default on a production machine.
- 
-
-Tracing particular processes
-----------------------------
-The kernel's text segment is intentionally at an address in memory that it will
-very seldom collide with text segments of user programs ( thanks Martin ),
-this simplifies debugging the kernel.
-However it is quite common for user processes to have addresses which collide
-this can make debugging a particular process under VM painful under normal
-circumstances as the process may change when doing a 
-TR I R <address range>.
-Thankfully after reading VM's online help I figured out how to debug
-I particular process.
-
-Your first problem is to find the STD ( segment table designation )
-of the program you wish to debug.
-There are several ways you can do this here are a few
-1) objdump --syms <program to be debugged> | grep main
-To get the address of main in the program.
-tr i pswa <address of main>
-Start the program, if VM drops to CP on what looks like the entry
-point of the main function this is most likely the process you wish to debug.
-Now do a D X13 or D XG13 on z/Architecture.
-On 31 bit the STD is bits 1-19 ( the STO segment table origin ) 
-& 25-31 ( the STL segment table length ) of CR13.
-now type
-TR I R STD <CR13's value> 0.7fffffff
-e.g.
-TR I R STD 8F32E1FF 0.7fffffff
-Another very useful variation is
-TR STORE INTO STD <CR13's value> <address range>
-for finding out when a particular variable changes.
-
-An alternative way of finding the STD of a currently running process 
-is to do the following, ( this method is more complex but
-could be quite convenient if you aren't updating the kernel much &
-so your kernel structures will stay constant for a reasonable period of
-time ).
-
-grep task /proc/<pid>/status
-from this you should see something like
-task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68
-This now gives you a pointer to the task structure.
-Now make CC:="s390-gcc -g" kernel/sched.s
-To get the task_struct stabinfo.
-( task_struct is defined in include/linux/sched.h ).
-Now we want to look at
-task->active_mm->pgd
-on my machine the active_mm in the task structure stab is
-active_mm:(4,12),672,32
-its offset is 672/8=84=0x54
-the pgd member in the mm_struct stab is
-pgd:(4,6)=*(29,5),96,32
-so its offset is 96/8=12=0xc
-
-so we'll
-hexdump -s 0xf160054 /dev/mem | more
-i.e. task_struct+active_mm offset
-to look at the active_mm member
-f160054 0fee cc60 0019 e334 0000 0000 0000 0011
-hexdump -s 0x0feecc6c /dev/mem | more
-i.e. active_mm+pgd offset
-feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010
-we get something like
-now do 
-TR I R STD <pgd|0x7f> 0.7fffffff
-i.e. the 0x7f is added because the pgd only
-gives the page table origin & we need to set the low bits
-to the maximum possible segment table length.
-TR I R STD 0f2c007f 0.7fffffff
-on z/Architecture you'll probably need to do
-TR I R STD <pgd|0x7> 0.ffffffffffffffff
-to set the TableType to 0x1 & the Table length to 3.
-
-
-
-Tracing Program Exceptions
---------------------------
-If you get a crash which says something like
-illegal operation or specification exception followed by a register dump
-You can restart linux & trace these using the tr prog <range or value> trace
-option.
-
-
-The most common ones you will normally be tracing for is
-1=operation exception
-2=privileged operation exception
-4=protection exception
-5=addressing exception
-6=specification exception
-10=segment translation exception
-11=page translation exception
-
-The full list of these is on page 22 of the current s/390 Reference Summary.
-e.g.
-tr prog 10 will trace segment translation exceptions.
-tr prog on its own will trace all program interruption codes.
-
-Trace Sets
-----------
-On starting VM you are initially in the INITIAL trace set.
-You can do a Q TR to verify this.
-If you have a complex tracing situation where you wish to wait for instance 
-till a driver is open before you start tracing IO, but know in your
-heart that you are going to have to make several runs through the code till you
-have a clue whats going on. 
-
-What you can do is
-TR I PSWA <Driver open address>
-hit b to continue till breakpoint
-reach the breakpoint
-now do your
-TR GOTO B 
-TR IO 7c08-7c09 inst int run 
-or whatever the IO channels you wish to trace are & hit b
-
-To got back to the initial trace set do
-TR GOTO INITIAL
-& the TR I PSWA <Driver open address> will be the only active breakpoint again.
-
-
-Tracing linux syscalls under VM
--------------------------------
-Syscalls are implemented on Linux for S390 by the Supervisor call instruction
-(SVC). There 256 possibilities of these as the instruction is made up of a 0xA
-opcode and the second byte being the syscall number. They are traced using the
-simple command:
-TR SVC  <Optional value or range>
-the syscalls are defined in linux/arch/s390/include/asm/unistd.h
-e.g. to trace all file opens just do
-TR SVC 5 ( as this is the syscall number of open )
-
-
-SMP Specific commands
----------------------
-To find out how many cpus you have
-Q CPUS displays all the CPU's available to your virtual machine
-To find the cpu that the current cpu VM debugger commands are being directed at
-do Q CPU to change the current cpu VM debugger commands are being directed at do
-CPU <desired cpu no>
-
-On a SMP guest issue a command to all CPUs try prefixing the command with cpu
-all. To issue a command to a particular cpu try cpu <cpu number> e.g.
-CPU 01 TR I R 2000.3000
-If you are running on a guest with several cpus & you have a IO related problem
-& cannot follow the flow of code but you know it isn't smp related.
-from the bash prompt issue
-shutdown -h now or halt.
-do a Q CPUS to find out how many cpus you have
-detach each one of them from cp except cpu 0 
-by issuing a 
-DETACH CPU 01-(number of cpus in configuration)
-& boot linux again.
-TR SIGP will trace inter processor signal processor instructions.
-DEFINE CPU 01-(number in configuration) 
-will get your guests cpus back.
-
-
-Help for displaying ascii textstrings
--------------------------------------
-On the very latest VM Nucleus'es VM can now display ascii
-( thanks Neale for the hint ) by doing
-D TX<lowaddr>.<len>
-e.g.
-D TX0.100
-
-Alternatively
-=============
-Under older VM debuggers (I love EBDIC too) you can use following little
-program which converts a command line of hex digits to ascii text. It can be
-compiled under linux and you can copy the hex digits from your x3270 terminal
-to your xterm if you are debugging from a linuxbox.
-
-This is quite useful when looking at a parameter passed in as a text string
-under VM ( unless you are good at decoding ASCII in your head ).
-
-e.g. consider tracing an open syscall
-TR SVC 5
-We have stopped at a breakpoint
-000151B0' SVC   0A05     -> 0001909A'   CC 0
-
-D 20.8 to check the SVC old psw in the prefix area and see was it from userspace
-(for the layout of the prefix area consult the "Fixed Storage Locations"
-chapter of the s/390 Reference Summary if you have it available).
-V00000020  070C2000 800151B2
-The problem state bit wasn't set &  it's also too early in the boot sequence
-for it to be a userspace SVC if it was we would have to temporarily switch the 
-psw to user space addressing so we could get at the first parameter of the open
-in gpr2.
-Next do a 
-D G2
-GPR  2 =  00014CB4
-Now display what gpr2 is pointing to
-D 00014CB4.20
-V00014CB4  2F646576 2F636F6E 736F6C65 00001BF5
-V00014CC4  FC00014C B4001001 E0001000 B8070707
-Now copy the text till the first 00 hex ( which is the end of the string
-to an xterm & do hex2ascii on it.
-hex2ascii 2F646576 2F636F6E 736F6C65 00 
-outputs
-Decoded Hex:=/ d e v / c o n s o l e 0x00 
-We were opening the console device,
-
-You can compile the code below yourself for practice :-),
-/*
- *    hex2ascii.c
- *    a useful little tool for converting a hexadecimal command line to ascii
- *
- *    Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
- *    (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation.
- */   
-#include <stdio.h>
-
-int main(int argc,char *argv[])
-{
-  int cnt1,cnt2,len,toggle=0;
-  int startcnt=1;
-  unsigned char c,hex;
-  
-  if(argc>1&&(strcmp(argv[1],"-a")==0))
-     startcnt=2;
-  printf("Decoded Hex:=");
-  for(cnt1=startcnt;cnt1<argc;cnt1++)
-  {
-    len=strlen(argv[cnt1]);
-    for(cnt2=0;cnt2<len;cnt2++)
-    {
-       c=argv[cnt1][cnt2];
-       if(c>='0'&&c<='9')
-	  c=c-'0';
-       if(c>='A'&&c<='F')
-	  c=c-'A'+10;
-       if(c>='a'&&c<='f')
-	  c=c-'a'+10;
-       switch(toggle)
-       {
-	  case 0:
-	     hex=c<<4;
-	     toggle=1;
-	  break;
-	  case 1:
-	     hex+=c;
-	     if(hex<32||hex>127)
-	     {
-		if(startcnt==1)
-		   printf("0x%02X ",(int)hex);
-		else
-		   printf(".");
-	     }
-	     else
-	     {
-	       printf("%c",hex);
-	       if(startcnt==1)
-		  printf(" ");
-	     }
-	     toggle=0;
-	  break;
-       }
-    }
-  }
-  printf("\n");
-}
-
-
-
-
-Stack tracing under VM
-----------------------
-A basic backtrace
------------------
-
-Here are the tricks I use 9 out of 10 times it works pretty well,
-
-When your backchain reaches a dead end
---------------------------------------
-This can happen when an exception happens in the kernel and the kernel is
-entered twice. If you reach the NULL pointer at the end of the back chain you
-should be able to sniff further back if you follow the following tricks.
-1) A kernel address should be easy to recognise since it is in
-primary space & the problem state bit isn't set & also
-The Hi bit of the address is set.
-2) Another backchain should also be easy to recognise since it is an 
-address pointing to another address approximately 100 bytes or 0x70 hex
-behind the current stackpointer.
-
-
-Here is some practice.
-boot the kernel & hit PA1 at some random time
-d g to display the gprs, this should display something like
-GPR  0 =  00000001  00156018  0014359C  00000000
-GPR  4 =  00000001  001B8888  000003E0  00000000
-GPR  8 =  00100080  00100084  00000000  000FE000
-GPR 12 =  00010400  8001B2DC  8001B36A  000FFED8
-Note that GPR14 is a return address but as we are real men we are going to
-trace the stack.
-display 0x40 bytes after the stack pointer.
-
-V000FFED8  000FFF38 8001B838 80014C8E 000FFF38
-V000FFEE8  00000000 00000000 000003E0 00000000
-V000FFEF8  00100080 00100084 00000000 000FE000
-V000FFF08  00010400 8001B2DC 8001B36A 000FFED8
-
-
-Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if
-you look above at our stackframe & also agrees with GPR14.
-
-now backchain 
-d 000FFF38.40
-we now are taking the contents of SP to get our first backchain.
-
-V000FFF38  000FFFA0 00000000 00014995 00147094
-V000FFF48  00147090 001470A0 000003E0 00000000
-V000FFF58  00100080 00100084 00000000 001BF1D0
-V000FFF68  00010400 800149BA 80014CA6 000FFF38
-
-This displays a 2nd return address of 80014CA6
-
-now do d 000FFFA0.40 for our 3rd backchain
-
-V000FFFA0  04B52002 0001107F 00000000 00000000
-V000FFFB0  00000000 00000000 FF000000 0001107F
-V000FFFC0  00000000 00000000 00000000 00000000
-V000FFFD0  00010400 80010802 8001085A 000FFFA0
-
-
-our 3rd return address is 8001085A
-
-as the 04B52002 looks suspiciously like rubbish it is fair to assume that the
-kernel entry routines for the sake of optimisation don't set up a backchain.
-
-now look at System.map to see if the addresses make any sense.
-
-grep -i 0001b3 System.map
-outputs among other things
-0001b304 T cpu_idle 
-so 8001B36A
-is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it )
-
-
-grep -i 00014 System.map 
-produces among other things
-00014a78 T start_kernel  
-so 0014CA6 is start_kernel+some hex number I can't add in my head.
-
-grep -i 00108 System.map 
-this produces
-00010800 T _stext
-so   8001085A is _stext+0x5a
-
-Congrats you've done your first backchain.
-
-
-
-s/390 & z/Architecture IO Overview
-==================================
-
-I am not going to give a course in 390 IO architecture as this would take me
-quite a while and I'm no expert. Instead I'll give a 390 IO architecture
-summary for Dummies. If you have the s/390 principles of operation available
-read this instead. If nothing else you may find a few useful keywords in here
-and be able to use them on a web search engine to find more useful information.
-
-Unlike other bus architectures modern 390 systems do their IO using mostly
-fibre optics and devices such as tapes and disks can be shared between several
-mainframes. Also S390 can support up to 65536 devices while a high end PC based
-system might be choking with around 64.
-
-Here is some of the common IO terminology:
-
-Subchannel:
-This is the logical number most IO commands use to talk to an IO device. There
-can be up to 0x10000 (65536) of these in a configuration, typically there are a
-few hundred. Under VM for simplicity they are allocated contiguously, however
-on the native hardware they are not. They typically stay consistent between
-boots provided no new hardware is inserted or removed.
-Under Linux for s390 we use these as IRQ's and also when issuing an IO command
-(CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL,
-START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID
-of the device we wish to talk to. The most important of these instructions are
-START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO
-completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have
-up to 8 channel paths to a device, this offers redundancy if one is not
-available.
-
-Device Number:
-This number remains static and is closely tied to the hardware. There are 65536
-of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and
-another lsb 8 bits. These remain static even if more devices are inserted or
-removed from the hardware. There is a 1 to 1 mapping between subchannels and
-device numbers, provided devices aren't inserted or removed.
-
-Channel Control Words:
-CCWs are linked lists of instructions initially pointed to by an operation
-request block (ORB), which is initially given to Start Subchannel (SSCH)
-command along with the subchannel number for the IO subsystem to process
-while the CPU continues executing normal code.
-CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and
-Format 1 (31 bit). These are typically used to issue read and write (and many
-other) instructions. They consist of a length field and an absolute address
-field.
-Each IO typically gets 1 or 2 interrupts, one for channel end (primary status)
-when the channel is idle, and the second for device end (secondary status).
-Sometimes you get both concurrently. You check how the IO went on by issuing a
-TEST SUBCHANNEL at each interrupt, from which you receive an Interruption
-response block (IRB). If you get channel and device end status in the IRB
-without channel checks etc. your IO probably went okay. If you didn't you
-probably need to examine the IRB, extended status word etc.
-If an error occurs, more sophisticated control units have a facility known as
-concurrent sense. This means that if an error occurs Extended sense information
-will be presented in the Extended status word in the IRB. If not you have to
-issue a subsequent SENSE CCW command after the test subchannel.
-
-
-TPI (Test pending interrupt) can also be used for polled IO, but in
-multitasking multiprocessor systems it isn't recommended except for
-checking special cases (i.e. non looping checks for pending IO etc.).
-
-Store Subchannel and Modify Subchannel can be used to examine and modify
-operating characteristics of a subchannel (e.g. channel paths).
-
-Other IO related Terms:
-Sysplex: S390's Clustering Technology
-QDIO: S390's new high speed IO architecture to support devices such as gigabit
-ethernet, this architecture is also designed to be forward compatible with
-upcoming 64 bit machines.
-
-
-General Concepts 
-
-Input Output Processors (IOP's) are responsible for communicating between
-the mainframe CPU's & the channel & relieve the mainframe CPU's from the
-burden of communicating with IO devices directly, this allows the CPU's to 
-concentrate on data processing. 
-
-IOP's can use one or more links ( known as channel paths ) to talk to each 
-IO device. It first checks for path availability & chooses an available one,
-then starts ( & sometimes terminates IO ).
-There are two types of channel path: ESCON & the Parallel IO interface.
-
-IO devices are attached to control units, control units provide the
-logic to interface the channel paths & channel path IO protocols to 
-the IO devices, they can be integrated with the devices or housed separately
-& often talk to several similar devices ( typical examples would be raid 
-controllers or a control unit which connects to 1000 3270 terminals ).
-
-
-    +---------------------------------------------------------------+
-    | +-----+ +-----+ +-----+ +-----+  +----------+  +----------+   |
-    | | CPU | | CPU | | CPU | | CPU |  |  Main    |  | Expanded |   |
-    | |     | |     | |     | |     |  |  Memory  |  |  Storage |   |
-    | +-----+ +-----+ +-----+ +-----+  +----------+  +----------+   | 
-    |---------------------------------------------------------------+
-    |   IOP        |      IOP      |       IOP                      |
-    |---------------------------------------------------------------
-    | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | 
-    ----------------------------------------------------------------
-         ||                                              ||
-         ||  Bus & Tag Channel Path                      || ESCON
-         ||  ======================                      || Channel
-         ||  ||                  ||                      || Path
-    +----------+               +----------+         +----------+
-    |          |               |          |         |          |
-    |    CU    |               |    CU    |         |    CU    |
-    |          |               |          |         |          |
-    +----------+               +----------+         +----------+
-       |      |                     |                |       |
-+----------+ +----------+      +----------+   +----------+ +----------+
-|I/O Device| |I/O Device|      |I/O Device|   |I/O Device| |I/O Device|
-+----------+ +----------+      +----------+   +----------+ +----------+
-  CPU = Central Processing Unit    
-  C = Channel                      
-  IOP = IP Processor               
-  CU = Control Unit
-
-The 390 IO systems come in 2 flavours the current 390 machines support both
-
-The Older 360 & 370 Interface,sometimes called the Parallel I/O interface,
-sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers
-Interface (OEMI).
-
-This byte wide Parallel channel path/bus has parity & data on the "Bus" cable 
-and control lines on the "Tag" cable. These can operate in byte multiplex mode
-for sharing between several slow devices or burst mode and monopolize the
-channel for the whole burst. Up to 256 devices can be addressed on one of these
-cables. These cables are about one inch in diameter. The maximum unextended
-length supported by these cables is 125 Meters but this can be extended up to
-2km with a fibre optic channel extended such as a 3044. The maximum burst speed
-supported is 4.5 megabytes per second. However, some really old processors
-support only transfer rates of 3.0, 2.0 & 1.0 MB/sec.
-One of these paths can be daisy chained to up to 8 control units.
-
-
-ESCON if fibre optic it is also called FICON 
-Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or
-lasers for communication at a signaling rate of up to 200 megabits/sec. As
-10bits are transferred for every 8 bits info this drops to 160 megabits/sec
-and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only
-operates in burst mode.
- 
-ESCONs typical max cable length is 3km for the led version and 20km for the
-laser version known as XDF (extended distance facility). This can be further
-extended by using an ESCON director which triples the above mentioned ranges.
-Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture,
-the standard Bus & Tag control protocol is however present within the packets.
-Up to 256 devices can be attached to each control unit that uses one of these
-interfaces.
-
-Common 390 Devices include:
-Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters,
-Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console).
-DASD's direct access storage devices ( otherwise known as hard disks ).
-Tape Drives.
-CTC ( Channel to Channel Adapters ),
-ESCON or Parallel Cables used as a very high speed serial link
-between 2 machines.
-
-
-Debugging IO on s/390 & z/Architecture under VM
-===============================================
-
-Now we are ready to go on with IO tracing commands under VM
-
-A few self explanatory queries:
-Q OSA
-Q CTC
-Q DISK ( This command is CMS specific )
-Q DASD
-
-
-
-
-
-
-Q OSA on my machine returns
-OSA  7C08 ON OSA   7C08 SUBCHANNEL = 0000
-OSA  7C09 ON OSA   7C09 SUBCHANNEL = 0001
-OSA  7C14 ON OSA   7C14 SUBCHANNEL = 0002
-OSA  7C15 ON OSA   7C15 SUBCHANNEL = 0003
-
-If you have a guest with certain privileges you may be able to see devices
-which don't belong to you. To avoid this, add the option V.
-e.g.
-Q V OSA
-
-Now using the device numbers returned by this command we will
-Trace the io starting up on the first device 7c08 & 7c09
-In our simplest case we can trace the 
-start subchannels
-like TR SSCH 7C08-7C09
-or the halt subchannels
-or TR HSCH 7C08-7C09
-MSCH's ,STSCH's I think you can guess the rest
-
-A good trick is tracing all the IO's and CCWS and spooling them into the reader
-of another VM guest so he can ftp the logfile back to his own machine. I'll do
-a small bit of this and give you a look at the output.
-
-1) Spool stdout to VM reader
-SP PRT TO (another vm guest ) or * for the local vm guest
-2) Fill the reader with the trace
-TR IO 7c08-7c09 INST INT CCW PRT RUN
-3) Start up linux 
-i 00c  
-4) Finish the trace
-TR END
-5) close the reader
-C PRT
-6) list reader contents
-RDRLIST
-7) copy it to linux4's minidisk 
-RECEIVE / LOG TXT A1 ( replace
-8)
-filel & press F11 to look at it
-You should see something like:
-
-00020942' SSCH  B2334000    0048813C    CC 0    SCH 0000    DEV 7C08
-          CPA 000FFDF0   PARM 00E2C9C4    KEY 0  FPI C0  LPM 80
-          CCW    000FFDF0  E4200100 00487FE8   0000  E4240100 ........
-          IDAL                                      43D8AFE8
-          IDAL                                      0FB76000
-00020B0A'   I/O DEV 7C08 -> 000197BC'   SCH 0000   PARM 00E2C9C4
-00021628' TSCH  B2354000 >> 00488164    CC 0    SCH 0000    DEV 7C08
-          CCWA 000FFDF8   DEV STS 0C  SCH STS 00  CNT 00EC
-           KEY 0   FPI C0  CC 0   CTLS 4007
-00022238' STSCH B2344000 >> 00488108    CC 0    SCH 0000    DEV 7C08
-
-If you don't like messing up your readed ( because you possibly booted from it )
-you can alternatively spool it to another readers guest.
-
-
-Other common VM device related commands
----------------------------------------------
-These commands are listed only because they have
-been of use to me in the past & may be of use to
-you too. For more complete info on each of the commands
-use type HELP <command> from CMS.
-detaching devices
-DET <devno range>
-ATT <devno range> <guest> 
-attach a device to guest * for your own guest
-READY <devno> cause VM to issue a fake interrupt.
-
-The VARY command is normally only available to VM administrators.
-VARY ON PATH <path> TO <devno range>
-VARY OFF PATH <PATH> FROM <devno range>
-This is used to switch on or off channel paths to devices.
-
-Q CHPID <channel path ID>
-This displays state of devices using this channel path
-D SCHIB <subchannel>
-This displays the subchannel information SCHIB block for the device.
-this I believe is also only available to administrators.
-DEFINE CTC <devno>
-defines a virtual CTC channel to channel connection
-2 need to be defined on each guest for the CTC driver to use.
-COUPLE  devno userid remote devno
-Joins a local virtual device to a remote virtual device
-( commonly used for the CTC driver ).
-
-Building a VM ramdisk under CMS which linux can use
-def vfb-<blocksize> <subchannel> <number blocks>
-blocksize is commonly 4096 for linux.
-Formatting it
-format <subchannel> <driver letter e.g. x> (blksize <blocksize>
-
-Sharing a disk between multiple guests
-LINK userid devno1 devno2 mode password
-
-
-
-GDB on S390
-===========
-N.B. if compiling for debugging gdb works better without optimisation 
-( see Compiling programs for debugging )
-
-invocation
-----------
-gdb <victim program> <optional corefile>
-
-Online help
------------
-help: gives help on commands
-e.g.
-help
-help display
-Note gdb's online help is very good use it.
-
-
-Assembly
---------
-info registers: displays registers other than floating point.
-info all-registers: displays floating points as well.
-disassemble: disassembles
-e.g.
-disassemble without parameters will disassemble the current function
-disassemble $pc $pc+10 
-
-Viewing & modifying variables
------------------------------
-print or p: displays variable or register
-e.g. p/x $sp will display the stack pointer
-
-display: prints variable or register each time program stops
-e.g.
-display/x $pc will display the program counter
-display argc
-
-undisplay : undo's display's
-
-info breakpoints: shows all current breakpoints
-
-info stack: shows stack back trace (if this doesn't work too well, I'll show
-you the stacktrace by hand below).
-
-info locals: displays local variables.
-
-info args: display current procedure arguments.
-
-set args: will set argc & argv each time the victim program is invoked.
-
-set <variable>=value
-set argc=100
-set $pc=0
-
-
-
-Modifying execution
--------------------
-step: steps n lines of sourcecode
-step steps 1 line.
-step 100 steps 100 lines of code.
-
-next: like step except this will not step into subroutines
-
-stepi: steps a single machine code instruction.
-e.g. stepi 100
-
-nexti: steps a single machine code instruction but will not step into
-subroutines.
-
-finish: will run until exit of the current routine
-
-run: (re)starts a program
-
-cont: continues a program
-
-quit: exits gdb.
-
-
-breakpoints
-------------
-
-break
-sets a breakpoint
-e.g.
-
-break main
-
-break *$pc
-
-break *0x400618
-
-Here's a really useful one for large programs
-rbr
-Set a breakpoint for all functions matching REGEXP
-e.g.
-rbr 390
-will set a breakpoint with all functions with 390 in their name.
-
-info breakpoints
-lists all breakpoints
-
-delete: delete breakpoint by number or delete them all
-e.g.
-delete 1 will delete the first breakpoint
-delete will delete them all
-
-watch: This will set a watchpoint ( usually hardware assisted ),
-This will watch a variable till it changes
-e.g.
-watch cnt, will watch the variable cnt till it changes.
-As an aside unfortunately gdb's, architecture independent watchpoint code
-is inconsistent & not very good, watchpoints usually work but not always.
-
-info watchpoints: Display currently active watchpoints
-
-condition: ( another useful one )
-Specify breakpoint number N to break only if COND is true.
-Usage is `condition N COND', where N is an integer and COND is an
-expression to be evaluated whenever breakpoint N is reached.
-
-
-
-User defined functions/macros
------------------------------
-define: ( Note this is very very useful,simple & powerful )
-usage define <name> <list of commands> end
-
-examples which you should consider putting into .gdbinit in your home directory
-define d
-stepi
-disassemble $pc $pc+10
-end
-
-define e
-nexti
-disassemble $pc $pc+10
-end
-
-
-Other hard to classify stuff
-----------------------------
-signal n:
-sends the victim program a signal.
-e.g. signal 3 will send a SIGQUIT.
-
-info signals:
-what gdb does when the victim receives certain signals.
-
-list:
-e.g.
-list lists current function source
-list 1,10 list first 10 lines of current file.
-list test.c:1,10
-
-
-directory:
-Adds directories to be searched for source if gdb cannot find the source.
-(note it is a bit sensitive about slashes)
-e.g. To add the root of the filesystem to the searchpath do
-directory //
-
-
-call <function>
-This calls a function in the victim program, this is pretty powerful
-e.g.
-(gdb) call printf("hello world")
-outputs:
-$1 = 11 
-
-You might now be thinking that the line above didn't work, something extra had
-to be done.
-(gdb) call fflush(stdout)
-hello world$2 = 0
-As an aside the debugger also calls malloc & free under the hood 
-to make space for the "hello world" string.
-
-
-
-hints
------
-1) command completion works just like bash 
-( if you are a bad typist like me this really helps )
-e.g. hit br <TAB> & cursor up & down :-).
-
-2) if you have a debugging problem that takes a few steps to recreate
-put the steps into a file called .gdbinit in your current working directory
-if you have defined a few extra useful user defined commands put these in 
-your home directory & they will be read each time gdb is launched.
-
-A typical .gdbinit file might be.
-break main
-run
-break runtime_exception
-cont 
-
-
-stack chaining in gdb by hand
------------------------------
-This is done using a the same trick described for VM 
-p/x (*($sp+56))&0x7fffffff get the first backchain.
-
-For z/Architecture
-Replace 56 with 112 & ignore the &0x7fffffff
-in the macros below & do nasty casts to longs like the following
-as gdb unfortunately deals with printed arguments as ints which
-messes up everything.
-i.e. here is a 3rd backchain dereference
-p/x *(long *)(***(long ***)$sp+112)
-
-
-this outputs 
-$5 = 0x528f18 
-on my machine.
-Now you can use 
-info symbol (*($sp+56))&0x7fffffff 
-you might see something like.
-rl_getc + 36 in section .text  telling you what is located at address 0x528f18
-Now do.
-p/x (*(*$sp+56))&0x7fffffff 
-This outputs
-$6 = 0x528ed0
-Now do.
-info symbol (*(*$sp+56))&0x7fffffff
-rl_read_key + 180 in section .text
-now do
-p/x (*(**$sp+56))&0x7fffffff
-& so on.
-
-Disassembling instructions without debug info
----------------------------------------------
-gdb typically complains if there is a lack of debugging
-symbols in the disassemble command with 
-"No function contains specified address." To get around
-this do 
-x/<number lines to disassemble>xi <address>
-e.g.
-x/20xi 0x400730
-
-
-
-Note: Remember gdb has history just like bash you don't need to retype the
-whole line just use the up & down arrows.
-
-
-
-For more info
--------------
-From your linuxbox do 
-man gdb or info gdb.
-
-core dumps
-----------
-What a core dump ?,
-A core dump is a file generated by the kernel (if allowed) which contains the
-registers and all active pages of the program which has crashed.
-From this file gdb will allow you to look at the registers, stack trace and
-memory of the program as if it just crashed on your system. It is usually
-called core and created in the current working directory.
-This is very useful in that a customer can mail a core dump to a technical
-support department and the technical support department can reconstruct what
-happened. Provided they have an identical copy of this program with debugging
-symbols compiled in and the source base of this build is available.
-In short it is far more useful than something like a crash log could ever hope
-to be.
-
-Why have I never seen one ?.
-Probably because you haven't used the command 
-ulimit -c unlimited in bash
-to allow core dumps, now do 
-ulimit -a 
-to verify that the limit was accepted.
-
-A sample core dump
-To create this I'm going to do
-ulimit -c unlimited
-gdb 
-to launch gdb (my victim app. ) now be bad & do the following from another 
-telnet/xterm session to the same machine
-ps -aux | grep gdb
-kill -SIGSEGV <gdb's pid>
-or alternatively use killall -SIGSEGV gdb if you have the killall command.
-Now look at the core dump.
-./gdb core
-Displays the following
-GNU gdb 4.18
-Copyright 1998 Free Software Foundation, Inc.
-GDB is free software, covered by the GNU General Public License, and you are
-welcome to change it and/or distribute copies of it under certain conditions.
-Type "show copying" to see the conditions.
-There is absolutely no warranty for GDB.  Type "show warranty" for details.
-This GDB was configured as "s390-ibm-linux"...
-Core was generated by `./gdb'.
-Program terminated with signal 11, Segmentation fault.
-Reading symbols from /usr/lib/libncurses.so.4...done.
-Reading symbols from /lib/libm.so.6...done.
-Reading symbols from /lib/libc.so.6...done.
-Reading symbols from /lib/ld-linux.so.2...done.
-#0  0x40126d1a in read () from /lib/libc.so.6
-Setting up the environment for debugging gdb.
-Breakpoint 1 at 0x4dc6f8: file utils.c, line 471.
-Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
-(top-gdb) info stack
-#0  0x40126d1a in read () from /lib/libc.so.6
-#1  0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402
-#2  0x528ed0 in rl_read_key () at input.c:381
-#3  0x5167e6 in readline_internal_char () at readline.c:454
-#4  0x5168ee in readline_internal_charloop () at readline.c:507
-#5  0x51692c in readline_internal () at readline.c:521
-#6  0x5164fe in readline (prompt=0x7ffff810)
-    at readline.c:349
-#7  0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
-    annotation_suffix=0x4d6b44 "prompt") at top.c:2091
-#8  0x4d6cf0 in command_loop () at top.c:1345
-#9  0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
-
-
-LDD
-===
-This is a program which lists the shared libraries which a library needs,
-Note you also get the relocations of the shared library text segments which
-help when using objdump --source.
-e.g.
- ldd ./gdb
-outputs
-libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000)
-libm.so.6 => /lib/libm.so.6 (0x4005e000)
-libc.so.6 => /lib/libc.so.6 (0x40084000)
-/lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000)
-
-
-Debugging shared libraries
-==========================
-Most programs use shared libraries, however it can be very painful
-when you single step instruction into a function like printf for the 
-first time & you end up in functions like _dl_runtime_resolve this is
-the ld.so doing lazy binding, lazy binding is a concept in ELF where 
-shared library functions are not loaded into memory unless they are 
-actually used, great for saving memory but a pain to debug.
-To get around this either relink the program -static or exit gdb type 
-export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing 
-the program in question.
- 
-
-
-Debugging modules
-=================
-As modules are dynamically loaded into the kernel their address can be
-anywhere to get around this use the -m option with insmod to emit a load
-map which can be piped into a file if required.
-
-The proc file system
-====================
-What is it ?.
-It is a filesystem created by the kernel with files which are created on demand
-by the kernel if read, or can be used to modify kernel parameters,
-it is a powerful concept.
-
-e.g.
-
-cat /proc/sys/net/ipv4/ip_forward 
-On my machine outputs 
-0 
-telling me ip_forwarding is not on to switch it on I can do
-echo 1 >  /proc/sys/net/ipv4/ip_forward
-cat it again
-cat /proc/sys/net/ipv4/ip_forward 
-On my machine now outputs
-1
-IP forwarding is on.
-There is a lot of useful info in here best found by going in and having a look
-around, so I'll take you through some entries I consider important.
-
-All the processes running on the machine have their own entry defined by
-/proc/<pid>
-So lets have a look at the init process
-cd /proc/1
-
-cat cmdline
-emits
-init [2]
-
-cd /proc/1/fd
-This contains numerical entries of all the open files,
-some of these you can cat e.g. stdout (2)
-
-cat /proc/29/maps
-on my machine emits
-
-00400000-00478000 r-xp 00000000 5f:00 4103       /bin/bash
-00478000-0047e000 rw-p 00077000 5f:00 4103       /bin/bash
-0047e000-00492000 rwxp 00000000 00:00 0
-40000000-40015000 r-xp 00000000 5f:00 14382      /lib/ld-2.1.2.so
-40015000-40016000 rw-p 00014000 5f:00 14382      /lib/ld-2.1.2.so
-40016000-40017000 rwxp 00000000 00:00 0
-40017000-40018000 rw-p 00000000 00:00 0
-40018000-4001b000 r-xp 00000000 5f:00 14435      /lib/libtermcap.so.2.0.8
-4001b000-4001c000 rw-p 00002000 5f:00 14435      /lib/libtermcap.so.2.0.8
-4001c000-4010d000 r-xp 00000000 5f:00 14387      /lib/libc-2.1.2.so
-4010d000-40111000 rw-p 000f0000 5f:00 14387      /lib/libc-2.1.2.so
-40111000-40114000 rw-p 00000000 00:00 0
-40114000-4011e000 r-xp 00000000 5f:00 14408      /lib/libnss_files-2.1.2.so
-4011e000-4011f000 rw-p 00009000 5f:00 14408      /lib/libnss_files-2.1.2.so
-7fffd000-80000000 rwxp ffffe000 00:00 0
-
-
-Showing us the shared libraries init uses where they are in memory
-& memory access permissions for each virtual memory area.
-
-/proc/1/cwd is a softlink to the current working directory.
-/proc/1/root is the root of the filesystem for this process. 
-
-/proc/1/mem is the current running processes memory which you
-can read & write to like a file.
-strace uses this sometimes as it is a bit faster than the
-rather inefficient ptrace interface for peeking at DATA.
-
-
-cat status 
-
-Name:   init
-State:  S (sleeping)
-Pid:    1
-PPid:   0
-Uid:    0       0       0       0
-Gid:    0       0       0       0
-Groups:
-VmSize:      408 kB
-VmLck:         0 kB
-VmRSS:       208 kB
-VmData:       24 kB
-VmStk:         8 kB
-VmExe:       368 kB
-VmLib:         0 kB
-SigPnd: 0000000000000000
-SigBlk: 0000000000000000
-SigIgn: 7fffffffd7f0d8fc
-SigCgt: 00000000280b2603
-CapInh: 00000000fffffeff
-CapPrm: 00000000ffffffff
-CapEff: 00000000fffffeff
-
-User PSW:    070de000 80414146
-task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68
-User GPRS:
-00000400  00000000  0000000b  7ffffa90
-00000000  00000000  00000000  0045d9f4
-0045cafc  7ffffa90  7fffff18  0045cb08
-00010400  804039e8  80403af8  7ffff8b0
-User ACRS:
-00000000  00000000  00000000  00000000
-00000001  00000000  00000000  00000000
-00000000  00000000  00000000  00000000
-00000000  00000000  00000000  00000000
-Kernel BackChain  CallChain    BackChain  CallChain
-       004b7ca8   8002bd0c     004b7d18   8002b92c
-       004b7db8   8005cd50     004b7e38   8005d12a
-       004b7f08   80019114                     
-Showing among other things memory usage & status of some signals &
-the processes'es registers from the kernel task_structure
-as well as a backchain which may be useful if a process crashes
-in the kernel for some unknown reason.
-
-Some driver debugging techniques
-================================
-debug feature
--------------
-Some of our drivers now support a "debug feature" in
-/proc/s390dbf see s390dbf.txt in the linux/Documentation directory
-for more info.
-e.g. 
-to switch on the lcs "debug feature"
-echo 5 > /proc/s390dbf/lcs/level
-& then after the error occurred.
-cat /proc/s390dbf/lcs/sprintf >/logfile
-the logfile now contains some information which may help
-tech support resolve a problem in the field.
-
-
-
-high level debugging network drivers
-------------------------------------
-ifconfig is a quite useful command
-it gives the current state of network drivers.
-
-If you suspect your network device driver is dead
-one way to check is type 
-ifconfig <network device> 
-e.g. tr0
-You should see something like
-tr0       Link encap:16/4 Mbps Token Ring (New)  HWaddr 00:04:AC:20:8E:48
-          inet addr:9.164.185.132  Bcast:9.164.191.255  Mask:255.255.224.0
-          UP BROADCAST RUNNING MULTICAST  MTU:2000  Metric:1
-          RX packets:246134 errors:0 dropped:0 overruns:0 frame:0
-          TX packets:5 errors:0 dropped:0 overruns:0 carrier:0
-          collisions:0 txqueuelen:100
-
-if the device doesn't say up
-try
-/etc/rc.d/init.d/network start 
-( this starts the network stack & hopefully calls ifconfig tr0 up ).
-ifconfig looks at the output of /proc/net/dev and presents it in a more
-presentable form.
-Now ping the device from a machine in the same subnet.
-if the RX packets count & TX packets counts don't increment you probably
-have problems.
-next 
-cat /proc/net/arp
-Do you see any hardware addresses in the cache if not you may have problems.
-Next try
-ping -c 5 <broadcast_addr> i.e. the Bcast field above in the output of
-ifconfig. Do you see any replies from machines other than the local machine
-if not you may have problems. also if the TX packets count in ifconfig
-hasn't incremented either you have serious problems in your driver 
-(e.g. the txbusy field of the network device being stuck on ) 
-or you may have multiple network devices connected.
-
-
-chandev
--------
-There is a new device layer for channel devices, some
-drivers e.g. lcs are registered with this layer.
-If the device uses the channel device layer you'll be
-able to find what interrupts it uses & the current state 
-of the device.
-See the manpage chandev.8 &type cat /proc/chandev for more info.
-
-
-SysRq
-=====
-This is now supported by linux for s/390 & z/Architecture.
-To enable it do compile the kernel with 
-Kernel Hacking -> Magic SysRq Key Enabled
-echo "1" > /proc/sys/kernel/sysrq
-also type
-echo "8" >/proc/sys/kernel/printk
-To make printk output go to console.
-On 390 all commands are prefixed with
-^-
-e.g.
-^-t will show tasks.
-^-? or some unknown command will display help.
-The sysrq key reading is very picky ( I have to type the keys in an
- xterm session & paste them  into the x3270 console )
-& it may be wise to predefine the keys as described in the VM hints above
-
-This is particularly useful for syncing disks unmounting & rebooting
-if the machine gets partially hung.
-
-Read Documentation/admin-guide/sysrq.rst for more info
-
-References:
-===========
-Enterprise Systems Architecture Reference Summary
-Enterprise Systems Architecture Principles of Operation
-Hartmut Penners s390 stack frame sheet.
-IBM Mainframe Channel Attachment a technology brief from a CISCO webpage
-Various bits of man & info pages of Linux.
-Linux & GDB source.
-Various info & man pages.
-CMS Help on tracing commands.
-Linux for s/390 Elf Application Binary Interface
-Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended )
-z/Architecture Principles of Operation SA22-7832-00
-Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the
-Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05
-
-Special Thanks
-==============
-Special thanks to Neale Ferguson who maintains a much
-prettier HTML version of this page at
-http://linuxvm.org/penguinvm/
-Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/s390/cds.rst b/Documentation/s390/cds.rst
new file mode 100644
index 000000000000..7006d8209d2e
--- /dev/null
+++ b/Documentation/s390/cds.rst
@@ -0,0 +1,530 @@
+===========================
+Linux for S/390 and zSeries
+===========================
+
+Common Device Support (CDS)
+Device Driver I/O Support Routines
+
+Authors:
+	- Ingo Adlung
+	- Cornelia Huck
+
+Copyright, IBM Corp. 1999-2002
+
+Introduction
+============
+
+This document describes the common device support routines for Linux/390.
+Different than other hardware architectures, ESA/390 has defined a unified
+I/O access method. This gives relief to the device drivers as they don't
+have to deal with different bus types, polling versus interrupt
+processing, shared versus non-shared interrupt processing, DMA versus port
+I/O (PIO), and other hardware features more. However, this implies that
+either every single device driver needs to implement the hardware I/O
+attachment functionality itself, or the operating system provides for a
+unified method to access the hardware, providing all the functionality that
+every single device driver would have to provide itself.
+
+The document does not intend to explain the ESA/390 hardware architecture in
+every detail.This information can be obtained from the ESA/390 Principles of
+Operation manual (IBM Form. No. SA22-7201).
+
+In order to build common device support for ESA/390 I/O interfaces, a
+functional layer was introduced that provides generic I/O access methods to
+the hardware.
+
+The common device support layer comprises the I/O support routines defined
+below. Some of them implement common Linux device driver interfaces, while
+some of them are ESA/390 platform specific.
+
+Note:
+  In order to write a driver for S/390, you also need to look into the interface
+  described in Documentation/s390/driver-model.rst.
+
+Note for porting drivers from 2.4:
+
+The major changes are:
+
+* The functions use a ccw_device instead of an irq (subchannel).
+* All drivers must define a ccw_driver (see driver-model.txt) and the associated
+  functions.
+* request_irq() and free_irq() are no longer done by the driver.
+* The oper_handler is (kindof) replaced by the probe() and set_online() functions
+  of the ccw_driver.
+* The not_oper_handler is (kindof) replaced by the remove() and set_offline()
+  functions of the ccw_driver.
+* The channel device layer is gone.
+* The interrupt handlers must be adapted to use a ccw_device as argument.
+  Moreover, they don't return a devstat, but an irb.
+* Before initiating an io, the options must be set via ccw_device_set_options().
+* Instead of calling read_dev_chars()/read_conf_data(), the driver issues
+  the channel program and handles the interrupt itself.
+
+ccw_device_get_ciw()
+   get commands from extended sense data.
+
+ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout()
+   initiate an I/O request.
+
+ccw_device_resume()
+   resume channel program execution.
+
+ccw_device_halt()
+   terminate the current I/O request processed on the device.
+
+do_IRQ()
+   generic interrupt routine. This function is called by the interrupt entry
+   routine whenever an I/O interrupt is presented to the system. The do_IRQ()
+   routine determines the interrupt status and calls the device specific
+   interrupt handler according to the rules (flags) defined during I/O request
+   initiation with do_IO().
+
+The next chapters describe the functions other than do_IRQ() in more details.
+The do_IRQ() interface is not described, as it is called from the Linux/390
+first level interrupt handler only and does not comprise a device driver
+callable interface. Instead, the functional description of do_IO() also
+describes the input to the device specific interrupt handler.
+
+Note:
+	All explanations apply also to the 64 bit architecture s390x.
+
+
+Common Device Support (CDS) for Linux/390 Device Drivers
+========================================================
+
+General Information
+-------------------
+
+The following chapters describe the I/O related interface routines the
+Linux/390 common device support (CDS) provides to allow for device specific
+driver implementations on the IBM ESA/390 hardware platform. Those interfaces
+intend to provide the functionality required by every device driver
+implementation to allow to drive a specific hardware device on the ESA/390
+platform. Some of the interface routines are specific to Linux/390 and some
+of them can be found on other Linux platforms implementations too.
+Miscellaneous function prototypes, data declarations, and macro definitions
+can be found in the architecture specific C header file
+linux/arch/s390/include/asm/irq.h.
+
+Overview of CDS interface concepts
+----------------------------------
+
+Different to other hardware platforms, the ESA/390 architecture doesn't define
+interrupt lines managed by a specific interrupt controller and bus systems
+that may or may not allow for shared interrupts, DMA processing, etc.. Instead,
+the ESA/390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the systems.
+Though the ESA/390 hardware platform knows about a huge variety of different
+peripheral attachments like disk devices (aka. DASDs), tapes, communication
+controllers, etc. they can all be accessed by a well defined access method and
+they are presenting I/O completion a unified way : I/O interruptions. Every
+single device is uniquely identified to the system by a so called subchannel,
+where the ESA/390 architecture allows for 64k devices be attached.
+
+Linux, however, was first built on the Intel PC architecture, with its two
+cascaded 8259 programmable interrupt controllers (PICs), that allow for a
+maximum of 15 different interrupt lines. All devices attached to such a system
+share those 15 interrupt levels. Devices attached to the ISA bus system must
+not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered
+interrupts. MCA, EISA, PCI and other bus systems base on level triggered
+interrupts, and therewith allow for shared IRQs. However, if multiple devices
+present their hardware status by the same (shared) IRQ, the operating system
+has to call every single device driver registered on this IRQ in order to
+determine the device driver owning the device that raised the interrupt.
+
+Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel).
+For internal use of the common I/O layer, these are still there. However,
+device drivers should use the new calling interface via the ccw_device only.
+
+During its startup the Linux/390 system checks for peripheral devices. Each
+of those devices is uniquely defined by a so called subchannel by the ESA/390
+channel subsystem. While the subchannel numbers are system generated, each
+subchannel also takes a user defined attribute, the so called device number.
+Both subchannel number and device number cannot exceed 65535. During sysfs
+initialisation, the information about control unit type and device types that
+imply specific I/O commands (channel command words - CCWs) in order to operate
+the device are gathered. Device drivers can retrieve this set of hardware
+information during their initialization step to recognize the devices they
+support using the information saved in the struct ccw_device given to them.
+This methods implies that Linux/390 doesn't require to probe for free (not
+armed) interrupt request lines (IRQs) to drive its devices with. Where
+applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS
+ccw to retrieve device characteristics in its online routine.
+
+In order to allow for easy I/O initiation the CDS layer provides a
+ccw_device_start() interface that takes a device specific channel program (one
+or more CCWs) as input sets up the required architecture specific control blocks
+and initiates an I/O request on behalf of the device driver. The
+ccw_device_start() routine allows to specify whether it expects the CDS layer
+to notify the device driver for every interrupt it observes, or with final status
+only. See ccw_device_start() for more details. A device driver must never issue
+ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead.
+
+For long running I/O request to be canceled, the CDS layer provides the
+ccw_device_halt() function. Some devices require to initially issue a HALT
+SUBCHANNEL (HSCH) command without having pending I/O requests. This function is
+also covered by ccw_device_halt().
+
+
+get_ciw() - get command information word
+
+This call enables a device driver to get information about supported commands
+from the extended SenseID data.
+
+::
+
+  struct ciw *
+  ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd);
+
+====  ========================================================
+cdev  The ccw_device for which the command is to be retrieved.
+cmd   The command type to be retrieved.
+====  ========================================================
+
+ccw_device_get_ciw() returns:
+
+=====  ================================================================
+ NULL  No extended data available, invalid device or command not found.
+!NULL  The command requested.
+=====  ================================================================
+
+::
+
+  ccw_device_start() - Initiate I/O Request
+
+The ccw_device_start() routines is the I/O request front-end processor. All
+device driver I/O requests must be issued using this routine. A device driver
+must not issue ESA/390 I/O commands itself. Instead the ccw_device_start()
+routine provides all interfaces required to drive arbitrary devices.
+
+This description also covers the status information passed to the device
+driver's interrupt handler as this is related to the rules (flags) defined
+with the associated I/O request when calling ccw_device_start().
+
+::
+
+  int ccw_device_start(struct ccw_device *cdev,
+		       struct ccw1 *cpa,
+		       unsigned long intparm,
+		       __u8 lpm,
+		       unsigned long flags);
+  int ccw_device_start_timeout(struct ccw_device *cdev,
+			       struct ccw1 *cpa,
+			       unsigned long intparm,
+			       __u8 lpm,
+			       unsigned long flags,
+			       int expires);
+  int ccw_device_start_key(struct ccw_device *cdev,
+			   struct ccw1 *cpa,
+			   unsigned long intparm,
+			   __u8 lpm,
+			   __u8 key,
+			   unsigned long flags);
+  int ccw_device_start_key_timeout(struct ccw_device *cdev,
+				   struct ccw1 *cpa,
+				   unsigned long intparm,
+				   __u8 lpm,
+				   __u8 key,
+				   unsigned long flags,
+				   int expires);
+
+============= =============================================================
+cdev          ccw_device the I/O is destined for
+cpa           logical start address of channel program
+user_intparm  user specific interrupt information; will be presented
+	      back to the device driver's interrupt handler. Allows a
+	      device driver to associate the interrupt with a
+	      particular I/O request.
+lpm           defines the channel path to be used for a specific I/O
+	      request. A value of 0 will make cio use the opm.
+key           the storage key to use for the I/O (useful for operating on a
+	      storage with a storage key != default key)
+flag          defines the action to be performed for I/O processing
+expires       timeout value in jiffies. The common I/O layer will terminate
+	      the running program after this and call the interrupt handler
+	      with ERR_PTR(-ETIMEDOUT) as irb.
+============= =============================================================
+
+Possible flag values are:
+
+========================= =============================================
+DOIO_ALLOW_SUSPEND        channel program may become suspended
+DOIO_DENY_PREFETCH        don't allow for CCW prefetch; usually
+			  this implies the channel program might
+			  become modified
+DOIO_SUPPRESS_INTER       don't call the handler on intermediate status
+========================= =============================================
+
+The cpa parameter points to the first format 1 CCW of a channel program::
+
+  struct ccw1 {
+	__u8  cmd_code;/* command code */
+	__u8  flags;   /* flags, like IDA addressing, etc. */
+	__u16 count;   /* byte count */
+	__u32 cda;     /* data address */
+  } __attribute__ ((packed,aligned(8)));
+
+with the following CCW flags values defined:
+
+=================== =========================
+CCW_FLAG_DC         data chaining
+CCW_FLAG_CC         command chaining
+CCW_FLAG_SLI        suppress incorrect length
+CCW_FLAG_SKIP       skip
+CCW_FLAG_PCI        PCI
+CCW_FLAG_IDA        indirect addressing
+CCW_FLAG_SUSPEND    suspend
+=================== =========================
+
+
+Via ccw_device_set_options(), the device driver may specify the following
+options for the device:
+
+========================= ======================================
+DOIO_EARLY_NOTIFICATION   allow for early interrupt notification
+DOIO_REPORT_ALL           report all interrupt conditions
+========================= ======================================
+
+
+The ccw_device_start() function returns:
+
+======== ======================================================================
+      0  successful completion or request successfully initiated
+ -EBUSY  The device is currently processing a previous I/O request, or there is
+	 a status pending at the device.
+-ENODEV  cdev is invalid, the device is not operational or the ccw_device is
+	 not online.
+======== ======================================================================
+
+When the I/O request completes, the CDS first level interrupt handler will
+accumulate the status in a struct irb and then call the device interrupt handler.
+The intparm field will contain the value the device driver has associated with a
+particular I/O request. If a pending device status was recognized,
+intparm will be set to 0 (zero). This may happen during I/O initiation or delayed
+by an alert status notification. In any case this status is not related to the
+current (last) I/O request. In case of a delayed status notification no special
+interrupt will be presented to indicate I/O completion as the I/O request was
+never started, even though ccw_device_start() returned with successful completion.
+
+The irb may contain an error value, and the device driver should check for this
+first:
+
+========== =================================================================
+-ETIMEDOUT the common I/O layer terminated the request after the specified
+	   timeout value
+-EIO       the common I/O layer terminated the request due to an error state
+========== =================================================================
+
+If the concurrent sense flag in the extended status word (esw) in the irb is
+set, the field erw.scnt in the esw describes the number of device specific
+sense bytes available in the extended control word irb->scsw.ecw[]. No device
+sensing by the device driver itself is required.
+
+The device interrupt handler can use the following definitions to investigate
+the primary unit check source coded in sense byte 0 :
+
+======================= ====
+SNS0_CMD_REJECT         0x80
+SNS0_INTERVENTION_REQ   0x40
+SNS0_BUS_OUT_CHECK      0x20
+SNS0_EQUIPMENT_CHECK    0x10
+SNS0_DATA_CHECK         0x08
+SNS0_OVERRUN            0x04
+SNS0_INCOMPL_DOMAIN     0x01
+======================= ====
+
+Depending on the device status, multiple of those values may be set together.
+Please refer to the device specific documentation for details.
+
+The irb->scsw.cstat field provides the (accumulated) subchannel status :
+
+========================= ============================
+SCHN_STAT_PCI             program controlled interrupt
+SCHN_STAT_INCORR_LEN      incorrect length
+SCHN_STAT_PROG_CHECK      program check
+SCHN_STAT_PROT_CHECK      protection check
+SCHN_STAT_CHN_DATA_CHK    channel data check
+SCHN_STAT_CHN_CTRL_CHK    channel control check
+SCHN_STAT_INTF_CTRL_CHK   interface control check
+SCHN_STAT_CHAIN_CHECK     chaining check
+========================= ============================
+
+The irb->scsw.dstat field provides the (accumulated) device status :
+
+===================== =================
+DEV_STAT_ATTENTION    attention
+DEV_STAT_STAT_MOD     status modifier
+DEV_STAT_CU_END       control unit end
+DEV_STAT_BUSY         busy
+DEV_STAT_CHN_END      channel end
+DEV_STAT_DEV_END      device end
+DEV_STAT_UNIT_CHECK   unit check
+DEV_STAT_UNIT_EXCEP   unit exception
+===================== =================
+
+Please see the ESA/390 Principles of Operation manual for details on the
+individual flag meanings.
+
+Usage Notes:
+
+ccw_device_start() must be called disabled and with the ccw device lock held.
+
+The device driver is allowed to issue the next ccw_device_start() call from
+within its interrupt handler already. It is not required to schedule a
+bottom-half, unless a non deterministically long running error recovery procedure
+or similar needs to be scheduled. During I/O processing the Linux/390 generic
+I/O device driver support has already obtained the IRQ lock, i.e. the handler
+must not try to obtain it again when calling ccw_device_start() or we end in a
+deadlock situation!
+
+If a device driver relies on an I/O request to be completed prior to start the
+next it can reduce I/O processing overhead by chaining a NoOp I/O command
+CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End
+and Device-End status to be presented together, with a single interrupt.
+However, this should be used with care as it implies the channel will remain
+busy, not being able to process I/O requests for other devices on the same
+channel. Therefore e.g. read commands should never use this technique, as the
+result will be presented by a single interrupt anyway.
+
+In order to minimize I/O overhead, a device driver should use the
+DOIO_REPORT_ALL  only if the device can report intermediate interrupt
+information prior to device-end the device driver urgently relies on. In this
+case all I/O interruptions are presented to the device driver until final
+status is recognized.
+
+If a device is able to recover from asynchronously presented I/O errors, it can
+perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some
+devices always report channel-end and device-end together, with a single
+interrupt, others present primary status (channel-end) when the channel is
+ready for the next I/O request and secondary status (device-end) when the data
+transmission has been completed at the device.
+
+Above flag allows to exploit this feature, e.g. for communication devices that
+can handle lost data on the network to allow for enhanced I/O processing.
+
+Unless the channel subsystem at any time presents a secondary status interrupt,
+exploiting this feature will cause only primary status interrupts to be
+presented to the device driver while overlapping I/O is performed. When a
+secondary status without error (alert status) is presented, this indicates
+successful completion for all overlapping ccw_device_start() requests that have
+been issued since the last secondary (final) status.
+
+Channel programs that intend to set the suspend flag on a channel command word
+(CCW)  must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the
+suspend flag will cause a channel program check. At the time the channel program
+becomes suspended an intermediate interrupt will be generated by the channel
+subsystem.
+
+ccw_device_resume() - Resume Channel Program Execution
+
+If a device driver chooses to suspend the current channel program execution by
+setting the CCW suspend flag on a particular CCW, the channel program execution
+is suspended. In order to resume channel program execution the CIO layer
+provides the ccw_device_resume() routine.
+
+::
+
+  int ccw_device_resume(struct ccw_device *cdev);
+
+====  ================================================
+cdev  ccw_device the resume operation is requested for
+====  ================================================
+
+The ccw_device_resume() function returns:
+
+=========   ==============================================
+	0   suspended channel program is resumed
+   -EBUSY   status pending
+  -ENODEV   cdev invalid or not-operational subchannel
+  -EINVAL   resume function not applicable
+-ENOTCONN   there is no I/O request pending for completion
+=========   ==============================================
+
+Usage Notes:
+
+Please have a look at the ccw_device_start() usage notes for more details on
+suspended channel programs.
+
+ccw_device_halt() - Halt I/O Request Processing
+
+Sometimes a device driver might need a possibility to stop the processing of
+a long-running channel program or the device might require to initially issue
+a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt()
+command is provided.
+
+ccw_device_halt() must be called disabled and with the ccw device lock held.
+
+::
+
+  int ccw_device_halt(struct ccw_device *cdev,
+		      unsigned long intparm);
+
+=======  =====================================================
+cdev     ccw_device the halt operation is requested for
+intparm  interruption parameter; value is only used if no I/O
+	 is outstanding, otherwise the intparm associated with
+	 the I/O request is returned
+=======  =====================================================
+
+The ccw_device_halt() function returns:
+
+=======  ==============================================================
+      0  request successfully initiated
+-EBUSY   the device is currently busy, or status pending.
+-ENODEV  cdev invalid.
+-EINVAL  The device is not operational or the ccw device is not online.
+=======  ==============================================================
+
+Usage Notes:
+
+A device driver may write a never-ending channel program by writing a channel
+program that at its end loops back to its beginning by means of a transfer in
+channel (TIC)   command (CCW_CMD_TIC). Usually this is performed by network
+device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is
+executed a program controlled interrupt (PCI) is generated. The device driver
+can then perform an appropriate action. Prior to interrupt of an outstanding
+read to a network device (with or without PCI flag) a ccw_device_halt()
+is required to end the pending operation.
+
+::
+
+  ccw_device_clear() - Terminage I/O Request Processing
+
+In order to terminate all I/O processing at the subchannel, the clear subchannel
+(CSCH) command is used. It can be issued via ccw_device_clear().
+
+ccw_device_clear() must be called disabled and with the ccw device lock held.
+
+::
+
+  int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm);
+
+======= ===============================================
+cdev    ccw_device the clear operation is requested for
+intparm interruption parameter (see ccw_device_halt())
+======= ===============================================
+
+The ccw_device_clear() function returns:
+
+=======  ==============================================================
+      0  request successfully initiated
+-ENODEV  cdev invalid
+-EINVAL  The device is not operational or the ccw device is not online.
+=======  ==============================================================
+
+Miscellaneous Support Routines
+------------------------------
+
+This chapter describes various routines to be used in a Linux/390 device
+driver programming environment.
+
+get_ccwdev_lock()
+
+Get the address of the device specific lock. This is then used in
+spin_lock() / spin_unlock() calls.
+
+::
+
+  __u8 ccw_device_get_path_mask(struct ccw_device *cdev);
+
+Get the mask of the path currently available for cdev.
diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt
deleted file mode 100644
index 480a78ef5a1e..000000000000
--- a/Documentation/s390/cds.txt
+++ /dev/null
@@ -1,472 +0,0 @@
-Linux for S/390 and zSeries
-
-Common Device Support (CDS)
-Device Driver I/O Support Routines
-
-Authors : Ingo Adlung
-	  Cornelia Huck
-
-Copyright, IBM Corp. 1999-2002
-
-Introduction
-
-This document describes the common device support routines for Linux/390.
-Different than other hardware architectures, ESA/390 has defined a unified
-I/O access method. This gives relief to the device drivers as they don't
-have to deal with different bus types, polling versus interrupt
-processing, shared versus non-shared interrupt processing, DMA versus port
-I/O (PIO), and other hardware features more. However, this implies that
-either every single device driver needs to implement the hardware I/O
-attachment functionality itself, or the operating system provides for a
-unified method to access the hardware, providing all the functionality that
-every single device driver would have to provide itself.
-
-The document does not intend to explain the ESA/390 hardware architecture in
-every detail.This information can be obtained from the ESA/390 Principles of
-Operation manual (IBM Form. No. SA22-7201).
-
-In order to build common device support for ESA/390 I/O interfaces, a
-functional layer was introduced that provides generic I/O access methods to
-the hardware. 
-
-The common device support layer comprises the I/O support routines defined 
-below. Some of them implement common Linux device driver interfaces, while 
-some of them are ESA/390 platform specific.
-
-Note:
-In order to write a driver for S/390, you also need to look into the interface
-described in Documentation/s390/driver-model.txt.
-
-Note for porting drivers from 2.4:
-The major changes are:
-* The functions use a ccw_device instead of an irq (subchannel).
-* All drivers must define a ccw_driver (see driver-model.txt) and the associated
-  functions.
-* request_irq() and free_irq() are no longer done by the driver.
-* The oper_handler is (kindof) replaced by the probe() and set_online() functions
-  of the ccw_driver.
-* The not_oper_handler is (kindof) replaced by the remove() and set_offline()
-  functions of the ccw_driver.
-* The channel device layer is gone.
-* The interrupt handlers must be adapted to use a ccw_device as argument.
-  Moreover, they don't return a devstat, but an irb.
-* Before initiating an io, the options must be set via ccw_device_set_options().
-* Instead of calling read_dev_chars()/read_conf_data(), the driver issues
-  the channel program and handles the interrupt itself.
-
-ccw_device_get_ciw()
-   get commands from extended sense data.
-
-ccw_device_start()	
-ccw_device_start_timeout()
-ccw_device_start_key()
-ccw_device_start_key_timeout()
-   initiate an I/O request.
-
-ccw_device_resume()
-   resume channel program execution.
-
-ccw_device_halt()	
-   terminate the current I/O request processed on the device.
-
-do_IRQ()	
-   generic interrupt routine. This function is called by the interrupt entry
-   routine whenever an I/O interrupt is presented to the system. The do_IRQ()
-   routine determines the interrupt status and calls the device specific
-   interrupt handler according to the rules (flags) defined during I/O request
-   initiation with do_IO().
-
-The next chapters describe the functions other than do_IRQ() in more details.
-The do_IRQ() interface is not described, as it is called from the Linux/390
-first level interrupt handler only and does not comprise a device driver
-callable interface. Instead, the functional description of do_IO() also
-describes the input to the device specific interrupt handler.
-
-Note: All explanations apply also to the 64 bit architecture s390x.
-
-
-Common Device Support (CDS) for Linux/390 Device Drivers
-
-General Information
-
-The following chapters describe the I/O related interface routines the
-Linux/390 common device support (CDS) provides to allow for device specific
-driver implementations on the IBM ESA/390 hardware platform. Those interfaces
-intend to provide the functionality required by every device driver
-implementation to allow to drive a specific hardware device on the ESA/390
-platform. Some of the interface routines are specific to Linux/390 and some
-of them can be found on other Linux platforms implementations too.
-Miscellaneous function prototypes, data declarations, and macro definitions
-can be found in the architecture specific C header file
-linux/arch/s390/include/asm/irq.h.
-
-Overview of CDS interface concepts
-
-Different to other hardware platforms, the ESA/390 architecture doesn't define
-interrupt lines managed by a specific interrupt controller and bus systems
-that may or may not allow for shared interrupts, DMA processing, etc.. Instead,
-the ESA/390 architecture has implemented a so called channel subsystem, that
-provides a unified view of the devices physically attached to the systems.
-Though the ESA/390 hardware platform knows about a huge variety of different
-peripheral attachments like disk devices (aka. DASDs), tapes, communication
-controllers, etc. they can all be accessed by a well defined access method and
-they are presenting I/O completion a unified way : I/O interruptions. Every
-single device is uniquely identified to the system by a so called subchannel,
-where the ESA/390 architecture allows for 64k devices be attached.
-
-Linux, however, was first built on the Intel PC architecture, with its two
-cascaded 8259 programmable interrupt controllers (PICs), that allow for a
-maximum of 15 different interrupt lines. All devices attached to such a system
-share those 15 interrupt levels. Devices attached to the ISA bus system must
-not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered
-interrupts. MCA, EISA, PCI and other bus systems base on level triggered
-interrupts, and therewith allow for shared IRQs. However, if multiple devices
-present their hardware status by the same (shared) IRQ, the operating system
-has to call every single device driver registered on this IRQ in order to
-determine the device driver owning the device that raised the interrupt.
-
-Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel).
-For internal use of the common I/O layer, these are still there. However, 
-device drivers should use the new calling interface via the ccw_device only.
-
-During its startup the Linux/390 system checks for peripheral devices. Each
-of those devices is uniquely defined by a so called subchannel by the ESA/390
-channel subsystem. While the subchannel numbers are system generated, each
-subchannel also takes a user defined attribute, the so called device number.
-Both subchannel number and device number cannot exceed 65535. During sysfs
-initialisation, the information about control unit type and device types that 
-imply specific I/O commands (channel command words - CCWs) in order to operate
-the device are gathered. Device drivers can retrieve this set of hardware
-information during their initialization step to recognize the devices they
-support using the information saved in the struct ccw_device given to them.
-This methods implies that Linux/390 doesn't require to probe for free (not
-armed) interrupt request lines (IRQs) to drive its devices with. Where
-applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS
-ccw to retrieve device characteristics in its online routine.
-
-In order to allow for easy I/O initiation the CDS layer provides a
-ccw_device_start() interface that takes a device specific channel program (one
-or more CCWs) as input sets up the required architecture specific control blocks
-and initiates an I/O request on behalf of the device driver. The
-ccw_device_start() routine allows to specify whether it expects the CDS layer
-to notify the device driver for every interrupt it observes, or with final status
-only. See ccw_device_start() for more details. A device driver must never issue
-ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead.
-
-For long running I/O request to be canceled, the CDS layer provides the
-ccw_device_halt() function. Some devices require to initially issue a HALT
-SUBCHANNEL (HSCH) command without having pending I/O requests. This function is
-also covered by ccw_device_halt().
-
-
-get_ciw() - get command information word
-
-This call enables a device driver to get information about supported commands
-from the extended SenseID data.
-
-struct ciw *
-ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd);
-
-cdev - The ccw_device for which the command is to be retrieved.
-cmd  - The command type to be retrieved.
-
-ccw_device_get_ciw() returns:
-NULL    - No extended data available, invalid device or command not found.
-!NULL   - The command requested.
-
-
-ccw_device_start() - Initiate I/O Request
-
-The ccw_device_start() routines is the I/O request front-end processor. All
-device driver I/O requests must be issued using this routine. A device driver
-must not issue ESA/390 I/O commands itself. Instead the ccw_device_start()
-routine provides all interfaces required to drive arbitrary devices.
-
-This description also covers the status information passed to the device
-driver's interrupt handler as this is related to the rules (flags) defined
-with the associated I/O request when calling ccw_device_start().
-
-int ccw_device_start(struct ccw_device *cdev,
-		     struct ccw1 *cpa,
-		     unsigned long intparm,
-		     __u8 lpm,
-		     unsigned long flags);
-int ccw_device_start_timeout(struct ccw_device *cdev,
-			     struct ccw1 *cpa,
-			     unsigned long intparm,
-			     __u8 lpm,
-			     unsigned long flags,
-			     int expires);
-int ccw_device_start_key(struct ccw_device *cdev,
-			 struct ccw1 *cpa,
-			 unsigned long intparm,
-			 __u8 lpm,
-			 __u8 key,
-			 unsigned long flags);
-int ccw_device_start_key_timeout(struct ccw_device *cdev,
-				 struct ccw1 *cpa,
-				 unsigned long intparm,
-				 __u8 lpm,
-				 __u8 key,
-				 unsigned long flags,
-				 int expires);
-
-cdev         : ccw_device the I/O is destined for
-cpa          : logical start address of channel program
-user_intparm : user specific interrupt information; will be presented
-	       back to the device driver's interrupt handler. Allows a
-               device driver to associate the interrupt with a
-               particular I/O request.
-lpm          : defines the channel path to be used for a specific I/O
-               request. A value of 0 will make cio use the opm.
-key	     : the storage key to use for the I/O (useful for operating on a
-	       storage with a storage key != default key)
-flag         : defines the action to be performed for I/O processing
-expires      : timeout value in jiffies. The common I/O layer will terminate
-	       the running program after this and call the interrupt handler
-	       with ERR_PTR(-ETIMEDOUT) as irb.
-
-Possible flag values are :
-
-DOIO_ALLOW_SUSPEND       - channel program may become suspended
-DOIO_DENY_PREFETCH       - don't allow for CCW prefetch; usually
-                           this implies the channel program might
-                           become modified
-DOIO_SUPPRESS_INTER     - don't call the handler on intermediate status
-
-The cpa parameter points to the first format 1 CCW of a channel program :
-
-struct ccw1 {
-      __u8  cmd_code;/* command code */
-      __u8  flags;   /* flags, like IDA addressing, etc. */
-      __u16 count;   /* byte count */
-      __u32 cda;     /* data address */
-} __attribute__ ((packed,aligned(8)));
-
-with the following CCW flags values defined :
-
-CCW_FLAG_DC        - data chaining
-CCW_FLAG_CC        - command chaining
-CCW_FLAG_SLI       - suppress incorrect length
-CCW_FLAG_SKIP      - skip
-CCW_FLAG_PCI       - PCI
-CCW_FLAG_IDA       - indirect addressing
-CCW_FLAG_SUSPEND   - suspend
-
-
-Via ccw_device_set_options(), the device driver may specify the following
-options for the device:
-
-DOIO_EARLY_NOTIFICATION  - allow for early interrupt notification
-DOIO_REPORT_ALL          - report all interrupt conditions
-
-
-The ccw_device_start() function returns :
-
-      0 - successful completion or request successfully initiated
--EBUSY	- The device is currently processing a previous I/O request, or there is
-          a status pending at the device.
--ENODEV - cdev is invalid, the device is not operational or the ccw_device is
-          not online.
-
-When the I/O request completes, the CDS first level interrupt handler will
-accumulate the status in a struct irb and then call the device interrupt handler.
-The intparm field will contain the value the device driver has associated with a 
-particular I/O request. If a pending device status was recognized, 
-intparm will be set to 0 (zero). This may happen during I/O initiation or delayed
-by an alert status notification. In any case this status is not related to the
-current (last) I/O request. In case of a delayed status notification no special
-interrupt will be presented to indicate I/O completion as the I/O request was
-never started, even though ccw_device_start() returned with successful completion.
-
-The irb may contain an error value, and the device driver should check for this
-first:
-
--ETIMEDOUT: the common I/O layer terminated the request after the specified
-            timeout value
--EIO:       the common I/O layer terminated the request due to an error state
-
-If the concurrent sense flag in the extended status word (esw) in the irb is
-set, the field erw.scnt in the esw describes the number of device specific
-sense bytes available in the extended control word irb->scsw.ecw[]. No device
-sensing by the device driver itself is required.
-
-The device interrupt handler can use the following definitions to investigate
-the primary unit check source coded in sense byte 0 :
-
-SNS0_CMD_REJECT         0x80
-SNS0_INTERVENTION_REQ   0x40
-SNS0_BUS_OUT_CHECK      0x20
-SNS0_EQUIPMENT_CHECK    0x10
-SNS0_DATA_CHECK         0x08
-SNS0_OVERRUN            0x04
-SNS0_INCOMPL_DOMAIN     0x01
-
-Depending on the device status, multiple of those values may be set together.
-Please refer to the device specific documentation for details.
-
-The irb->scsw.cstat field provides the (accumulated) subchannel status :
-
-SCHN_STAT_PCI            - program controlled interrupt
-SCHN_STAT_INCORR_LEN     - incorrect length
-SCHN_STAT_PROG_CHECK     - program check
-SCHN_STAT_PROT_CHECK     - protection check
-SCHN_STAT_CHN_DATA_CHK   - channel data check
-SCHN_STAT_CHN_CTRL_CHK   - channel control check
-SCHN_STAT_INTF_CTRL_CHK  - interface control check
-SCHN_STAT_CHAIN_CHECK    - chaining check
-
-The irb->scsw.dstat field provides the (accumulated) device status :
-
-DEV_STAT_ATTENTION   - attention
-DEV_STAT_STAT_MOD    - status modifier
-DEV_STAT_CU_END      - control unit end
-DEV_STAT_BUSY        - busy
-DEV_STAT_CHN_END     - channel end
-DEV_STAT_DEV_END     - device end
-DEV_STAT_UNIT_CHECK  - unit check
-DEV_STAT_UNIT_EXCEP  - unit exception
-
-Please see the ESA/390 Principles of Operation manual for details on the
-individual flag meanings.
-
-Usage Notes :
-
-ccw_device_start() must be called disabled and with the ccw device lock held.
-
-The device driver is allowed to issue the next ccw_device_start() call from
-within its interrupt handler already. It is not required to schedule a
-bottom-half, unless a non deterministically long running error recovery procedure
-or similar needs to be scheduled. During I/O processing the Linux/390 generic
-I/O device driver support has already obtained the IRQ lock, i.e. the handler
-must not try to obtain it again when calling ccw_device_start() or we end in a
-deadlock situation!
-
-If a device driver relies on an I/O request to be completed prior to start the
-next it can reduce I/O processing overhead by chaining a NoOp I/O command
-CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End
-and Device-End status to be presented together, with a single interrupt.
-However, this should be used with care as it implies the channel will remain
-busy, not being able to process I/O requests for other devices on the same
-channel. Therefore e.g. read commands should never use this technique, as the
-result will be presented by a single interrupt anyway.
-
-In order to minimize I/O overhead, a device driver should use the
-DOIO_REPORT_ALL  only if the device can report intermediate interrupt
-information prior to device-end the device driver urgently relies on. In this
-case all I/O interruptions are presented to the device driver until final
-status is recognized.
-
-If a device is able to recover from asynchronously presented I/O errors, it can
-perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some
-devices always report channel-end and device-end together, with a single
-interrupt, others present primary status (channel-end) when the channel is
-ready for the next I/O request and secondary status (device-end) when the data
-transmission has been completed at the device.
-
-Above flag allows to exploit this feature, e.g. for communication devices that
-can handle lost data on the network to allow for enhanced I/O processing.
-
-Unless the channel subsystem at any time presents a secondary status interrupt,
-exploiting this feature will cause only primary status interrupts to be
-presented to the device driver while overlapping I/O is performed. When a
-secondary status without error (alert status) is presented, this indicates
-successful completion for all overlapping ccw_device_start() requests that have
-been issued since the last secondary (final) status.
-
-Channel programs that intend to set the suspend flag on a channel command word 
-(CCW)  must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the 
-suspend flag will cause a channel program check. At the time the channel program 
-becomes suspended an intermediate interrupt will be generated by the channel 
-subsystem.
-
-ccw_device_resume() - Resume Channel Program Execution 
-
-If a device driver chooses to suspend the current channel program execution by 
-setting the CCW suspend flag on a particular CCW, the channel program execution 
-is suspended. In order to resume channel program execution the CIO layer 
-provides the ccw_device_resume() routine. 
-
-int ccw_device_resume(struct ccw_device *cdev);
-
-cdev - ccw_device the resume operation is requested for
-
-The ccw_device_resume() function returns:
-
-        0  - suspended channel program is resumed
--EBUSY     - status pending
--ENODEV    - cdev invalid or not-operational subchannel 
--EINVAL    - resume function not applicable  
--ENOTCONN  - there is no I/O request pending for completion 
-
-Usage Notes:
-Please have a look at the ccw_device_start() usage notes for more details on
-suspended channel programs.
-
-ccw_device_halt() - Halt I/O Request Processing
-
-Sometimes a device driver might need a possibility to stop the processing of
-a long-running channel program or the device might require to initially issue
-a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt()
-command is provided.
-
-ccw_device_halt() must be called disabled and with the ccw device lock held.
-
-int ccw_device_halt(struct ccw_device *cdev,
-                    unsigned long intparm);
-
-cdev    : ccw_device the halt operation is requested for
-intparm : interruption parameter; value is only used if no I/O
-          is outstanding, otherwise the intparm associated with
-          the I/O request is returned
-
-The ccw_device_halt() function returns :
-
-      0 - request successfully initiated
--EBUSY  - the device is currently busy, or status pending.
--ENODEV - cdev invalid.
--EINVAL - The device is not operational or the ccw device is not online.
-
-Usage Notes :
-
-A device driver may write a never-ending channel program by writing a channel
-program that at its end loops back to its beginning by means of a transfer in
-channel (TIC)   command (CCW_CMD_TIC). Usually this is performed by network
-device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is
-executed a program controlled interrupt (PCI) is generated. The device driver
-can then perform an appropriate action. Prior to interrupt of an outstanding
-read to a network device (with or without PCI flag) a ccw_device_halt()
-is required to end the pending operation.
-
-ccw_device_clear() - Terminage I/O Request Processing
-
-In order to terminate all I/O processing at the subchannel, the clear subchannel
-(CSCH) command is used. It can be issued via ccw_device_clear().
-
-ccw_device_clear() must be called disabled and with the ccw device lock held.
-
-int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm);
-
-cdev:	 ccw_device the clear operation is requested for
-intparm: interruption parameter (see ccw_device_halt())
-
-The ccw_device_clear() function returns:
-
-      0 - request successfully initiated
--ENODEV - cdev invalid
--EINVAL - The device is not operational or the ccw device is not online.
-
-Miscellaneous Support Routines
-
-This chapter describes various routines to be used in a Linux/390 device
-driver programming environment.
-
-get_ccwdev_lock()
-
-Get the address of the device specific lock. This is then used in
-spin_lock() / spin_unlock() calls.
-
-
-__u8 ccw_device_get_path_mask(struct ccw_device *cdev);
-
-Get the mask of the path currently available for cdev.
diff --git a/Documentation/s390/common_io.rst b/Documentation/s390/common_io.rst
new file mode 100644
index 000000000000..846485681ce7
--- /dev/null
+++ b/Documentation/s390/common_io.rst
@@ -0,0 +1,140 @@
+======================
+S/390 common I/O-Layer
+======================
+
+command line parameters, procfs and debugfs entries
+===================================================
+
+Command line parameters
+-----------------------
+
+* ccw_timeout_log
+
+  Enable logging of debug information in case of ccw device timeouts.
+
+* cio_ignore = device[,device[,..]]
+
+	device := {all | [!]ipldev | [!]condev | [!]<devno> | [!]<devno>-<devno>}
+
+  The given devices will be ignored by the common I/O-layer; no detection
+  and device sensing will be done on any of those devices. The subchannel to
+  which the device in question is attached will be treated as if no device was
+  attached.
+
+  An ignored device can be un-ignored later; see the "/proc entries"-section for
+  details.
+
+  The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
+  device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
+  give a device number 0xabcd, it will be interpreted as 0.0.abcd.
+
+  You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev'
+  keywords can be used to refer to the CCW based boot device and CCW console
+  device respectively (these are probably useful only when combined with the '!'
+  operator). The '!' operator will cause the I/O-layer to _not_ ignore a device.
+  The command line
+  is parsed from left to right.
+
+  For example::
+
+	cio_ignore=0.0.0023-0.0.0042,0.0.4711
+
+  will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device
+  0.0.4711, if detected.
+
+  As another example::
+
+	cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02
+
+  will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02.
+
+  By default, no devices are ignored.
+
+
+/proc entries
+-------------
+
+* /proc/cio_ignore
+
+  Lists the ranges of devices (by bus id) which are ignored by common I/O.
+
+  You can un-ignore certain or all devices by piping to /proc/cio_ignore.
+  "free all" will un-ignore all ignored devices,
+  "free <device range>, <device range>, ..." will un-ignore the specified
+  devices.
+
+  For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored,
+
+  - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore
+    will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023
+    to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored;
+  - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device
+    0.0.0041;
+  - echo free all > /proc/cio_ignore will un-ignore all remaining ignored
+    devices.
+
+  When a device is un-ignored, device recognition and sensing is performed and
+  the device driver will be notified if possible, so the device will become
+  available to the system. Note that un-ignoring is performed asynchronously.
+
+  You can also add ranges of devices to be ignored by piping to
+  /proc/cio_ignore; "add <device range>, <device range>, ..." will ignore the
+  specified devices.
+
+  Note: While already known devices can be added to the list of devices to be
+	ignored, there will be no effect on then. However, if such a device
+	disappears and then reappears, it will then be ignored. To make
+	known devices go away, you need the "purge" command (see below).
+
+  For example::
+
+	"echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
+
+  will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
+  devices.
+
+  You can remove already known but now ignored devices via::
+
+	"echo purge > /proc/cio_ignore"
+
+  All devices ignored but still registered and not online (= not in use)
+  will be deregistered and thus removed from the system.
+
+  The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
+  compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
+  numbers given as 0xabcd will be interpreted as 0.0.abcd.
+
+* /proc/cio_settle
+
+  A write request to this file is blocked until all queued cio actions are
+  handled. This will allow userspace to wait for pending work affecting
+  device availability after changing cio_ignore or the hardware configuration.
+
+* For some of the information present in the /proc filesystem in 2.4 (namely,
+  /proc/subchannels and /proc/chpids), see driver-model.txt.
+  Information formerly in /proc/irq_count is now in /proc/interrupts.
+
+
+debugfs entries
+---------------
+
+* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
+
+  Some views generated by the debug feature to hold various debug outputs.
+
+  - /sys/kernel/debug/s390dbf/cio_crw/sprintf
+    Messages from the processing of pending channel report words (machine check
+    handling).
+
+  - /sys/kernel/debug/s390dbf/cio_msg/sprintf
+    Various debug messages from the common I/O-layer.
+
+  - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
+    Logs the calling of functions in the common I/O-layer and, if applicable,
+    which subchannel they were called for, as well as dumps of some data
+    structures (like irb in an error case).
+
+  The level of logging can be changed to be more or less verbose by piping to
+  /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
+  documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst)
+  for details.
diff --git a/Documentation/s390/dasd.rst b/Documentation/s390/dasd.rst
new file mode 100644
index 000000000000..9e22247285c8
--- /dev/null
+++ b/Documentation/s390/dasd.rst
@@ -0,0 +1,84 @@
+==================
+DASD device driver
+==================
+
+S/390's disk devices (DASDs) are managed by Linux via the DASD device
+driver. It is valid for all types of DASDs and represents them to
+Linux as block devices, namely "dd". Currently the DASD driver uses a
+single major number (254) and 4 minor numbers per volume (1 for the
+physical volume and 3 for partitions). With respect to partitions see
+below. Thus you may have up to 64 DASD devices in your system.
+
+The kernel parameter 'dasd=from-to,...' may be issued arbitrary times
+in the kernel's parameter line or not at all. The 'from' and 'to'
+parameters are to be given in hexadecimal notation without a leading
+0x.
+If you supply kernel parameters the different instances are processed
+in order of appearance and a minor number is reserved for any device
+covered by the supplied range up to 64 volumes. Additional DASDs are
+ignored. If you do not supply the 'dasd=' kernel parameter at all, the
+DASD driver registers all supported DASDs of your system to a minor
+number in ascending order of the subchannel number.
+
+The driver currently supports ECKD-devices and there are stubs for
+support of the FBA and CKD architectures. For the FBA architecture
+only some smart data structures are missing to make the support
+complete.
+We performed our testing on 3380 and 3390 type disks of different
+sizes, under VM and on the bare hardware (LPAR), using internal disks
+of the multiprise as well as a RAMAC virtual array. Disks exported by
+an Enterprise Storage Server (Seascape) should work fine as well.
+
+We currently implement one partition per volume, which is the whole
+volume, skipping the first blocks up to the volume label. These are
+reserved for IPL records and IBM's volume label to assure
+accessibility of the DASD from other OSs. In a later stage we will
+provide support of partitions, maybe VTOC oriented or using a kind of
+partition table in the label record.
+
+Usage
+=====
+
+-Low-level format (?CKD only)
+For using an ECKD-DASD as a Linux harddisk you have to low-level
+format the tracks by issuing the BLKDASDFORMAT-ioctl on that
+device. This will erase any data on that volume including IBM volume
+labels, VTOCs etc. The ioctl may take a `struct format_data *` or
+'NULL' as an argument::
+
+  typedef struct {
+	int start_unit;
+	int stop_unit;
+	int blksize;
+  } format_data_t;
+
+When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole
+disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit
+and stop_unit are the first and last track to be formatted. If
+stop_unit is -1 it implies that the DASD is formatted from start_unit
+up to the last track. blksize can be any power of two between 512 and
+4096. We recommend no blksize lower than 1024 because the ext2fs uses
+1kB blocks anyway and you gain approx. 50% of capacity increasing your
+blksize from 512 byte to 1kB.
+
+Make a filesystem
+=================
+
+Then you can mk??fs the filesystem of your choice on that volume or
+partition. For reasons of sanity you should build your filesystem on
+the partition /dev/dd?1 instead of the whole volume. You only lose 3kB
+but may be sure that you can reuse your data after introduction of a
+real partition table.
+
+Bugs
+====
+
+- Performance sometimes is rather low because we don't fully exploit clustering
+
+TODO-List
+=========
+
+- Add IBM'S Disk layout to genhd
+- Enhance driver to use more than one major number
+- Enable usage as a module
+- Support Cache fast write and DASD fast write (ECKD)
diff --git a/Documentation/s390/debugging390.rst b/Documentation/s390/debugging390.rst
new file mode 100644
index 000000000000..73ad0b06c666
--- /dev/null
+++ b/Documentation/s390/debugging390.rst
@@ -0,0 +1,2613 @@
+=============================================
+Debugging on Linux for s/390 & z/Architecture
+=============================================
+
+Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+
+Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation
+
+.. Best viewed with fixed width fonts
+
+Overview of Document:
+=====================
+This document is intended to give a good overview of how to debug Linux for
+s/390 and z/Architecture. It is not intended as a complete reference and not a
+tutorial on the fundamentals of C & assembly. It doesn't go into
+390 IO in any detail. It is intended to complement the documents in the
+reference section below & any other worthwhile references you get.
+
+It is intended like the Enterprise Systems Architecture/390 Reference Summary
+to be printed out & used as a quick cheat sheet self help style reference when
+problems occur.
+
+.. Contents
+   ========
+   Register Set
+   Address Spaces on Intel Linux
+   Address Spaces on Linux for s/390 & z/Architecture
+   The Linux for s/390 & z/Architecture Kernel Task Structure
+   Register Usage & Stackframes on Linux for s/390 & z/Architecture
+   A sample program with comments
+   Compiling programs for debugging on Linux for s/390 & z/Architecture
+   Debugging under VM
+   s/390 & z/Architecture IO Overview
+   Debugging IO on s/390 & z/Architecture under VM
+   GDB on s/390 & z/Architecture
+   Stack chaining in gdb by hand
+   Examining core dumps
+   ldd
+   Debugging modules
+   The proc file system
+   SysRq
+   References
+   Special Thanks
+
+Register Set
+============
+The current architectures have the following registers.
+
+16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture,
+r0-r15 (or gpr0-gpr15), used for arithmetic and addressing.
+
+16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15,
+kernel usage only, used for memory management, interrupt control, debugging
+control etc.
+
+16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture,
+normally not used by normal programs but potentially could be used as
+temporary storage. These registers have a 1:1 association with general
+purpose registers and are designed to be used in the so-called access
+register mode to select different address spaces.
+Access register 0 (and access register 1 on z/Architecture, which needs a
+64 bit pointer) is currently used by the pthread library as a pointer to
+the current running threads private area.
+
+16 64-bit floating point registers (fp0-fp15 ) IEEE & HFP floating
+point format compliant on G5 upwards & a Floating point control reg (FPC)
+
+4  64-bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines.
+
+Note:
+   Linux (currently) always uses IEEE & emulates G5 IEEE format on older
+   machines, ( provided the kernel is configured for this ).
+
+
+The PSW is the most important register on the machine it
+is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of
+a program counter (pc), condition code register,memory space designator.
+In IBM standard notation I am counting bit 0 as the MSB.
+It has several advantages over a normal program counter
+in that you can change address translation & program counter
+in a single instruction. To change address translation,
+e.g. switching address translation off requires that you
+have a logical=physical mapping for the address you are
+currently running at.
+
++-------------------------+-------------------------------------------------+
+|          Bit            |                                                 |
++--------+----------------+                     Value                       |
+| s/390  | z/Architecture |                                                 |
++========+================+=================================================+
+| 0      |     0          | Reserved (must be 0) otherwise specification    |
+|        |                | exception occurs.                               |
++--------+----------------+-------------------------------------------------+
+| 1      |     1          | Program Event Recording 1 PER enabled,          |
+|        |                | PER is used to facilitate debugging e.g.        |
+|        |                | single stepping.                                |
++--------+----------------+-------------------------------------------------+
+| 2-4    |    2-4         | Reserved (must be 0).                           |
++--------+----------------+-------------------------------------------------+
+| 5      |     5          | Dynamic address translation 1=DAT on.           |
++--------+----------------+-------------------------------------------------+
+| 6      |     6          | Input/Output interrupt Mask                     |
++--------+----------------+-------------------------------------------------+
+| 7      |     7          | External interrupt Mask used primarily for      |
+|        |                | interprocessor signalling and clock interrupts. |
++--------+----------------+-------------------------------------------------+
+| 8-11   |   8-11         | PSW Key used for complex memory protection      |
+|        |                | mechanism (not used under linux)                |
++--------+----------------+-------------------------------------------------+
+| 12     |     12         | 1 on s/390 0 on z/Architecture                  |
++--------+----------------+-------------------------------------------------+
+| 13     |     13         | Machine Check Mask 1=enable machine check       |
+|        |                | interrupts                                      |
++--------+----------------+-------------------------------------------------+
+| 14     |     14         | Wait State. Set this to 1 to stop the processor |
+|        |                | except for interrupts and give  time to other   |
+|        |                | LPARS. Used in CPU idle in the kernel to        |
+|        |                | increase overall usage of processor resources.  |
++--------+----------------+-------------------------------------------------+
+| 15     |     15         | Problem state (if set to 1 certain instructions |
+|        |                | are disabled). All linux user programs run with |
+|        |                | this bit 1 (useful info for debugging under VM).|
++--------+----------------+-------------------------------------------------+
+| 16-17  |    16-17       | Address Space Control                           |
+|        |                |                                                 |
+|        |                | 00 Primary Space Mode:                          |
+|        |                |                                                 |
+|        |                | The register CR1 contains the primary           |
+|        |                | address-space control element (PASCE), which    |
+|        |                | points to the primary space region/segment      |
+|        |                | table origin.                                   |
+|        |                |                                                 |
+|        |                | 01 Access register mode                         |
+|        |                |                                                 |
+|        |                | 10 Secondary Space Mode:                        |
+|        |                |                                                 |
+|        |                | The register CR7 contains the secondary         |
+|        |                | address-space control element (SASCE), which    |
+|        |                | points to the secondary space region or         |
+|        |                | segment table origin.                           |
+|        |                |                                                 |
+|        |                | 11 Home Space Mode:                             |
+|        |                |                                                 |
+|        |                | The register CR13 contains the home space       |
+|        |                | address-space control element (HASCE), which    |
+|        |                | points to the home space region/segment         |
+|        |                | table origin.                                   |
+|        |                |                                                 |
+|        |                | See "Address Spaces on Linux for s/390 &        |
+|        |                | z/Architecture" below for more information      |
+|        |                | about address space usage in Linux.             |
++--------+----------------+-------------------------------------------------+
+| 18-19  |    18-19       | Condition codes (CC)                            |
++--------+----------------+-------------------------------------------------+
+| 20     |    20          | Fixed point overflow mask if 1=FPU exceptions   |
+|        |                | for this event occur (normally 0)               |
++--------+----------------+-------------------------------------------------+
+| 21     |    21          | Decimal overflow mask if 1=FPU exceptions for   |
+|        |                | this event occur (normally 0)                   |
++--------+----------------+-------------------------------------------------+
+| 22     |    22          | Exponent underflow mask if 1=FPU exceptions     |
+|        |                | for this event occur (normally 0)               |
++--------+----------------+-------------------------------------------------+
+| 23     |    23          | Significance Mask if 1=FPU exceptions for this  |
+|        |                | event occur (normally 0)                        |
++--------+----------------+-------------------------------------------------+
+| 24-31  |    24-30       | Reserved Must be 0.                             |
+|        +----------------+-------------------------------------------------+
+|        |    31          | Extended Addressing Mode                        |
+|        +----------------+-------------------------------------------------+
+|        |    32          | Basic Addressing Mode                           |
+|        |                |                                                 |
+|        |                | Used to set addressing mode::                   |
+|        |                |                                                 |
+|        |                |    +---------+----------+----------+            |
+|        |                |    | PSW 31  | PSW 32   |          |            |
+|        |                |    +---------+----------+----------+            |
+|        |                |    |   0     |    0     |  24 bit  |            |
+|        |                |    +---------+----------+----------+            |
+|        |                |    |   0     |    1     |  31 bit  |            |
+|        |                |    +---------+----------+----------+            |
+|        |                |    |   1     |    1     |  64 bit  |            |
+|        |                |    +---------+----------+----------+            |
++--------+----------------+-------------------------------------------------+
+| 32     |                | 1=31 bit addressing mode 0=24 bit addressing    |
+|        |                | mode (for backward compatibility), linux        |
+|        |                | always runs with this bit set to 1              |
++--------+----------------+-------------------------------------------------+
+| 33-64  |                | Instruction address.                            |
+|        +----------------+-------------------------------------------------+
+|        |    33-63       | Reserved must be 0                              |
+|        +----------------+-------------------------------------------------+
+|        |    64-127      | Address                                         |
+|        |                |                                                 |
+|        |                |   - In 24 bits mode bits 64-103=0 bits 104-127  |
+|        |                |     Address                                     |
+|        |                |   - In 31 bits mode bits 64-96=0 bits 97-127    |
+|        |                |     Address                                     |
+|        |                |                                                 |
+|        |                | Note:                                           |
+|        |                |     unlike 31 bit mode on s/390 bit 96 must be  |
+|        |                |     zero when loading the address with LPSWE    |
+|        |                |     otherwise a specification exception occurs, |
+|        |                |     LPSW is fully backward compatible.          |
++--------+----------------+-------------------------------------------------+
+
+Prefix Page(s)
+--------------
+This per cpu memory area is too intimately tied to the processor not to mention.
+It exists between the real addresses 0-4096 on s/390 and between 0-8192 on
+z/Architecture and is exchanged with one page on s/390 or two pages on
+z/Architecture in absolute storage by the set prefix instruction during Linux
+startup.
+
+This page is mapped to a different prefix for each processor in an SMP
+configuration (assuming the OS designer is sane of course).
+
+Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on
+z/Architecture are used by the processor itself for holding such information
+as exception indications and entry points for exceptions.
+
+Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and
+z/Architecture (there is a gap on z/Architecture currently between 0xc00 and
+0x1000, too, which is used by Linux).
+
+The closest thing to this on traditional architectures is the interrupt
+vector table. This is a good thing & does simplify some of the kernel coding
+however it means that we now cannot catch stray NULL pointers in the
+kernel without hard coded checks.
+
+
+
+Address Spaces on Intel Linux
+=============================
+
+The traditional Intel Linux is approximately mapped as follows forgive
+the ascii art::
+
+  0xFFFFFFFF 4GB Himem          *****************
+				*               *
+				* Kernel Space  *
+				*               *
+				*****************         ****************
+  User Space Himem              *  User Stack   *         *              *
+  (typically 0xC0000000 3GB )   *****************         *              *
+				*  Shared Libs  *         * Next Process *
+				*****************         *     to       *
+				*               *   <==   *     Run      *  <==
+				*  User Program *         *              *
+				*   Data BSS    *         *              *
+				*    Text       *         *              *
+				*   Sections    *         *              *
+  0x00000000                    *****************         ****************
+
+Now it is easy to see that on Intel it is quite easy to recognise a kernel
+address as being one greater than user space himem (in this case 0xC0000000),
+and addresses of less than this are the ones in the current running program on
+this processor (if an smp box).
+
+If using the virtual machine ( VM ) as a debugger it is quite difficult to
+know which user process is running as the address space you are looking at
+could be from any process in the run queue.
+
+The limitation of Intels addressing technique is that the linux
+kernel uses a very simple real address to virtual addressing technique
+of Real Address=Virtual Address-User Space Himem.
+This means that on Intel the kernel linux can typically only address
+Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines
+can typically use.
+
+They can lower User Himem to 2GB or lower & thus be
+able to use 2GB of RAM however this shrinks the maximum size
+of User Space from 3GB to 2GB they have a no win limit of 4GB unless
+they go to 64 Bit.
+
+
+On 390 our limitations & strengths make us slightly different.
+For backward compatibility we are only allowed use 31 bits (2GB)
+of our 32 bit addresses, however, we use entirely separate address
+spaces for the user & kernel.
+
+This means we can support 2GB of non Extended RAM on s/390, & more
+with the Extended memory management swap device &
+currently 4TB of physical memory currently on z/Architecture.
+
+
+Address Spaces on Linux for s/390 & z/Architecture
+==================================================
+
+Our addressing scheme is basically as follows::
+
+				   Primary Space               Home Space
+  Himem 0x7fffffff 2GB on s/390    *****************          ****************
+  currently 0x3ffffffffff (2^42)-1 *  User Stack   *          *              *
+  on z/Architecture.               *****************          *              *
+				   *  Shared Libs  *          *              *
+				   *****************          *              *
+				   *               *          *    Kernel    *
+				   *  User Program *          *              *
+				   *   Data BSS    *          *              *
+				   *    Text       *          *              *
+				   *   Sections    *          *              *
+  0x00000000                       *****************          ****************
+
+This also means that we need to look at the PSW problem state bit and the
+addressing mode to decide whether we are looking at user or kernel space.
+
+User space runs in primary address mode (or access register mode within
+the vdso code).
+
+The kernel usually also runs in home space mode, however when accessing
+user space the kernel switches to primary or secondary address mode if
+the mvcos instruction is not available or if a compare-and-swap (futex)
+instruction on a user space address is performed.
+
+When also looking at the ASCE control registers, this means:
+
+User space:
+
+- runs in primary or access register mode
+- cr1 contains the user asce
+- cr7 contains the user asce
+- cr13 contains the kernel asce
+
+Kernel space:
+
+- runs in home space mode
+- cr1 contains the user or kernel asce
+
+  - the kernel asce is loaded when a uaccess requires primary or
+    secondary address mode
+
+- cr7 contains the user or kernel asce, (changed with set_fs())
+- cr13 contains the kernel asce
+
+In case of uaccess the kernel changes to:
+
+- primary space mode in case of a uaccess (copy_to_user) and uses
+  e.g. the mvcp instruction to access user space. However the kernel
+  will stay in home space mode if the mvcos instruction is available
+- secondary space mode in case of futex atomic operations, so that the
+  instructions come from primary address space and data from secondary
+  space
+
+In case of KVM, the kernel runs in home space mode, but cr1 gets switched
+to contain the gmap asce before the SIE instruction gets executed. When
+the SIE instruction is finished, cr1 will be switched back to contain the
+user asce.
+
+
+Virtual Addresses on s/390 & z/Architecture
+===========================================
+
+A virtual address on s/390 is made up of 3 parts
+The SX (segment index, roughly corresponding to the PGD & PMD in Linux
+terminology) being bits 1-11.
+
+The PX (page index, corresponding to the page table entry (pte) in Linux
+terminology) being bits 12-19.
+
+The remaining bits BX (the byte index are the offset in the page )
+i.e. bits 20 to 31.
+
+On z/Architecture in linux we currently make up an address from 4 parts.
+
+- The region index bits (RX) 0-32 we currently use bits 22-32
+- The segment index (SX) being bits 33-43
+- The page index (PX) being bits  44-51
+- The byte index (BX) being bits  52-63
+
+Notes:
+  1) s/390 has no PMD so the PMD is really the PGD also.
+     A lot of this stuff is defined in pgtable.h.
+
+  2) Also seeing as s/390's page indexes are only 1k  in size
+     (bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k )
+     to make the best use of memory by updating 4 segment indices
+     entries each time we mess with a PMD & use offsets
+     0,1024,2048 & 3072 in this page as for our segment indexes.
+     On z/Architecture our page indexes are now 2k in size
+     ( bits 12-19 x 8 bytes per pte ) we do a similar trick
+     but only mess with 2 segment indices each time we mess with
+     a PMD.
+
+  3) As z/Architecture supports up to a massive 5-level page table lookup we
+     can only use 3 currently on Linux ( as this is all the generic kernel
+     currently supports ) however this may change in future
+     this allows us to access ( according to my sums )
+     4TB of virtual storage per process i.e.
+     4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes,
+     enough for another 2 or 3 of years I think :-).
+     to do this we use a region-third-table designation type in
+     our address space control registers.
+
+
+The Linux for s/390 & z/Architecture Kernel Task Structure
+==========================================================
+Each process/thread under Linux for S390 has its own kernel task_struct
+defined in linux/include/linux/sched.h
+The S390 on initialisation & resuming of a process on a cpu sets
+the __LC_KERNEL_STACK variable in the spare prefix area for this cpu
+(which we use for per-processor globals).
+
+The kernel stack pointer is intimately tied with the task structure for
+each processor as follows::
+
+			s/390
+	      ************************
+	      *  1 page kernel stack *
+	      *        ( 4K )        *
+	      ************************
+	      *   1 page task_struct *
+	      *        ( 4K )        *
+  8K aligned  ************************
+
+		   z/Architecture
+	      ************************
+	      *  2 page kernel stack *
+	      *        ( 8K )        *
+	      ************************
+	      *  2 page task_struct  *
+	      *        ( 8K )        *
+  16K aligned ************************
+
+What this means is that we don't need to dedicate any register or global
+variable to point to the current running process & can retrieve it with the
+following very simple construct for s/390 & one very similar for
+z/Architecture::
+
+  static inline struct task_struct * get_current(void)
+  {
+	struct task_struct *current;
+	__asm__("lhi   %0,-8192\n\t"
+		"nr    %0,15"
+		: "=r" (current) );
+	return current;
+  }
+
+i.e. just anding the current kernel stack pointer with the mask -8192.
+Thankfully because Linux doesn't have support for nested IO interrupts
+& our devices have large buffers can survive interrupts being shut for
+short amounts of time we don't need a separate stack for interrupts.
+
+
+
+
+Register Usage & Stackframes on Linux for s/390 & z/Architecture
+=================================================================
+Overview:
+---------
+This is the code that gcc produces at the top & the bottom of
+each function. It usually is fairly consistent & similar from
+function to function & if you know its layout you can probably
+make some headway in finding the ultimate cause of a problem
+after a crash without a source level debugger.
+
+Note: To follow stackframes requires a knowledge of C or Pascal &
+limited knowledge of one assembly language.
+
+It should be noted that there are some differences between the
+s/390 and z/Architecture stack layouts as the z/Architecture stack layout
+didn't have to maintain compatibility with older linkage formats.
+
+Glossary:
+---------
+alloca:
+  This is a built in compiler function for runtime allocation
+  of extra space on the callers stack which is obviously freed
+  up on function exit ( e.g. the caller may choose to allocate nothing
+  of a buffer of 4k if required for temporary purposes ), it generates
+  very efficient code ( a few cycles  ) when compared to alternatives
+  like malloc.
+
+automatics:
+  These are local variables on the stack, i.e they aren't in registers &
+  they aren't static.
+
+back-chain:
+  This is a pointer to the stack pointer before entering a
+  framed functions ( see frameless function ) prologue got by
+  dereferencing the address of the current stack pointer,
+  i.e. got by accessing the 32 bit value at the stack pointers
+  current location.
+
+base-pointer:
+  This is a pointer to the back of the literal pool which
+  is an area just behind each procedure used to store constants
+  in each function.
+
+call-clobbered:
+  The caller probably needs to save these registers if there
+  is something of value in them, on the stack or elsewhere before making a
+  call to another procedure so that it can restore it later.
+
+epilogue:
+  The code generated by the compiler to return to the caller.
+
+frameless-function:
+  A frameless function in Linux for s390 & z/Architecture is one which doesn't
+  need more than the register save area (96 bytes on s/390, 160 on z/Architecture)
+  given to it by the caller.
+
+  A frameless function never:
+
+  1) Sets up a back chain.
+  2) Calls alloca.
+  3) Calls other normal functions
+  4) Has automatics.
+
+GOT-pointer:
+  This is a pointer to the global-offset-table in ELF
+  ( Executable Linkable Format, Linux'es most common executable format ),
+  all globals & shared library objects are found using this pointer.
+
+lazy-binding
+  ELF shared libraries are typically only loaded when routines in the shared
+  library are actually first called at runtime. This is lazy binding.
+
+procedure-linkage-table
+  This is a table found from the GOT which contains pointers to routines
+  in other shared libraries which can't be called to by easier means.
+
+prologue:
+  The code generated by the compiler to set up the stack frame.
+
+outgoing-args:
+  This is extra area allocated on the stack of the calling function if the
+  parameters for the callee's cannot all be put in registers, the same
+  area can be reused by each function the caller calls.
+
+routine-descriptor:
+  A COFF  executable format based concept of a procedure reference
+  actually being 8 bytes or more as opposed to a simple pointer to the routine.
+  This is typically defined as follows:
+
+  - Routine Descriptor offset 0=Pointer to Function
+  - Routine Descriptor offset 4=Pointer to Table of Contents
+
+  The table of contents/TOC is roughly equivalent to a GOT pointer.
+  & it means that shared libraries etc. can be shared between several
+  environments each with their own TOC.
+
+static-chain:
+  This is used in nested functions a concept adopted from pascal
+  by gcc not used in ansi C or C++ ( although quite useful ), basically it
+  is a pointer used to reference local variables of enclosing functions.
+  You might come across this stuff once or twice in your lifetime.
+
+  e.g.
+
+  The function below should return 11 though gcc may get upset & toss warnings
+  about unused variables::
+
+    int FunctionA(int a)
+    {
+	int b;
+	FunctionC(int c)
+	{
+		b=c+1;
+	}
+	FunctionC(10);
+	return(b);
+    }
+
+
+s/390 & z/Architecture Register usage
+=====================================
+
+======== ========================================== ===============
+r0       used by syscalls/assembly                  call-clobbered
+r1       used by syscalls/assembly                  call-clobbered
+r2       argument 0 / return value 0                call-clobbered
+r3       argument 1 / return value 1 (if long long) call-clobbered
+r4       argument 2                                 call-clobbered
+r5       argument 3                                 call-clobbered
+r6       argument 4                                 saved
+r7       pointer-to arguments 5 to ...              saved
+r8       this & that                                saved
+r9       this & that                                saved
+r10      static-chain ( if nested function )        saved
+r11      frame-pointer ( if function used alloca )  saved
+r12      got-pointer                                saved
+r13      base-pointer                               saved
+r14      return-address                             saved
+r15      stack-pointer                              saved
+
+f0       argument 0 / return value ( float/double ) call-clobbered
+f2       argument 1                                 call-clobbered
+f4       z/Architecture argument 2                  saved
+f6       z/Architecture argument 3                  saved
+======== ========================================== ===============
+
+The remaining floating points
+f1,f3,f5 f7-f15 are call-clobbered.
+
+Notes:
+------
+1) The only requirement is that registers which are used
+   by the callee are saved, e.g. the compiler is perfectly
+   capable of using r11 for purposes other than a frame a
+   frame pointer if a frame pointer is not needed.
+2) In functions with variable arguments e.g. printf the calling procedure
+   is identical to one without variable arguments & the same number of
+   parameters. However, the prologue of this function is somewhat more
+   hairy owing to it having to move these parameters to the stack to
+   get va_start, va_arg & va_end to work.
+3) Access registers are currently unused by gcc but are used in
+   the kernel. Possibilities exist to use them at the moment for
+   temporary storage but it isn't recommended.
+4) Only 4 of the floating point registers are used for
+   parameter passing as older machines such as G3 only have only 4
+   & it keeps the stack frame compatible with other compilers.
+   However with IEEE floating point emulation under linux on the
+   older machines you are free to use the other 12.
+5) A long long or double parameter cannot be have the
+   first 4 bytes in a register & the second four bytes in the
+   outgoing args area. It must be purely in the outgoing args
+   area if crossing this boundary.
+6) Floating point parameters are mixed with outgoing args
+   on the outgoing args area in the order the are passed in as parameters.
+7) Floating point arguments 2 & 3 are saved in the outgoing args area for
+   z/Architecture
+
+
+Stack Frame Layout
+------------------
+
+========= ============== ======================================================
+s/390     z/Architecture
+========= ============== ======================================================
+0         0              back chain ( a 0 here signifies end of back chain )
+4         8              eos ( end of stack, not used on Linux for S390 used
+			 in other linkage formats )
+8         16             glue used in other s/390 linkage formats for saved
+			 routine descriptors etc.
+12        24             glue used in other s/390 linkage formats for saved
+			 routine descriptors etc.
+16        32             scratch area
+20        40             scratch area
+24        48             saved r6 of caller function
+28        56             saved r7 of caller function
+32        64             saved r8 of caller function
+36        72             saved r9 of caller function
+40        80             saved r10 of caller function
+44        88             saved r11 of caller function
+48        96             saved r12 of caller function
+52        104            saved r13 of caller function
+56        112            saved r14 of caller function
+60        120            saved r15 of caller function
+64        128            saved f4 of caller function
+72        132            saved f6 of caller function
+80                       undefined
+96        160            outgoing args passed from caller to callee
+96+x      160+x          possible stack alignment ( 8 bytes desirable )
+96+x+y    160+x+y        alloca space of caller ( if used )
+96+x+y+z  160+x+y+z      automatics of caller ( if used )
+0                        back-chain
+========= ============== ======================================================
+
+A sample program with comments.
+===============================
+
+Comments on the function test
+-----------------------------
+1) It didn't need to set up a pointer to the constant pool gpr13 as it is not
+   used ( :-( ).
+2) This is a frameless function & no stack is bought.
+3) The compiler was clever enough to recognise that it could return the
+   value in r2 as well as use it for the passed in parameter ( :-) ).
+4) The basr ( branch relative & save ) trick works as follows the instruction
+   has a special case with r0,r0 with some instruction operands is understood as
+   the literal value 0, some risc architectures also do this ). So now
+   we are branching to the next address & the address new program counter is
+   in r13,so now we subtract the size of the function prologue we have executed
+   the size of the literal pool to get to the top of the literal pool::
+
+
+     0040037c int test(int b)
+     {                                                     # Function prologue below
+       40037c:  90 de f0 34     stm     %r13,%r14,52(%r15) # Save registers r13 & r14
+       400380:  0d d0           basr    %r13,%r0           # Set up pointer to constant pool using
+       400382:  a7 da ff fa     ahi     %r13,-6            # basr trick
+	return(5+b);
+							   # Huge main program
+       400386:  a7 2a 00 05     ahi     %r2,5              # add 5 to r2
+
+							   # Function epilogue below
+       40038a:  98 de f0 34     lm      %r13,%r14,52(%r15) # restore registers r13 & 14
+       40038e:  07 fe           br      %r14               # return
+     }
+
+Comments on the function main
+-----------------------------
+1) The compiler did this function optimally ( 8-) )::
+
+     Literal pool for main.
+     400390:    ff ff ff ec     .long 0xffffffec
+     main(int argc,char *argv[])
+     {                                                     # Function prologue below
+       400394:  90 bf f0 2c     stm     %r11,%r15,44(%r15) # Save necessary registers
+       400398:  18 0f           lr      %r0,%r15           # copy stack pointer to r0
+       40039a:  a7 fa ff a0     ahi     %r15,-96           # Make area for callee saving
+       40039e:  0d d0           basr    %r13,%r0           # Set up r13 to point to
+       4003a0:  a7 da ff f0     ahi     %r13,-16           # literal pool
+       4003a4:  50 00 f0 00     st      %r0,0(%r15)        # Save backchain
+
+	return(test(5));                                   # Main Program Below
+       4003a8:  58 e0 d0 00     l       %r14,0(%r13)       # load relative address of test from
+							   # literal pool
+       4003ac:  a7 28 00 05     lhi     %r2,5              # Set first parameter to 5
+       4003b0:  4d ee d0 00     bas     %r14,0(%r14,%r13)  # jump to test setting r14 as return
+							   # address using branch & save instruction.
+
+							   # Function Epilogue below
+       4003b4:  98 bf f0 8c     lm      %r11,%r15,140(%r15)# Restore necessary registers.
+       4003b8:  07 fe           br      %r14               # return to do program exit
+     }
+
+
+Compiler updates
+----------------
+
+::
+
+  main(int argc,char *argv[])
+  {
+    4004fc:     90 7f f0 1c             stm     %r7,%r15,28(%r15)
+    400500:     a7 d5 00 04             bras    %r13,400508 <main+0xc>
+    400504:     00 40 04 f4             .long   0x004004f4
+    # compiler now puts constant pool in code to so it saves an instruction
+    400508:     18 0f                   lr      %r0,%r15
+    40050a:     a7 fa ff a0             ahi     %r15,-96
+    40050e:     50 00 f0 00             st      %r0,0(%r15)
+	return(test(5));
+    400512:     58 10 d0 00             l       %r1,0(%r13)
+    400516:     a7 28 00 05             lhi     %r2,5
+    40051a:     0d e1                   basr    %r14,%r1
+    # compiler adds 1 extra instruction to epilogue this is done to
+    # avoid processor pipeline stalls owing to data dependencies on g5 &
+    # above as register 14 in the old code was needed directly after being loaded
+    # by the lm %r11,%r15,140(%r15) for the br %14.
+    40051c:     58 40 f0 98             l       %r4,152(%r15)
+    400520:     98 7f f0 7c             lm      %r7,%r15,124(%r15)
+    400524:     07 f4                   br      %r4
+  }
+
+
+Hartmut ( our compiler developer ) also has been threatening to take out the
+stack backchain in optimised code as this also causes pipeline stalls, you
+have been warned.
+
+64 bit z/Architecture code disassembly
+--------------------------------------
+
+If you understand the stuff above you'll understand the stuff
+below too so I'll avoid repeating myself & just say that
+some of the instructions have g's on the end of them to indicate
+they are 64 bit & the stack offsets are a bigger,
+the only other difference you'll find between 32 & 64 bit is that
+we now use f4 & f6 for floating point arguments on 64 bit::
+
+  00000000800005b0 <test>:
+  int test(int b)
+  {
+	return(5+b);
+      800005b0: a7 2a 00 05             ahi     %r2,5
+      800005b4: b9 14 00 22             lgfr    %r2,%r2 # downcast to integer
+      800005b8: 07 fe                   br      %r14
+      800005ba: 07 07                   bcr     0,%r7
+
+
+  }
+
+  00000000800005bc <main>:
+  main(int argc,char *argv[])
+  {
+      800005bc: eb bf f0 58 00 24       stmg    %r11,%r15,88(%r15)
+      800005c2: b9 04 00 1f             lgr     %r1,%r15
+      800005c6: a7 fb ff 60             aghi    %r15,-160
+      800005ca: e3 10 f0 00 00 24       stg     %r1,0(%r15)
+	return(test(5));
+      800005d0: a7 29 00 05             lghi    %r2,5
+      # brasl allows jumps > 64k & is overkill here bras would do fune
+      800005d4: c0 e5 ff ff ff ee       brasl   %r14,800005b0 <test>
+      800005da: e3 40 f1 10 00 04       lg      %r4,272(%r15)
+      800005e0: eb bf f0 f8 00 04       lmg     %r11,%r15,248(%r15)
+      800005e6: 07 f4                   br      %r4
+  }
+
+
+
+Compiling programs for debugging on Linux for s/390 & z/Architecture
+====================================================================
+-gdwarf-2 now works it should be considered the default debugging
+format for s/390 & z/Architecture as it is more reliable for debugging
+shared libraries,  normal -g debugging works much better now
+Thanks to the IBM java compiler developers bug reports.
+
+This is typically done adding/appending the flags -g or -gdwarf-2 to the
+CFLAGS & LDFLAGS variables Makefile of the program concerned.
+
+If using gdb & you would like accurate displays of registers &
+stack traces compile without optimisation i.e make sure
+that there is no -O2 or similar on the CFLAGS line of the Makefile &
+the emitted gcc commands, obviously this will produce worse code
+( not advisable for shipment ) but it is an  aid to the debugging process.
+
+This aids debugging because the compiler will copy parameters passed in
+in registers onto the stack so backtracing & looking at passed in
+parameters will work, however some larger programs which use inline functions
+will not compile without optimisation.
+
+Debugging with optimisation has since much improved after fixing
+some bugs, please make sure you are using gdb-5.0 or later developed
+after Nov'2000.
+
+
+
+Debugging under VM
+==================
+
+Notes
+-----
+Addresses & values in the VM debugger are always hex never decimal
+Address ranges are of the format <HexValue1>-<HexValue2> or
+<HexValue1>.<HexValue2>
+For example, the address range  0x2000 to 0x3000 can be described as 2000-3000
+or 2000.1000
+
+The VM Debugger is case insensitive.
+
+VM's strengths are usually other debuggers weaknesses you can get at any
+resource no matter how sensitive e.g. memory management resources, change
+address translation in the PSW. For kernel hacking you will reap dividends if
+you get good at it.
+
+The VM Debugger displays operators but not operands, and also the debugger
+displays useful information on the same line as the author of the code probably
+felt that it was a good idea not to go over the 80 columns on the screen.
+This isn't as unintuitive as it may seem as the s/390 instructions are easy to
+decode mentally and you can make a good guess at a lot of them as all the
+operands are nibble (half byte aligned).
+So if you have an objdump listing by hand, it is quite easy to follow, and if
+you don't have an objdump listing keep a copy of the s/390 Reference Summary
+or alternatively the s/390 principles of operation next to you.
+e.g. even I can guess that
+0001AFF8' LR    180F        CC 0
+is a ( load register ) lr r0,r15
+
+Also it is very easy to tell the length of a 390 instruction from the 2 most
+significant bits in the instruction (not that this info is really useful except
+if you are trying to make sense of a hexdump of code).
+Here is a table
+
+======================= ==================
+Bits                    Instruction Length
+======================= ==================
+00                          2 Bytes
+01                          4 Bytes
+10                          4 Bytes
+11                          6 Bytes
+======================= ==================
+
+The debugger also displays other useful info on the same line such as the
+addresses being operated on destination addresses of branches & condition codes.
+e.g.::
+
+  00019736' AHI   A7DAFF0E    CC 1
+  000198BA' BRC   A7840004 -> 000198C2'   CC 0
+  000198CE' STM   900EF068 >> 0FA95E78    CC 2
+
+
+
+Useful VM debugger commands
+---------------------------
+
+I suppose I'd better mention this before I start
+to list the current active traces do::
+
+	Q TR
+
+there can be a maximum of 255 of these per set
+( more about trace sets later ).
+
+To stop traces issue a::
+
+	TR END.
+
+To delete a particular breakpoint issue::
+
+	TR DEL <breakpoint number>
+
+The PA1 key drops to CP mode so you can issue debugger commands,
+Doing alt c (on my 3270 console at least ) clears the screen.
+
+hitting b <enter> comes back to the running operating system
+from cp mode ( in our case linux ).
+
+It is typically useful to add shortcuts to your profile.exec file
+if you have one ( this is roughly equivalent to autoexec.bat in DOS ).
+file here are a few from mine::
+
+  /* this gives me command history on issuing f12 */
+  set pf12 retrieve
+  /* this continues */
+  set pf8 imm b
+  /* goes to trace set a */
+  set pf1 imm tr goto a
+  /* goes to trace set b */
+  set pf2 imm tr goto b
+  /* goes to trace set c */
+  set pf3 imm tr goto c
+
+
+
+Instruction Tracing
+-------------------
+Setting a simple breakpoint::
+
+	TR I PSWA <address>
+
+To debug a particular function try::
+
+  TR I R <function address range>
+  TR I on its own will single step.
+  TR I DATA <MNEMONIC> <OPTIONAL RANGE> will trace for particular mnemonics
+
+e.g.::
+
+  TR I DATA 4D R 0197BC.4000
+
+will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000
+
+if you were inclined you could add traces for all branch instructions &
+suffix them with the run prefix so you would have a backtrace on screen
+when a program crashes::
+
+	TR BR <INTO OR FROM> will trace branches into or out of an address.
+
+e.g.::
+
+	TR BR INTO 0
+
+is often quite useful if a program is getting awkward & deciding
+to branch to 0 & crashing as this will stop at the address before in jumps to 0.
+
+::
+
+	TR I R <address range> RUN cmd d g
+
+single steps a range of addresses but stays running &
+displays the gprs on each step.
+
+
+
+Displaying & modifying Registers
+--------------------------------
+D G
+	will display all the gprs
+
+Adding a extra G to all the commands is necessary to access the full 64 bit
+content in VM on z/Architecture. Obviously this isn't required for access
+registers as these are still 32 bit.
+
+e.g.
+
+DGG
+	instead of DG
+
+D X
+	will display all the control registers
+D AR
+	will display all the access registers
+D AR4-7
+	will display access registers 4 to 7
+CPU ALL D G
+	will display the GRPS of all CPUS in the configuration
+D PSW
+	will display the current PSW
+st PSW 2000
+	will put the value 2000 into the PSW & cause crash your machine.
+D PREFIX
+	displays the prefix offset
+
+
+Displaying Memory
+-----------------
+To display memory mapped using the current PSW's mapping try::
+
+   D <range>
+
+To make VM display a message each time it hits a particular address and
+continue try:
+
+D I<range>
+	will disassemble/display a range of instructions.
+
+ST addr 32 bit word
+	will store a 32 bit aligned address
+D T<range>
+	will display the EBCDIC in an address (if you are that way inclined)
+D R<range>
+	will display real addresses ( without DAT ) but with prefixing.
+
+There are other complex options to display if you need to get at say home space
+but are in primary space the easiest thing to do is to temporarily
+modify the PSW to the other addressing mode, display the stuff & then
+restore it.
+
+
+
+Hints
+-----
+If you want to issue a debugger command without halting your virtual machine
+with the PA1 key try prefixing the command with #CP e.g.::
+
+	#cp tr i pswa 2000
+
+also suffixing most debugger commands with RUN will cause them not
+to stop just display the mnemonic at the current instruction on the console.
+
+If you have several breakpoints you want to put into your program &
+you get fed up of cross referencing with System.map
+you can do the following trick for several symbols.
+
+::
+
+	grep do_signal System.map
+
+which emits the following among other things::
+
+	0001f4e0 T do_signal
+
+now you can do::
+
+	TR I PSWA 0001f4e0 cmd msg * do_signal
+
+This sends a message to your own console each time do_signal is entered.
+( As an aside I wrote a perl script once which automatically generated a REXX
+script with breakpoints on every kernel procedure, this isn't a good idea
+because there are thousands of these routines & VM can only set 255 breakpoints
+at a time so you nearly had to spend as long pruning the file down as you would
+entering the msgs by hand), however, the trick might be useful for a single
+object file. In the 3270 terminal emulator x3270 there is a very useful option
+in the file menu called "Save Screen In File" - this is very good for keeping a
+copy of traces.
+
+From CMS help <command name> will give you online help on a particular command.
+e.g.::
+
+	HELP DISPLAY
+
+Also CP has a file called profile.exec which automatically gets called
+on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session
+CP has a feature similar to doskey, it may be useful for you to
+use profile.exec to define some keystrokes.
+
+SET PF9 IMM B
+	This does a single step in VM on pressing F8.
+
+SET PF10  ^
+	This sets up the ^ key.
+	which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed
+	directly into some 3270 consoles.
+
+SET PF11 ^-
+	This types the starting keystrokes for a sysrq see SysRq below.
+SET PF12 RETRIEVE
+	This retrieves command history on pressing F12.
+
+
+Sometimes in VM the display is set up to scroll automatically this
+can be very annoying if there are messages you wish to look at
+to stop this do
+
+TERM MORE 255 255
+  This will nearly stop automatic screen updates, however it will
+  cause a denial of service if lots of messages go to the 3270 console,
+  so it would be foolish to use this as the default on a production machine.
+
+
+Tracing particular processes
+----------------------------
+The kernel's text segment is intentionally at an address in memory that it will
+very seldom collide with text segments of user programs ( thanks Martin ),
+this simplifies debugging the kernel.
+However it is quite common for user processes to have addresses which collide
+this can make debugging a particular process under VM painful under normal
+circumstances as the process may change when doing a::
+
+	TR I R <address range>.
+
+Thankfully after reading VM's online help I figured out how to debug
+I particular process.
+
+Your first problem is to find the STD ( segment table designation )
+of the program you wish to debug.
+There are several ways you can do this here are a few
+
+Run::
+
+	objdump --syms <program to be debugged> | grep main
+
+To get the address of main in the program. Then::
+
+	tr i pswa <address of main>
+
+Start the program, if VM drops to CP on what looks like the entry
+point of the main function this is most likely the process you wish to debug.
+Now do a D X13 or D XG13 on z/Architecture.
+
+On 31 bit the STD is bits 1-19 ( the STO segment table origin )
+& 25-31 ( the STL segment table length ) of CR13.
+
+now type::
+
+	TR I R STD <CR13's value> 0.7fffffff
+
+e.g.::
+
+	TR I R STD 8F32E1FF 0.7fffffff
+
+Another very useful variation is::
+
+	TR STORE INTO STD <CR13's value> <address range>
+
+for finding out when a particular variable changes.
+
+An alternative way of finding the STD of a currently running process
+is to do the following, ( this method is more complex but
+could be quite convenient if you aren't updating the kernel much &
+so your kernel structures will stay constant for a reasonable period of
+time ).
+
+::
+
+	grep task /proc/<pid>/status
+
+from this you should see something like::
+
+	task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68
+
+This now gives you a pointer to the task structure.
+
+Now make::
+
+	CC:="s390-gcc -g" kernel/sched.s
+
+To get the task_struct stabinfo.
+
+( task_struct is defined in include/linux/sched.h ).
+
+Now we want to look at
+task->active_mm->pgd
+
+on my machine the active_mm in the task structure stab is
+active_mm:(4,12),672,32
+
+its offset is 672/8=84=0x54
+
+the pgd member in the mm_struct stab is
+pgd:(4,6)=*(29,5),96,32
+so its offset is 96/8=12=0xc
+
+so we'll::
+
+	hexdump -s 0xf160054 /dev/mem | more
+
+i.e. task_struct+active_mm offset
+to look at the active_mm member::
+
+	f160054 0fee cc60 0019 e334 0000 0000 0000 0011
+
+::
+
+	hexdump -s 0x0feecc6c /dev/mem | more
+
+i.e. active_mm+pgd offset::
+
+	feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010
+
+we get something like
+now do::
+
+	TR I R STD <pgd|0x7f> 0.7fffffff
+
+i.e. the 0x7f is added because the pgd only
+gives the page table origin & we need to set the low bits
+to the maximum possible segment table length.
+
+::
+
+	TR I R STD 0f2c007f 0.7fffffff
+
+on z/Architecture you'll probably need to do::
+
+	TR I R STD <pgd|0x7> 0.ffffffffffffffff
+
+to set the TableType to 0x1 & the Table length to 3.
+
+
+
+Tracing Program Exceptions
+--------------------------
+If you get a crash which says something like
+illegal operation or specification exception followed by a register dump
+You can restart linux & trace these using the tr prog <range or value> trace
+option.
+
+
+The most common ones you will normally be tracing for is:
+
+- 1=operation exception
+- 2=privileged operation exception
+- 4=protection exception
+- 5=addressing exception
+- 6=specification exception
+- 10=segment translation exception
+- 11=page translation exception
+
+The full list of these is on page 22 of the current s/390 Reference Summary.
+e.g.
+
+tr prog 10 will trace segment translation exceptions.
+
+tr prog on its own will trace all program interruption codes.
+
+Trace Sets
+----------
+On starting VM you are initially in the INITIAL trace set.
+You can do a Q TR to verify this.
+If you have a complex tracing situation where you wish to wait for instance
+till a driver is open before you start tracing IO, but know in your
+heart that you are going to have to make several runs through the code till you
+have a clue whats going on.
+
+What you can do is::
+
+	TR I PSWA <Driver open address>
+
+hit b to continue till breakpoint
+
+reach the breakpoint
+
+now do your::
+
+	TR GOTO B
+	TR IO 7c08-7c09 inst int run
+
+or whatever the IO channels you wish to trace are & hit b
+
+To got back to the initial trace set do::
+
+	TR GOTO INITIAL
+
+& the TR I PSWA <Driver open address> will be the only active breakpoint again.
+
+
+Tracing linux syscalls under VM
+-------------------------------
+Syscalls are implemented on Linux for S390 by the Supervisor call instruction
+(SVC). There 256 possibilities of these as the instruction is made up of a 0xA
+opcode and the second byte being the syscall number. They are traced using the
+simple command::
+
+	TR SVC  <Optional value or range>
+
+the syscalls are defined in linux/arch/s390/include/asm/unistd.h
+e.g. to trace all file opens just do::
+
+	TR SVC 5 ( as this is the syscall number of open )
+
+
+SMP Specific commands
+---------------------
+To find out how many cpus you have
+Q CPUS displays all the CPU's available to your virtual machine
+To find the cpu that the current cpu VM debugger commands are being directed at
+do Q CPU to change the current cpu VM debugger commands are being directed at
+do::
+
+	CPU <desired cpu no>
+
+On a SMP guest issue a command to all CPUs try prefixing the command with cpu
+all. To issue a command to a particular cpu try cpu <cpu number> e.g.::
+
+	CPU 01 TR I R 2000.3000
+
+If you are running on a guest with several cpus & you have a IO related problem
+& cannot follow the flow of code but you know it isn't smp related.
+
+from the bash prompt issue::
+
+	shutdown -h now or halt.
+
+do a::
+
+	Q CPUS
+
+to find out how many cpus you have detach each one of them from cp except
+cpu 0 by issuing a::
+
+	DETACH CPU 01-(number of cpus in configuration)
+
+& boot linux again.
+
+TR SIGP
+	will trace inter processor signal processor instructions.
+
+DEFINE CPU 01-(number in configuration)
+	will get your guests cpus back.
+
+
+Help for displaying ascii textstrings
+-------------------------------------
+On the very latest VM Nucleus'es VM can now display ascii
+( thanks Neale for the hint ) by doing::
+
+	D TX<lowaddr>.<len>
+
+e.g.::
+
+	D TX0.100
+
+Alternatively
+=============
+Under older VM debuggers (I love EBDIC too) you can use following little
+program which converts a command line of hex digits to ascii text. It can be
+compiled under linux and you can copy the hex digits from your x3270 terminal
+to your xterm if you are debugging from a linuxbox.
+
+This is quite useful when looking at a parameter passed in as a text string
+under VM ( unless you are good at decoding ASCII in your head ).
+
+e.g. consider tracing an open syscall::
+
+	TR SVC 5
+
+We have stopped at a breakpoint::
+
+	000151B0' SVC   0A05     -> 0001909A'   CC 0
+
+D 20.8 to check the SVC old psw in the prefix area and see was it from userspace
+(for the layout of the prefix area consult the "Fixed Storage Locations"
+chapter of the s/390 Reference Summary if you have it available).
+
+::
+
+  V00000020  070C2000 800151B2
+
+The problem state bit wasn't set &  it's also too early in the boot sequence
+for it to be a userspace SVC if it was we would have to temporarily switch the
+psw to user space addressing so we could get at the first parameter of the open
+in gpr2.
+
+Next do a::
+
+	D G2
+	GPR  2 =  00014CB4
+
+Now display what gpr2 is pointing to::
+
+	D 00014CB4.20
+	V00014CB4  2F646576 2F636F6E 736F6C65 00001BF5
+	V00014CC4  FC00014C B4001001 E0001000 B8070707
+
+Now copy the text till the first 00 hex ( which is the end of the string
+to an xterm & do hex2ascii on it::
+
+	hex2ascii 2F646576 2F636F6E 736F6C65 00
+
+outputs::
+
+	Decoded Hex:=/ d e v / c o n s o l e 0x00
+
+We were opening the console device,
+
+You can compile the code below yourself for practice :-),
+
+::
+
+  /*
+   *    hex2ascii.c
+   *    a useful little tool for converting a hexadecimal command line to ascii
+   *
+   *    Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+   *    (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation.
+   */
+  #include <stdio.h>
+
+  int main(int argc,char *argv[])
+  {
+    int cnt1,cnt2,len,toggle=0;
+    int startcnt=1;
+    unsigned char c,hex;
+
+    if(argc>1&&(strcmp(argv[1],"-a")==0))
+       startcnt=2;
+    printf("Decoded Hex:=");
+    for(cnt1=startcnt;cnt1<argc;cnt1++)
+    {
+      len=strlen(argv[cnt1]);
+      for(cnt2=0;cnt2<len;cnt2++)
+      {
+	 c=argv[cnt1][cnt2];
+	 if(c>='0'&&c<='9')
+	  c=c-'0';
+	 if(c>='A'&&c<='F')
+	  c=c-'A'+10;
+	 if(c>='a'&&c<='f')
+	  c=c-'a'+10;
+	 switch(toggle)
+	 {
+	  case 0:
+	     hex=c<<4;
+	     toggle=1;
+	  break;
+	  case 1:
+	     hex+=c;
+	     if(hex<32||hex>127)
+	     {
+		if(startcnt==1)
+		   printf("0x%02X ",(int)hex);
+		else
+		   printf(".");
+	     }
+	     else
+	     {
+	       printf("%c",hex);
+	       if(startcnt==1)
+		  printf(" ");
+	     }
+	     toggle=0;
+	  break;
+	 }
+      }
+    }
+    printf("\n");
+  }
+
+
+
+
+Stack tracing under VM
+----------------------
+A basic backtrace
+-----------------
+
+Here are the tricks I use 9 out of 10 times it works pretty well,
+
+When your backchain reaches a dead end
+--------------------------------------
+This can happen when an exception happens in the kernel and the kernel is
+entered twice. If you reach the NULL pointer at the end of the back chain you
+should be able to sniff further back if you follow the following tricks.
+1) A kernel address should be easy to recognise since it is in
+primary space & the problem state bit isn't set & also
+The Hi bit of the address is set.
+2) Another backchain should also be easy to recognise since it is an
+address pointing to another address approximately 100 bytes or 0x70 hex
+behind the current stackpointer.
+
+
+Here is some practice.
+
+boot the kernel & hit PA1 at some random time
+
+d g to display the gprs, this should display something like::
+
+  GPR  0 =  00000001  00156018  0014359C  00000000
+  GPR  4 =  00000001  001B8888  000003E0  00000000
+  GPR  8 =  00100080  00100084  00000000  000FE000
+  GPR 12 =  00010400  8001B2DC  8001B36A  000FFED8
+
+Note that GPR14 is a return address but as we are real men we are going to
+trace the stack.
+display 0x40 bytes after the stack pointer::
+
+  V000FFED8  000FFF38 8001B838 80014C8E 000FFF38
+  V000FFEE8  00000000 00000000 000003E0 00000000
+  V000FFEF8  00100080 00100084 00000000 000FE000
+  V000FFF08  00010400 8001B2DC 8001B36A 000FFED8
+
+
+Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if
+you look above at our stackframe & also agrees with GPR14.
+
+now backchain::
+
+	d 000FFF38.40
+
+we now are taking the contents of SP to get our first backchain::
+
+  V000FFF38  000FFFA0 00000000 00014995 00147094
+  V000FFF48  00147090 001470A0 000003E0 00000000
+  V000FFF58  00100080 00100084 00000000 001BF1D0
+  V000FFF68  00010400 800149BA 80014CA6 000FFF38
+
+This displays a 2nd return address of 80014CA6
+
+now do::
+
+	d 000FFFA0.40
+
+for our 3rd backchain::
+
+  V000FFFA0  04B52002 0001107F 00000000 00000000
+  V000FFFB0  00000000 00000000 FF000000 0001107F
+  V000FFFC0  00000000 00000000 00000000 00000000
+  V000FFFD0  00010400 80010802 8001085A 000FFFA0
+
+
+our 3rd return address is 8001085A
+
+as the 04B52002 looks suspiciously like rubbish it is fair to assume that the
+kernel entry routines for the sake of optimisation don't set up a backchain.
+
+now look at System.map to see if the addresses make any sense::
+
+	grep -i 0001b3 System.map
+
+outputs among other things::
+
+	0001b304 T cpu_idle
+
+so 8001B36A
+is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it )
+
+::
+
+	grep -i 00014 System.map
+
+produces among other things::
+
+	00014a78 T start_kernel
+
+so 0014CA6 is start_kernel+some hex number I can't add in my head.
+
+::
+
+	grep -i 00108 System.map
+
+this produces::
+
+	00010800 T _stext
+
+so   8001085A is _stext+0x5a
+
+Congrats you've done your first backchain.
+
+
+
+s/390 & z/Architecture IO Overview
+==================================
+
+I am not going to give a course in 390 IO architecture as this would take me
+quite a while and I'm no expert. Instead I'll give a 390 IO architecture
+summary for Dummies. If you have the s/390 principles of operation available
+read this instead. If nothing else you may find a few useful keywords in here
+and be able to use them on a web search engine to find more useful information.
+
+Unlike other bus architectures modern 390 systems do their IO using mostly
+fibre optics and devices such as tapes and disks can be shared between several
+mainframes. Also S390 can support up to 65536 devices while a high end PC based
+system might be choking with around 64.
+
+Here is some of the common IO terminology:
+
+Subchannel:
+  This is the logical number most IO commands use to talk to an IO device. There
+  can be up to 0x10000 (65536) of these in a configuration, typically there are a
+  few hundred. Under VM for simplicity they are allocated contiguously, however
+  on the native hardware they are not. They typically stay consistent between
+  boots provided no new hardware is inserted or removed.
+
+  Under Linux for s390 we use these as IRQ's and also when issuing an IO command
+  (CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL,
+  START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID
+  of the device we wish to talk to. The most important of these instructions are
+  START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO
+  completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have
+  up to 8 channel paths to a device, this offers redundancy if one is not
+  available.
+
+Device Number:
+  This number remains static and is closely tied to the hardware. There are 65536
+  of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and
+  another lsb 8 bits. These remain static even if more devices are inserted or
+  removed from the hardware. There is a 1 to 1 mapping between subchannels and
+  device numbers, provided devices aren't inserted or removed.
+
+Channel Control Words:
+  CCWs are linked lists of instructions initially pointed to by an operation
+  request block (ORB), which is initially given to Start Subchannel (SSCH)
+  command along with the subchannel number for the IO subsystem to process
+  while the CPU continues executing normal code.
+  CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and
+  Format 1 (31 bit). These are typically used to issue read and write (and many
+  other) instructions. They consist of a length field and an absolute address
+  field.
+
+  Each IO typically gets 1 or 2 interrupts, one for channel end (primary status)
+  when the channel is idle, and the second for device end (secondary status).
+  Sometimes you get both concurrently. You check how the IO went on by issuing a
+  TEST SUBCHANNEL at each interrupt, from which you receive an Interruption
+  response block (IRB). If you get channel and device end status in the IRB
+  without channel checks etc. your IO probably went okay. If you didn't you
+  probably need to examine the IRB, extended status word etc.
+  If an error occurs, more sophisticated control units have a facility known as
+  concurrent sense. This means that if an error occurs Extended sense information
+  will be presented in the Extended status word in the IRB. If not you have to
+  issue a subsequent SENSE CCW command after the test subchannel.
+
+
+TPI (Test pending interrupt) can also be used for polled IO, but in
+multitasking multiprocessor systems it isn't recommended except for
+checking special cases (i.e. non looping checks for pending IO etc.).
+
+Store Subchannel and Modify Subchannel can be used to examine and modify
+operating characteristics of a subchannel (e.g. channel paths).
+
+Other IO related Terms:
+
+Sysplex:
+  S390's Clustering Technology
+QDIO:
+  S390's new high speed IO architecture to support devices such as gigabit
+  ethernet, this architecture is also designed to be forward compatible with
+  upcoming 64 bit machines.
+
+
+General Concepts
+----------------
+
+Input Output Processors (IOP's) are responsible for communicating between
+the mainframe CPU's & the channel & relieve the mainframe CPU's from the
+burden of communicating with IO devices directly, this allows the CPU's to
+concentrate on data processing.
+
+IOP's can use one or more links ( known as channel paths ) to talk to each
+IO device. It first checks for path availability & chooses an available one,
+then starts ( & sometimes terminates IO ).
+There are two types of channel path: ESCON & the Parallel IO interface.
+
+IO devices are attached to control units, control units provide the
+logic to interface the channel paths & channel path IO protocols to
+the IO devices, they can be integrated with the devices or housed separately
+& often talk to several similar devices ( typical examples would be raid
+controllers or a control unit which connects to 1000 3270 terminals )::
+
+
+      +---------------------------------------------------------------+
+      | +-----+ +-----+ +-----+ +-----+  +----------+  +----------+   |
+      | | CPU | | CPU | | CPU | | CPU |  |  Main    |  | Expanded |   |
+      | |     | |     | |     | |     |  |  Memory  |  |  Storage |   |
+      | +-----+ +-----+ +-----+ +-----+  +----------+  +----------+   |
+      |---------------------------------------------------------------+
+      |   IOP        |      IOP      |       IOP                      |
+      |---------------------------------------------------------------
+      | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C |
+      ----------------------------------------------------------------
+	   ||                                              ||
+	   ||  Bus & Tag Channel Path                      || ESCON
+	   ||  ======================                      || Channel
+	   ||  ||                  ||                      || Path
+      +----------+               +----------+         +----------+
+      |          |               |          |         |          |
+      |    CU    |               |    CU    |         |    CU    |
+      |          |               |          |         |          |
+      +----------+               +----------+         +----------+
+	 |      |                     |                |       |
+  +----------+ +----------+      +----------+   +----------+ +----------+
+  |I/O Device| |I/O Device|      |I/O Device|   |I/O Device| |I/O Device|
+  +----------+ +----------+      +----------+   +----------+ +----------+
+    CPU = Central Processing Unit
+    C = Channel
+    IOP = IP Processor
+    CU = Control Unit
+
+The 390 IO systems come in 2 flavours the current 390 machines support both
+
+The Older 360 & 370 Interface,sometimes called the Parallel I/O interface,
+sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers
+Interface (OEMI).
+
+This byte wide Parallel channel path/bus has parity & data on the "Bus" cable
+and control lines on the "Tag" cable. These can operate in byte multiplex mode
+for sharing between several slow devices or burst mode and monopolize the
+channel for the whole burst. Up to 256 devices can be addressed on one of these
+cables. These cables are about one inch in diameter. The maximum unextended
+length supported by these cables is 125 Meters but this can be extended up to
+2km with a fibre optic channel extended such as a 3044. The maximum burst speed
+supported is 4.5 megabytes per second. However, some really old processors
+support only transfer rates of 3.0, 2.0 & 1.0 MB/sec.
+One of these paths can be daisy chained to up to 8 control units.
+
+
+ESCON if fibre optic it is also called FICON
+Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or
+lasers for communication at a signaling rate of up to 200 megabits/sec. As
+10bits are transferred for every 8 bits info this drops to 160 megabits/sec
+and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only
+operates in burst mode.
+
+ESCONs typical max cable length is 3km for the led version and 20km for the
+laser version known as XDF (extended distance facility). This can be further
+extended by using an ESCON director which triples the above mentioned ranges.
+Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture,
+the standard Bus & Tag control protocol is however present within the packets.
+Up to 256 devices can be attached to each control unit that uses one of these
+interfaces.
+
+Common 390 Devices include:
+Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters,
+Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console).
+DASD's direct access storage devices ( otherwise known as hard disks ).
+Tape Drives.
+CTC ( Channel to Channel Adapters ),
+ESCON or Parallel Cables used as a very high speed serial link
+between 2 machines.
+
+
+Debugging IO on s/390 & z/Architecture under VM
+===============================================
+
+Now we are ready to go on with IO tracing commands under VM
+
+A few self explanatory queries::
+
+	Q OSA
+	Q CTC
+	Q DISK ( This command is CMS specific )
+	Q DASD
+
+Q OSA on my machine returns::
+
+	OSA  7C08 ON OSA   7C08 SUBCHANNEL = 0000
+	OSA  7C09 ON OSA   7C09 SUBCHANNEL = 0001
+	OSA  7C14 ON OSA   7C14 SUBCHANNEL = 0002
+	OSA  7C15 ON OSA   7C15 SUBCHANNEL = 0003
+
+If you have a guest with certain privileges you may be able to see devices
+which don't belong to you. To avoid this, add the option V.
+e.g.::
+
+	Q V OSA
+
+Now using the device numbers returned by this command we will
+Trace the io starting up on the first device 7c08 & 7c09
+In our simplest case we can trace the
+start subchannels
+like TR SSCH 7C08-7C09
+or the halt subchannels
+or TR HSCH 7C08-7C09
+MSCH's ,STSCH's I think you can guess the rest
+
+A good trick is tracing all the IO's and CCWS and spooling them into the reader
+of another VM guest so he can ftp the logfile back to his own machine. I'll do
+a small bit of this and give you a look at the output.
+
+1) Spool stdout to VM reader::
+
+	SP PRT TO (another vm guest ) or * for the local vm guest
+
+2) Fill the reader with the trace::
+
+	TR IO 7c08-7c09 INST INT CCW PRT RUN
+
+3) Start up linux::
+
+	i 00c
+4) Finish the trace::
+
+	TR END
+
+5) close the reader::
+
+	C PRT
+
+6) list reader contents::
+
+	RDRLIST
+
+7) copy it to linux4's minidisk::
+
+	RECEIVE / LOG TXT A1 ( replace
+
+8)
+filel & press F11 to look at it
+You should see something like::
+
+  00020942' SSCH  B2334000    0048813C    CC 0    SCH 0000    DEV 7C08
+	    CPA 000FFDF0   PARM 00E2C9C4    KEY 0  FPI C0  LPM 80
+	    CCW    000FFDF0  E4200100 00487FE8   0000  E4240100 ........
+	    IDAL                                      43D8AFE8
+	    IDAL                                      0FB76000
+  00020B0A'   I/O DEV 7C08 -> 000197BC'   SCH 0000   PARM 00E2C9C4
+  00021628' TSCH  B2354000 >> 00488164    CC 0    SCH 0000    DEV 7C08
+	    CCWA 000FFDF8   DEV STS 0C  SCH STS 00  CNT 00EC
+	     KEY 0   FPI C0  CC 0   CTLS 4007
+  00022238' STSCH B2344000 >> 00488108    CC 0    SCH 0000    DEV 7C08
+
+If you don't like messing up your readed ( because you possibly booted from it )
+you can alternatively spool it to another readers guest.
+
+
+Other common VM device related commands
+---------------------------------------------
+These commands are listed only because they have
+been of use to me in the past & may be of use to
+you too. For more complete info on each of the commands
+use type HELP <command> from CMS.
+
+detaching devices::
+
+	DET <devno range>
+	ATT <devno range> <guest>
+
+attach a device to guest * for your own guest
+
+READY <devno>
+	cause VM to issue a fake interrupt.
+
+The VARY command is normally only available to VM administrators::
+
+	VARY ON PATH <path> TO <devno range>
+	VARY OFF PATH <PATH> FROM <devno range>
+
+This is used to switch on or off channel paths to devices.
+
+Q CHPID <channel path ID>
+   This displays state of devices using this channel path
+
+D SCHIB <subchannel>
+   This displays the subchannel information SCHIB block for the device.
+   this I believe is also only available to administrators.
+
+DEFINE CTC <devno>
+  defines a virtual CTC channel to channel connection
+  2 need to be defined on each guest for the CTC driver to use.
+
+COUPLE  devno userid remote devno
+  Joins a local virtual device to a remote virtual device
+  ( commonly used for the CTC driver ).
+
+Building a VM ramdisk under CMS which linux can use::
+
+	def vfb-<blocksize> <subchannel> <number blocks>
+
+blocksize is commonly 4096 for linux.
+
+Formatting it::
+
+	format <subchannel> <driver letter e.g. x> (blksize <blocksize>
+
+Sharing a disk between multiple guests::
+
+	LINK userid devno1 devno2 mode password
+
+
+
+GDB on S390
+===========
+N.B. if compiling for debugging gdb works better without optimisation
+( see Compiling programs for debugging )
+
+invocation
+----------
+gdb <victim program> <optional corefile>
+
+Online help
+-----------
+help: gives help on commands
+
+e.g.::
+
+	help
+	help display
+
+Note gdb's online help is very good use it.
+
+
+Assembly
+--------
+info registers:
+  displays registers other than floating point.
+
+info all-registers:
+  displays floating points as well.
+
+disassemble:
+  disassembles
+
+e.g.::
+
+	disassemble without parameters will disassemble the current function
+	disassemble $pc $pc+10
+
+Viewing & modifying variables
+-----------------------------
+print or p:
+  displays variable or register
+
+e.g. p/x $sp will display the stack pointer
+
+display:
+  prints variable or register each time program stops
+
+e.g.::
+
+	display/x $pc will display the program counter
+	display argc
+
+undisplay:
+  undo's display's
+
+info breakpoints:
+  shows all current breakpoints
+
+info stack:
+  shows stack back trace (if this doesn't work too well, I'll show
+  you the stacktrace by hand below).
+
+info locals:
+  displays local variables.
+
+info args:
+  display current procedure arguments.
+
+set args:
+  will set argc & argv each time the victim program is invoked
+
+e.g.::
+
+	set <variable>=value
+	set argc=100
+	set $pc=0
+
+
+
+Modifying execution
+-------------------
+step:
+  steps n lines of sourcecode
+
+step
+  steps 1 line.
+
+step 100
+  steps 100 lines of code.
+
+next:
+	like step except this will not step into subroutines
+
+stepi:
+	steps a single machine code instruction.
+
+e.g.::
+
+	stepi 100
+
+nexti:
+	steps a single machine code instruction but will not step into
+	subroutines.
+
+finish:
+	will run until exit of the current routine
+
+run:
+	(re)starts a program
+
+cont:
+	continues a program
+
+quit:
+	exits gdb.
+
+
+breakpoints
+------------
+
+break
+  sets a breakpoint
+
+e.g.::
+
+	break main
+	break *$pc
+	break *0x400618
+
+Here's a really useful one for large programs
+
+rbr
+	Set a breakpoint for all functions matching REGEXP
+
+e.g.::
+
+	rbr 390
+
+will set a breakpoint with all functions with 390 in their name.
+
+info breakpoints
+	lists all breakpoints
+
+delete:
+	delete breakpoint by number or delete them all
+
+e.g.
+
+delete 1
+	will delete the first breakpoint
+
+
+delete
+	will delete them all
+
+watch:
+	This will set a watchpoint ( usually hardware assisted ),
+
+This will watch a variable till it changes
+
+e.g.
+
+watch cnt
+	will watch the variable cnt till it changes.
+
+As an aside unfortunately gdb's, architecture independent watchpoint code
+is inconsistent & not very good, watchpoints usually work but not always.
+
+info watchpoints:
+	Display currently active watchpoints
+
+condition: ( another useful one )
+	Specify breakpoint number N to break only if COND is true.
+
+Usage is `condition N COND`, where N is an integer and COND is an
+expression to be evaluated whenever breakpoint N is reached.
+
+
+
+User defined functions/macros
+-----------------------------
+define: ( Note this is very very useful,simple & powerful )
+
+usage define <name> <list of commands> end
+
+examples which you should consider putting into .gdbinit in your home
+directory::
+
+	define d
+	stepi
+	disassemble $pc $pc+10
+	end
+	define e
+	nexti
+	disassemble $pc $pc+10
+	end
+
+
+Other hard to classify stuff
+----------------------------
+signal n:
+   sends the victim program a signal.
+
+e.g. `signal 3` will send a SIGQUIT.
+
+info signals:
+	what gdb does when the victim receives certain signals.
+
+list:
+
+e.g.:
+
+list
+	lists current function source
+list 1,10
+	list first 10 lines of current file.
+
+list test.c:1,10
+
+
+directory:
+  Adds directories to be searched for source if gdb cannot find the source.
+  (note it is a bit sensitive about slashes)
+
+e.g. To add the root of the filesystem to the searchpath do::
+
+	directory //
+
+
+call <function>
+This calls a function in the victim program, this is pretty powerful
+e.g.
+(gdb) call printf("hello world")
+outputs:
+$1 = 11
+
+You might now be thinking that the line above didn't work, something extra had
+to be done.
+(gdb) call fflush(stdout)
+hello world$2 = 0
+As an aside the debugger also calls malloc & free under the hood
+to make space for the "hello world" string.
+
+
+
+hints
+-----
+1) command completion works just like bash
+   ( if you are a bad typist like me this really helps )
+
+e.g. hit br <TAB> & cursor up & down :-).
+
+2) if you have a debugging problem that takes a few steps to recreate
+put the steps into a file called .gdbinit in your current working directory
+if you have defined a few extra useful user defined commands put these in
+your home directory & they will be read each time gdb is launched.
+
+A typical .gdbinit file might be.::
+
+	break main
+	run
+	break runtime_exception
+	cont
+
+
+stack chaining in gdb by hand
+-----------------------------
+This is done using a the same trick described for VM::
+
+	p/x (*($sp+56))&0x7fffffff
+
+get the first backchain.
+
+For z/Architecture
+Replace 56 with 112 & ignore the &0x7fffffff
+in the macros below & do nasty casts to longs like the following
+as gdb unfortunately deals with printed arguments as ints which
+messes up everything.
+
+i.e. here is a 3rd backchain dereference::
+
+	p/x *(long *)(***(long ***)$sp+112)
+
+
+this outputs::
+
+	$5 = 0x528f18
+
+on my machine.
+
+Now you can use::
+
+	info symbol (*($sp+56))&0x7fffffff
+
+you might see something like::
+
+	rl_getc + 36 in section .text
+
+telling you what is located at address 0x528f18
+Now do::
+
+	p/x (*(*$sp+56))&0x7fffffff
+
+This outputs::
+
+	$6 = 0x528ed0
+
+Now do::
+
+	info symbol (*(*$sp+56))&0x7fffffff
+	rl_read_key + 180 in section .text
+
+now do::
+
+	p/x (*(**$sp+56))&0x7fffffff
+
+& so on.
+
+Disassembling instructions without debug info
+---------------------------------------------
+gdb typically complains if there is a lack of debugging
+symbols in the disassemble command with
+"No function contains specified address." To get around
+this do::
+
+	x/<number lines to disassemble>xi <address>
+
+e.g.::
+
+	x/20xi 0x400730
+
+
+
+Note:
+  Remember gdb has history just like bash you don't need to retype the
+  whole line just use the up & down arrows.
+
+
+
+For more info
+-------------
+From your linuxbox do::
+
+	man gdb
+
+or::
+
+	info gdb.
+
+core dumps
+----------
+
+What a core dump ?
+^^^^^^^^^^^^^^^^^^
+
+A core dump is a file generated by the kernel (if allowed) which contains the
+registers and all active pages of the program which has crashed.
+
+From this file gdb will allow you to look at the registers, stack trace and
+memory of the program as if it just crashed on your system. It is usually
+called core and created in the current working directory.
+
+This is very useful in that a customer can mail a core dump to a technical
+support department and the technical support department can reconstruct what
+happened. Provided they have an identical copy of this program with debugging
+symbols compiled in and the source base of this build is available.
+
+In short it is far more useful than something like a crash log could ever hope
+to be.
+
+Why have I never seen one ?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Probably because you haven't used the command::
+
+	ulimit -c unlimited in bash
+
+to allow core dumps, now do::
+
+	ulimit -a
+
+to verify that the limit was accepted.
+
+A sample core dump
+   To create this I'm going to do::
+
+	ulimit -c unlimited
+	gdb
+
+to launch gdb (my victim app. ) now be bad & do the following from another
+telnet/xterm session to the same machine::
+
+	ps -aux | grep gdb
+	kill -SIGSEGV <gdb's pid>
+
+or alternatively use `killall -SIGSEGV gdb` if you have the killall command.
+
+Now look at the core dump::
+
+	./gdb core
+
+Displays the following::
+
+  GNU gdb 4.18
+  Copyright 1998 Free Software Foundation, Inc.
+  GDB is free software, covered by the GNU General Public License, and you are
+  welcome to change it and/or distribute copies of it under certain conditions.
+  Type "show copying" to see the conditions.
+  There is absolutely no warranty for GDB.  Type "show warranty" for details.
+  This GDB was configured as "s390-ibm-linux"...
+  Core was generated by `./gdb'.
+  Program terminated with signal 11, Segmentation fault.
+  Reading symbols from /usr/lib/libncurses.so.4...done.
+  Reading symbols from /lib/libm.so.6...done.
+  Reading symbols from /lib/libc.so.6...done.
+  Reading symbols from /lib/ld-linux.so.2...done.
+  #0  0x40126d1a in read () from /lib/libc.so.6
+  Setting up the environment for debugging gdb.
+  Breakpoint 1 at 0x4dc6f8: file utils.c, line 471.
+  Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
+  (top-gdb) info stack
+  #0  0x40126d1a in read () from /lib/libc.so.6
+  #1  0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402
+  #2  0x528ed0 in rl_read_key () at input.c:381
+  #3  0x5167e6 in readline_internal_char () at readline.c:454
+  #4  0x5168ee in readline_internal_charloop () at readline.c:507
+  #5  0x51692c in readline_internal () at readline.c:521
+  #6  0x5164fe in readline (prompt=0x7ffff810)
+      at readline.c:349
+  #7  0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
+      annotation_suffix=0x4d6b44 "prompt") at top.c:2091
+  #8  0x4d6cf0 in command_loop () at top.c:1345
+  #9  0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
+
+
+LDD
+===
+This is a program which lists the shared libraries which a library needs,
+Note you also get the relocations of the shared library text segments which
+help when using objdump --source.
+
+e.g.::
+
+	ldd ./gdb
+
+outputs::
+
+  libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000)
+  libm.so.6 => /lib/libm.so.6 (0x4005e000)
+  libc.so.6 => /lib/libc.so.6 (0x40084000)
+  /lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000)
+
+
+Debugging shared libraries
+==========================
+Most programs use shared libraries, however it can be very painful
+when you single step instruction into a function like printf for the
+first time & you end up in functions like _dl_runtime_resolve this is
+the ld.so doing lazy binding, lazy binding is a concept in ELF where
+shared library functions are not loaded into memory unless they are
+actually used, great for saving memory but a pain to debug.
+
+To get around this either relink the program -static or exit gdb type
+export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing
+the program in question.
+
+
+
+Debugging modules
+=================
+As modules are dynamically loaded into the kernel their address can be
+anywhere to get around this use the -m option with insmod to emit a load
+map which can be piped into a file if required.
+
+The proc file system
+====================
+What is it ?.
+It is a filesystem created by the kernel with files which are created on demand
+by the kernel if read, or can be used to modify kernel parameters,
+it is a powerful concept.
+
+e.g.::
+
+	cat /proc/sys/net/ipv4/ip_forward
+
+On my machine outputs::
+
+	0
+
+telling me ip_forwarding is not on to switch it on I can do::
+
+	echo 1 >  /proc/sys/net/ipv4/ip_forward
+
+cat it again::
+
+	cat /proc/sys/net/ipv4/ip_forward
+
+On my machine now outputs::
+
+	1
+
+IP forwarding is on.
+
+There is a lot of useful info in here best found by going in and having a look
+around, so I'll take you through some entries I consider important.
+
+All the processes running on the machine have their own entry defined by
+/proc/<pid>
+
+So lets have a look at the init process::
+
+	cd /proc/1
+	cat cmdline
+
+emits::
+
+	init [2]
+
+::
+
+	cd /proc/1/fd
+
+This contains numerical entries of all the open files,
+some of these you can cat e.g. stdout (2)::
+
+	cat /proc/29/maps
+
+on my machine emits::
+
+  00400000-00478000 r-xp 00000000 5f:00 4103       /bin/bash
+  00478000-0047e000 rw-p 00077000 5f:00 4103       /bin/bash
+  0047e000-00492000 rwxp 00000000 00:00 0
+  40000000-40015000 r-xp 00000000 5f:00 14382      /lib/ld-2.1.2.so
+  40015000-40016000 rw-p 00014000 5f:00 14382      /lib/ld-2.1.2.so
+  40016000-40017000 rwxp 00000000 00:00 0
+  40017000-40018000 rw-p 00000000 00:00 0
+  40018000-4001b000 r-xp 00000000 5f:00 14435      /lib/libtermcap.so.2.0.8
+  4001b000-4001c000 rw-p 00002000 5f:00 14435      /lib/libtermcap.so.2.0.8
+  4001c000-4010d000 r-xp 00000000 5f:00 14387      /lib/libc-2.1.2.so
+  4010d000-40111000 rw-p 000f0000 5f:00 14387      /lib/libc-2.1.2.so
+  40111000-40114000 rw-p 00000000 00:00 0
+  40114000-4011e000 r-xp 00000000 5f:00 14408      /lib/libnss_files-2.1.2.so
+  4011e000-4011f000 rw-p 00009000 5f:00 14408      /lib/libnss_files-2.1.2.so
+  7fffd000-80000000 rwxp ffffe000 00:00 0
+
+
+Showing us the shared libraries init uses where they are in memory
+& memory access permissions for each virtual memory area.
+
+/proc/1/cwd is a softlink to the current working directory.
+
+/proc/1/root is the root of the filesystem for this process.
+
+/proc/1/mem is the current running processes memory which you
+can read & write to like a file.
+
+strace uses this sometimes as it is a bit faster than the
+rather inefficient ptrace interface for peeking at DATA.
+
+::
+
+  cat status
+
+  Name:   init
+  State:  S (sleeping)
+  Pid:    1
+  PPid:   0
+  Uid:    0       0       0       0
+  Gid:    0       0       0       0
+  Groups:
+  VmSize:      408 kB
+  VmLck:         0 kB
+  VmRSS:       208 kB
+  VmData:       24 kB
+  VmStk:         8 kB
+  VmExe:       368 kB
+  VmLib:         0 kB
+  SigPnd: 0000000000000000
+  SigBlk: 0000000000000000
+  SigIgn: 7fffffffd7f0d8fc
+  SigCgt: 00000000280b2603
+  CapInh: 00000000fffffeff
+  CapPrm: 00000000ffffffff
+  CapEff: 00000000fffffeff
+
+  User PSW:    070de000 80414146
+  task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68
+  User GPRS:
+  00000400  00000000  0000000b  7ffffa90
+  00000000  00000000  00000000  0045d9f4
+  0045cafc  7ffffa90  7fffff18  0045cb08
+  00010400  804039e8  80403af8  7ffff8b0
+  User ACRS:
+  00000000  00000000  00000000  00000000
+  00000001  00000000  00000000  00000000
+  00000000  00000000  00000000  00000000
+  00000000  00000000  00000000  00000000
+  Kernel BackChain  CallChain    BackChain  CallChain
+	 004b7ca8   8002bd0c     004b7d18   8002b92c
+	 004b7db8   8005cd50     004b7e38   8005d12a
+	 004b7f08   80019114
+
+Showing among other things memory usage & status of some signals &
+the processes'es registers from the kernel task_structure
+as well as a backchain which may be useful if a process crashes
+in the kernel for some unknown reason.
+
+Some driver debugging techniques
+================================
+debug feature
+-------------
+Some of our drivers now support a "debug feature" in
+/proc/s390dbf see s390dbf.txt in the linux/Documentation directory
+for more info.
+
+e.g.
+to switch on the lcs "debug feature"::
+
+	echo 5 > /proc/s390dbf/lcs/level
+
+& then after the error occurred::
+
+	cat /proc/s390dbf/lcs/sprintf >/logfile
+
+the logfile now contains some information which may help
+tech support resolve a problem in the field.
+
+
+
+high level debugging network drivers
+------------------------------------
+ifconfig is a quite useful command
+it gives the current state of network drivers.
+
+If you suspect your network device driver is dead
+one way to check is type::
+
+	ifconfig <network device>
+
+e.g. tr0
+
+You should see something like::
+
+	ifconfig tr0
+	tr0      Link encap:16/4 Mbps Token Ring (New)  HWaddr 00:04:AC:20:8E:48
+		inet addr:9.164.185.132  Bcast:9.164.191.255  Mask:255.255.224.0
+		UP BROADCAST RUNNING MULTICAST  MTU:2000  Metric:1
+		RX packets:246134 errors:0 dropped:0 overruns:0 frame:0
+		TX packets:5 errors:0 dropped:0 overruns:0 carrier:0
+		collisions:0 txqueuelen:100
+
+if the device doesn't say up
+try::
+
+	/etc/rc.d/init.d/network start
+
+( this starts the network stack & hopefully calls ifconfig tr0 up ).
+ifconfig looks at the output of /proc/net/dev and presents it in a more
+presentable form.
+
+Now ping the device from a machine in the same subnet.
+
+if the RX packets count & TX packets counts don't increment you probably
+have problems.
+
+next::
+
+	cat /proc/net/arp
+
+Do you see any hardware addresses in the cache if not you may have problems.
+Next try::
+
+	ping -c 5 <broadcast_addr>
+
+i.e. the Bcast field above in the output of
+ifconfig. Do you see any replies from machines other than the local machine
+if not you may have problems. also if the TX packets count in ifconfig
+hasn't incremented either you have serious problems in your driver
+(e.g. the txbusy field of the network device being stuck on )
+or you may have multiple network devices connected.
+
+
+chandev
+-------
+There is a new device layer for channel devices, some
+drivers e.g. lcs are registered with this layer.
+
+If the device uses the channel device layer you'll be
+able to find what interrupts it uses & the current state
+of the device.
+
+See the manpage chandev.8 &type cat /proc/chandev for more info.
+
+
+SysRq
+=====
+This is now supported by linux for s/390 & z/Architecture.
+
+To enable it do compile the kernel with::
+
+	Kernel Hacking -> Magic SysRq Key Enabled
+
+Then::
+
+	echo "1" > /proc/sys/kernel/sysrq
+
+also type::
+
+	echo "8" >/proc/sys/kernel/printk
+
+To make printk output go to console.
+
+On 390 all commands are prefixed with::
+
+	^-
+
+e.g.::
+
+	^-t will show tasks.
+	^-? or some unknown command will display help.
+
+The sysrq key reading is very picky ( I have to type the keys in an
+xterm session & paste them  into the x3270 console )
+& it may be wise to predefine the keys as described in the VM hints above
+
+This is particularly useful for syncing disks unmounting & rebooting
+if the machine gets partially hung.
+
+Read Documentation/admin-guide/sysrq.rst for more info
+
+References:
+===========
+- Enterprise Systems Architecture Reference Summary
+- Enterprise Systems Architecture Principles of Operation
+- Hartmut Penners s390 stack frame sheet.
+- IBM Mainframe Channel Attachment a technology brief from a CISCO webpage
+- Various bits of man & info pages of Linux.
+- Linux & GDB source.
+- Various info & man pages.
+- CMS Help on tracing commands.
+- Linux for s/390 Elf Application Binary Interface
+- Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended )
+- z/Architecture Principles of Operation SA22-7832-00
+- Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the
+- Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05
+
+Special Thanks
+==============
+Special thanks to Neale Ferguson who maintains a much
+prettier HTML version of this page at
+http://linuxvm.org/penguinvm/
+Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/s390/driver-model.rst b/Documentation/s390/driver-model.rst
new file mode 100644
index 000000000000..ad4bc2dbea43
--- /dev/null
+++ b/Documentation/s390/driver-model.rst
@@ -0,0 +1,328 @@
+=============================
+S/390 driver model interfaces
+=============================
+
+1. CCW devices
+--------------
+
+All devices which can be addressed by means of ccws are called 'CCW devices' -
+even if they aren't actually driven by ccws.
+
+All ccw devices are accessed via a subchannel, this is reflected in the
+structures under devices/::
+
+  devices/
+     - system/
+     - css0/
+	   - 0.0.0000/0.0.0815/
+	   - 0.0.0001/0.0.4711/
+	   - 0.0.0002/
+	   - 0.1.0000/0.1.1234/
+	   ...
+	   - defunct/
+
+In this example, device 0815 is accessed via subchannel 0 in subchannel set 0,
+device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O
+subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1.
+
+The subchannel named 'defunct' does not represent any real subchannel on the
+system; it is a pseudo subchannel where disconnected ccw devices are moved to
+if they are displaced by another ccw device becoming operational on their
+former subchannel. The ccw devices will be moved again to a proper subchannel
+if they become operational again on that subchannel.
+
+You should address a ccw device via its bus id (e.g. 0.0.4711); the device can
+be found under bus/ccw/devices/.
+
+All ccw devices export some data via sysfs.
+
+cutype:
+	The control unit type / model.
+
+devtype:
+	The device type / model, if applicable.
+
+availability:
+	      Can be 'good' or 'boxed'; 'no path' or 'no device' for
+	      disconnected devices.
+
+online:
+	    An interface to set the device online and offline.
+	    In the special case of the device being disconnected (see the
+	    notify function under 1.2), piping 0 to online will forcibly delete
+	    the device.
+
+The device drivers can add entries to export per-device data and interfaces.
+
+There is also some data exported on a per-subchannel basis (see under
+bus/css/devices/):
+
+chpids:
+	Via which chpids the device is connected.
+
+pimpampom:
+	The path installed, path available and path operational masks.
+
+There also might be additional data, for example for block devices.
+
+
+1.1 Bringing up a ccw device
+----------------------------
+
+This is done in several steps.
+
+a. Each driver can provide one or more parameter interfaces where parameters can
+   be specified. These interfaces are also in the driver's responsibility.
+b. After a. has been performed, if necessary, the device is finally brought up
+   via the 'online' interface.
+
+
+1.2 Writing a driver for ccw devices
+------------------------------------
+
+The basic struct ccw_device and struct ccw_driver data structures can be found
+under include/asm/ccwdev.h::
+
+  struct ccw_device {
+	spinlock_t *ccwlock;
+	struct ccw_device_private *private;
+	struct ccw_device_id id;
+
+	struct ccw_driver *drv;
+	struct device dev;
+	int online;
+
+	void (*handler) (struct ccw_device *dev, unsigned long intparm,
+			 struct irb *irb);
+  };
+
+  struct ccw_driver {
+	struct module *owner;
+	struct ccw_device_id *ids;
+	int (*probe) (struct ccw_device *);
+	int (*remove) (struct ccw_device *);
+	int (*set_online) (struct ccw_device *);
+	int (*set_offline) (struct ccw_device *);
+	int (*notify) (struct ccw_device *, int);
+	struct device_driver driver;
+	char *name;
+  };
+
+The 'private' field contains data needed for internal i/o operation only, and
+is not available to the device driver.
+
+Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models
+and/or device types/models it is interested. This information can later be found
+in the struct ccw_device_id fields::
+
+  struct ccw_device_id {
+	__u16   match_flags;
+
+	__u16   cu_type;
+	__u16   dev_type;
+	__u8    cu_model;
+	__u8    dev_model;
+
+	unsigned long driver_info;
+  };
+
+The functions in ccw_driver should be used in the following way:
+
+probe:
+	 This function is called by the device layer for each device the driver
+	 is interested in. The driver should only allocate private structures
+	 to put in dev->driver_data and create attributes (if needed). Also,
+	 the interrupt handler (see below) should be set here.
+
+::
+
+  int (*probe) (struct ccw_device *cdev);
+
+Parameters:
+		cdev
+			- the device to be probed.
+
+
+remove:
+	 This function is called by the device layer upon removal of the driver,
+	 the device or the module. The driver should perform cleanups here.
+
+::
+
+  int (*remove) (struct ccw_device *cdev);
+
+Parameters:
+		cdev
+			- the device to be removed.
+
+
+set_online:
+	    This function is called by the common I/O layer when the device is
+	    activated via the 'online' attribute. The driver should finally
+	    setup and activate the device here.
+
+::
+
+  int (*set_online) (struct ccw_device *);
+
+Parameters:
+		cdev
+			- the device to be activated. The common layer has
+			  verified that the device is not already online.
+
+
+set_offline: This function is called by the common I/O layer when the device is
+	     de-activated via the 'online' attribute. The driver should shut
+	     down the device, but not de-allocate its private data.
+
+::
+
+  int (*set_offline) (struct ccw_device *);
+
+Parameters:
+		cdev
+			- the device to be deactivated. The common layer has
+			   verified that the device is online.
+
+
+notify:
+	This function is called by the common I/O layer for some state changes
+	of the device.
+
+	Signalled to the driver are:
+
+	* In online state, device detached (CIO_GONE) or last path gone
+	  (CIO_NO_PATH). The driver must return !0 to keep the device; for
+	  return code 0, the device will be deleted as usual (also when no
+	  notify function is registered). If the driver wants to keep the
+	  device, it is moved into disconnected state.
+	* In disconnected state, device operational again (CIO_OPER). The
+	  common I/O layer performs some sanity checks on device number and
+	  Device / CU to be reasonably sure if it is still the same device.
+	  If not, the old device is removed and a new one registered. By the
+	  return code of the notify function the device driver signals if it
+	  wants the device back: !0 for keeping, 0 to make the device being
+	  removed and re-registered.
+
+::
+
+  int (*notify) (struct ccw_device *, int);
+
+Parameters:
+		cdev
+			- the device whose state changed.
+
+		event
+			- the event that happened. This can be one of CIO_GONE,
+			  CIO_NO_PATH or CIO_OPER.
+
+The handler field of the struct ccw_device is meant to be set to the interrupt
+handler for the device. In order to accommodate drivers which use several
+distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device
+instead of ccw_driver.
+The handler is registered with the common layer during set_online() processing
+before the driver is called, and is deregistered during set_offline() after the
+driver has been called. Also, after registering / before deregistering, path
+grouping resp. disbanding of the path group (if applicable) are performed.
+
+::
+
+  void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb);
+
+Parameters:     dev     - the device the handler is called for
+		intparm - the intparm which allows the device driver to identify
+			  the i/o the interrupt is associated with, or to recognize
+			  the interrupt as unsolicited.
+		irb     - interruption response block which contains the accumulated
+			  status.
+
+The device driver is called from the common ccw_device layer and can retrieve
+information about the interrupt from the irb parameter.
+
+
+1.3 ccwgroup devices
+--------------------
+
+The ccwgroup mechanism is designed to handle devices consisting of multiple ccw
+devices, like lcs or ctc.
+
+The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to
+this attributes creates a ccwgroup device consisting of these ccw devices (if
+possible). This ccwgroup device can be set online or offline just like a normal
+ccw device.
+
+Each ccwgroup device also provides an 'ungroup' attribute to destroy the device
+again (only when offline). This is a generic ccwgroup mechanism (the driver does
+not need to implement anything beyond normal removal routines).
+
+A ccw device which is a member of a ccwgroup device carries a pointer to the
+ccwgroup device in the driver_data of its device struct. This field must not be
+touched by the driver - it should use the ccwgroup device's driver_data for its
+private data.
+
+To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
+mind that most drivers will need to implement both a ccwgroup and a ccw
+driver.
+
+
+2. Channel paths
+-----------------
+
+Channel paths show up, like subchannels, under the channel subsystem root (css0)
+and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus.
+Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect
+only the logical state and not the physical state, since we cannot track the
+latter consistently due to lacking machine support (we don't need to be aware
+of it anyway).
+
+status
+       - Can be 'online' or 'offline'.
+	 Piping 'on' or 'off' sets the chpid logically online/offline.
+	 Piping 'on' to an online chpid triggers path reprobing for all devices
+	 the chpid connects to. This can be used to force the kernel to re-use
+	 a channel path the user knows to be online, but the machine hasn't
+	 created a machine check for.
+
+type
+       - The physical type of the channel path.
+
+shared
+       - Whether the channel path is shared.
+
+cmg
+       - The channel measurement group.
+
+3. System devices
+-----------------
+
+3.1 xpram
+---------
+
+xpram shows up under devices/system/ as 'xpram'.
+
+3.2 cpus
+--------
+
+For each cpu, a directory is created under devices/system/cpu/. Each cpu has an
+attribute 'online' which can be 0 or 1.
+
+
+4. Other devices
+----------------
+
+4.1 Netiucv
+-----------
+
+The netiucv driver creates an attribute 'connection' under
+bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv
+connection to the specified host.
+
+Netiucv connections show up under devices/iucv/ as "netiucv<ifnum>". The interface
+number is assigned sequentially to the connections defined via the 'connection'
+attribute.
+
+user
+    - shows the connection partner.
+
+buffer
+    - maximum buffer size. Pipe to it to change buffer size.
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt
deleted file mode 100644
index ed265cf54cde..000000000000
--- a/Documentation/s390/driver-model.txt
+++ /dev/null
@@ -1,287 +0,0 @@
-S/390 driver model interfaces
------------------------------
-
-1. CCW devices
---------------
-
-All devices which can be addressed by means of ccws are called 'CCW devices' -
-even if they aren't actually driven by ccws.
-
-All ccw devices are accessed via a subchannel, this is reflected in the 
-structures under devices/:
-
-devices/
-     - system/
-     - css0/
-           - 0.0.0000/0.0.0815/
-	   - 0.0.0001/0.0.4711/
-	   - 0.0.0002/
-	   - 0.1.0000/0.1.1234/
-	   ...
-	   - defunct/
-
-In this example, device 0815 is accessed via subchannel 0 in subchannel set 0,
-device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O
-subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1.
-
-The subchannel named 'defunct' does not represent any real subchannel on the
-system; it is a pseudo subchannel where disconnected ccw devices are moved to
-if they are displaced by another ccw device becoming operational on their
-former subchannel. The ccw devices will be moved again to a proper subchannel
-if they become operational again on that subchannel.
-
-You should address a ccw device via its bus id (e.g. 0.0.4711); the device can
-be found under bus/ccw/devices/.
-
-All ccw devices export some data via sysfs.
-
-cutype:	    The control unit type / model.
-
-devtype:    The device type / model, if applicable.
-
-availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for
-	      disconnected devices.
-
-online:     An interface to set the device online and offline.
-	    In the special case of the device being disconnected (see the
-	    notify function under 1.2), piping 0 to online will forcibly delete
-	    the device.
-
-The device drivers can add entries to export per-device data and interfaces.
-
-There is also some data exported on a per-subchannel basis (see under
-bus/css/devices/):
-
-chpids:	    Via which chpids the device is connected.
-
-pimpampom:  The path installed, path available and path operational masks.
-
-There also might be additional data, for example for block devices.
-
-
-1.1 Bringing up a ccw device
-----------------------------
-
-This is done in several steps.
-
-a. Each driver can provide one or more parameter interfaces where parameters can
-   be specified. These interfaces are also in the driver's responsibility.
-b. After a. has been performed, if necessary, the device is finally brought up
-   via the 'online' interface.
-
-
-1.2 Writing a driver for ccw devices
-------------------------------------
-
-The basic struct ccw_device and struct ccw_driver data structures can be found
-under include/asm/ccwdev.h.
-
-struct ccw_device {
-        spinlock_t *ccwlock;
-        struct ccw_device_private *private;
-	struct ccw_device_id id;	
-
-	struct ccw_driver *drv;		
-	struct device dev;		
-	int online;
-
-	void (*handler) (struct ccw_device *dev, unsigned long intparm,
-                         struct irb *irb);
-};
-
-struct ccw_driver {
-	struct module *owner;		
-	struct ccw_device_id *ids;	
-	int (*probe) (struct ccw_device *); 
-	int (*remove) (struct ccw_device *);
-	int (*set_online) (struct ccw_device *);
-	int (*set_offline) (struct ccw_device *);
-	int (*notify) (struct ccw_device *, int);
-	struct device_driver driver;
-	char *name;
-};
-
-The 'private' field contains data needed for internal i/o operation only, and
-is not available to the device driver.
-
-Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models
-and/or device types/models it is interested. This information can later be found
-in the struct ccw_device_id fields:
-
-struct ccw_device_id {
-	__u16	match_flags;	
-
-	__u16	cu_type;	
-	__u16	dev_type;	
-	__u8	cu_model;	
-	__u8	dev_model;	
-
-	unsigned long driver_info;
-};
-
-The functions in ccw_driver should be used in the following way:
-probe:   This function is called by the device layer for each device the driver
-	 is interested in. The driver should only allocate private structures
-	 to put in dev->driver_data and create attributes (if needed). Also,
-	 the interrupt handler (see below) should be set here.
-
-int (*probe) (struct ccw_device *cdev); 
-
-Parameters:  cdev     - the device to be probed.
-
-
-remove:  This function is called by the device layer upon removal of the driver,
-	 the device or the module. The driver should perform cleanups here.
-
-int (*remove) (struct ccw_device *cdev);
-
-Parameters:   cdev    - the device to be removed.
-
-
-set_online: This function is called by the common I/O layer when the device is
-	    activated via the 'online' attribute. The driver should finally
-	    setup and activate the device here.
-
-int (*set_online) (struct ccw_device *);
-
-Parameters:   cdev	- the device to be activated. The common layer has
-			  verified that the device is not already online.
-
-
-set_offline: This function is called by the common I/O layer when the device is
-	     de-activated via the 'online' attribute. The driver should shut
-	     down the device, but not de-allocate its private data.
-
-int (*set_offline) (struct ccw_device *);
-
-Parameters:   cdev       - the device to be deactivated. The common layer has
-			   verified that the device is online.
-
-
-notify: This function is called by the common I/O layer for some state changes
-	of the device.
-	Signalled to the driver are:
-	* In online state, device detached (CIO_GONE) or last path gone
-	  (CIO_NO_PATH). The driver must return !0 to keep the device; for
-	  return code 0, the device will be deleted as usual (also when no
-	  notify function is registered). If the driver wants to keep the
-	  device, it is moved into disconnected state.
-	* In disconnected state, device operational again (CIO_OPER). The
-	  common I/O layer performs some sanity checks on device number and
-	  Device / CU to be reasonably sure if it is still the same device.
-	  If not, the old device is removed and a new one registered. By the
-	  return code of the notify function the device driver signals if it
-	  wants the device back: !0 for keeping, 0 to make the device being
-	  removed and re-registered.
-	
-int (*notify) (struct ccw_device *, int);
-
-Parameters:   cdev    - the device whose state changed.
-	      event   - the event that happened. This can be one of CIO_GONE,
-		        CIO_NO_PATH or CIO_OPER.
-
-The handler field of the struct ccw_device is meant to be set to the interrupt
-handler for the device. In order to accommodate drivers which use several 
-distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device
-instead of ccw_driver.
-The handler is registered with the common layer during set_online() processing
-before the driver is called, and is deregistered during set_offline() after the
-driver has been called. Also, after registering / before deregistering, path 
-grouping resp. disbanding of the path group (if applicable) are performed.
-
-void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb);
-
-Parameters:	dev	- the device the handler is called for
-		intparm - the intparm which allows the device driver to identify
-                          the i/o the interrupt is associated with, or to recognize
-                          the interrupt as unsolicited.
-                irb     - interruption response block which contains the accumulated
-                          status.
-
-The device driver is called from the common ccw_device layer and can retrieve 
-information about the interrupt from the irb parameter.
-
-
-1.3 ccwgroup devices
---------------------
-
-The ccwgroup mechanism is designed to handle devices consisting of multiple ccw
-devices, like lcs or ctc.
-
-The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to
-this attributes creates a ccwgroup device consisting of these ccw devices (if
-possible). This ccwgroup device can be set online or offline just like a normal
-ccw device.
-
-Each ccwgroup device also provides an 'ungroup' attribute to destroy the device
-again (only when offline). This is a generic ccwgroup mechanism (the driver does
-not need to implement anything beyond normal removal routines).
-
-A ccw device which is a member of a ccwgroup device carries a pointer to the
-ccwgroup device in the driver_data of its device struct. This field must not be
-touched by the driver - it should use the ccwgroup device's driver_data for its
-private data.
-
-To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
-mind that most drivers will need to implement both a ccwgroup and a ccw
-driver.
-
-
-2. Channel paths
------------------
-
-Channel paths show up, like subchannels, under the channel subsystem root (css0)
-and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus.
-Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect
-only the logical state and not the physical state, since we cannot track the
-latter consistently due to lacking machine support (we don't need to be aware
-of it anyway).
-
-status - Can be 'online' or 'offline'.
-	 Piping 'on' or 'off' sets the chpid logically online/offline.
-	 Piping 'on' to an online chpid triggers path reprobing for all devices
-	 the chpid connects to. This can be used to force the kernel to re-use
-	 a channel path the user knows to be online, but the machine hasn't
-	 created a machine check for.
-
-type - The physical type of the channel path.
-
-shared - Whether the channel path is shared.
-
-cmg - The channel measurement group.
-
-3. System devices
------------------
-
-3.1 xpram 
----------
-
-xpram shows up under devices/system/ as 'xpram'.
-
-3.2 cpus
---------
-
-For each cpu, a directory is created under devices/system/cpu/. Each cpu has an
-attribute 'online' which can be 0 or 1.
-
-
-4. Other devices
-----------------
-
-4.1 Netiucv
------------
-
-The netiucv driver creates an attribute 'connection' under
-bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv
-connection to the specified host.
-
-Netiucv connections show up under devices/iucv/ as "netiucv<ifnum>". The interface
-number is assigned sequentially to the connections defined via the 'connection'
-attribute.
-
-user			  - shows the connection partner.
-
-buffer			  - maximum buffer size.
-			    Pipe to it to change buffer size.
-
-
diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst
new file mode 100644
index 000000000000..4602312909d3
--- /dev/null
+++ b/Documentation/s390/index.rst
@@ -0,0 +1,28 @@
+=================
+s390 Architecture
+=================
+
+.. toctree::
+    :maxdepth: 1
+
+    cds
+    3270
+    debugging390
+    driver-model
+    monreader
+    qeth
+    s390dbf
+    vfio-ap
+    vfio-ccw
+    zfcpdump
+    dasd
+    common_io
+
+    text_files
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/s390/monreader.rst b/Documentation/s390/monreader.rst
new file mode 100644
index 000000000000..1e857575c113
--- /dev/null
+++ b/Documentation/s390/monreader.rst
@@ -0,0 +1,212 @@
+=================================================
+Linux API for read access to z/VM Monitor Records
+=================================================
+
+Date  : 2004-Nov-26
+
+Author: Gerald Schaefer (geraldsc@de.ibm.com)
+
+
+
+
+Description
+===========
+This item delivers a new Linux API in the form of a misc char device that is
+usable from user space and allows read access to the z/VM Monitor Records
+collected by the `*MONITOR` System Service of z/VM.
+
+
+User Requirements
+=================
+The z/VM guest on which you want to access this API needs to be configured in
+order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the
+IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is
+restricted (likely), you also need the NAMESAVE <DCSS NAME> statement.
+This item will use the IUCV device driver to access the z/VM services, so you
+need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1.
+
+There are two options for being able to load the monitor DCSS (examples assume
+that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the
+location of the monitor DCSS with the Class E privileged CP command Q NSS MAP
+(the values BEGPAG and ENDPAG are given in units of 4K pages).
+
+See also "CP Command and Utility Reference" (SC24-6081-00) for more information
+on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning
+and Administration" (SC24-6116-00) for more information on DCSSes.
+
+1st option:
+-----------
+You can use the CP command DEF STOR CONFIG to define a "memory hole" in your
+guest virtual storage around the address range of the DCSS.
+
+Example: DEF STOR CONFIG 0.140M 200M.200M
+
+This defines two blocks of storage, the first is 140MB in size an begins at
+address 0MB, the second is 200MB in size and begins at address 200MB,
+resulting in a total storage of 340MB. Note that the first block should
+always start at 0 and be at least 64MB in size.
+
+2nd option:
+-----------
+Your guest virtual storage has to end below the starting address of the DCSS
+and you have to specify the "mem=" kernel parameter in your parmfile with a
+value greater than the ending address of the DCSS.
+
+Example::
+
+	DEF STOR 140M
+
+This defines 140MB storage size for your guest, the parameter "mem=160M" is
+added to the parmfile.
+
+
+User Interface
+==============
+The char device is implemented as a kernel module named "monreader",
+which can be loaded via the modprobe command, or it can be compiled into the
+kernel instead. There is one optional module (or kernel) parameter, "mondcss",
+to specify the name of the monitor DCSS. If the module is compiled into the
+kernel, the kernel parameter "monreader.mondcss=<DCSS NAME>" can be specified
+in the parmfile.
+
+The default name for the DCSS is "MONDCSS" if none is specified. In case that
+there are other users already connected to the `*MONITOR` service (e.g.
+Performance Toolkit), the monitor DCSS is already defined and you have to use
+the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name
+of the monitor DCSS, if already defined, and the users connected to the
+`*MONITOR` service.
+Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor
+DCSS if your z/VM doesn't have one already, you need Class E privileges to
+define and save a DCSS.
+
+Example:
+--------
+
+::
+
+	modprobe monreader mondcss=MYDCSS
+
+This loads the module and sets the DCSS name to "MYDCSS".
+
+NOTE:
+-----
+This API provides no interface to control the `*MONITOR` service, e.g. specify
+which data should be collected. This can be done by the CP command MONITOR
+(Class E privileged), see "CP Command and Utility Reference".
+
+Device nodes with udev:
+-----------------------
+After loading the module, a char device will be created along with the device
+node /<udev directory>/monreader.
+
+Device nodes without udev:
+--------------------------
+If your distribution does not support udev, a device node will not be created
+automatically and you have to create it manually after loading the module.
+Therefore you need to know the major and minor numbers of the device. These
+numbers can be found in /sys/class/misc/monreader/dev.
+
+Typing cat /sys/class/misc/monreader/dev will give an output of the form
+<major>:<minor>. The device node can be created via the mknod command, enter
+mknod <name> c <major> <minor>, where <name> is the name of the device node
+to be created.
+
+Example:
+--------
+
+::
+
+	# modprobe monreader
+	# cat /sys/class/misc/monreader/dev
+	10:63
+	# mknod /dev/monreader c 10 63
+
+This loads the module with the default monitor DCSS (MONDCSS) and creates a
+device node.
+
+File operations:
+----------------
+The following file operations are supported: open, release, read, poll.
+There are two alternative methods for reading: either non-blocking read in
+conjunction with polling, or blocking read without polling. IOCTLs are not
+supported.
+
+Read:
+-----
+Reading from the device provides a 12 Byte monitor control element (MCE),
+followed by a set of one or more contiguous monitor records (similar to the
+output of the CMS utility MONWRITE without the 4K control blocks). The MCE
+contains information on the type of the following record set (sample/event
+data), the monitor domains contained within it and the start and end address
+of the record set in the monitor DCSS. The start and end address can be used
+to determine the size of the record set, the end address is the address of the
+last byte of data. The start address is needed to handle "end-of-frame" records
+correctly (domain 1, record 13), i.e. it can be used to determine the record
+start offset relative to a 4K page (frame) boundary.
+
+See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description
+of the monitor control element layout. The layout of the monitor records can
+be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html
+
+The layout of the data stream provided by the monreader device is as follows::
+
+	...
+	<0 byte read>
+	<first MCE>              \
+	<first set of records>    |
+	...                       |- data set
+	<last MCE>                |
+	<last set of records>    /
+	<0 byte read>
+	...
+
+There may be more than one combination of MCE and corresponding record set
+within one data set and the end of each data set is indicated by a successful
+read with a return value of 0 (0 byte read).
+Any received data must be considered invalid until a complete set was
+read successfully, including the closing 0 byte read. Therefore you should
+always read the complete set into a buffer before processing the data.
+
+The maximum size of a data set can be as large as the size of the
+monitor DCSS, so design the buffer adequately or use dynamic memory allocation.
+The size of the monitor DCSS will be printed into syslog after loading the
+module. You can also use the (Class E privileged) CP command Q NSS MAP to
+list all available segments and information about them.
+
+As with most char devices, error conditions are indicated by returning a
+negative value for the number of bytes read. In this case, the errno variable
+indicates the error condition:
+
+EIO:
+     reply failed, read data is invalid and the application
+     should discard the data read since the last successful read with 0 size.
+EFAULT:
+	copy_to_user failed, read data is invalid and the application should
+	discard the data read since the last successful read with 0 size.
+EAGAIN:
+	occurs on a non-blocking read if there is no data available at the
+	moment. There is no data missing or corrupted, just try again or rather
+	use polling for non-blocking reads.
+EOVERFLOW:
+	   message limit reached, the data read since the last successful
+	   read with 0 size is valid but subsequent records may be missing.
+
+In the last case (EOVERFLOW) there may be missing data, in the first two cases
+(EIO, EFAULT) there will be missing data. It's up to the application if it will
+continue reading subsequent data or rather exit.
+
+Open:
+-----
+Only one user is allowed to open the char device. If it is already in use, the
+open function will fail (return a negative value) and set errno to EBUSY.
+The open function may also fail if an IUCV connection to the `*MONITOR` service
+cannot be established. In this case errno will be set to EIO and an error
+message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER
+codes are described in the "z/VM Performance" book, Appendix A.
+
+NOTE:
+-----
+As soon as the device is opened, incoming messages will be accepted and they
+will account for the message limit, i.e. opening the device without reading
+from it will provoke the "message limit reached" error (EOVERFLOW error code)
+eventually.
diff --git a/Documentation/s390/monreader.txt b/Documentation/s390/monreader.txt
deleted file mode 100644
index d3729585fdb0..000000000000
--- a/Documentation/s390/monreader.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-
-Date  : 2004-Nov-26
-Author: Gerald Schaefer (geraldsc@de.ibm.com)
-
-
-             Linux API for read access to z/VM Monitor Records
-             =================================================
-
-
-Description
-===========
-This item delivers a new Linux API in the form of a misc char device that is
-usable from user space and allows read access to the z/VM Monitor Records
-collected by the *MONITOR System Service of z/VM.
-
-
-User Requirements
-=================
-The z/VM guest on which you want to access this API needs to be configured in
-order to allow IUCV connections to the *MONITOR service, i.e. it needs the
-IUCV *MONITOR statement in its user entry. If the monitor DCSS to be used is
-restricted (likely), you also need the NAMESAVE <DCSS NAME> statement.
-This item will use the IUCV device driver to access the z/VM services, so you
-need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1.
-
-There are two options for being able to load the monitor DCSS (examples assume
-that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the
-location of the monitor DCSS with the Class E privileged CP command Q NSS MAP
-(the values BEGPAG and ENDPAG are given in units of 4K pages).
-
-See also "CP Command and Utility Reference" (SC24-6081-00) for more information
-on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning
-and Administration" (SC24-6116-00) for more information on DCSSes.
-
-1st option:
------------
-You can use the CP command DEF STOR CONFIG to define a "memory hole" in your
-guest virtual storage around the address range of the DCSS.
-
-Example: DEF STOR CONFIG 0.140M 200M.200M
-
-This defines two blocks of storage, the first is 140MB in size an begins at
-address 0MB, the second is 200MB in size and begins at address 200MB,
-resulting in a total storage of 340MB. Note that the first block should
-always start at 0 and be at least 64MB in size.
-
-2nd option:
------------
-Your guest virtual storage has to end below the starting address of the DCSS
-and you have to specify the "mem=" kernel parameter in your parmfile with a
-value greater than the ending address of the DCSS.
-
-Example: DEF STOR 140M
-
-This defines 140MB storage size for your guest, the parameter "mem=160M" is
-added to the parmfile.
-
-
-User Interface
-==============
-The char device is implemented as a kernel module named "monreader",
-which can be loaded via the modprobe command, or it can be compiled into the
-kernel instead. There is one optional module (or kernel) parameter, "mondcss",
-to specify the name of the monitor DCSS. If the module is compiled into the
-kernel, the kernel parameter "monreader.mondcss=<DCSS NAME>" can be specified
-in the parmfile.
-
-The default name for the DCSS is "MONDCSS" if none is specified. In case that
-there are other users already connected to the *MONITOR service (e.g.
-Performance Toolkit), the monitor DCSS is already defined and you have to use
-the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name
-of the monitor DCSS, if already defined, and the users connected to the
-*MONITOR service.
-Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor
-DCSS if your z/VM doesn't have one already, you need Class E privileges to
-define and save a DCSS.
-
-Example:
---------
-modprobe monreader mondcss=MYDCSS
-
-This loads the module and sets the DCSS name to "MYDCSS".
-
-NOTE:
------
-This API provides no interface to control the *MONITOR service, e.g. specify
-which data should be collected. This can be done by the CP command MONITOR
-(Class E privileged), see "CP Command and Utility Reference".
-
-Device nodes with udev:
------------------------
-After loading the module, a char device will be created along with the device
-node /<udev directory>/monreader.
-
-Device nodes without udev:
---------------------------
-If your distribution does not support udev, a device node will not be created
-automatically and you have to create it manually after loading the module.
-Therefore you need to know the major and minor numbers of the device. These
-numbers can be found in /sys/class/misc/monreader/dev.
-Typing cat /sys/class/misc/monreader/dev will give an output of the form
-<major>:<minor>. The device node can be created via the mknod command, enter
-mknod <name> c <major> <minor>, where <name> is the name of the device node
-to be created.
-
-Example:
---------
-# modprobe monreader
-# cat /sys/class/misc/monreader/dev
-10:63
-# mknod /dev/monreader c 10 63
-
-This loads the module with the default monitor DCSS (MONDCSS) and creates a
-device node.
-
-File operations:
-----------------
-The following file operations are supported: open, release, read, poll.
-There are two alternative methods for reading: either non-blocking read in
-conjunction with polling, or blocking read without polling. IOCTLs are not
-supported.
-
-Read:
------
-Reading from the device provides a 12 Byte monitor control element (MCE),
-followed by a set of one or more contiguous monitor records (similar to the
-output of the CMS utility MONWRITE without the 4K control blocks). The MCE
-contains information on the type of the following record set (sample/event
-data), the monitor domains contained within it and the start and end address
-of the record set in the monitor DCSS. The start and end address can be used
-to determine the size of the record set, the end address is the address of the
-last byte of data. The start address is needed to handle "end-of-frame" records
-correctly (domain 1, record 13), i.e. it can be used to determine the record
-start offset relative to a 4K page (frame) boundary.
-
-See "Appendix A: *MONITOR" in the "z/VM Performance" document for a description
-of the monitor control element layout. The layout of the monitor records can
-be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html
-
-The layout of the data stream provided by the monreader device is as follows:
-...
-<0 byte read>
-<first MCE>              \
-<first set of records>    |
-...                       |- data set
-<last MCE>                |
-<last set of records>    /
-<0 byte read>
-...
-
-There may be more than one combination of MCE and corresponding record set
-within one data set and the end of each data set is indicated by a successful
-read with a return value of 0 (0 byte read).
-Any received data must be considered invalid until a complete set was
-read successfully, including the closing 0 byte read. Therefore you should
-always read the complete set into a buffer before processing the data.
-
-The maximum size of a data set can be as large as the size of the
-monitor DCSS, so design the buffer adequately or use dynamic memory allocation.
-The size of the monitor DCSS will be printed into syslog after loading the
-module. You can also use the (Class E privileged) CP command Q NSS MAP to
-list all available segments and information about them.
-
-As with most char devices, error conditions are indicated by returning a
-negative value for the number of bytes read. In this case, the errno variable
-indicates the error condition:
-
-EIO: reply failed, read data is invalid and the application
-     should discard the data read since the last successful read with 0 size.
-EFAULT: copy_to_user failed, read data is invalid and the application should
-        discard the data read since the last successful read with 0 size.
-EAGAIN: occurs on a non-blocking read if there is no data available at the
-        moment. There is no data missing or corrupted, just try again or rather
-        use polling for non-blocking reads.
-EOVERFLOW: message limit reached, the data read since the last successful
-           read with 0 size is valid but subsequent records may be missing.
-
-In the last case (EOVERFLOW) there may be missing data, in the first two cases
-(EIO, EFAULT) there will be missing data. It's up to the application if it will
-continue reading subsequent data or rather exit.
-
-Open:
------
-Only one user is allowed to open the char device. If it is already in use, the
-open function will fail (return a negative value) and set errno to EBUSY.
-The open function may also fail if an IUCV connection to the *MONITOR service
-cannot be established. In this case errno will be set to EIO and an error
-message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER
-codes are described in the "z/VM Performance" book, Appendix A.
-
-NOTE:
------
-As soon as the device is opened, incoming messages will be accepted and they
-will account for the message limit, i.e. opening the device without reading
-from it will provoke the "message limit reached" error (EOVERFLOW error code)
-eventually.
-
diff --git a/Documentation/s390/qeth.rst b/Documentation/s390/qeth.rst
new file mode 100644
index 000000000000..f02fdaa68de0
--- /dev/null
+++ b/Documentation/s390/qeth.rst
@@ -0,0 +1,64 @@
+=============================
+IBM s390 QDIO Ethernet Driver
+=============================
+
+OSA and HiperSockets Bridge Port Support
+========================================
+
+Uevents
+-------
+
+To generate the events the device must be assigned a role of either
+a primary or a secondary Bridge Port. For more information, see
+"z/VM Connectivity, SC24-6174".
+
+When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state
+of some configured Bridge Port device on the channel changes, a udev
+event with ACTION=CHANGE is emitted on behalf of the corresponding
+ccwgroup device. The event has the following attributes:
+
+BRIDGEPORT=statechange
+  indicates that the Bridge Port device changed
+  its state.
+
+ROLE={primary|secondary|none}
+  the role assigned to the port.
+
+STATE={active|standby|inactive}
+  the newly assumed state of the port.
+
+When run on HiperSockets Bridge Capable Port hardware with host address
+notifications enabled, a udev event with ACTION=CHANGE is emitted.
+It is emitted on behalf of the corresponding ccwgroup device when a host
+or a VLAN is registered or unregistered on the network served by the device.
+The event has the following attributes:
+
+BRIDGEDHOST={reset|register|deregister|abort}
+  host address
+  notifications are started afresh, a new host or VLAN is registered or
+  deregistered on the Bridge Port HiperSockets channel, or address
+  notifications are aborted.
+
+VLAN=numeric-vlan-id
+  VLAN ID on which the event occurred. Not included
+  if no VLAN is involved in the event.
+
+MAC=xx:xx:xx:xx:xx:xx
+  MAC address of the host that is being registered
+  or deregistered from the HiperSockets channel. Not reported if the
+  event reports the creation or destruction of a VLAN.
+
+NTOK_BUSID=x.y.zzzz
+  device bus ID (CSSID, SSID and device number).
+
+NTOK_IID=xx
+  device IID.
+
+NTOK_CHPID=xx
+  device CHPID.
+
+NTOK_CHID=xxxx
+  device channel ID.
+
+Note that the `NTOK_*` attributes refer to devices other than  the one
+connected to the system on which the OS is running.
diff --git a/Documentation/s390/qeth.txt b/Documentation/s390/qeth.txt
deleted file mode 100644
index aa06fcf5f8c2..000000000000
--- a/Documentation/s390/qeth.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-IBM s390 QDIO Ethernet Driver
-
-OSA and HiperSockets Bridge Port Support
-
-Uevents
-
-To generate the events the device must be assigned a role of either
-a primary or a secondary Bridge Port. For more information, see
-"z/VM Connectivity, SC24-6174".
-
-When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state
-of some configured Bridge Port device on the channel changes, a udev
-event with ACTION=CHANGE is emitted on behalf of the corresponding
-ccwgroup device. The event has the following attributes:
-
-BRIDGEPORT=statechange -  indicates that the Bridge Port device changed
-  its state.
-
-ROLE={primary|secondary|none} - the role assigned to the port.
-
-STATE={active|standby|inactive} - the newly assumed state of the port.
-
-When run on HiperSockets Bridge Capable Port hardware with host address
-notifications enabled, a udev event with ACTION=CHANGE is emitted.
-It is emitted on behalf of the corresponding ccwgroup device when a host
-or a VLAN is registered or unregistered on the network served by the device.
-The event has the following attributes:
-
-BRIDGEDHOST={reset|register|deregister|abort} - host address
-  notifications are started afresh, a new host or VLAN is registered or
-  deregistered on the Bridge Port HiperSockets channel, or address
-  notifications are aborted.
-
-VLAN=numeric-vlan-id - VLAN ID on which the event occurred. Not included
-  if no VLAN is involved in the event.
-
-MAC=xx:xx:xx:xx:xx:xx - MAC address of the host that is being registered
-  or deregistered from the HiperSockets channel. Not reported if the
-  event reports the creation or destruction of a VLAN.
-
-NTOK_BUSID=x.y.zzzz - device bus ID (CSSID, SSID and device number).
-
-NTOK_IID=xx - device IID.
-
-NTOK_CHPID=xx - device CHPID.
-
-NTOK_CHID=xxxx - device channel ID.
-
-Note that the NTOK_* attributes refer to devices other than  the one
-connected to the system on which the OS is running.
diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst
new file mode 100644
index 000000000000..cdb36842b898
--- /dev/null
+++ b/Documentation/s390/s390dbf.rst
@@ -0,0 +1,487 @@
+==================
+S390 Debug Feature
+==================
+
+files:
+      - arch/s390/kernel/debug.c
+      - arch/s390/include/asm/debug.h
+
+Description:
+------------
+The goal of this feature is to provide a kernel debug logging API
+where log records can be stored efficiently in memory, where each component
+(e.g. device drivers) can have one separate debug log.
+One purpose of this is to inspect the debug logs after a production system crash
+in order to analyze the reason for the crash.
+
+If the system still runs but only a subcomponent which uses dbf fails,
+it is possible to look at the debug logs on a live system via the Linux
+debugfs filesystem.
+
+The debug feature may also very useful for kernel and driver development.
+
+Design:
+-------
+Kernel components (e.g. device drivers) can register themselves at the debug
+feature with the function call :c:func:`debug_register()`.
+This function initializes a
+debug log for the caller. For each debug log exists a number of debug areas
+where exactly one is active at one time.  Each debug area consists of contiguous
+pages in memory. In the debug areas there are stored debug entries (log records)
+which are written by event- and exception-calls.
+
+An event-call writes the specified debug entry to the active debug
+area and updates the log pointer for the active area. If the end
+of the active debug area is reached, a wrap around is done (ring buffer)
+and the next debug entry will be written at the beginning of the active
+debug area.
+
+An exception-call writes the specified debug entry to the log and
+switches to the next debug area. This is done in order to be sure
+that the records which describe the origin of the exception are not
+overwritten when a wrap around for the current area occurs.
+
+The debug areas themselves are also ordered in form of a ring buffer.
+When an exception is thrown in the last debug area, the following debug
+entries are then written again in the very first area.
+
+There are four versions for the event- and exception-calls: One for
+logging raw data, one for text, one for numbers (unsigned int and long),
+and one for sprintf-like formatted strings.
+
+Each debug entry contains the following data:
+
+- Timestamp
+- Cpu-Number of calling task
+- Level of debug entry (0...6)
+- Return Address to caller
+- Flag, if entry is an exception or not
+
+The debug logs can be inspected in a live system through entries in
+the debugfs-filesystem. Under the toplevel directory "``s390dbf``" there is
+a directory for each registered component, which is named like the
+corresponding component. The debugfs normally should be mounted to
+``/sys/kernel/debug`` therefore the debug feature can be accessed under
+``/sys/kernel/debug/s390dbf``.
+
+The content of the directories are files which represent different views
+to the debug log. Each component can decide which views should be
+used through registering them with the function :c:func:`debug_register_view()`.
+Predefined views for hex/ascii, sprintf and raw binary data are provided.
+It is also possible to define other views. The content of
+a view can be inspected simply by reading the corresponding debugfs file.
+
+All debug logs have an actual debug level (range from 0 to 6).
+The default level is 3. Event and Exception functions have a :c:data:`level`
+parameter. Only debug entries with a level that is lower or equal
+than the actual level are written to the log. This means, when
+writing events, high priority log entries should have a low level
+value whereas low priority entries should have a high one.
+The actual debug level can be changed with the help of the debugfs-filesystem
+through writing a number string "x" to the ``level`` debugfs file which is
+provided for every debug log. Debugging can be switched off completely
+by using "-" on the ``level`` debugfs file.
+
+Example::
+
+	> echo "-" > /sys/kernel/debug/s390dbf/dasd/level
+
+It is also possible to deactivate the debug feature globally for every
+debug log. You can change the behavior using  2 sysctl parameters in
+``/proc/sys/s390dbf``:
+
+There are currently 2 possible triggers, which stop the debug feature
+globally. The first possibility is to use the ``debug_active`` sysctl. If
+set to 1 the debug feature is running. If ``debug_active`` is set to 0 the
+debug feature is turned off.
+
+The second trigger which stops the debug feature is a kernel oops.
+That prevents the debug feature from overwriting debug information that
+happened before the oops. After an oops you can reactivate the debug feature
+by piping 1 to ``/proc/sys/s390dbf/debug_active``. Nevertheless, it's not
+suggested to use an oopsed kernel in a production environment.
+
+If you want to disallow the deactivation of the debug feature, you can use
+the ``debug_stoppable`` sysctl. If you set ``debug_stoppable`` to 0 the debug
+feature cannot be stopped. If the debug feature is already stopped, it
+will stay deactivated.
+
+Kernel Interfaces:
+------------------
+
+.. kernel-doc:: arch/s390/kernel/debug.c
+.. kernel-doc:: arch/s390/include/asm/debug.h
+
+Predefined views:
+-----------------
+
+.. code-block:: c
+
+  extern struct debug_view debug_hex_ascii_view;
+
+  extern struct debug_view debug_raw_view;
+
+  extern struct debug_view debug_sprintf_view;
+
+Examples
+--------
+
+.. code-block:: c
+
+  /*
+   * hex_ascii- + raw-view Example
+   */
+
+  #include <linux/init.h>
+  #include <asm/debug.h>
+
+  static debug_info_t *debug_info;
+
+  static int init(void)
+  {
+      /* register 4 debug areas with one page each and 4 byte data field */
+
+      debug_info = debug_register("test", 1, 4, 4 );
+      debug_register_view(debug_info, &debug_hex_ascii_view);
+      debug_register_view(debug_info, &debug_raw_view);
+
+      debug_text_event(debug_info, 4 , "one ");
+      debug_int_exception(debug_info, 4, 4711);
+      debug_event(debug_info, 3, &debug_info, 4);
+
+      return 0;
+  }
+
+  static void cleanup(void)
+  {
+      debug_unregister(debug_info);
+  }
+
+  module_init(init);
+  module_exit(cleanup);
+
+.. code-block:: c
+
+  /*
+   * sprintf-view Example
+   */
+
+  #include <linux/init.h>
+  #include <asm/debug.h>
+
+  static debug_info_t *debug_info;
+
+  static int init(void)
+  {
+      /* register 4 debug areas with one page each and data field for */
+      /* format string pointer + 2 varargs (= 3 * sizeof(long))       */
+
+      debug_info = debug_register("test", 1, 4, sizeof(long) * 3);
+      debug_register_view(debug_info, &debug_sprintf_view);
+
+      debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__);
+      debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info);
+
+      return 0;
+  }
+
+  static void cleanup(void)
+  {
+      debug_unregister(debug_info);
+  }
+
+  module_init(init);
+  module_exit(cleanup);
+
+Debugfs Interface
+-----------------
+Views to the debug logs can be investigated through reading the corresponding
+debugfs-files:
+
+Example::
+
+  > ls /sys/kernel/debug/s390dbf/dasd
+  flush  hex_ascii  level pages raw
+  > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
+  00 00974733272:680099 2 - 02 0006ad7e  07 ea 4a 90 | ....
+  00 00974733272:682210 2 - 02 0006ade6  46 52 45 45 | FREE
+  00 00974733272:682213 2 - 02 0006adf6  07 ea 4a 90 | ....
+  00 00974733272:682281 1 * 02 0006ab08  41 4c 4c 43 | EXCP
+  01 00974733272:682284 2 - 02 0006ab16  45 43 4b 44 | ECKD
+  01 00974733272:682287 2 - 02 0006ab28  00 00 00 04 | ....
+  01 00974733272:682289 2 - 02 0006ab3e  00 00 00 20 | ...
+  01 00974733272:682297 2 - 02 0006ad7e  07 ea 4a 90 | ....
+  01 00974733272:684384 2 - 00 0006ade6  46 52 45 45 | FREE
+  01 00974733272:684388 2 - 00 0006adf6  07 ea 4a 90 | ....
+
+See section about predefined views for explanation of the above output!
+
+Changing the debug level
+------------------------
+
+Example::
+
+
+  > cat /sys/kernel/debug/s390dbf/dasd/level
+  3
+  > echo "5" > /sys/kernel/debug/s390dbf/dasd/level
+  > cat /sys/kernel/debug/s390dbf/dasd/level
+  5
+
+Flushing debug areas
+--------------------
+Debug areas can be flushed with piping the number of the desired
+area (0...n) to the debugfs file "flush". When using "-" all debug areas
+are flushed.
+
+Examples:
+
+1. Flush debug area 0::
+
+     > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush
+
+2. Flush all debug areas::
+
+     > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush
+
+Changing the size of debug areas
+------------------------------------
+It is possible the change the size of debug areas through piping
+the number of pages to the debugfs file "pages". The resize request will
+also flush the debug areas.
+
+Example:
+
+Define 4 pages for the debug areas of debug feature "dasd"::
+
+  > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages
+
+Stopping the debug feature
+--------------------------
+Example:
+
+1. Check if stopping is allowed::
+
+     > cat /proc/sys/s390dbf/debug_stoppable
+
+2. Stop debug feature::
+
+     > echo 0 > /proc/sys/s390dbf/debug_active
+
+crash Interface
+----------------
+The ``crash`` tool since v5.1.0 has a built-in command
+``s390dbf`` to display all the debug logs or export them to the file system.
+With this tool it is possible
+to investigate the debug logs on a live system and with a memory dump after
+a system crash.
+
+Investigating raw memory
+------------------------
+One last possibility to investigate the debug logs at a live
+system and after a system crash is to look at the raw memory
+under VM or at the Service Element.
+It is possible to find the anchor of the debug-logs through
+the ``debug_area_first`` symbol in the System map. Then one has
+to follow the correct pointers of the data-structures defined
+in debug.h and find the debug-areas in memory.
+Normally modules which use the debug feature will also have
+a global variable with the pointer to the debug-logs. Following
+this pointer it will also be possible to find the debug logs in
+memory.
+
+For this method it is recommended to use '16 * x + 4' byte (x = 0..n)
+for the length of the data field in :c:func:`debug_register()` in
+order to see the debug entries well formatted.
+
+
+Predefined Views
+----------------
+
+There are three predefined views: hex_ascii, raw and sprintf.
+The hex_ascii view shows the data field in hex and ascii representation
+(e.g. ``45 43 4b 44 | ECKD``).
+The raw view returns a bytestream as the debug areas are stored in memory.
+
+The sprintf view formats the debug entries in the same way as the sprintf
+function would do. The sprintf event/exception functions write to the
+debug entry a pointer to the format string (size = sizeof(long))
+and for each vararg a long value. So e.g. for a debug entry with a format
+string plus two varargs one would need to allocate a (3 * sizeof(long))
+byte data area in the debug_register() function.
+
+IMPORTANT:
+  Using "%s" in sprintf event functions is dangerous. You can only
+  use "%s" in the sprintf event functions, if the memory for the passed string
+  is available as long as the debug feature exists. The reason behind this is
+  that due to performance considerations only a pointer to the string is stored
+  in  the debug feature. If you log a string that is freed afterwards, you will
+  get an OOPS when inspecting the debug feature, because then the debug feature
+  will access the already freed memory.
+
+NOTE:
+  If using the sprintf view do NOT use other event/exception functions
+  than the sprintf-event and -exception functions.
+
+The format of the hex_ascii and sprintf view is as follows:
+
+- Number of area
+- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated
+  Universal Time (UTC), January 1, 1970)
+- level of debug entry
+- Exception flag (* = Exception)
+- Cpu-Number of calling task
+- Return Address to caller
+- data field
+
+The format of the raw view is:
+
+- Header as described in debug.h
+- datafield
+
+A typical line of the hex_ascii view will look like the following (first line
+is only for explanation and will not be displayed when 'cating' the view)::
+
+  area  time           level exception cpu caller    data (hex + ascii)
+  --------------------------------------------------------------------------
+  00    00964419409:440690 1 -         00  88023fe
+
+
+Defining views
+--------------
+
+Views are specified with the 'debug_view' structure. There are defined
+callback functions which are used for reading and writing the debugfs files:
+
+.. code-block:: c
+
+  struct debug_view {
+	char name[DEBUG_MAX_PROCF_LEN];
+	debug_prolog_proc_t* prolog_proc;
+	debug_header_proc_t* header_proc;
+	debug_format_proc_t* format_proc;
+	debug_input_proc_t*  input_proc;
+	void*                private_data;
+  };
+
+where:
+
+.. code-block:: c
+
+  typedef int (debug_header_proc_t) (debug_info_t* id,
+				     struct debug_view* view,
+				     int area,
+				     debug_entry_t* entry,
+				     char* out_buf);
+
+  typedef int (debug_format_proc_t) (debug_info_t* id,
+				     struct debug_view* view, char* out_buf,
+				     const char* in_buf);
+  typedef int (debug_prolog_proc_t) (debug_info_t* id,
+				     struct debug_view* view,
+				     char* out_buf);
+  typedef int (debug_input_proc_t) (debug_info_t* id,
+				    struct debug_view* view,
+				    struct file* file, const char* user_buf,
+				    size_t in_buf_size, loff_t* offset);
+
+
+The "private_data" member can be used as pointer to view specific data.
+It is not used by the debug feature itself.
+
+The output when reading a debugfs file is structured like this::
+
+  "prolog_proc output"
+
+  "header_proc output 1"  "format_proc output 1"
+  "header_proc output 2"  "format_proc output 2"
+  "header_proc output 3"  "format_proc output 3"
+  ...
+
+When a view is read from the debugfs, the Debug Feature calls the
+'prolog_proc' once for writing the prolog.
+Then 'header_proc' and 'format_proc' are called for each
+existing debug entry.
+
+The input_proc can be used to implement functionality when it is written to
+the view (e.g. like with ``echo "0" > /sys/kernel/debug/s390dbf/dasd/level``).
+
+For header_proc there can be used the default function
+:c:func:`debug_dflt_header_fn()` which is defined in debug.h.
+and which produces the same header output as the predefined views.
+E.g::
+
+  00 00964419409:440761 2 - 00 88023ec
+
+In order to see how to use the callback functions check the implementation
+of the default views!
+
+Example:
+
+.. code-block:: c
+
+  #include <asm/debug.h>
+
+  #define UNKNOWNSTR "data: %08x"
+
+  const char* messages[] =
+  {"This error...........\n",
+   "That error...........\n",
+   "Problem..............\n",
+   "Something went wrong.\n",
+   "Everything ok........\n",
+   NULL
+  };
+
+  static int debug_test_format_fn(
+     debug_info_t *id, struct debug_view *view,
+     char *out_buf, const char *in_buf
+  )
+  {
+    int i, rc = 0;
+
+    if (id->buf_size >= 4) {
+       int msg_nr = *((int*)in_buf);
+       if (msg_nr < sizeof(messages) / sizeof(char*) - 1)
+	  rc += sprintf(out_buf, "%s", messages[msg_nr]);
+       else
+	  rc += sprintf(out_buf, UNKNOWNSTR, msg_nr);
+    }
+    return rc;
+  }
+
+  struct debug_view debug_test_view = {
+    "myview",                 /* name of view */
+    NULL,                     /* no prolog */
+    &debug_dflt_header_fn,    /* default header for each entry */
+    &debug_test_format_fn,    /* our own format function */
+    NULL,                     /* no input function */
+    NULL                      /* no private data */
+  };
+
+test:
+=====
+
+.. code-block:: c
+
+  debug_info_t *debug_info;
+  int i;
+  ...
+  debug_info = debug_register("test", 0, 4, 4);
+  debug_register_view(debug_info, &debug_test_view);
+  for (i = 0; i < 10; i ++)
+    debug_int_event(debug_info, 1, i);
+
+::
+
+  > cat /sys/kernel/debug/s390dbf/test/myview
+  00 00964419734:611402 1 - 00 88042ca   This error...........
+  00 00964419734:611405 1 - 00 88042ca   That error...........
+  00 00964419734:611408 1 - 00 88042ca   Problem..............
+  00 00964419734:611411 1 - 00 88042ca   Something went wrong.
+  00 00964419734:611414 1 - 00 88042ca   Everything ok........
+  00 00964419734:611417 1 - 00 88042ca   data: 00000005
+  00 00964419734:611419 1 - 00 88042ca   data: 00000006
+  00 00964419734:611422 1 - 00 88042ca   data: 00000007
+  00 00964419734:611425 1 - 00 88042ca   data: 00000008
+  00 00964419734:611428 1 - 00 88042ca   data: 00000009
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt
deleted file mode 100644
index 61329fd62e89..000000000000
--- a/Documentation/s390/s390dbf.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-S390 Debug Feature
-==================
-
-files: arch/s390/kernel/debug.c
-       arch/s390/include/asm/debug.h
-
-Description:
-------------
-The goal of this feature is to provide a kernel debug logging API 
-where log records can be stored efficiently in memory, where each component 
-(e.g. device drivers) can have one separate debug log.
-One purpose of this is to inspect the debug logs after a production system crash
-in order to analyze the reason for the crash.
-If the system still runs but only a subcomponent which uses dbf fails,
-it is possible to look at the debug logs on a live system via the Linux
-debugfs filesystem.
-The debug feature may also very useful for kernel and driver development.
-
-Design:
--------
-Kernel components (e.g. device drivers) can register themselves at the debug 
-feature with the function call debug_register(). This function initializes a 
-debug log for the caller. For each debug log exists a number of debug areas 
-where exactly one is active at one time.  Each debug area consists of contiguous
-pages in memory. In the debug areas there are stored debug entries (log records)
-which are written by event- and exception-calls. 
-
-An event-call writes the specified debug entry to the active debug
-area and updates the log pointer for the active area. If the end 
-of the active debug area is reached, a wrap around is done (ring buffer) 
-and the next debug entry will be written at the beginning of the active 
-debug area.
-
-An exception-call writes the specified debug entry to the log and
-switches to the next debug area. This is done in order to be sure
-that the records which describe the origin of the exception are not
-overwritten when a wrap around for the current area occurs.
-
-The debug areas themselves are also ordered in form of a ring buffer.
-When an exception is thrown in the last debug area, the following debug 
-entries are then written again in the very first area.
-
-There are three versions for the event- and exception-calls: One for
-logging raw data, one for text and one for numbers.
-
-Each debug entry contains the following data:
-
-- Timestamp
-- Cpu-Number of calling task
-- Level of debug entry (0...6)
-- Return Address to caller
-- Flag, if entry is an exception or not
-
-The debug logs can be inspected in a live system through entries in
-the debugfs-filesystem. Under the toplevel directory "s390dbf" there is
-a directory for each registered component, which is named like the
-corresponding component. The debugfs normally should be mounted to
-/sys/kernel/debug therefore the debug feature can be accessed under
-/sys/kernel/debug/s390dbf.
-
-The content of the directories are files which represent different views
-to the debug log. Each component can decide which views should be
-used through registering them with the function debug_register_view().
-Predefined views for hex/ascii, sprintf and raw binary data are provided.
-It is also possible to define other views. The content of
-a view can be inspected simply by reading the corresponding debugfs file.
-
-All debug logs have an actual debug level (range from 0 to 6).
-The default level is 3. Event and Exception functions have a 'level'
-parameter. Only debug entries with a level that is lower or equal
-than the actual level are written to the log. This means, when
-writing events, high priority log entries should have a low level
-value whereas low priority entries should have a high one.
-The actual debug level can be changed with the help of the debugfs-filesystem
-through writing a number string "x" to the 'level' debugfs file which is
-provided for every debug log. Debugging can be switched off completely
-by using "-" on the 'level' debugfs file.
-
-Example:
-
-> echo "-" > /sys/kernel/debug/s390dbf/dasd/level
-
-It is also possible to deactivate the debug feature globally for every
-debug log. You can change the behavior using  2 sysctl parameters in
-/proc/sys/s390dbf:
-There are currently 2 possible triggers, which stop the debug feature
-globally. The first possibility is to use the "debug_active" sysctl. If
-set to 1 the debug feature is running. If "debug_active" is set to 0 the
-debug feature is turned off.
-The second trigger which stops the debug feature is a kernel oops.
-That prevents the debug feature from overwriting debug information that
-happened before the oops. After an oops you can reactivate the debug feature
-by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not
-suggested to use an oopsed kernel in a production environment.
-If you want to disallow the deactivation of the debug feature, you can use
-the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug
-feature cannot be stopped. If the debug feature is already stopped, it
-will stay deactivated.
-
-Kernel Interfaces:
-------------------
-
-----------------------------------------------------------------------------
-debug_info_t *debug_register(char *name, int pages, int nr_areas,
-                             int buf_size);
-
-Parameter:    name:        Name of debug log (e.g. used for debugfs entry)
-              pages:       number of pages, which will be allocated per area
-              nr_areas:    number of debug areas
-              buf_size:    size of data area in each debug entry
-
-Return Value: Handle for generated debug area   
-              NULL if register failed 
-
-Description:  Allocates memory for a debug log     
-              Must not be called within an interrupt handler 
-
-----------------------------------------------------------------------------
-debug_info_t *debug_register_mode(char *name, int pages, int nr_areas,
-				  int buf_size, mode_t mode, uid_t uid,
-				  gid_t gid);
-
-Parameter:    name:	   Name of debug log (e.g. used for debugfs entry)
-	      pages:	   Number of pages, which will be allocated per area
-	      nr_areas:    Number of debug areas
-	      buf_size:    Size of data area in each debug entry
-	      mode:	   File mode for debugfs files. E.g. S_IRWXUGO
-	      uid:	   User ID for debugfs files. Currently only 0 is
-			   supported.
-	      gid:	   Group ID for debugfs files. Currently only 0 is
-			   supported.
-
-Return Value: Handle for generated debug area
-	      NULL if register failed
-
-Description:  Allocates memory for a debug log
-	      Must not be called within an interrupt handler
-
----------------------------------------------------------------------------
-void debug_unregister (debug_info_t * id);
-
-Parameter:     id:   handle for debug log  
-
-Return Value:  none 
-
-Description:   frees memory for a debug log and removes all registered debug
-	       views.
-               Must not be called within an interrupt handler 
-
----------------------------------------------------------------------------
-void debug_set_level (debug_info_t * id, int new_level);
-
-Parameter:     id:        handle for debug log  
-               new_level: new debug level 
-
-Return Value:  none 
-
-Description:   Sets new actual debug level if new_level is valid. 
-
----------------------------------------------------------------------------
-bool debug_level_enabled (debug_info_t * id, int level);
-
-Parameter:    id:	  handle for debug log
-	      level:	  debug level
-
-Return Value: True if level is less or equal to the current debug level.
-
-Description:  Returns true if debug events for the specified level would be
-	      logged. Otherwise returns false.
----------------------------------------------------------------------------
-void debug_stop_all(void);
-
-Parameter:     none
-
-Return Value:  none
-
-Description:   stops the debug feature if stopping is allowed. Currently
-               used in case of a kernel oops.
-
----------------------------------------------------------------------------
-debug_entry_t* debug_event (debug_info_t* id, int level, void* data, 
-                            int length);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   pointer to data for debug entry  
-               length: length of data in bytes       
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry to active debug area (if level <= actual 
-               debug level)    
-
----------------------------------------------------------------------------
-debug_entry_t* debug_int_event (debug_info_t * id, int level, 
-                                unsigned int data);
-debug_entry_t* debug_long_event(debug_info_t * id, int level,
-                                unsigned long data);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   integer value for debug entry           
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry to active debug area (if level <= actual 
-               debug level)    
-
----------------------------------------------------------------------------
-debug_entry_t* debug_text_event (debug_info_t * id, int level, 
-                                 const char* data);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   string for debug entry  
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry in ascii format to active debug area 
-               (if level <= actual debug level)     
-
----------------------------------------------------------------------------
-debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, 
-                                    char* string,...);
-
-Parameter:     id:    handle for debug log 
-               level: debug level
-               string: format string for debug entry 
-               ...: varargs used as in sprintf()
-
-Return Value:  Address of written debug entry
-
-Description:   writes debug entry with format string and varargs (longs) to 
-               active debug area (if level $<=$ actual debug level). 
-               floats and long long datatypes cannot be used as varargs.
-
----------------------------------------------------------------------------
-
-debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, 
-                                int length);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   pointer to data for debug entry  
-               length: length of data in bytes       
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry to active debug area (if level <= actual 
-               debug level) and switches to next debug area  
-
----------------------------------------------------------------------------
-debug_entry_t* debug_int_exception (debug_info_t * id, int level, 
-                                    unsigned int data);
-debug_entry_t* debug_long_exception(debug_info_t * id, int level,
-                                    unsigned long data);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   integer value for debug entry           
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry to active debug area (if level <= actual 
-               debug level) and switches to next debug area  
-
----------------------------------------------------------------------------
-debug_entry_t* debug_text_exception (debug_info_t * id, int level, 
-                                     const char* data);
-
-Parameter:     id:     handle for debug log  
-               level:  debug level           
-               data:   string for debug entry  
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry in ascii format to active debug area 
-               (if level <= actual debug level) and switches to next debug 
-               area  
-
----------------------------------------------------------------------------
-debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level,
-                                        char* string,...);
-
-Parameter:     id:    handle for debug log  
-               level: debug level  
-               string: format string for debug entry  
-               ...: varargs used as in sprintf()
-
-Return Value:  Address of written debug entry 
-
-Description:   writes debug entry with format string and varargs (longs) to 
-               active debug area (if level $<=$ actual debug level) and
-               switches to next debug area. 
-               floats and long long datatypes cannot be used as varargs.
-
----------------------------------------------------------------------------
-
-int debug_register_view (debug_info_t * id, struct debug_view *view);
-
-Parameter:     id:    handle for debug log  
-               view:  pointer to debug view struct 
-
-Return Value:  0  : ok 
-               < 0: Error 
-
-Description:   registers new debug view and creates debugfs dir entry
-
----------------------------------------------------------------------------
-int debug_unregister_view (debug_info_t * id, struct debug_view *view); 
-
-Parameter:     id:    handle for debug log  
-               view:  pointer to debug view struct 
-
-Return Value:  0  : ok 
-               < 0: Error 
-
-Description:   unregisters debug view and removes debugfs dir entry
-
-
-
-Predefined views:
------------------
-
-extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
-extern struct debug_view debug_sprintf_view;
-
-Examples
---------
-
-/*
- * hex_ascii- + raw-view Example
- */
-
-#include <linux/init.h>
-#include <asm/debug.h>
-
-static debug_info_t* debug_info;
-
-static int init(void)
-{
-    /* register 4 debug areas with one page each and 4 byte data field */
-
-    debug_info = debug_register ("test", 1, 4, 4 );
-    debug_register_view(debug_info,&debug_hex_ascii_view);
-    debug_register_view(debug_info,&debug_raw_view);
-
-    debug_text_event(debug_info, 4 , "one ");
-    debug_int_exception(debug_info, 4, 4711);
-    debug_event(debug_info, 3, &debug_info, 4);
-
-    return 0;
-}
-
-static void cleanup(void)
-{
-    debug_unregister (debug_info);
-}
-
-module_init(init);
-module_exit(cleanup);
-
----------------------------------------------------------------------------
-
-/*
- * sprintf-view Example
- */
-
-#include <linux/init.h>
-#include <asm/debug.h>
-
-static debug_info_t* debug_info;
-
-static int init(void)
-{
-    /* register 4 debug areas with one page each and data field for */
-    /* format string pointer + 2 varargs (= 3 * sizeof(long))       */
-
-    debug_info = debug_register ("test", 1, 4, sizeof(long) * 3);
-    debug_register_view(debug_info,&debug_sprintf_view);
-
-    debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__);
-    debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info);
-
-    return 0;
-}
-
-static void cleanup(void)
-{
-    debug_unregister (debug_info);
-}
-
-module_init(init);
-module_exit(cleanup);
-
-
-
-Debugfs Interface
-----------------
-Views to the debug logs can be investigated through reading the corresponding 
-debugfs-files:
-
-Example:
-
-> ls /sys/kernel/debug/s390dbf/dasd
-flush  hex_ascii  level pages raw
-> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
-00 00974733272:680099 2 - 02 0006ad7e  07 ea 4a 90 | ....
-00 00974733272:682210 2 - 02 0006ade6  46 52 45 45 | FREE
-00 00974733272:682213 2 - 02 0006adf6  07 ea 4a 90 | ....
-00 00974733272:682281 1 * 02 0006ab08  41 4c 4c 43 | EXCP 
-01 00974733272:682284 2 - 02 0006ab16  45 43 4b 44 | ECKD
-01 00974733272:682287 2 - 02 0006ab28  00 00 00 04 | ....
-01 00974733272:682289 2 - 02 0006ab3e  00 00 00 20 | ... 
-01 00974733272:682297 2 - 02 0006ad7e  07 ea 4a 90 | ....
-01 00974733272:684384 2 - 00 0006ade6  46 52 45 45 | FREE
-01 00974733272:684388 2 - 00 0006adf6  07 ea 4a 90 | ....
-
-See section about predefined views for explanation of the above output!
-
-Changing the debug level
-------------------------
-
-Example:
-
-
-> cat /sys/kernel/debug/s390dbf/dasd/level
-3
-> echo "5" > /sys/kernel/debug/s390dbf/dasd/level
-> cat /sys/kernel/debug/s390dbf/dasd/level
-5
-
-Flushing debug areas
---------------------
-Debug areas can be flushed with piping the number of the desired
-area (0...n) to the debugfs file "flush". When using "-" all debug areas
-are flushed.
-
-Examples:
-
-1. Flush debug area 0:
-> echo "0" > /sys/kernel/debug/s390dbf/dasd/flush
-
-2. Flush all debug areas:
-> echo "-" > /sys/kernel/debug/s390dbf/dasd/flush
-
-Changing the size of debug areas
-------------------------------------
-It is possible the change the size of debug areas through piping
-the number of pages to the debugfs file "pages". The resize request will
-also flush the debug areas.
-
-Example:
-
-Define 4 pages for the debug areas of debug feature "dasd":
-> echo "4" > /sys/kernel/debug/s390dbf/dasd/pages
-
-Stooping the debug feature
---------------------------
-Example:
-
-1. Check if stopping is allowed
-> cat /proc/sys/s390dbf/debug_stoppable
-2. Stop debug feature
-> echo 0 > /proc/sys/s390dbf/debug_active
-
-lcrash Interface
-----------------
-It is planned that the dump analysis tool lcrash gets an additional command
-'s390dbf' to display all the debug logs. With this tool it will be possible 
-to investigate the debug logs on a live system and with a memory dump after 
-a system crash.
-
-Investigating raw memory
-------------------------
-One last possibility to investigate the debug logs at a live
-system and after a system crash is to look at the raw memory
-under VM or at the Service Element.
-It is possible to find the anker of the debug-logs through
-the 'debug_area_first' symbol in the System map. Then one has
-to follow the correct pointers of the data-structures defined
-in debug.h and find the debug-areas in memory.
-Normally modules which use the debug feature will also have
-a global variable with the pointer to the debug-logs. Following
-this pointer it will also be possible to find the debug logs in
-memory.
-
-For this method it is recommended to use '16 * x + 4' byte (x = 0..n)
-for the length of the data field in debug_register() in
-order to see the debug entries well formatted.
-
-
-Predefined Views
-----------------
-
-There are three predefined views: hex_ascii, raw and sprintf. 
-The hex_ascii view shows the data field in hex and ascii representation 
-(e.g. '45 43 4b 44 | ECKD'). 
-The raw view returns a bytestream as the debug areas are stored in memory.
-
-The sprintf view formats the debug entries in the same way as the sprintf
-function would do. The sprintf event/exception functions write to the
-debug entry a pointer to the format string (size = sizeof(long)) 
-and for each vararg a long value. So e.g. for a debug entry with a format 
-string plus two varargs one would need to allocate a (3 * sizeof(long)) 
-byte data area in the debug_register() function.
-
-IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only
-use "%s" in the sprintf event functions, if the memory for the passed string is
-available as long as the debug feature exists. The reason behind this is that
-due to performance considerations only a pointer to the string is stored in
-the debug feature. If you log a string that is freed afterwards, you will get
-an OOPS when inspecting the debug feature, because then the debug feature will
-access the already freed memory.
-
-NOTE: If using the sprintf view do NOT use other event/exception functions
-than the sprintf-event and -exception functions.
-
-The format of the hex_ascii and sprintf view is as follows:
-- Number of area
-- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated 
-  Universal Time (UTC), January 1, 1970)
-- level of debug entry
-- Exception flag (* = Exception)
-- Cpu-Number of calling task
-- Return Address to caller
-- data field
-
-The format of the raw view is:
-- Header as described in debug.h
-- datafield 
-
-A typical line of the hex_ascii view will look like the following (first line 
-is only for explanation and will not be displayed when 'cating' the view):
-
-area  time           level exception cpu caller    data (hex + ascii)
---------------------------------------------------------------------------
-00    00964419409:440690 1 -         00  88023fe   
-
-
-Defining views
---------------
-
-Views are specified with the 'debug_view' structure. There are defined
-callback functions which are used for reading and writing the debugfs files:
-
-struct debug_view {
-        char name[DEBUG_MAX_PROCF_LEN];  
-        debug_prolog_proc_t* prolog_proc; 
-        debug_header_proc_t* header_proc;
-        debug_format_proc_t* format_proc;
-        debug_input_proc_t*  input_proc;
-	void*                private_data;
-};
-
-where
-
-typedef int (debug_header_proc_t) (debug_info_t* id,
-                                   struct debug_view* view,
-                                   int area,
-                                   debug_entry_t* entry,
-                                   char* out_buf);
-
-typedef int (debug_format_proc_t) (debug_info_t* id,
-                                   struct debug_view* view, char* out_buf,
-                                   const char* in_buf);
-typedef int (debug_prolog_proc_t) (debug_info_t* id,
-                                   struct debug_view* view,
-                                   char* out_buf);
-typedef int (debug_input_proc_t) (debug_info_t* id,
-                                  struct debug_view* view,
-                                  struct file* file, const char* user_buf,
-                                  size_t in_buf_size, loff_t* offset);
-
-
-The "private_data" member can be used as pointer to view specific data.
-It is not used by the debug feature itself.
-
-The output when reading a debugfs file is structured like this:
-
-"prolog_proc output"
-
-"header_proc output 1"  "format_proc output 1"
-"header_proc output 2"  "format_proc output 2"
-"header_proc output 3"  "format_proc output 3"
-...
-
-When a view is read from the debugfs, the Debug Feature calls the
-'prolog_proc' once for writing the prolog.
-Then 'header_proc' and 'format_proc' are called for each 
-existing debug entry.
-
-The input_proc can be used to implement functionality when it is written to 
-the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level).
-
-For header_proc there can be used the default function
-debug_dflt_header_fn() which is defined in debug.h.
-and which produces the same header output as the predefined views.
-E.g:
-00 00964419409:440761 2 - 00 88023ec
-
-In order to see how to use the callback functions check the implementation
-of the default views!
-
-Example
-
-#include <asm/debug.h>
-
-#define UNKNOWNSTR "data: %08x"
-
-const char* messages[] =
-{"This error...........\n",
- "That error...........\n",
- "Problem..............\n",
- "Something went wrong.\n",
- "Everything ok........\n",
- NULL
-};
-
-static int debug_test_format_fn(
-   debug_info_t * id, struct debug_view *view, 
-   char *out_buf, const char *in_buf
-)
-{
-  int i, rc = 0;
-
-  if(id->buf_size >= 4) {
-     int msg_nr = *((int*)in_buf);
-     if(msg_nr < sizeof(messages)/sizeof(char*) - 1)
-        rc += sprintf(out_buf, "%s", messages[msg_nr]);	
-     else
-        rc += sprintf(out_buf, UNKNOWNSTR, msg_nr);
-  }
- out:
-   return rc;
-}
-
-struct debug_view debug_test_view = {
-  "myview",                 /* name of view */
-  NULL,                     /* no prolog */
-  &debug_dflt_header_fn,    /* default header for each entry */
-  &debug_test_format_fn,    /* our own format function */
-  NULL,                     /* no input function */
-  NULL                      /* no private data */
-};
-
-=====
-test:
-=====
-debug_info_t *debug_info;
-...
-debug_info = debug_register ("test", 0, 4, 4 ));
-debug_register_view(debug_info, &debug_test_view);
-for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i);
-
-> cat /sys/kernel/debug/s390dbf/test/myview
-00 00964419734:611402 1 - 00 88042ca   This error...........
-00 00964419734:611405 1 - 00 88042ca   That error...........
-00 00964419734:611408 1 - 00 88042ca   Problem..............
-00 00964419734:611411 1 - 00 88042ca   Something went wrong.
-00 00964419734:611414 1 - 00 88042ca   Everything ok........
-00 00964419734:611417 1 - 00 88042ca   data: 00000005
-00 00964419734:611419 1 - 00 88042ca   data: 00000006
-00 00964419734:611422 1 - 00 88042ca   data: 00000007
-00 00964419734:611425 1 - 00 88042ca   data: 00000008
-00 00964419734:611428 1 - 00 88042ca   data: 00000009
diff --git a/Documentation/s390/text_files.rst b/Documentation/s390/text_files.rst
new file mode 100644
index 000000000000..c94d05d4fa17
--- /dev/null
+++ b/Documentation/s390/text_files.rst
@@ -0,0 +1,11 @@
+ibm 3270 changelog
+------------------
+
+.. include:: 3270.ChangeLog
+    :literal:
+
+ibm 3270 config3270.sh
+----------------------
+
+.. literalinclude:: config3270.sh
+    :language: shell
diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst
new file mode 100644
index 000000000000..b5c51f7c748d
--- /dev/null
+++ b/Documentation/s390/vfio-ap.rst
@@ -0,0 +1,866 @@
+===============================
+Adjunct Processor (AP) facility
+===============================
+
+
+Introduction
+============
+The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
+of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
+The AP devices provide cryptographic functions to all CPUs assigned to a
+linux system running in an IBM Z system LPAR.
+
+The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
+is to make AP cards available to KVM guests using the VFIO mediated device
+framework. This implementation relies considerably on the s390 virtualization
+facilities which do most of the hard work of providing direct access to AP
+devices.
+
+AP Architectural Overview
+=========================
+To facilitate the comprehension of the design, let's start with some
+definitions:
+
+* AP adapter
+
+  An AP adapter is an IBM Z adapter card that can perform cryptographic
+  functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
+  assigned to the LPAR in which a linux host is running will be available to
+  the linux host. Each adapter is identified by a number from 0 to 255; however,
+  the maximum adapter number is determined by machine model and/or adapter type.
+  When installed, an AP adapter is accessed by AP instructions executed by any
+  CPU.
+
+  The AP adapter cards are assigned to a given LPAR via the system's Activation
+  Profile which can be edited via the HMC. When the linux host system is IPL'd
+  in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
+  creates a sysfs device for each assigned adapter. For example, if AP adapters
+  4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
+  sysfs device entries::
+
+    /sys/devices/ap/card04
+    /sys/devices/ap/card0a
+
+  Symbolic links to these devices will also be created in the AP bus devices
+  sub-directory::
+
+    /sys/bus/ap/devices/[card04]
+    /sys/bus/ap/devices/[card04]
+
+* AP domain
+
+  An adapter is partitioned into domains. An adapter can hold up to 256 domains
+  depending upon the adapter type and hardware configuration. A domain is
+  identified by a number from 0 to 255; however, the maximum domain number is
+  determined by machine model and/or adapter type.. A domain can be thought of
+  as a set of hardware registers and memory used for processing AP commands. A
+  domain can be configured with a secure private key used for clear key
+  encryption. A domain is classified in one of two ways depending upon how it
+  may be accessed:
+
+    * Usage domains are domains that are targeted by an AP instruction to
+      process an AP command.
+
+    * Control domains are domains that are changed by an AP command sent to a
+      usage domain; for example, to set the secure private key for the control
+      domain.
+
+  The AP usage and control domains are assigned to a given LPAR via the system's
+  Activation Profile which can be edited via the HMC. When a linux host system
+  is IPL'd in the LPAR, the AP bus module detects the AP usage and control
+  domains assigned to the LPAR. The domain number of each usage domain and
+  adapter number of each AP adapter are combined to create AP queue devices
+  (see AP Queue section below). The domain number of each control domain will be
+  represented in a bitmask and stored in a sysfs file
+  /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
+  significant bit, correspond to domains 0-255.
+
+* AP Queue
+
+  An AP queue is the means by which an AP command is sent to a usage domain
+  inside a specific adapter. An AP queue is identified by a tuple
+  comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
+  APQI corresponds to a given usage domain number within the adapter. This tuple
+  forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
+  instructions include a field containing the APQN to identify the AP queue to
+  which the AP command is to be sent for processing.
+
+  The AP bus will create a sysfs device for each APQN that can be derived from
+  the cross product of the AP adapter and usage domain numbers detected when the
+  AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
+  domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
+  following sysfs entries::
+
+    /sys/devices/ap/card04/04.0006
+    /sys/devices/ap/card04/04.0047
+    /sys/devices/ap/card0a/0a.0006
+    /sys/devices/ap/card0a/0a.0047
+
+  The following symbolic links to these devices will be created in the AP bus
+  devices subdirectory::
+
+    /sys/bus/ap/devices/[04.0006]
+    /sys/bus/ap/devices/[04.0047]
+    /sys/bus/ap/devices/[0a.0006]
+    /sys/bus/ap/devices/[0a.0047]
+
+* AP Instructions:
+
+  There are three AP instructions:
+
+  * NQAP: to enqueue an AP command-request message to a queue
+  * DQAP: to dequeue an AP command-reply message from a queue
+  * PQAP: to administer the queues
+
+  AP instructions identify the domain that is targeted to process the AP
+  command; this must be one of the usage domains. An AP command may modify a
+  domain that is not one of the usage domains, but the modified domain
+  must be one of the control domains.
+
+AP and SIE
+==========
+Let's now take a look at how AP instructions executed on a guest are interpreted
+by the hardware.
+
+A satellite control block called the Crypto Control Block (CRYCB) is attached to
+our main hardware virtualization control block. The CRYCB contains three fields
+to identify the adapters, usage domains and control domains assigned to the KVM
+guest:
+
+* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
+  to the KVM guest. Each bit in the mask, from left to right (i.e. from most
+  significant to least significant bit in big endian order), corresponds to
+  an APID from 0-255. If a bit is set, the corresponding adapter is valid for
+  use by the KVM guest.
+
+* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
+  assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
+  most significant to least significant bit in big endian order), corresponds to
+  an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
+  is valid for use by the KVM guest.
+
+* The AP Domain Mask field is a bit mask that identifies the AP control domains
+  assigned to the KVM guest. The ADM bit mask controls which domains can be
+  changed by an AP command-request message sent to a usage domain from the
+  guest. Each bit in the mask, from left to right (i.e. from most significant to
+  least significant bit in big endian order), corresponds to a domain from
+  0-255. If a bit is set, the corresponding domain can be modified by an AP
+  command-request message sent to a usage domain.
+
+If you recall from the description of an AP Queue, AP instructions include
+an APQN to identify the AP queue to which an AP command-request message is to be
+sent (NQAP and PQAP instructions), or from which a command-reply message is to
+be received (DQAP instruction). The validity of an APQN is defined by the matrix
+calculated from the APM and AQM; it is the cross product of all assigned adapter
+numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
+and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
+(2,5) and (2,6) will be valid for the guest.
+
+The APQNs can provide secure key functionality - i.e., a private key is stored
+on the adapter card for each of its domains - so each APQN must be assigned to
+at most one guest or to the linux host::
+
+   Example 1: Valid configuration:
+   ------------------------------
+   Guest1: adapters 1,2  domains 5,6
+   Guest2: adapter  1,2  domain 7
+
+   This is valid because both guests have a unique set of APQNs:
+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+      Guest2 has APQNs (1,7), (2,7)
+
+   Example 2: Valid configuration:
+   ------------------------------
+   Guest1: adapters 1,2 domains 5,6
+   Guest2: adapters 3,4 domains 5,6
+
+   This is also valid because both guests have a unique set of APQNs:
+      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+      Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
+
+   Example 3: Invalid configuration:
+   --------------------------------
+   Guest1: adapters 1,2  domains 5,6
+   Guest2: adapter  1    domains 6,7
+
+   This is an invalid configuration because both guests have access to
+   APQN (1,6).
+
+The Design
+==========
+The design introduces three new objects:
+
+1. AP matrix device
+2. VFIO AP device driver (vfio_ap.ko)
+3. VFIO AP mediated matrix pass-through device
+
+The VFIO AP device driver
+-------------------------
+The VFIO AP (vfio_ap) device driver serves the following purposes:
+
+1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
+
+2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
+   device and creates the sysfs interfaces for assigning adapters, usage
+   domains, and control domains comprising the matrix for a KVM guest.
+
+3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
+   SIE state description to grant the guest access to a matrix of AP devices
+
+Reserve APQNs for exclusive use of KVM guests
+---------------------------------------------
+The following block diagram illustrates the mechanism by which APQNs are
+reserved::
+
+				+------------------+
+		 7 remove       |                  |
+	   +--------------------> cex4queue driver |
+	   |                    |                  |
+	   |                    +------------------+
+	   |
+	   |
+	   |                    +------------------+          +----------------+
+	   |  5 register driver |                  | 3 create |                |
+	   |   +---------------->   Device core    +---------->  matrix device |
+	   |   |                |                  |          |                |
+	   |   |                +--------^---------+          +----------------+
+	   |   |                         |
+	   |   |                         +-------------------+
+	   |   | +-----------------------------------+       |
+	   |   | |      4 register AP driver         |       | 2 register device
+	   |   | |                                   |       |
+  +--------+---+-v---+                      +--------+-------+-+
+  |                  |                      |                  |
+  |      ap_bus      +--------------------- >  vfio_ap driver  |
+  |                  |       8 probe        |                  |
+  +--------^---------+                      +--^--^------------+
+  6 edit   |                                   |  |
+    apmask |     +-----------------------------+  | 9 mdev create
+    aqmask |     |           1 modprobe           |
+  +--------+-----+---+           +----------------+-+         +----------------+
+  |                  |           |                  |8 create |     mediated   |
+  |      admin       |           | VFIO device core |--------->     matrix     |
+  |                  +           |                  |         |     device     |
+  +------+-+---------+           +--------^---------+         +--------^-------+
+	 | |                              |                            |
+	 | | 9 create vfio_ap-passthrough |                            |
+	 | +------------------------------+                            |
+	 +-------------------------------------------------------------+
+		     10  assign adapter/domain/control domain
+
+The process for reserving an AP queue for use by a KVM guest is:
+
+1. The administrator loads the vfio_ap device driver
+2. The vfio-ap driver during its initialization will register a single 'matrix'
+   device with the device core. This will serve as the parent device for
+   all mediated matrix devices used to configure an AP matrix for a guest.
+3. The /sys/devices/vfio_ap/matrix device is created by the device core
+4. The vfio_ap device driver will register with the AP bus for AP queue devices
+   of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
+   driver's probe and remove callback interfaces. Devices older than CEX4 queues
+   are not supported to simplify the implementation by not needlessly
+   complicating the design by supporting older devices that will go out of
+   service in the relatively near future, and for which there are few older
+   systems around on which to test.
+5. The AP bus registers the vfio_ap device driver with the device core
+6. The administrator edits the AP adapter and queue masks to reserve AP queues
+   for use by the vfio_ap device driver.
+7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
+   default zcrypt cex4queue driver.
+8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
+   it.
+9. The administrator creates a passthrough type mediated matrix device to be
+   used by a guest
+10. The administrator assigns the adapters, usage domains and control domains
+    to be exclusively used by a guest.
+
+Set up the VFIO mediated device interfaces
+------------------------------------------
+The VFIO AP device driver utilizes the common interface of the VFIO mediated
+device core driver to:
+
+* Register an AP mediated bus driver to add a mediated matrix device to and
+  remove it from a VFIO group.
+* Create and destroy a mediated matrix device
+* Add a mediated matrix device to and remove it from the AP mediated bus driver
+* Add a mediated matrix device to and remove it from an IOMMU group
+
+The following high-level block diagram shows the main components and interfaces
+of the VFIO AP mediated matrix device driver::
+
+   +-------------+
+   |             |
+   | +---------+ | mdev_register_driver() +--------------+
+   | |  Mdev   | +<-----------------------+              |
+   | |  bus    | |                        | vfio_mdev.ko |
+   | | driver  | +----------------------->+              |<-> VFIO user
+   | +---------+ |    probe()/remove()    +--------------+    APIs
+   |             |
+   |  MDEV CORE  |
+   |   MODULE    |
+   |   mdev.ko   |
+   | +---------+ | mdev_register_device() +--------------+
+   | |Physical | +<-----------------------+              |
+   | | device  | |                        |  vfio_ap.ko  |<-> matrix
+   | |interface| +----------------------->+              |    device
+   | +---------+ |       callback         +--------------+
+   +-------------+
+
+During initialization of the vfio_ap module, the matrix device is registered
+with an 'mdev_parent_ops' structure that provides the sysfs attribute
+structures, mdev functions and callback interfaces for managing the mediated
+matrix device.
+
+* sysfs attribute structures:
+
+  supported_type_groups
+    The VFIO mediated device framework supports creation of user-defined
+    mediated device types. These mediated device types are specified
+    via the 'supported_type_groups' structure when a device is registered
+    with the mediated device framework. The registration process creates the
+    sysfs structures for each mediated device type specified in the
+    'mdev_supported_types' sub-directory of the device being registered. Along
+    with the device type, the sysfs attributes of the mediated device type are
+    provided.
+
+    The VFIO AP device driver will register one mediated device type for
+    passthrough devices:
+
+      /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
+
+    Only the read-only attributes required by the VFIO mdev framework will
+    be provided::
+
+	... name
+	... device_api
+	... available_instances
+	... device_api
+
+    Where:
+
+	* name:
+	    specifies the name of the mediated device type
+	* device_api:
+	    the mediated device type's API
+	* available_instances:
+	    the number of mediated matrix passthrough devices
+	    that can be created
+	* device_api:
+	    specifies the VFIO API
+  mdev_attr_groups
+    This attribute group identifies the user-defined sysfs attributes of the
+    mediated device. When a device is registered with the VFIO mediated device
+    framework, the sysfs attribute files identified in the 'mdev_attr_groups'
+    structure will be created in the mediated matrix device's directory. The
+    sysfs attributes for a mediated matrix device are:
+
+    assign_adapter / unassign_adapter:
+      Write-only attributes for assigning/unassigning an AP adapter to/from the
+      mediated matrix device. To assign/unassign an adapter, the APID of the
+      adapter is echoed to the respective attribute file.
+    assign_domain / unassign_domain:
+      Write-only attributes for assigning/unassigning an AP usage domain to/from
+      the mediated matrix device. To assign/unassign a domain, the domain
+      number of the the usage domain is echoed to the respective attribute
+      file.
+    matrix:
+      A read-only file for displaying the APQNs derived from the cross product
+      of the adapter and domain numbers assigned to the mediated matrix device.
+    assign_control_domain / unassign_control_domain:
+      Write-only attributes for assigning/unassigning an AP control domain
+      to/from the mediated matrix device. To assign/unassign a control domain,
+      the ID of the domain to be assigned/unassigned is echoed to the respective
+      attribute file.
+    control_domains:
+      A read-only file for displaying the control domain numbers assigned to the
+      mediated matrix device.
+
+* functions:
+
+  create:
+    allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
+
+    * Store the reference to the KVM structure for the guest using the mdev
+    * Store the AP matrix configuration for the adapters, domains, and control
+      domains assigned via the corresponding sysfs attributes files
+
+  remove:
+    deallocates the mediated matrix device's ap_matrix_mdev structure. This will
+    be allowed only if a running guest is not using the mdev.
+
+* callback interfaces
+
+  open:
+    The vfio_ap driver uses this callback to register a
+    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
+    device. The open is invoked when QEMU connects the VFIO iommu group
+    for the mdev matrix device to the MDEV bus. Access to the KVM structure used
+    to configure the KVM guest is provided via this callback. The KVM structure,
+    is used to configure the guest's access to the AP matrix defined via the
+    mediated matrix device's sysfs attribute files.
+  release:
+    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
+    mdev matrix device and deconfigures the guest's AP matrix.
+
+Configure the APM, AQM and ADM in the CRYCB
+-------------------------------------------
+Configuring the AP matrix for a KVM guest will be performed when the
+VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
+function is called when QEMU connects to KVM. The guest's AP matrix is
+configured via it's CRYCB by:
+
+* Setting the bits in the APM corresponding to the APIDs assigned to the
+  mediated matrix device via its 'assign_adapter' interface.
+* Setting the bits in the AQM corresponding to the domains assigned to the
+  mediated matrix device via its 'assign_domain' interface.
+* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
+  mediated matrix device via its 'assign_control_domains' interface.
+
+The CPU model features for AP
+-----------------------------
+The AP stack relies on the presence of the AP instructions as well as two
+facilities: The AP Facilities Test (APFT) facility; and the AP Query
+Configuration Information (QCI) facility. These features/facilities are made
+available to a KVM guest via the following CPU model features:
+
+1. ap: Indicates whether the AP instructions are installed on the guest. This
+   feature will be enabled by KVM only if the AP instructions are installed
+   on the host.
+
+2. apft: Indicates the APFT facility is available on the guest. This facility
+   can be made available to the guest only if it is available on the host (i.e.,
+   facility bit 15 is set).
+
+3. apqci: Indicates the AP QCI facility is available on the guest. This facility
+   can be made available to the guest only if it is available on the host (i.e.,
+   facility bit 12 is set).
+
+Note: If the user chooses to specify a CPU model different than the 'host'
+model to QEMU, the CPU model features and facilities need to be turned on
+explicitly; for example::
+
+     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
+
+A guest can be precluded from using AP features/facilities by turning them off
+explicitly; for example::
+
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
+
+Note: If the APFT facility is turned off (apft=off) for the guest, the guest
+will not see any AP devices. The zcrypt device drivers that register for type 10
+and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
+the APFT facility to ascertain the facilities installed on a given AP device. If
+the APFT facility is not installed on the guest, then the probe of device
+drivers will fail since only type 10 and newer devices can be configured for
+guest use.
+
+Example
+=======
+Let's now provide an example to illustrate how KVM guests may be given
+access to AP facilities. For this example, we will show how to configure
+three guests such that executing the lszcrypt command on the guests would
+look like this:
+
+Guest1
+------
+=========== ===== ============
+CARD.DOMAIN TYPE  MODE
+=========== ===== ============
+05          CEX5C CCA-Coproc
+05.0004     CEX5C CCA-Coproc
+05.00ab     CEX5C CCA-Coproc
+06          CEX5A Accelerator
+06.0004     CEX5A Accelerator
+06.00ab     CEX5C CCA-Coproc
+=========== ===== ============
+
+Guest2
+------
+=========== ===== ============
+CARD.DOMAIN TYPE  MODE
+=========== ===== ============
+05          CEX5A Accelerator
+05.0047     CEX5A Accelerator
+05.00ff     CEX5A Accelerator
+=========== ===== ============
+
+Guest2
+------
+=========== ===== ============
+CARD.DOMAIN TYPE  MODE
+=========== ===== ============
+06          CEX5A Accelerator
+06.0047     CEX5A Accelerator
+06.00ff     CEX5A Accelerator
+=========== ===== ============
+
+These are the steps:
+
+1. Install the vfio_ap module on the linux host. The dependency chain for the
+   vfio_ap module is:
+   * iommu
+   * s390
+   * zcrypt
+   * vfio
+   * vfio_mdev
+   * vfio_mdev_device
+   * KVM
+
+   To build the vfio_ap module, the kernel build must be configured with the
+   following Kconfig elements selected:
+   * IOMMU_SUPPORT
+   * S390
+   * ZCRYPT
+   * S390_AP_IOMMU
+   * VFIO
+   * VFIO_MDEV
+   * VFIO_MDEV_DEVICE
+   * KVM
+
+   If using make menuconfig select the following to build the vfio_ap module::
+
+     -> Device Drivers
+	-> IOMMU Hardware Support
+	   select S390 AP IOMMU Support
+	-> VFIO Non-Privileged userspace driver framework
+	   -> Mediated device driver frramework
+	      -> VFIO driver for Mediated devices
+     -> I/O subsystem
+	-> VFIO support for AP devices
+
+2. Secure the AP queues to be used by the three guests so that the host can not
+   access them. To secure them, there are two sysfs files that specify
+   bitmasks marking a subset of the APQN range as 'usable by the default AP
+   queue device drivers' or 'not usable by the default device drivers' and thus
+   available for use by the vfio_ap device driver'. The location of the sysfs
+   files containing the masks are::
+
+     /sys/bus/ap/apmask
+     /sys/bus/ap/aqmask
+
+   The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
+   (APID). Each bit in the mask, from left to right (i.e., from most significant
+   to least significant bit in big endian order), corresponds to an APID from
+   0-255. If a bit is set, the APID is marked as usable only by the default AP
+   queue device drivers; otherwise, the APID is usable by the vfio_ap
+   device driver.
+
+   The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
+   (APQI). Each bit in the mask, from left to right (i.e., from most significant
+   to least significant bit in big endian order), corresponds to an APQI from
+   0-255. If a bit is set, the APQI is marked as usable only by the default AP
+   queue device drivers; otherwise, the APQI is usable by the vfio_ap device
+   driver.
+
+   Take, for example, the following mask::
+
+      0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+    It indicates:
+
+      1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
+      belong to the vfio_ap device driver's pool.
+
+   The APQN of each AP queue device assigned to the linux host is checked by the
+   AP bus against the set of APQNs derived from the cross product of APIDs
+   and APQIs marked as usable only by the default AP queue device drivers. If a
+   match is detected,  only the default AP queue device drivers will be probed;
+   otherwise, the vfio_ap device driver will be probed.
+
+   By default, the two masks are set to reserve all APQNs for use by the default
+   AP queue device drivers. There are two ways the default masks can be changed:
+
+   1. The sysfs mask files can be edited by echoing a string into the
+      respective sysfs mask file in one of two formats:
+
+      * An absolute hex string starting with 0x - like "0x12345678" - sets
+	the mask. If the given string is shorter than the mask, it is padded
+	with 0s on the right; for example, specifying a mask value of 0x41 is
+	the same as specifying::
+
+	   0x4100000000000000000000000000000000000000000000000000000000000000
+
+	Keep in mind that the mask reads from left to right (i.e., most
+	significant to least significant bit in big endian order), so the mask
+	above identifies device numbers 1 and 7 (01000001).
+
+	If the string is longer than the mask, the operation is terminated with
+	an error (EINVAL).
+
+      * Individual bits in the mask can be switched on and off by specifying
+	each bit number to be switched in a comma separated list. Each bit
+	number string must be prepended with a ('+') or minus ('-') to indicate
+	the corresponding bit is to be switched on ('+') or off ('-'). Some
+	valid values are:
+
+	   - "+0"    switches bit 0 on
+	   - "-13"   switches bit 13 off
+	   - "+0x41" switches bit 65 on
+	   - "-0xff" switches bit 255 off
+
+	The following example:
+
+	      +0,-6,+0x47,-0xf0
+
+	Switches bits 0 and 71 (0x47) on
+
+	Switches bits 6 and 240 (0xf0) off
+
+	Note that the bits not specified in the list remain as they were before
+	the operation.
+
+   2. The masks can also be changed at boot time via parameters on the kernel
+      command line like this:
+
+	 ap.apmask=0xffff ap.aqmask=0x40
+
+	 This would create the following masks::
+
+	    apmask:
+	    0xffff000000000000000000000000000000000000000000000000000000000000
+
+	    aqmask:
+	    0x4000000000000000000000000000000000000000000000000000000000000000
+
+	 Resulting in these two pools::
+
+	    default drivers pool:    adapter 0-15, domain 1
+	    alternate drivers pool:  adapter 16-255, domains 0, 2-255
+
+Securing the APQNs for our example
+----------------------------------
+   To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
+   06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
+   APQNs can either be removed from the default masks::
+
+      echo -5,-6 > /sys/bus/ap/apmask
+
+      echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
+
+   Or the masks can be set as follows::
+
+      echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
+      > apmask
+
+      echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
+      > aqmask
+
+   This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
+   06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
+   sysfs directory for the vfio_ap device driver will now contain symbolic links
+   to the AP queue devices bound to it::
+
+     /sys/bus/ap
+     ... [drivers]
+     ...... [vfio_ap]
+     ......... [05.0004]
+     ......... [05.0047]
+     ......... [05.00ab]
+     ......... [05.00ff]
+     ......... [06.0004]
+     ......... [06.0047]
+     ......... [06.00ab]
+     ......... [06.00ff]
+
+   Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
+   can be bound to the vfio_ap device driver. The reason for this is to
+   simplify the implementation by not needlessly complicating the design by
+   supporting older devices that will go out of service in the relatively near
+   future and for which there are few older systems on which to test.
+
+   The administrator, therefore, must take care to secure only AP queues that
+   can be bound to the vfio_ap device driver. The device type for a given AP
+   queue device can be read from the parent card's sysfs directory. For example,
+   to see the hardware type of the queue 05.0004:
+
+     cat /sys/bus/ap/devices/card05/hwtype
+
+   The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
+   vfio_ap device driver.
+
+3. Create the mediated devices needed to configure the AP matrixes for the
+   three guests and to provide an interface to the vfio_ap driver for
+   use by the guests::
+
+     /sys/devices/vfio_ap/matrix/
+     --- [mdev_supported_types]
+     ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
+     --------- create
+     --------- [devices]
+
+   To create the mediated devices for the three guests::
+
+	uuidgen > create
+	uuidgen > create
+	uuidgen > create
+
+	or
+
+	echo $uuid1 > create
+	echo $uuid2 > create
+	echo $uuid3 > create
+
+   This will create three mediated devices in the [devices] subdirectory named
+   after the UUID written to the create attribute file. We call them $uuid1,
+   $uuid2 and $uuid3 and this is the sysfs directory structure after creation::
+
+     /sys/devices/vfio_ap/matrix/
+     --- [mdev_supported_types]
+     ------ [vfio_ap-passthrough]
+     --------- [devices]
+     ------------ [$uuid1]
+     --------------- assign_adapter
+     --------------- assign_control_domain
+     --------------- assign_domain
+     --------------- matrix
+     --------------- unassign_adapter
+     --------------- unassign_control_domain
+     --------------- unassign_domain
+
+     ------------ [$uuid2]
+     --------------- assign_adapter
+     --------------- assign_control_domain
+     --------------- assign_domain
+     --------------- matrix
+     --------------- unassign_adapter
+     ----------------unassign_control_domain
+     ----------------unassign_domain
+
+     ------------ [$uuid3]
+     --------------- assign_adapter
+     --------------- assign_control_domain
+     --------------- assign_domain
+     --------------- matrix
+     --------------- unassign_adapter
+     ----------------unassign_control_domain
+     ----------------unassign_domain
+
+4. The administrator now needs to configure the matrixes for the mediated
+   devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
+
+   This is how the matrix is configured for Guest1::
+
+      echo 5 > assign_adapter
+      echo 6 > assign_adapter
+      echo 4 > assign_domain
+      echo 0xab > assign_domain
+
+   Control domains can similarly be assigned using the assign_control_domain
+   sysfs file.
+
+   If a mistake is made configuring an adapter, domain or control domain,
+   you can use the unassign_xxx files to unassign the adapter, domain or
+   control domain.
+
+   To display the matrix configuration for Guest1::
+
+	 cat matrix
+
+   This is how the matrix is configured for Guest2::
+
+      echo 5 > assign_adapter
+      echo 0x47 > assign_domain
+      echo 0xff > assign_domain
+
+   This is how the matrix is configured for Guest3::
+
+      echo 6 > assign_adapter
+      echo 0x47 > assign_domain
+      echo 0xff > assign_domain
+
+   In order to successfully assign an adapter:
+
+   * The adapter number specified must represent a value from 0 up to the
+     maximum adapter number configured for the system. If an adapter number
+     higher than the maximum is specified, the operation will terminate with
+     an error (ENODEV).
+
+   * All APQNs that can be derived from the adapter ID and the IDs of
+     the previously assigned domains must be bound to the vfio_ap device
+     driver. If no domains have yet been assigned, then there must be at least
+     one APQN with the specified APID bound to the vfio_ap driver. If no such
+     APQNs are bound to the driver, the operation will terminate with an
+     error (EADDRNOTAVAIL).
+
+     No APQN that can be derived from the adapter ID and the IDs of the
+     previously assigned domains can be assigned to another mediated matrix
+     device. If an APQN is assigned to another mediated matrix device, the
+     operation will terminate with an error (EADDRINUSE).
+
+   In order to successfully assign a domain:
+
+   * The domain number specified must represent a value from 0 up to the
+     maximum domain number configured for the system. If a domain number
+     higher than the maximum is specified, the operation will terminate with
+     an error (ENODEV).
+
+   * All APQNs that can be derived from the domain ID and the IDs of
+     the previously assigned adapters must be bound to the vfio_ap device
+     driver. If no domains have yet been assigned, then there must be at least
+     one APQN with the specified APQI bound to the vfio_ap driver. If no such
+     APQNs are bound to the driver, the operation will terminate with an
+     error (EADDRNOTAVAIL).
+
+     No APQN that can be derived from the domain ID and the IDs of the
+     previously assigned adapters can be assigned to another mediated matrix
+     device. If an APQN is assigned to another mediated matrix device, the
+     operation will terminate with an error (EADDRINUSE).
+
+   In order to successfully assign a control domain, the domain number
+   specified must represent a value from 0 up to the maximum domain number
+   configured for the system. If a control domain number higher than the maximum
+   is specified, the operation will terminate with an error (ENODEV).
+
+5. Start Guest1::
+
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
+
+7. Start Guest2::
+
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
+
+7. Start Guest3::
+
+     /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+	-device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
+
+When the guest is shut down, the mediated matrix devices may be removed.
+
+Using our example again, to remove the mediated matrix device $uuid1::
+
+   /sys/devices/vfio_ap/matrix/
+      --- [mdev_supported_types]
+      ------ [vfio_ap-passthrough]
+      --------- [devices]
+      ------------ [$uuid1]
+      --------------- remove
+
+::
+
+   echo 1 > remove
+
+This will remove all of the mdev matrix device's sysfs structures including
+the mdev device itself. To recreate and reconfigure the mdev matrix device,
+all of the steps starting with step 3 will have to be performed again. Note
+that the remove will fail if a guest using the mdev is still running.
+
+It is not necessary to remove an mdev matrix device, but one may want to
+remove it if no guest will use it during the remaining lifetime of the linux
+host. If the mdev matrix device is removed, one may want to also reconfigure
+the pool of adapters and queues reserved for use by the default drivers.
+
+Limitations
+===========
+* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
+  to the default drivers pool of a queue that is still assigned to a mediated
+  device in use by a guest. It is incumbent upon the administrator to
+  ensure there is no mediated device in use by a guest to which the APQN is
+  assigned lest the host be given access to the private data of the AP queue
+  device such as a private key configured specifically for the guest.
+
+* Dynamically modifying the AP matrix for a running guest (which would amount to
+  hot(un)plug of AP devices for the guest) is currently not supported
+
+* Live guest migration is not supported for guests using AP devices.
diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt
deleted file mode 100644
index 65167cfe4485..000000000000
--- a/Documentation/s390/vfio-ap.txt
+++ /dev/null
@@ -1,837 +0,0 @@
-Introduction:
-============
-The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
-of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
-The AP devices provide cryptographic functions to all CPUs assigned to a
-linux system running in an IBM Z system LPAR.
-
-The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
-is to make AP cards available to KVM guests using the VFIO mediated device
-framework. This implementation relies considerably on the s390 virtualization
-facilities which do most of the hard work of providing direct access to AP
-devices.
-
-AP Architectural Overview:
-=========================
-To facilitate the comprehension of the design, let's start with some
-definitions:
-
-* AP adapter
-
-  An AP adapter is an IBM Z adapter card that can perform cryptographic
-  functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
-  assigned to the LPAR in which a linux host is running will be available to
-  the linux host. Each adapter is identified by a number from 0 to 255; however,
-  the maximum adapter number is determined by machine model and/or adapter type.
-  When installed, an AP adapter is accessed by AP instructions executed by any
-  CPU.
-
-  The AP adapter cards are assigned to a given LPAR via the system's Activation
-  Profile which can be edited via the HMC. When the linux host system is IPL'd
-  in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
-  creates a sysfs device for each assigned adapter. For example, if AP adapters
-  4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
-  sysfs device entries:
-
-    /sys/devices/ap/card04
-    /sys/devices/ap/card0a
-
-  Symbolic links to these devices will also be created in the AP bus devices
-  sub-directory:
-
-    /sys/bus/ap/devices/[card04]
-    /sys/bus/ap/devices/[card04]
-
-* AP domain
-
-  An adapter is partitioned into domains. An adapter can hold up to 256 domains
-  depending upon the adapter type and hardware configuration. A domain is
-  identified by a number from 0 to 255; however, the maximum domain number is
-  determined by machine model and/or adapter type.. A domain can be thought of
-  as a set of hardware registers and memory used for processing AP commands. A
-  domain can be configured with a secure private key used for clear key
-  encryption. A domain is classified in one of two ways depending upon how it
-  may be accessed:
-
-    * Usage domains are domains that are targeted by an AP instruction to
-      process an AP command.
-
-    * Control domains are domains that are changed by an AP command sent to a
-      usage domain; for example, to set the secure private key for the control
-      domain.
-
-  The AP usage and control domains are assigned to a given LPAR via the system's
-  Activation Profile which can be edited via the HMC. When a linux host system
-  is IPL'd in the LPAR, the AP bus module detects the AP usage and control
-  domains assigned to the LPAR. The domain number of each usage domain and
-  adapter number of each AP adapter are combined to create AP queue devices
-  (see AP Queue section below). The domain number of each control domain will be
-  represented in a bitmask and stored in a sysfs file
-  /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
-  significant bit, correspond to domains 0-255.
-
-* AP Queue
-
-  An AP queue is the means by which an AP command is sent to a usage domain
-  inside a specific adapter. An AP queue is identified by a tuple
-  comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
-  APQI corresponds to a given usage domain number within the adapter. This tuple
-  forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
-  instructions include a field containing the APQN to identify the AP queue to
-  which the AP command is to be sent for processing.
-
-  The AP bus will create a sysfs device for each APQN that can be derived from
-  the cross product of the AP adapter and usage domain numbers detected when the
-  AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
-  domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
-  following sysfs entries:
-
-    /sys/devices/ap/card04/04.0006
-    /sys/devices/ap/card04/04.0047
-    /sys/devices/ap/card0a/0a.0006
-    /sys/devices/ap/card0a/0a.0047
-
-  The following symbolic links to these devices will be created in the AP bus
-  devices subdirectory:
-
-    /sys/bus/ap/devices/[04.0006]
-    /sys/bus/ap/devices/[04.0047]
-    /sys/bus/ap/devices/[0a.0006]
-    /sys/bus/ap/devices/[0a.0047]
-
-* AP Instructions:
-
-  There are three AP instructions:
-
-  * NQAP: to enqueue an AP command-request message to a queue
-  * DQAP: to dequeue an AP command-reply message from a queue
-  * PQAP: to administer the queues
-
-  AP instructions identify the domain that is targeted to process the AP
-  command; this must be one of the usage domains. An AP command may modify a
-  domain that is not one of the usage domains, but the modified domain
-  must be one of the control domains.
-
-AP and SIE:
-==========
-Let's now take a look at how AP instructions executed on a guest are interpreted
-by the hardware.
-
-A satellite control block called the Crypto Control Block (CRYCB) is attached to
-our main hardware virtualization control block. The CRYCB contains three fields
-to identify the adapters, usage domains and control domains assigned to the KVM
-guest:
-
-* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
-  to the KVM guest. Each bit in the mask, from left to right (i.e. from most
-  significant to least significant bit in big endian order), corresponds to
-  an APID from 0-255. If a bit is set, the corresponding adapter is valid for
-  use by the KVM guest.
-
-* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
-  assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
-  most significant to least significant bit in big endian order), corresponds to
-  an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
-  is valid for use by the KVM guest.
-
-* The AP Domain Mask field is a bit mask that identifies the AP control domains
-  assigned to the KVM guest. The ADM bit mask controls which domains can be
-  changed by an AP command-request message sent to a usage domain from the
-  guest. Each bit in the mask, from left to right (i.e. from most significant to
-  least significant bit in big endian order), corresponds to a domain from
-  0-255. If a bit is set, the corresponding domain can be modified by an AP
-  command-request message sent to a usage domain.
-
-If you recall from the description of an AP Queue, AP instructions include
-an APQN to identify the AP queue to which an AP command-request message is to be
-sent (NQAP and PQAP instructions), or from which a command-reply message is to
-be received (DQAP instruction). The validity of an APQN is defined by the matrix
-calculated from the APM and AQM; it is the cross product of all assigned adapter
-numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
-and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
-(2,5) and (2,6) will be valid for the guest.
-
-The APQNs can provide secure key functionality - i.e., a private key is stored
-on the adapter card for each of its domains - so each APQN must be assigned to
-at most one guest or to the linux host.
-
-   Example 1: Valid configuration:
-   ------------------------------
-   Guest1: adapters 1,2  domains 5,6
-   Guest2: adapter  1,2  domain 7
-
-   This is valid because both guests have a unique set of APQNs:
-      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
-      Guest2 has APQNs (1,7), (2,7)
-
-   Example 2: Valid configuration:
-   ------------------------------
-   Guest1: adapters 1,2 domains 5,6
-   Guest2: adapters 3,4 domains 5,6
-
-   This is also valid because both guests have a unique set of APQNs:
-      Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
-      Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
-
-   Example 3: Invalid configuration:
-   --------------------------------
-   Guest1: adapters 1,2  domains 5,6
-   Guest2: adapter  1    domains 6,7
-
-   This is an invalid configuration because both guests have access to
-   APQN (1,6).
-
-The Design:
-===========
-The design introduces three new objects:
-
-1. AP matrix device
-2. VFIO AP device driver (vfio_ap.ko)
-3. VFIO AP mediated matrix pass-through device
-
-The VFIO AP device driver
--------------------------
-The VFIO AP (vfio_ap) device driver serves the following purposes:
-
-1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
-
-2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
-   device and creates the sysfs interfaces for assigning adapters, usage
-   domains, and control domains comprising the matrix for a KVM guest.
-
-3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
-   SIE state description to grant the guest access to a matrix of AP devices
-
-Reserve APQNs for exclusive use of KVM guests
----------------------------------------------
-The following block diagram illustrates the mechanism by which APQNs are
-reserved:
-
-                              +------------------+
-               7 remove       |                  |
-         +--------------------> cex4queue driver |
-         |                    |                  |
-         |                    +------------------+
-         |
-         |
-         |                    +------------------+          +-----------------+
-         |  5 register driver |                  | 3 create |                 |
-         |   +---------------->   Device core    +---------->  matrix device  |
-         |   |                |                  |          |                 |
-         |   |                +--------^---------+          +-----------------+
-         |   |                         |
-         |   |                         +-------------------+
-         |   | +-----------------------------------+       |
-         |   | |      4 register AP driver         |       | 2 register device
-         |   | |                                   |       |
-+--------+---+-v---+                      +--------+-------+-+
-|                  |                      |                  |
-|      ap_bus      +--------------------- >  vfio_ap driver  |
-|                  |       8 probe        |                  |
-+--------^---------+                      +--^--^------------+
-6 edit   |                                   |  |
-  apmask |     +-----------------------------+  | 9 mdev create
-  aqmask |     |           1 modprobe           |
-+--------+-----+---+           +----------------+-+         +------------------+
-|                  |           |                  |8 create |     mediated     |
-|      admin       |           | VFIO device core |--------->     matrix       |
-|                  +           |                  |         |     device       |
-+------+-+---------+           +--------^---------+         +--------^---------+
-       | |                              |                            |
-       | | 9 create vfio_ap-passthrough |                            |
-       | +------------------------------+                            |
-       +-------------------------------------------------------------+
-                   10  assign adapter/domain/control domain
-
-The process for reserving an AP queue for use by a KVM guest is:
-
-1. The administrator loads the vfio_ap device driver
-2. The vfio-ap driver during its initialization will register a single 'matrix'
-   device with the device core. This will serve as the parent device for
-   all mediated matrix devices used to configure an AP matrix for a guest.
-3. The /sys/devices/vfio_ap/matrix device is created by the device core
-4  The vfio_ap device driver will register with the AP bus for AP queue devices
-   of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
-   driver's probe and remove callback interfaces. Devices older than CEX4 queues
-   are not supported to simplify the implementation by not needlessly
-   complicating the design by supporting older devices that will go out of
-   service in the relatively near future, and for which there are few older
-   systems around on which to test.
-5. The AP bus registers the vfio_ap device driver with the device core
-6. The administrator edits the AP adapter and queue masks to reserve AP queues
-   for use by the vfio_ap device driver.
-7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
-   default zcrypt cex4queue driver.
-8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
-   it.
-9. The administrator creates a passthrough type mediated matrix device to be
-   used by a guest
-10 The administrator assigns the adapters, usage domains and control domains
-   to be exclusively used by a guest.
-
-Set up the VFIO mediated device interfaces
-------------------------------------------
-The VFIO AP device driver utilizes the common interface of the VFIO mediated
-device core driver to:
-* Register an AP mediated bus driver to add a mediated matrix device to and
-  remove it from a VFIO group.
-* Create and destroy a mediated matrix device
-* Add a mediated matrix device to and remove it from the AP mediated bus driver
-* Add a mediated matrix device to and remove it from an IOMMU group
-
-The following high-level block diagram shows the main components and interfaces
-of the VFIO AP mediated matrix device driver:
-
- +-------------+
- |             |
- | +---------+ | mdev_register_driver() +--------------+
- | |  Mdev   | +<-----------------------+              |
- | |  bus    | |                        | vfio_mdev.ko |
- | | driver  | +----------------------->+              |<-> VFIO user
- | +---------+ |    probe()/remove()    +--------------+    APIs
- |             |
- |  MDEV CORE  |
- |   MODULE    |
- |   mdev.ko   |
- | +---------+ | mdev_register_device() +--------------+
- | |Physical | +<-----------------------+              |
- | | device  | |                        |  vfio_ap.ko  |<-> matrix
- | |interface| +----------------------->+              |    device
- | +---------+ |       callback         +--------------+
- +-------------+
-
-During initialization of the vfio_ap module, the matrix device is registered
-with an 'mdev_parent_ops' structure that provides the sysfs attribute
-structures, mdev functions and callback interfaces for managing the mediated
-matrix device.
-
-* sysfs attribute structures:
-  * supported_type_groups
-    The VFIO mediated device framework supports creation of user-defined
-    mediated device types. These mediated device types are specified
-    via the 'supported_type_groups' structure when a device is registered
-    with the mediated device framework. The registration process creates the
-    sysfs structures for each mediated device type specified in the
-    'mdev_supported_types' sub-directory of the device being registered. Along
-    with the device type, the sysfs attributes of the mediated device type are
-    provided.
-
-    The VFIO AP device driver will register one mediated device type for
-    passthrough devices:
-      /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
-    Only the read-only attributes required by the VFIO mdev framework will
-    be provided:
-        ... name
-        ... device_api
-        ... available_instances
-        ... device_api
-        Where:
-        * name: specifies the name of the mediated device type
-        * device_api: the mediated device type's API
-        * available_instances: the number of mediated matrix passthrough devices
-                               that can be created
-        * device_api: specifies the VFIO API
-  * mdev_attr_groups
-    This attribute group identifies the user-defined sysfs attributes of the
-    mediated device. When a device is registered with the VFIO mediated device
-    framework, the sysfs attribute files identified in the 'mdev_attr_groups'
-    structure will be created in the mediated matrix device's directory. The
-    sysfs attributes for a mediated matrix device are:
-    * assign_adapter:
-    * unassign_adapter:
-      Write-only attributes for assigning/unassigning an AP adapter to/from the
-      mediated matrix device. To assign/unassign an adapter, the APID of the
-      adapter is echoed to the respective attribute file.
-    * assign_domain:
-    * unassign_domain:
-      Write-only attributes for assigning/unassigning an AP usage domain to/from
-      the mediated matrix device. To assign/unassign a domain, the domain
-      number of the the usage domain is echoed to the respective attribute
-      file.
-    * matrix:
-      A read-only file for displaying the APQNs derived from the cross product
-      of the adapter and domain numbers assigned to the mediated matrix device.
-    * assign_control_domain:
-    * unassign_control_domain:
-      Write-only attributes for assigning/unassigning an AP control domain
-      to/from the mediated matrix device. To assign/unassign a control domain,
-      the ID of the domain to be assigned/unassigned is echoed to the respective
-      attribute file.
-    * control_domains:
-      A read-only file for displaying the control domain numbers assigned to the
-      mediated matrix device.
-
-* functions:
-  * create:
-    allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
-    * Store the reference to the KVM structure for the guest using the mdev
-    * Store the AP matrix configuration for the adapters, domains, and control
-      domains assigned via the corresponding sysfs attributes files
-  * remove:
-    deallocates the mediated matrix device's ap_matrix_mdev structure. This will
-    be allowed only if a running guest is not using the mdev.
-
-* callback interfaces
-  * open:
-    The vfio_ap driver uses this callback to register a
-    VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
-    device. The open is invoked when QEMU connects the VFIO iommu group
-    for the mdev matrix device to the MDEV bus. Access to the KVM structure used
-    to configure the KVM guest is provided via this callback. The KVM structure,
-    is used to configure the guest's access to the AP matrix defined via the
-    mediated matrix device's sysfs attribute files.
-  * release:
-    unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
-    mdev matrix device and deconfigures the guest's AP matrix.
-
-Configure the APM, AQM and ADM in the CRYCB:
--------------------------------------------
-Configuring the AP matrix for a KVM guest will be performed when the
-VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
-function is called when QEMU connects to KVM. The guest's AP matrix is
-configured via it's CRYCB by:
-* Setting the bits in the APM corresponding to the APIDs assigned to the
-  mediated matrix device via its 'assign_adapter' interface.
-* Setting the bits in the AQM corresponding to the domains assigned to the
-  mediated matrix device via its 'assign_domain' interface.
-* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
-  mediated matrix device via its 'assign_control_domains' interface.
-
-The CPU model features for AP
------------------------------
-The AP stack relies on the presence of the AP instructions as well as two
-facilities: The AP Facilities Test (APFT) facility; and the AP Query
-Configuration Information (QCI) facility. These features/facilities are made
-available to a KVM guest via the following CPU model features:
-
-1. ap: Indicates whether the AP instructions are installed on the guest. This
-   feature will be enabled by KVM only if the AP instructions are installed
-   on the host.
-
-2. apft: Indicates the APFT facility is available on the guest. This facility
-   can be made available to the guest only if it is available on the host (i.e.,
-   facility bit 15 is set).
-
-3. apqci: Indicates the AP QCI facility is available on the guest. This facility
-   can be made available to the guest only if it is available on the host (i.e.,
-   facility bit 12 is set).
-
-Note: If the user chooses to specify a CPU model different than the 'host'
-model to QEMU, the CPU model features and facilities need to be turned on
-explicitly; for example:
-
-     /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
-
-A guest can be precluded from using AP features/facilities by turning them off
-explicitly; for example:
-
-     /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
-
-Note: If the APFT facility is turned off (apft=off) for the guest, the guest
-will not see any AP devices. The zcrypt device drivers that register for type 10
-and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
-the APFT facility to ascertain the facilities installed on a given AP device. If
-the APFT facility is not installed on the guest, then the probe of device
-drivers will fail since only type 10 and newer devices can be configured for
-guest use.
-
-Example:
-=======
-Let's now provide an example to illustrate how KVM guests may be given
-access to AP facilities. For this example, we will show how to configure
-three guests such that executing the lszcrypt command on the guests would
-look like this:
-
-Guest1
-------
-CARD.DOMAIN TYPE  MODE
-------------------------------
-05          CEX5C CCA-Coproc
-05.0004     CEX5C CCA-Coproc
-05.00ab     CEX5C CCA-Coproc
-06          CEX5A Accelerator
-06.0004     CEX5A Accelerator
-06.00ab     CEX5C CCA-Coproc
-
-Guest2
-------
-CARD.DOMAIN TYPE  MODE
-------------------------------
-05          CEX5A Accelerator
-05.0047     CEX5A Accelerator
-05.00ff     CEX5A Accelerator
-
-Guest2
-------
-CARD.DOMAIN TYPE  MODE
-------------------------------
-06          CEX5A Accelerator
-06.0047     CEX5A Accelerator
-06.00ff     CEX5A Accelerator
-
-These are the steps:
-
-1. Install the vfio_ap module on the linux host. The dependency chain for the
-   vfio_ap module is:
-   * iommu
-   * s390
-   * zcrypt
-   * vfio
-   * vfio_mdev
-   * vfio_mdev_device
-   * KVM
-
-   To build the vfio_ap module, the kernel build must be configured with the
-   following Kconfig elements selected:
-   * IOMMU_SUPPORT
-   * S390
-   * ZCRYPT
-   * S390_AP_IOMMU
-   * VFIO
-   * VFIO_MDEV
-   * VFIO_MDEV_DEVICE
-   * KVM
-
-   If using make menuconfig select the following to build the vfio_ap module:
-   -> Device Drivers
-      -> IOMMU Hardware Support
-         select S390 AP IOMMU Support
-      -> VFIO Non-Privileged userspace driver framework
-         -> Mediated device driver frramework
-            -> VFIO driver for Mediated devices
-   -> I/O subsystem
-      -> VFIO support for AP devices
-
-2. Secure the AP queues to be used by the three guests so that the host can not
-   access them. To secure them, there are two sysfs files that specify
-   bitmasks marking a subset of the APQN range as 'usable by the default AP
-   queue device drivers' or 'not usable by the default device drivers' and thus
-   available for use by the vfio_ap device driver'. The location of the sysfs
-   files containing the masks are:
-
-   /sys/bus/ap/apmask
-   /sys/bus/ap/aqmask
-
-   The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
-   (APID). Each bit in the mask, from left to right (i.e., from most significant
-   to least significant bit in big endian order), corresponds to an APID from
-   0-255. If a bit is set, the APID is marked as usable only by the default AP
-   queue device drivers; otherwise, the APID is usable by the vfio_ap
-   device driver.
-
-   The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
-   (APQI). Each bit in the mask, from left to right (i.e., from most significant
-   to least significant bit in big endian order), corresponds to an APQI from
-   0-255. If a bit is set, the APQI is marked as usable only by the default AP
-   queue device drivers; otherwise, the APQI is usable by the vfio_ap device
-   driver.
-
-   Take, for example, the following mask:
-
-      0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-
-    It indicates:
-
-      1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
-      belong to the vfio_ap device driver's pool.
-
-   The APQN of each AP queue device assigned to the linux host is checked by the
-   AP bus against the set of APQNs derived from the cross product of APIDs
-   and APQIs marked as usable only by the default AP queue device drivers. If a
-   match is detected,  only the default AP queue device drivers will be probed;
-   otherwise, the vfio_ap device driver will be probed.
-
-   By default, the two masks are set to reserve all APQNs for use by the default
-   AP queue device drivers. There are two ways the default masks can be changed:
-
-   1. The sysfs mask files can be edited by echoing a string into the
-      respective sysfs mask file in one of two formats:
-
-      * An absolute hex string starting with 0x - like "0x12345678" - sets
-        the mask. If the given string is shorter than the mask, it is padded
-        with 0s on the right; for example, specifying a mask value of 0x41 is
-        the same as specifying:
-
-           0x4100000000000000000000000000000000000000000000000000000000000000
-
-        Keep in mind that the mask reads from left to right (i.e., most
-        significant to least significant bit in big endian order), so the mask
-        above identifies device numbers 1 and 7 (01000001).
-
-        If the string is longer than the mask, the operation is terminated with
-        an error (EINVAL).
-
-      * Individual bits in the mask can be switched on and off by specifying
-        each bit number to be switched in a comma separated list. Each bit
-        number string must be prepended with a ('+') or minus ('-') to indicate
-        the corresponding bit is to be switched on ('+') or off ('-'). Some
-        valid values are:
-
-           "+0"    switches bit 0 on
-           "-13"   switches bit 13 off
-           "+0x41" switches bit 65 on
-           "-0xff" switches bit 255 off
-
-           The following example:
-              +0,-6,+0x47,-0xf0
-
-              Switches bits 0 and 71 (0x47) on
-              Switches bits 6 and 240 (0xf0) off
-
-        Note that the bits not specified in the list remain as they were before
-        the operation.
-
-   2. The masks can also be changed at boot time via parameters on the kernel
-      command line like this:
-
-         ap.apmask=0xffff ap.aqmask=0x40
-
-         This would create the following masks:
-
-            apmask:
-            0xffff000000000000000000000000000000000000000000000000000000000000
-
-            aqmask:
-            0x4000000000000000000000000000000000000000000000000000000000000000
-
-         Resulting in these two pools:
-
-            default drivers pool:    adapter 0-15, domain 1
-            alternate drivers pool:  adapter 16-255, domains 0, 2-255
-
-   Securing the APQNs for our example:
-   ----------------------------------
-   To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
-   06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
-   APQNs can either be removed from the default masks:
-
-      echo -5,-6 > /sys/bus/ap/apmask
-
-      echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
-
-   Or the masks can be set as follows:
-
-      echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
-      > apmask
-
-      echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
-      > aqmask
-
-   This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
-   06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
-   sysfs directory for the vfio_ap device driver will now contain symbolic links
-   to the AP queue devices bound to it:
-
-   /sys/bus/ap
-   ... [drivers]
-   ...... [vfio_ap]
-   ......... [05.0004]
-   ......... [05.0047]
-   ......... [05.00ab]
-   ......... [05.00ff]
-   ......... [06.0004]
-   ......... [06.0047]
-   ......... [06.00ab]
-   ......... [06.00ff]
-
-   Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
-   can be bound to the vfio_ap device driver. The reason for this is to
-   simplify the implementation by not needlessly complicating the design by
-   supporting older devices that will go out of service in the relatively near
-   future and for which there are few older systems on which to test.
-
-   The administrator, therefore, must take care to secure only AP queues that
-   can be bound to the vfio_ap device driver. The device type for a given AP
-   queue device can be read from the parent card's sysfs directory. For example,
-   to see the hardware type of the queue 05.0004:
-
-   cat /sys/bus/ap/devices/card05/hwtype
-
-   The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
-   vfio_ap device driver.
-
-3. Create the mediated devices needed to configure the AP matrixes for the
-   three guests and to provide an interface to the vfio_ap driver for
-   use by the guests:
-
-   /sys/devices/vfio_ap/matrix/
-   --- [mdev_supported_types]
-   ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
-   --------- create
-   --------- [devices]
-
-   To create the mediated devices for the three guests:
-
-	uuidgen > create
-	uuidgen > create
-	uuidgen > create
-
-        or
-
-        echo $uuid1 > create
-        echo $uuid2 > create
-        echo $uuid3 > create
-
-   This will create three mediated devices in the [devices] subdirectory named
-   after the UUID written to the create attribute file. We call them $uuid1,
-   $uuid2 and $uuid3 and this is the sysfs directory structure after creation:
-
-   /sys/devices/vfio_ap/matrix/
-   --- [mdev_supported_types]
-   ------ [vfio_ap-passthrough]
-   --------- [devices]
-   ------------ [$uuid1]
-   --------------- assign_adapter
-   --------------- assign_control_domain
-   --------------- assign_domain
-   --------------- matrix
-   --------------- unassign_adapter
-   --------------- unassign_control_domain
-   --------------- unassign_domain
-
-   ------------ [$uuid2]
-   --------------- assign_adapter
-   --------------- assign_control_domain
-   --------------- assign_domain
-   --------------- matrix
-   --------------- unassign_adapter
-   ----------------unassign_control_domain
-   ----------------unassign_domain
-
-   ------------ [$uuid3]
-   --------------- assign_adapter
-   --------------- assign_control_domain
-   --------------- assign_domain
-   --------------- matrix
-   --------------- unassign_adapter
-   ----------------unassign_control_domain
-   ----------------unassign_domain
-
-4. The administrator now needs to configure the matrixes for the mediated
-   devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
-
-   This is how the matrix is configured for Guest1:
-
-      echo 5 > assign_adapter
-      echo 6 > assign_adapter
-      echo 4 > assign_domain
-      echo 0xab > assign_domain
-
-      Control domains can similarly be assigned using the assign_control_domain
-      sysfs file.
-
-      If a mistake is made configuring an adapter, domain or control domain,
-      you can use the unassign_xxx files to unassign the adapter, domain or
-      control domain.
-
-      To display the matrix configuration for Guest1:
-
-         cat matrix
-
-   This is how the matrix is configured for Guest2:
-
-      echo 5 > assign_adapter
-      echo 0x47 > assign_domain
-      echo 0xff > assign_domain
-
-   This is how the matrix is configured for Guest3:
-
-      echo 6 > assign_adapter
-      echo 0x47 > assign_domain
-      echo 0xff > assign_domain
-
-   In order to successfully assign an adapter:
-
-   * The adapter number specified must represent a value from 0 up to the
-     maximum adapter number configured for the system. If an adapter number
-     higher than the maximum is specified, the operation will terminate with
-     an error (ENODEV).
-
-   * All APQNs that can be derived from the adapter ID and the IDs of
-     the previously assigned domains must be bound to the vfio_ap device
-     driver. If no domains have yet been assigned, then there must be at least
-     one APQN with the specified APID bound to the vfio_ap driver. If no such
-     APQNs are bound to the driver, the operation will terminate with an
-     error (EADDRNOTAVAIL).
-
-     No APQN that can be derived from the adapter ID and the IDs of the
-     previously assigned domains can be assigned to another mediated matrix
-     device. If an APQN is assigned to another mediated matrix device, the
-     operation will terminate with an error (EADDRINUSE).
-
-   In order to successfully assign a domain:
-
-   * The domain number specified must represent a value from 0 up to the
-     maximum domain number configured for the system. If a domain number
-     higher than the maximum is specified, the operation will terminate with
-     an error (ENODEV).
-
-   * All APQNs that can be derived from the domain ID and the IDs of
-     the previously assigned adapters must be bound to the vfio_ap device
-     driver. If no domains have yet been assigned, then there must be at least
-     one APQN with the specified APQI bound to the vfio_ap driver. If no such
-     APQNs are bound to the driver, the operation will terminate with an
-     error (EADDRNOTAVAIL).
-
-     No APQN that can be derived from the domain ID and the IDs of the
-     previously assigned adapters can be assigned to another mediated matrix
-     device. If an APQN is assigned to another mediated matrix device, the
-     operation will terminate with an error (EADDRINUSE).
-
-   In order to successfully assign a control domain, the domain number
-   specified must represent a value from 0 up to the maximum domain number
-   configured for the system. If a control domain number higher than the maximum
-   is specified, the operation will terminate with an error (ENODEV).
-
-5. Start Guest1:
-
-   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
-      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
-
-7. Start Guest2:
-
-   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
-      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
-
-7. Start Guest3:
-
-   /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
-      -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
-
-When the guest is shut down, the mediated matrix devices may be removed.
-
-Using our example again, to remove the mediated matrix device $uuid1:
-
-   /sys/devices/vfio_ap/matrix/
-      --- [mdev_supported_types]
-      ------ [vfio_ap-passthrough]
-      --------- [devices]
-      ------------ [$uuid1]
-      --------------- remove
-
-
-   echo 1 > remove
-
-   This will remove all of the mdev matrix device's sysfs structures including
-   the mdev device itself. To recreate and reconfigure the mdev matrix device,
-   all of the steps starting with step 3 will have to be performed again. Note
-   that the remove will fail if a guest using the mdev is still running.
-
-   It is not necessary to remove an mdev matrix device, but one may want to
-   remove it if no guest will use it during the remaining lifetime of the linux
-   host. If the mdev matrix device is removed, one may want to also reconfigure
-   the pool of adapters and queues reserved for use by the default drivers.
-
-Limitations
-===========
-* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
-  to the default drivers pool of a queue that is still assigned to a mediated
-  device in use by a guest. It is incumbent upon the administrator to
-  ensure there is no mediated device in use by a guest to which the APQN is
-  assigned lest the host be given access to the private data of the AP queue
-  device such as a private key configured specifically for the guest.
-
-* Dynamically modifying the AP matrix for a running guest (which would amount to
-  hot(un)plug of AP devices for the guest) is currently not supported
-
-* Live guest migration is not supported for guests using AP devices.
diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst
new file mode 100644
index 000000000000..1e210c6afa88
--- /dev/null
+++ b/Documentation/s390/vfio-ccw.rst
@@ -0,0 +1,326 @@
+==================================
+vfio-ccw: the basic infrastructure
+==================================
+
+Introduction
+------------
+
+Here we describe the vfio support for I/O subchannel devices for
+Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
+virtual machine, while vfio is the means.
+
+Different than other hardware architectures, s390 has defined a unified
+I/O access method, which is so called Channel I/O. It has its own access
+patterns:
+
+- Channel programs run asynchronously on a separate (co)processor.
+- The channel subsystem will access any memory designated by the caller
+  in the channel program directly, i.e. there is no iommu involved.
+
+Thus when we introduce vfio support for these devices, we realize it
+with a mediated device (mdev) implementation. The vfio mdev will be
+added to an iommu group, so as to make itself able to be managed by the
+vfio framework. And we add read/write callbacks for special vfio I/O
+regions to pass the channel programs from the mdev to its parent device
+(the real I/O subchannel device) to do further address translation and
+to perform I/O instructions.
+
+This document does not intend to explain the s390 I/O architecture in
+every detail. More information/reference could be found here:
+
+- A good start to know Channel I/O in general:
+  https://en.wikipedia.org/wiki/Channel_I/O
+- s390 architecture:
+  s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+- The existing QEMU code which implements a simple emulated channel
+  subsystem could also be a good reference. It makes it easier to follow
+  the flow.
+  qemu/hw/s390x/css.c
+
+For vfio mediated device framework:
+- Documentation/driver-api/vfio-mediated-device.rst
+
+Motivation of vfio-ccw
+----------------------
+
+Typically, a guest virtualized via QEMU/KVM on s390 only sees
+paravirtualized virtio devices via the "Virtio Over Channel I/O
+(virtio-ccw)" transport. This makes virtio devices discoverable via
+standard operating system algorithms for handling channel devices.
+
+However this is not enough. On s390 for the majority of devices, which
+use the standard Channel I/O based mechanism, we also need to provide
+the functionality of passing through them to a QEMU virtual machine.
+This includes devices that don't have a virtio counterpart (e.g. tape
+drives) or that have specific characteristics which guests want to
+exploit.
+
+For passing a device to a guest, we want to use the same interface as
+everybody else, namely vfio. We implement this vfio support for channel
+devices via the vfio mediated device framework and the subchannel device
+driver "vfio_ccw".
+
+Access patterns of CCW devices
+------------------------------
+
+s390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the
+systems. Though the s390 hardware platform knows about a huge variety of
+different peripheral attachments like disk devices (aka. DASDs), tapes,
+communication controllers, etc. They can all be accessed by a well
+defined access method and they are presenting I/O completion a unified
+way: I/O interruptions.
+
+All I/O requires the use of channel command words (CCWs). A CCW is an
+instruction to a specialized I/O channel processor. A channel program is
+a sequence of CCWs which are executed by the I/O channel subsystem.  To
+issue a channel program to the channel subsystem, it is required to
+build an operation request block (ORB), which can be used to point out
+the format of the CCW and other control information to the system. The
+operating system signals the I/O channel subsystem to begin executing
+the channel program with a SSCH (start sub-channel) instruction. The
+central processor is then free to proceed with non-I/O instructions
+until interrupted. The I/O completion result is received by the
+interrupt handler in the form of interrupt response block (IRB).
+
+Back to vfio-ccw, in short:
+
+- ORBs and channel programs are built in guest kernel (with guest
+  physical addresses).
+- ORBs and channel programs are passed to the host kernel.
+- Host kernel translates the guest physical addresses to real addresses
+  and starts the I/O with issuing a privileged Channel I/O instruction
+  (e.g SSCH).
+- channel programs run asynchronously on a separate processor.
+- I/O completion will be signaled to the host with I/O interruptions.
+  And it will be copied as IRB to user space to pass it back to the
+  guest.
+
+Physical vfio ccw device and its child mdev
+-------------------------------------------
+
+As mentioned above, we realize vfio-ccw with a mdev implementation.
+
+Channel I/O does not have IOMMU hardware support, so the physical
+vfio-ccw device does not have an IOMMU level translation or isolation.
+
+Subchannel I/O instructions are all privileged instructions. When
+handling the I/O instruction interception, vfio-ccw has the software
+policing and translation how the channel program is programmed before
+it gets sent to hardware.
+
+Within this implementation, we have two drivers for two types of
+devices:
+
+- The vfio_ccw driver for the physical subchannel device.
+  This is an I/O subchannel driver for the real subchannel device.  It
+  realizes a group of callbacks and registers to the mdev framework as a
+  parent (physical) device. As a consequence, mdev provides vfio_ccw a
+  generic interface (sysfs) to create mdev devices. A vfio mdev could be
+  created by vfio_ccw then and added to the mediated bus. It is the vfio
+  device that added to an IOMMU group and a vfio group.
+  vfio_ccw also provides an I/O region to accept channel program
+  request from user space and store I/O interrupt result for user
+  space to retrieve. To notify user space an I/O completion, it offers
+  an interface to setup an eventfd fd for asynchronous signaling.
+
+- The vfio_mdev driver for the mediated vfio ccw device.
+  This is provided by the mdev framework. It is a vfio device driver for
+  the mdev that created by vfio_ccw.
+  It realizes a group of vfio device driver callbacks, adds itself to a
+  vfio group, and registers itself to the mdev framework as a mdev
+  driver.
+  It uses a vfio iommu backend that uses the existing map and unmap
+  ioctls, but rather than programming them into an IOMMU for a device,
+  it simply stores the translations for use by later requests. This
+  means that a device programmed in a VM with guest physical addresses
+  can have the vfio kernel convert that address to process virtual
+  address, pin the page and program the hardware with the host physical
+  address in one step.
+  For a mdev, the vfio iommu backend will not pin the pages during the
+  VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
+  of the iova<->vaddr mappings in this operation. And they export a
+  vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
+  backend for the physical devices to pin and unpin pages by demand.
+
+Below is a high Level block diagram::
+
+ +-------------+
+ |             |
+ | +---------+ | mdev_register_driver() +--------------+
+ | |  Mdev   | +<-----------------------+              |
+ | |  bus    | |                        | vfio_mdev.ko |
+ | | driver  | +----------------------->+              |<-> VFIO user
+ | +---------+ |    probe()/remove()    +--------------+    APIs
+ |             |
+ |  MDEV CORE  |
+ |   MODULE    |
+ |   mdev.ko   |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+              |
+ | | device  | |                        |  vfio_ccw.ko |<-> subchannel
+ | |interface| +----------------------->+              |     device
+ | +---------+ |       callback         +--------------+
+ +-------------+
+
+The process of how these work together.
+
+1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
+   physical device (with callbacks) to mdev framework.
+   When vfio_ccw probing the subchannel device, it registers device
+   pointer and callbacks to the mdev framework. Mdev related file nodes
+   under the device node in sysfs would be created for the subchannel
+   device, namely 'mdev_create', 'mdev_destroy' and
+   'mdev_supported_types'.
+2. Create a mediated vfio ccw device.
+   Use the 'mdev_create' sysfs file, we need to manually create one (and
+   only one for our case) mediated device.
+3. vfio_mdev.ko drives the mediated ccw device.
+   vfio_mdev is also the vfio device drvier. It will probe the mdev and
+   add it to an iommu_group and a vfio_group. Then we could pass through
+   the mdev to a guest.
+
+vfio-ccw I/O region
+-------------------
+
+An I/O region is used to accept channel program request from user
+space and store I/O interrupt result for user space to retrieve. The
+definition of the region is::
+
+  struct ccw_io_region {
+  #define ORB_AREA_SIZE 12
+	  __u8    orb_area[ORB_AREA_SIZE];
+  #define SCSW_AREA_SIZE 12
+	  __u8    scsw_area[SCSW_AREA_SIZE];
+  #define IRB_AREA_SIZE 96
+	  __u8    irb_area[IRB_AREA_SIZE];
+	  __u32   ret_code;
+  } __packed;
+
+While starting an I/O request, orb_area should be filled with the
+guest ORB, and scsw_area should be filled with the SCSW of the Virtual
+Subchannel.
+
+irb_area stores the I/O result.
+
+ret_code stores a return code for each access of the region.
+
+vfio-ccw operation details
+--------------------------
+
+vfio-ccw follows what vfio-pci did on the s390 platform and uses
+vfio-iommu-type1 as the vfio iommu backend.
+
+* CCW translation APIs
+  A group of APIs (start with `cp_`) to do CCW translation. The CCWs
+  passed in by a user space program are organized with their guest
+  physical memory addresses. These APIs will copy the CCWs into kernel
+  space, and assemble a runnable kernel channel program by updating the
+  guest physical addresses with their corresponding host physical addresses.
+  Note that we have to use IDALs even for direct-access CCWs, as the
+  referenced memory can be located anywhere, including above 2G.
+
+* vfio_ccw device driver
+  This driver utilizes the CCW translation APIs and introduces
+  vfio_ccw, which is the driver for the I/O subchannel devices you want
+  to pass through.
+  vfio_ccw implements the following vfio ioctls::
+
+    VFIO_DEVICE_GET_INFO
+    VFIO_DEVICE_GET_IRQ_INFO
+    VFIO_DEVICE_GET_REGION_INFO
+    VFIO_DEVICE_RESET
+    VFIO_DEVICE_SET_IRQS
+
+  This provides an I/O region, so that the user space program can pass a
+  channel program to the kernel, to do further CCW translation before
+  issuing them to a real device.
+  This also provides the SET_IRQ ioctl to setup an event notifier to
+  notify the user space program the I/O completion in an asynchronous
+  way.
+
+The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a
+good example to get understand how these patches work. Here is a little
+bit more detail how an I/O request triggered by the QEMU guest will be
+handled (without error handling).
+
+Explanation:
+
+- Q1-Q7: QEMU side process.
+- K1-K5: Kernel side process.
+
+Q1.
+    Get I/O region info during initialization.
+
+Q2.
+    Setup event notifier and handler to handle I/O completion.
+
+... ...
+
+Q3.
+    Intercept a ssch instruction.
+Q4.
+    Write the guest channel program and ORB to the I/O region.
+
+    K1.
+	Copy from guest to kernel.
+    K2.
+	Translate the guest channel program to a host kernel space
+	channel program, which becomes runnable for a real device.
+    K3.
+	With the necessary information contained in the orb passed in
+	by QEMU, issue the ccwchain to the device.
+    K4.
+	Return the ssch CC code.
+Q5.
+    Return the CC code to the guest.
+
+... ...
+
+    K5.
+	Interrupt handler gets the I/O result and write the result to
+	the I/O region.
+    K6.
+	Signal QEMU to retrieve the result.
+
+Q6.
+    Get the signal and event handler reads out the result from the I/O
+    region.
+Q7.
+    Update the irb for the guest.
+
+Limitations
+-----------
+
+The current vfio-ccw implementation focuses on supporting basic commands
+needed to implement block device functionality (read/write) of DASD/ECKD
+device only. Some commands may need special handling in the future, for
+example, anything related to path grouping.
+
+DASD is a kind of storage device. While ECKD is a data recording format.
+More information for DASD and ECKD could be found here:
+https://en.wikipedia.org/wiki/Direct-access_storage_device
+https://en.wikipedia.org/wiki/Count_key_data
+
+Together with the corresponding work in QEMU, we can bring the passed
+through DASD/ECKD device online in a guest now and use it as a block
+device.
+
+While the current code allows the guest to start channel programs via
+START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is
+not yet implemented.
+
+vfio-ccw supports classic (command mode) channel I/O only. Transport
+mode (HPF) is not supported.
+
+QDIO subchannels are currently not supported. Classic devices other than
+DASD/ECKD might work, but have not been tested.
+
+Reference
+---------
+1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
+3. https://en.wikipedia.org/wiki/Channel_I/O
+4. Documentation/s390/cds.rst
+5. Documentation/driver-api/vfio.rst
+6. Documentation/driver-api/vfio-mediated-device.rst
diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt
deleted file mode 100644
index 2be11ad864ff..000000000000
--- a/Documentation/s390/vfio-ccw.txt
+++ /dev/null
@@ -1,300 +0,0 @@
-vfio-ccw: the basic infrastructure
-==================================
-
-Introduction
-------------
-
-Here we describe the vfio support for I/O subchannel devices for
-Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
-virtual machine, while vfio is the means.
-
-Different than other hardware architectures, s390 has defined a unified
-I/O access method, which is so called Channel I/O. It has its own access
-patterns:
-- Channel programs run asynchronously on a separate (co)processor.
-- The channel subsystem will access any memory designated by the caller
-  in the channel program directly, i.e. there is no iommu involved.
-Thus when we introduce vfio support for these devices, we realize it
-with a mediated device (mdev) implementation. The vfio mdev will be
-added to an iommu group, so as to make itself able to be managed by the
-vfio framework. And we add read/write callbacks for special vfio I/O
-regions to pass the channel programs from the mdev to its parent device
-(the real I/O subchannel device) to do further address translation and
-to perform I/O instructions.
-
-This document does not intend to explain the s390 I/O architecture in
-every detail. More information/reference could be found here:
-- A good start to know Channel I/O in general:
-  https://en.wikipedia.org/wiki/Channel_I/O
-- s390 architecture:
-  s390 Principles of Operation manual (IBM Form. No. SA22-7832)
-- The existing QEMU code which implements a simple emulated channel
-  subsystem could also be a good reference. It makes it easier to follow
-  the flow.
-  qemu/hw/s390x/css.c
-
-For vfio mediated device framework:
-- Documentation/vfio-mediated-device.txt
-
-Motivation of vfio-ccw
-----------------------
-
-Typically, a guest virtualized via QEMU/KVM on s390 only sees
-paravirtualized virtio devices via the "Virtio Over Channel I/O
-(virtio-ccw)" transport. This makes virtio devices discoverable via
-standard operating system algorithms for handling channel devices.
-
-However this is not enough. On s390 for the majority of devices, which
-use the standard Channel I/O based mechanism, we also need to provide
-the functionality of passing through them to a QEMU virtual machine.
-This includes devices that don't have a virtio counterpart (e.g. tape
-drives) or that have specific characteristics which guests want to
-exploit.
-
-For passing a device to a guest, we want to use the same interface as
-everybody else, namely vfio. We implement this vfio support for channel
-devices via the vfio mediated device framework and the subchannel device
-driver "vfio_ccw".
-
-Access patterns of CCW devices
-------------------------------
-
-s390 architecture has implemented a so called channel subsystem, that
-provides a unified view of the devices physically attached to the
-systems. Though the s390 hardware platform knows about a huge variety of
-different peripheral attachments like disk devices (aka. DASDs), tapes,
-communication controllers, etc. They can all be accessed by a well
-defined access method and they are presenting I/O completion a unified
-way: I/O interruptions.
-
-All I/O requires the use of channel command words (CCWs). A CCW is an
-instruction to a specialized I/O channel processor. A channel program is
-a sequence of CCWs which are executed by the I/O channel subsystem.  To
-issue a channel program to the channel subsystem, it is required to
-build an operation request block (ORB), which can be used to point out
-the format of the CCW and other control information to the system. The
-operating system signals the I/O channel subsystem to begin executing
-the channel program with a SSCH (start sub-channel) instruction. The
-central processor is then free to proceed with non-I/O instructions
-until interrupted. The I/O completion result is received by the
-interrupt handler in the form of interrupt response block (IRB).
-
-Back to vfio-ccw, in short:
-- ORBs and channel programs are built in guest kernel (with guest
-  physical addresses).
-- ORBs and channel programs are passed to the host kernel.
-- Host kernel translates the guest physical addresses to real addresses
-  and starts the I/O with issuing a privileged Channel I/O instruction
-  (e.g SSCH).
-- channel programs run asynchronously on a separate processor.
-- I/O completion will be signaled to the host with I/O interruptions.
-  And it will be copied as IRB to user space to pass it back to the
-  guest.
-
-Physical vfio ccw device and its child mdev
--------------------------------------------
-
-As mentioned above, we realize vfio-ccw with a mdev implementation.
-
-Channel I/O does not have IOMMU hardware support, so the physical
-vfio-ccw device does not have an IOMMU level translation or isolation.
-
-Subchannel I/O instructions are all privileged instructions. When
-handling the I/O instruction interception, vfio-ccw has the software
-policing and translation how the channel program is programmed before
-it gets sent to hardware.
-
-Within this implementation, we have two drivers for two types of
-devices:
-- The vfio_ccw driver for the physical subchannel device.
-  This is an I/O subchannel driver for the real subchannel device.  It
-  realizes a group of callbacks and registers to the mdev framework as a
-  parent (physical) device. As a consequence, mdev provides vfio_ccw a
-  generic interface (sysfs) to create mdev devices. A vfio mdev could be
-  created by vfio_ccw then and added to the mediated bus. It is the vfio
-  device that added to an IOMMU group and a vfio group.
-  vfio_ccw also provides an I/O region to accept channel program
-  request from user space and store I/O interrupt result for user
-  space to retrieve. To notify user space an I/O completion, it offers
-  an interface to setup an eventfd fd for asynchronous signaling.
-
-- The vfio_mdev driver for the mediated vfio ccw device.
-  This is provided by the mdev framework. It is a vfio device driver for
-  the mdev that created by vfio_ccw.
-  It realizes a group of vfio device driver callbacks, adds itself to a
-  vfio group, and registers itself to the mdev framework as a mdev
-  driver.
-  It uses a vfio iommu backend that uses the existing map and unmap
-  ioctls, but rather than programming them into an IOMMU for a device,
-  it simply stores the translations for use by later requests. This
-  means that a device programmed in a VM with guest physical addresses
-  can have the vfio kernel convert that address to process virtual
-  address, pin the page and program the hardware with the host physical
-  address in one step.
-  For a mdev, the vfio iommu backend will not pin the pages during the
-  VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
-  of the iova<->vaddr mappings in this operation. And they export a
-  vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
-  backend for the physical devices to pin and unpin pages by demand.
-
-Below is a high Level block diagram.
-
- +-------------+
- |             |
- | +---------+ | mdev_register_driver() +--------------+
- | |  Mdev   | +<-----------------------+              |
- | |  bus    | |                        | vfio_mdev.ko |
- | | driver  | +----------------------->+              |<-> VFIO user
- | +---------+ |    probe()/remove()    +--------------+    APIs
- |             |
- |  MDEV CORE  |
- |   MODULE    |
- |   mdev.ko   |
- | +---------+ | mdev_register_device() +--------------+
- | |Physical | +<-----------------------+              |
- | | device  | |                        |  vfio_ccw.ko |<-> subchannel
- | |interface| +----------------------->+              |     device
- | +---------+ |       callback         +--------------+
- +-------------+
-
-The process of how these work together.
-1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
-   physical device (with callbacks) to mdev framework.
-   When vfio_ccw probing the subchannel device, it registers device
-   pointer and callbacks to the mdev framework. Mdev related file nodes
-   under the device node in sysfs would be created for the subchannel
-   device, namely 'mdev_create', 'mdev_destroy' and
-   'mdev_supported_types'.
-2. Create a mediated vfio ccw device.
-   Use the 'mdev_create' sysfs file, we need to manually create one (and
-   only one for our case) mediated device.
-3. vfio_mdev.ko drives the mediated ccw device.
-   vfio_mdev is also the vfio device drvier. It will probe the mdev and
-   add it to an iommu_group and a vfio_group. Then we could pass through
-   the mdev to a guest.
-
-vfio-ccw I/O region
--------------------
-
-An I/O region is used to accept channel program request from user
-space and store I/O interrupt result for user space to retrieve. The
-definition of the region is:
-
-struct ccw_io_region {
-#define ORB_AREA_SIZE 12
-	__u8	orb_area[ORB_AREA_SIZE];
-#define SCSW_AREA_SIZE 12
-	__u8	scsw_area[SCSW_AREA_SIZE];
-#define IRB_AREA_SIZE 96
-	__u8	irb_area[IRB_AREA_SIZE];
-	__u32	ret_code;
-} __packed;
-
-While starting an I/O request, orb_area should be filled with the
-guest ORB, and scsw_area should be filled with the SCSW of the Virtual
-Subchannel.
-
-irb_area stores the I/O result.
-
-ret_code stores a return code for each access of the region.
-
-vfio-ccw operation details
---------------------------
-
-vfio-ccw follows what vfio-pci did on the s390 platform and uses
-vfio-iommu-type1 as the vfio iommu backend.
-
-* CCW translation APIs
-  A group of APIs (start with 'cp_') to do CCW translation. The CCWs
-  passed in by a user space program are organized with their guest
-  physical memory addresses. These APIs will copy the CCWs into kernel
-  space, and assemble a runnable kernel channel program by updating the
-  guest physical addresses with their corresponding host physical addresses.
-  Note that we have to use IDALs even for direct-access CCWs, as the
-  referenced memory can be located anywhere, including above 2G.
-
-* vfio_ccw device driver
-  This driver utilizes the CCW translation APIs and introduces
-  vfio_ccw, which is the driver for the I/O subchannel devices you want
-  to pass through.
-  vfio_ccw implements the following vfio ioctls:
-    VFIO_DEVICE_GET_INFO
-    VFIO_DEVICE_GET_IRQ_INFO
-    VFIO_DEVICE_GET_REGION_INFO
-    VFIO_DEVICE_RESET
-    VFIO_DEVICE_SET_IRQS
-  This provides an I/O region, so that the user space program can pass a
-  channel program to the kernel, to do further CCW translation before
-  issuing them to a real device.
-  This also provides the SET_IRQ ioctl to setup an event notifier to
-  notify the user space program the I/O completion in an asynchronous
-  way.
-
-The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a
-good example to get understand how these patches work. Here is a little
-bit more detail how an I/O request triggered by the QEMU guest will be
-handled (without error handling).
-
-Explanation:
-Q1-Q7: QEMU side process.
-K1-K5: Kernel side process.
-
-Q1. Get I/O region info during initialization.
-Q2. Setup event notifier and handler to handle I/O completion.
-
-... ...
-
-Q3. Intercept a ssch instruction.
-Q4. Write the guest channel program and ORB to the I/O region.
-    K1. Copy from guest to kernel.
-    K2. Translate the guest channel program to a host kernel space
-        channel program, which becomes runnable for a real device.
-    K3. With the necessary information contained in the orb passed in
-        by QEMU, issue the ccwchain to the device.
-    K4. Return the ssch CC code.
-Q5. Return the CC code to the guest.
-
-... ...
-
-    K5. Interrupt handler gets the I/O result and write the result to
-        the I/O region.
-    K6. Signal QEMU to retrieve the result.
-Q6. Get the signal and event handler reads out the result from the I/O
-    region.
-Q7. Update the irb for the guest.
-
-Limitations
------------
-
-The current vfio-ccw implementation focuses on supporting basic commands
-needed to implement block device functionality (read/write) of DASD/ECKD
-device only. Some commands may need special handling in the future, for
-example, anything related to path grouping.
-
-DASD is a kind of storage device. While ECKD is a data recording format.
-More information for DASD and ECKD could be found here:
-https://en.wikipedia.org/wiki/Direct-access_storage_device
-https://en.wikipedia.org/wiki/Count_key_data
-
-Together with the corresponding work in QEMU, we can bring the passed
-through DASD/ECKD device online in a guest now and use it as a block
-device.
-
-While the current code allows the guest to start channel programs via
-START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is
-not yet implemented.
-
-vfio-ccw supports classic (command mode) channel I/O only. Transport
-mode (HPF) is not supported.
-
-QDIO subchannels are currently not supported. Classic devices other than
-DASD/ECKD might work, but have not been tested.
-
-Reference
----------
-1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
-2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
-3. https://en.wikipedia.org/wiki/Channel_I/O
-4. Documentation/s390/cds.txt
-5. Documentation/vfio.txt
-6. Documentation/vfio-mediated-device.txt
diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst
new file mode 100644
index 000000000000..54e8e7caf7e7
--- /dev/null
+++ b/Documentation/s390/zfcpdump.rst
@@ -0,0 +1,50 @@
+==================================
+The s390 SCSI dump tool (zfcpdump)
+==================================
+
+System z machines (z900 or higher) provide hardware support for creating system
+dumps on SCSI disks. The dump process is initiated by booting a dump tool, which
+has to create a dump of the current (probably crashed) Linux image. In order to
+not overwrite memory of the crashed Linux with data of the dump tool, the
+hardware saves some memory plus the register sets of the boot CPU before the
+dump tool is loaded. There exists an SCLP hardware interface to obtain the saved
+memory afterwards. Currently 32 MB are saved.
+
+This zfcpdump implementation consists of a Linux dump kernel together with
+a user space dump tool, which are loaded together into the saved memory region
+below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in
+the s390-tools package) to make the device bootable. The operator of a Linux
+system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump
+resides on.
+
+The user space dump tool accesses the memory of the crashed system by means
+of the /proc/vmcore interface. This interface exports the crashed system's
+memory and registers in ELF core dump format. To access the memory which has
+been saved by the hardware SCLP requests will be created at the time the data
+is needed by /proc/vmcore. The tail part of the crashed systems memory which
+has not been stashed by hardware can just be copied from real memory.
+
+To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP
+has to be set.
+
+To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig".
+
+The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs
+under the following locations:
+
+* kernel:  <zfcpdump directory>/zfcpdump.image
+* ramdisk: <zfcpdump directory>/zfcpdump.rd
+
+The zfcpdump directory is defined in the s390-tools package.
+
+The user space application of zfcpdump can reside in an intitramfs or an
+initrd. It can also be included in a built-in kernel initramfs. The application
+reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk.
+
+The s390-tools package version 1.24.0 and above builds an external zfcpdump
+initramfs with a user space application that writes the dump to a SCSI
+partition.
+
+For more information on how to use zfcpdump refer to the s390 'Using the Dump
+Tools book', which is available from
+http://www.ibm.com/developerworks/linux/linux390.
diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.txt
deleted file mode 100644
index b064aa59714d..000000000000
--- a/Documentation/s390/zfcpdump.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-The s390 SCSI dump tool (zfcpdump)
-
-System z machines (z900 or higher) provide hardware support for creating system
-dumps on SCSI disks. The dump process is initiated by booting a dump tool, which
-has to create a dump of the current (probably crashed) Linux image. In order to
-not overwrite memory of the crashed Linux with data of the dump tool, the
-hardware saves some memory plus the register sets of the boot CPU before the
-dump tool is loaded. There exists an SCLP hardware interface to obtain the saved
-memory afterwards. Currently 32 MB are saved.
-
-This zfcpdump implementation consists of a Linux dump kernel together with
-a user space dump tool, which are loaded together into the saved memory region
-below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in
-the s390-tools package) to make the device bootable. The operator of a Linux
-system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump
-resides on.
-
-The user space dump tool accesses the memory of the crashed system by means
-of the /proc/vmcore interface. This interface exports the crashed system's
-memory and registers in ELF core dump format. To access the memory which has
-been saved by the hardware SCLP requests will be created at the time the data
-is needed by /proc/vmcore. The tail part of the crashed systems memory which
-has not been stashed by hardware can just be copied from real memory.
-
-To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP
-has to be set.
-
-To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig".
-
-The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs
-under the following locations:
-
-* kernel:  <zfcpdump directory>/zfcpdump.image
-* ramdisk: <zfcpdump directory>/zfcpdump.rd
-
-The zfcpdump directory is defined in the s390-tools package.
-
-The user space application of zfcpdump can reside in an intitramfs or an
-initrd. It can also be included in a built-in kernel initramfs. The application
-reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk.
-
-The s390-tools package version 1.24.0 and above builds an external zfcpdump
-initramfs with a user space application that writes the dump to a SCSI
-partition.
-
-For more information on how to use zfcpdump refer to the s390 'Using the Dump
-Tools book', which is available from
-http://www.ibm.com/developerworks/linux/linux390.
diff --git a/Documentation/scheduler/completion.rst b/Documentation/scheduler/completion.rst
new file mode 100644
index 000000000000..9f039b4f4b09
--- /dev/null
+++ b/Documentation/scheduler/completion.rst
@@ -0,0 +1,293 @@
+================================================
+Completions - "wait for completion" barrier APIs
+================================================
+
+Introduction:
+-------------
+
+If you have one or more threads that must wait for some kernel activity
+to have reached a point or a specific state, completions can provide a
+race-free solution to this problem. Semantically they are somewhat like a
+pthread_barrier() and have similar use-cases.
+
+Completions are a code synchronization mechanism which is preferable to any
+misuse of locks/semaphores and busy-loops. Any time you think of using
+yield() or some quirky msleep(1) loop to allow something else to proceed,
+you probably want to look into using one of the wait_for_completion*()
+calls and complete() instead.
+
+The advantage of using completions is that they have a well defined, focused
+purpose which makes it very easy to see the intent of the code, but they
+also result in more efficient code as all threads can continue execution
+until the result is actually needed, and both the waiting and the signalling
+is highly efficient using low level scheduler sleep/wakeup facilities.
+
+Completions are built on top of the waitqueue and wakeup infrastructure of
+the Linux scheduler. The event the threads on the waitqueue are waiting for
+is reduced to a simple flag in 'struct completion', appropriately called "done".
+
+As completions are scheduling related, the code can be found in
+kernel/sched/completion.c.
+
+
+Usage:
+------
+
+There are three main parts to using completions:
+
+ - the initialization of the 'struct completion' synchronization object
+ - the waiting part through a call to one of the variants of wait_for_completion(),
+ - the signaling side through a call to complete() or complete_all().
+
+There are also some helper functions for checking the state of completions.
+Note that while initialization must happen first, the waiting and signaling
+part can happen in any order. I.e. it's entirely normal for a thread
+to have marked a completion as 'done' before another thread checks whether
+it has to wait for it.
+
+To use completions you need to #include <linux/completion.h> and
+create a static or dynamic variable of type 'struct completion',
+which has only two fields::
+
+	struct completion {
+		unsigned int done;
+		wait_queue_head_t wait;
+	};
+
+This provides the ->wait waitqueue to place tasks on for waiting (if any), and
+the ->done completion flag for indicating whether it's completed or not.
+
+Completions should be named to refer to the event that is being synchronized on.
+A good example is::
+
+	wait_for_completion(&early_console_added);
+
+	complete(&early_console_added);
+
+Good, intuitive naming (as always) helps code readability. Naming a completion
+'complete' is not helpful unless the purpose is super obvious...
+
+
+Initializing completions:
+-------------------------
+
+Dynamically allocated completion objects should preferably be embedded in data
+structures that are assured to be alive for the life-time of the function/driver,
+to prevent races with asynchronous complete() calls from occurring.
+
+Particular care should be taken when using the _timeout() or _killable()/_interruptible()
+variants of wait_for_completion(), as it must be assured that memory de-allocation
+does not happen until all related activities (complete() or reinit_completion())
+have taken place, even if these wait functions return prematurely due to a timeout
+or a signal triggering.
+
+Initializing of dynamically allocated completion objects is done via a call to
+init_completion()::
+
+	init_completion(&dynamic_object->done);
+
+In this call we initialize the waitqueue and set ->done to 0, i.e. "not completed"
+or "not done".
+
+The re-initialization function, reinit_completion(), simply resets the
+->done field to 0 ("not done"), without touching the waitqueue.
+Callers of this function must make sure that there are no racy
+wait_for_completion() calls going on in parallel.
+
+Calling init_completion() on the same completion object twice is
+most likely a bug as it re-initializes the queue to an empty queue and
+enqueued tasks could get "lost" - use reinit_completion() in that case,
+but be aware of other races.
+
+For static declaration and initialization, macros are available.
+
+For static (or global) declarations in file scope you can use
+DECLARE_COMPLETION()::
+
+	static DECLARE_COMPLETION(setup_done);
+	DECLARE_COMPLETION(setup_done);
+
+Note that in this case the completion is boot time (or module load time)
+initialized to 'not done' and doesn't require an init_completion() call.
+
+When a completion is declared as a local variable within a function,
+then the initialization should always use DECLARE_COMPLETION_ONSTACK()
+explicitly, not just to make lockdep happy, but also to make it clear
+that limited scope had been considered and is intentional::
+
+	DECLARE_COMPLETION_ONSTACK(setup_done)
+
+Note that when using completion objects as local variables you must be
+acutely aware of the short life time of the function stack: the function
+must not return to a calling context until all activities (such as waiting
+threads) have ceased and the completion object is completely unused.
+
+To emphasise this again: in particular when using some of the waiting API variants
+with more complex outcomes, such as the timeout or signalling (_timeout(),
+_killable() and _interruptible()) variants, the wait might complete
+prematurely while the object might still be in use by another thread - and a return
+from the wait_on_completion*() caller function will deallocate the function
+stack and cause subtle data corruption if a complete() is done in some
+other thread. Simple testing might not trigger these kinds of races.
+
+If unsure, use dynamically allocated completion objects, preferably embedded
+in some other long lived object that has a boringly long life time which
+exceeds the life time of any helper threads using the completion object,
+or has a lock or other synchronization mechanism to make sure complete()
+is not called on a freed object.
+
+A naive DECLARE_COMPLETION() on the stack triggers a lockdep warning.
+
+Waiting for completions:
+------------------------
+
+For a thread to wait for some concurrent activity to finish, it
+calls wait_for_completion() on the initialized completion structure::
+
+	void wait_for_completion(struct completion *done)
+
+A typical usage scenario is::
+
+	CPU#1					CPU#2
+
+	struct completion setup_done;
+
+	init_completion(&setup_done);
+	initialize_work(...,&setup_done,...);
+
+	/* run non-dependent code */		/* do setup */
+
+	wait_for_completion(&setup_done);	complete(setup_done);
+
+This is not implying any particular order between wait_for_completion() and
+the call to complete() - if the call to complete() happened before the call
+to wait_for_completion() then the waiting side simply will continue
+immediately as all dependencies are satisfied; if not, it will block until
+completion is signaled by complete().
+
+Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(),
+so it can only be called safely when you know that interrupts are enabled.
+Calling it from IRQs-off atomic contexts will result in hard-to-detect
+spurious enabling of interrupts.
+
+The default behavior is to wait without a timeout and to mark the task as
+uninterruptible. wait_for_completion() and its variants are only safe
+in process context (as they can sleep) but not in atomic context,
+interrupt context, with disabled IRQs, or preemption is disabled - see also
+try_wait_for_completion() below for handling completion in atomic/interrupt
+context.
+
+As all variants of wait_for_completion() can (obviously) block for a long
+time depending on the nature of the activity they are waiting for, so in
+most cases you probably don't want to call this with held mutexes.
+
+
+wait_for_completion*() variants available:
+------------------------------------------
+
+The below variants all return status and this status should be checked in
+most(/all) cases - in cases where the status is deliberately not checked you
+probably want to make a note explaining this (e.g. see
+arch/arm/kernel/smp.c:__cpu_up()).
+
+A common problem that occurs is to have unclean assignment of return types,
+so take care to assign return-values to variables of the proper type.
+
+Checking for the specific meaning of return values also has been found
+to be quite inaccurate, e.g. constructs like::
+
+	if (!wait_for_completion_interruptible_timeout(...))
+
+... would execute the same code path for successful completion and for the
+interrupted case - which is probably not what you want::
+
+	int wait_for_completion_interruptible(struct completion *done)
+
+This function marks the task TASK_INTERRUPTIBLE while it is waiting.
+If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise::
+
+	unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout)
+
+The task is marked as TASK_UNINTERRUPTIBLE and will wait at most 'timeout'
+jiffies. If a timeout occurs it returns 0, else the remaining time in
+jiffies (but at least 1).
+
+Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies(),
+to make the code largely HZ-invariant.
+
+If the returned timeout value is deliberately ignored a comment should probably explain
+why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc())::
+
+	long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout)
+
+This function passes a timeout in jiffies and marks the task as
+TASK_INTERRUPTIBLE. If a signal was received it will return -ERESTARTSYS;
+otherwise it returns 0 if the completion timed out, or the remaining time in
+jiffies if completion occurred.
+
+Further variants include _killable which uses TASK_KILLABLE as the
+designated tasks state and will return -ERESTARTSYS if it is interrupted,
+or 0 if completion was achieved.  There is a _timeout variant as well::
+
+	long wait_for_completion_killable(struct completion *done)
+	long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout)
+
+The _io variants wait_for_completion_io() behave the same as the non-_io
+variants, except for accounting waiting time as 'waiting on IO', which has
+an impact on how the task is accounted in scheduling/IO stats::
+
+	void wait_for_completion_io(struct completion *done)
+	unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout)
+
+
+Signaling completions:
+----------------------
+
+A thread that wants to signal that the conditions for continuation have been
+achieved calls complete() to signal exactly one of the waiters that it can
+continue::
+
+	void complete(struct completion *done)
+
+... or calls complete_all() to signal all current and future waiters::
+
+	void complete_all(struct completion *done)
+
+The signaling will work as expected even if completions are signaled before
+a thread starts waiting. This is achieved by the waiter "consuming"
+(decrementing) the done field of 'struct completion'. Waiting threads
+wakeup order is the same in which they were enqueued (FIFO order).
+
+If complete() is called multiple times then this will allow for that number
+of waiters to continue - each call to complete() will simply increment the
+done field. Calling complete_all() multiple times is a bug though. Both
+complete() and complete_all() can be called in IRQ/atomic context safely.
+
+There can only be one thread calling complete() or complete_all() on a
+particular 'struct completion' at any time - serialized through the wait
+queue spinlock. Any such concurrent calls to complete() or complete_all()
+probably are a design bug.
+
+Signaling completion from IRQ context is fine as it will appropriately
+lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never
+sleep.
+
+
+try_wait_for_completion()/completion_done():
+--------------------------------------------
+
+The try_wait_for_completion() function will not put the thread on the wait
+queue but rather returns false if it would need to enqueue (block) the thread,
+else it consumes one posted completion and returns true::
+
+	bool try_wait_for_completion(struct completion *done)
+
+Finally, to check the state of a completion without changing it in any way,
+call completion_done(), which returns false if there are no posted
+completions that were not yet consumed by waiters (implying that there are
+waiters) and true otherwise::
+
+	bool completion_done(struct completion *done)
+
+Both try_wait_for_completion() and completion_done() are safe to be called in
+IRQ or atomic context.
diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.txt
deleted file mode 100644
index e5b9df4d8078..000000000000
--- a/Documentation/scheduler/completion.txt
+++ /dev/null
@@ -1,291 +0,0 @@
-Completions - "wait for completion" barrier APIs
-================================================
-
-Introduction:
--------------
-
-If you have one or more threads that must wait for some kernel activity
-to have reached a point or a specific state, completions can provide a
-race-free solution to this problem. Semantically they are somewhat like a
-pthread_barrier() and have similar use-cases.
-
-Completions are a code synchronization mechanism which is preferable to any
-misuse of locks/semaphores and busy-loops. Any time you think of using
-yield() or some quirky msleep(1) loop to allow something else to proceed,
-you probably want to look into using one of the wait_for_completion*()
-calls and complete() instead.
-
-The advantage of using completions is that they have a well defined, focused
-purpose which makes it very easy to see the intent of the code, but they
-also result in more efficient code as all threads can continue execution
-until the result is actually needed, and both the waiting and the signalling
-is highly efficient using low level scheduler sleep/wakeup facilities.
-
-Completions are built on top of the waitqueue and wakeup infrastructure of
-the Linux scheduler. The event the threads on the waitqueue are waiting for
-is reduced to a simple flag in 'struct completion', appropriately called "done".
-
-As completions are scheduling related, the code can be found in
-kernel/sched/completion.c.
-
-
-Usage:
-------
-
-There are three main parts to using completions:
-
- - the initialization of the 'struct completion' synchronization object
- - the waiting part through a call to one of the variants of wait_for_completion(),
- - the signaling side through a call to complete() or complete_all().
-
-There are also some helper functions for checking the state of completions.
-Note that while initialization must happen first, the waiting and signaling
-part can happen in any order. I.e. it's entirely normal for a thread
-to have marked a completion as 'done' before another thread checks whether
-it has to wait for it.
-
-To use completions you need to #include <linux/completion.h> and
-create a static or dynamic variable of type 'struct completion',
-which has only two fields:
-
-	struct completion {
-		unsigned int done;
-		wait_queue_head_t wait;
-	};
-
-This provides the ->wait waitqueue to place tasks on for waiting (if any), and
-the ->done completion flag for indicating whether it's completed or not.
-
-Completions should be named to refer to the event that is being synchronized on.
-A good example is:
-
-	wait_for_completion(&early_console_added);
-
-	complete(&early_console_added);
-
-Good, intuitive naming (as always) helps code readability. Naming a completion
-'complete' is not helpful unless the purpose is super obvious...
-
-
-Initializing completions:
--------------------------
-
-Dynamically allocated completion objects should preferably be embedded in data
-structures that are assured to be alive for the life-time of the function/driver,
-to prevent races with asynchronous complete() calls from occurring.
-
-Particular care should be taken when using the _timeout() or _killable()/_interruptible()
-variants of wait_for_completion(), as it must be assured that memory de-allocation
-does not happen until all related activities (complete() or reinit_completion())
-have taken place, even if these wait functions return prematurely due to a timeout
-or a signal triggering.
-
-Initializing of dynamically allocated completion objects is done via a call to
-init_completion():
-
-	init_completion(&dynamic_object->done);
-
-In this call we initialize the waitqueue and set ->done to 0, i.e. "not completed"
-or "not done".
-
-The re-initialization function, reinit_completion(), simply resets the
-->done field to 0 ("not done"), without touching the waitqueue.
-Callers of this function must make sure that there are no racy
-wait_for_completion() calls going on in parallel.
-
-Calling init_completion() on the same completion object twice is
-most likely a bug as it re-initializes the queue to an empty queue and
-enqueued tasks could get "lost" - use reinit_completion() in that case,
-but be aware of other races.
-
-For static declaration and initialization, macros are available.
-
-For static (or global) declarations in file scope you can use DECLARE_COMPLETION():
-
-	static DECLARE_COMPLETION(setup_done);
-	DECLARE_COMPLETION(setup_done);
-
-Note that in this case the completion is boot time (or module load time)
-initialized to 'not done' and doesn't require an init_completion() call.
-
-When a completion is declared as a local variable within a function,
-then the initialization should always use DECLARE_COMPLETION_ONSTACK()
-explicitly, not just to make lockdep happy, but also to make it clear
-that limited scope had been considered and is intentional:
-
-	DECLARE_COMPLETION_ONSTACK(setup_done)
-
-Note that when using completion objects as local variables you must be
-acutely aware of the short life time of the function stack: the function
-must not return to a calling context until all activities (such as waiting
-threads) have ceased and the completion object is completely unused.
-
-To emphasise this again: in particular when using some of the waiting API variants
-with more complex outcomes, such as the timeout or signalling (_timeout(),
-_killable() and _interruptible()) variants, the wait might complete
-prematurely while the object might still be in use by another thread - and a return
-from the wait_on_completion*() caller function will deallocate the function
-stack and cause subtle data corruption if a complete() is done in some
-other thread. Simple testing might not trigger these kinds of races.
-
-If unsure, use dynamically allocated completion objects, preferably embedded
-in some other long lived object that has a boringly long life time which
-exceeds the life time of any helper threads using the completion object,
-or has a lock or other synchronization mechanism to make sure complete()
-is not called on a freed object.
-
-A naive DECLARE_COMPLETION() on the stack triggers a lockdep warning.
-
-Waiting for completions:
-------------------------
-
-For a thread to wait for some concurrent activity to finish, it
-calls wait_for_completion() on the initialized completion structure:
-
-	void wait_for_completion(struct completion *done)
-
-A typical usage scenario is:
-
-	CPU#1					CPU#2
-
-	struct completion setup_done;
-
-	init_completion(&setup_done);
-	initialize_work(...,&setup_done,...);
-
-	/* run non-dependent code */		/* do setup */
-
-	wait_for_completion(&setup_done);	complete(setup_done);
-
-This is not implying any particular order between wait_for_completion() and
-the call to complete() - if the call to complete() happened before the call
-to wait_for_completion() then the waiting side simply will continue
-immediately as all dependencies are satisfied; if not, it will block until
-completion is signaled by complete().
-
-Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(),
-so it can only be called safely when you know that interrupts are enabled.
-Calling it from IRQs-off atomic contexts will result in hard-to-detect
-spurious enabling of interrupts.
-
-The default behavior is to wait without a timeout and to mark the task as
-uninterruptible. wait_for_completion() and its variants are only safe
-in process context (as they can sleep) but not in atomic context,
-interrupt context, with disabled IRQs, or preemption is disabled - see also
-try_wait_for_completion() below for handling completion in atomic/interrupt
-context.
-
-As all variants of wait_for_completion() can (obviously) block for a long
-time depending on the nature of the activity they are waiting for, so in
-most cases you probably don't want to call this with held mutexes.
-
-
-wait_for_completion*() variants available:
-------------------------------------------
-
-The below variants all return status and this status should be checked in
-most(/all) cases - in cases where the status is deliberately not checked you
-probably want to make a note explaining this (e.g. see
-arch/arm/kernel/smp.c:__cpu_up()).
-
-A common problem that occurs is to have unclean assignment of return types,
-so take care to assign return-values to variables of the proper type.
-
-Checking for the specific meaning of return values also has been found
-to be quite inaccurate, e.g. constructs like:
-
-	if (!wait_for_completion_interruptible_timeout(...))
-
-... would execute the same code path for successful completion and for the
-interrupted case - which is probably not what you want.
-
-	int wait_for_completion_interruptible(struct completion *done)
-
-This function marks the task TASK_INTERRUPTIBLE while it is waiting.
-If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise.
-
-	unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout)
-
-The task is marked as TASK_UNINTERRUPTIBLE and will wait at most 'timeout'
-jiffies. If a timeout occurs it returns 0, else the remaining time in
-jiffies (but at least 1).
-
-Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies(),
-to make the code largely HZ-invariant.
-
-If the returned timeout value is deliberately ignored a comment should probably explain
-why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()).
-
-	long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout)
-
-This function passes a timeout in jiffies and marks the task as
-TASK_INTERRUPTIBLE. If a signal was received it will return -ERESTARTSYS;
-otherwise it returns 0 if the completion timed out, or the remaining time in
-jiffies if completion occurred.
-
-Further variants include _killable which uses TASK_KILLABLE as the
-designated tasks state and will return -ERESTARTSYS if it is interrupted,
-or 0 if completion was achieved.  There is a _timeout variant as well:
-
-	long wait_for_completion_killable(struct completion *done)
-	long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout)
-
-The _io variants wait_for_completion_io() behave the same as the non-_io
-variants, except for accounting waiting time as 'waiting on IO', which has
-an impact on how the task is accounted in scheduling/IO stats:
-
-	void wait_for_completion_io(struct completion *done)
-	unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout)
-
-
-Signaling completions:
-----------------------
-
-A thread that wants to signal that the conditions for continuation have been
-achieved calls complete() to signal exactly one of the waiters that it can
-continue:
-
-	void complete(struct completion *done)
-
-... or calls complete_all() to signal all current and future waiters:
-
-	void complete_all(struct completion *done)
-
-The signaling will work as expected even if completions are signaled before
-a thread starts waiting. This is achieved by the waiter "consuming"
-(decrementing) the done field of 'struct completion'. Waiting threads
-wakeup order is the same in which they were enqueued (FIFO order).
-
-If complete() is called multiple times then this will allow for that number
-of waiters to continue - each call to complete() will simply increment the
-done field. Calling complete_all() multiple times is a bug though. Both
-complete() and complete_all() can be called in IRQ/atomic context safely.
-
-There can only be one thread calling complete() or complete_all() on a
-particular 'struct completion' at any time - serialized through the wait
-queue spinlock. Any such concurrent calls to complete() or complete_all()
-probably are a design bug.
-
-Signaling completion from IRQ context is fine as it will appropriately
-lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never
-sleep. 
-
-
-try_wait_for_completion()/completion_done():
---------------------------------------------
-
-The try_wait_for_completion() function will not put the thread on the wait
-queue but rather returns false if it would need to enqueue (block) the thread,
-else it consumes one posted completion and returns true.
-
-	bool try_wait_for_completion(struct completion *done)
-
-Finally, to check the state of a completion without changing it in any way,
-call completion_done(), which returns false if there are no posted
-completions that were not yet consumed by waiters (implying that there are
-waiters) and true otherwise;
-
-	bool completion_done(struct completion *done)
-
-Both try_wait_for_completion() and completion_done() are safe to be called in
-IRQ or atomic context.
diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
new file mode 100644
index 000000000000..69074e5de9c4
--- /dev/null
+++ b/Documentation/scheduler/index.rst
@@ -0,0 +1,27 @@
+===============
+Linux Scheduler
+===============
+
+.. toctree::
+    :maxdepth: 1
+
+
+    completion
+    sched-arch
+    sched-bwc
+    sched-deadline
+    sched-design-CFS
+    sched-domains
+    sched-energy
+    sched-nice-design
+    sched-rt-group
+    sched-stats
+
+    text_files
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/scheduler/sched-arch.rst b/Documentation/scheduler/sched-arch.rst
new file mode 100644
index 000000000000..0eaec669790a
--- /dev/null
+++ b/Documentation/scheduler/sched-arch.rst
@@ -0,0 +1,76 @@
+=================================================================
+CPU Scheduler implementation hints for architecture specific code
+=================================================================
+
+	Nick Piggin, 2005
+
+Context switch
+==============
+1. Runqueue locking
+By default, the switch_to arch function is called with the runqueue
+locked. This is usually not a problem unless switch_to may need to
+take the runqueue lock. This is usually due to a wake up operation in
+the context switch. See arch/ia64/include/asm/switch_to.h for an example.
+
+To request the scheduler call switch_to with the runqueue unlocked,
+you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
+(typically the one where switch_to is defined).
+
+Unlocked context switches introduce only a very minor performance
+penalty to the core scheduler implementation in the CONFIG_SMP case.
+
+CPU idle
+========
+Your cpu_idle routines need to obey the following rules:
+
+1. Preempt should now disabled over idle routines. Should only
+   be enabled to call schedule() then disabled again.
+
+2. need_resched/TIF_NEED_RESCHED is only ever set, and will never
+   be cleared until the running task has called schedule(). Idle
+   threads need only ever query need_resched, and may never set or
+   clear it.
+
+3. When cpu_idle finds (need_resched() == 'true'), it should call
+   schedule(). It should not call schedule() otherwise.
+
+4. The only time interrupts need to be disabled when checking
+   need_resched is if we are about to sleep the processor until
+   the next interrupt (this doesn't provide any protection of
+   need_resched, it prevents losing an interrupt):
+
+	4a. Common problem with this type of sleep appears to be::
+
+	        local_irq_disable();
+	        if (!need_resched()) {
+	                local_irq_enable();
+	                *** resched interrupt arrives here ***
+	                __asm__("sleep until next interrupt");
+	        }
+
+5. TIF_POLLING_NRFLAG can be set by idle routines that do not
+   need an interrupt to wake them up when need_resched goes high.
+   In other words, they must be periodically polling need_resched,
+   although it may be reasonable to do some background work or enter
+   a low CPU priority.
+
+      - 5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter
+	an interrupt sleep, it needs to be cleared then a memory
+	barrier issued (followed by a test of need_resched with
+	interrupts disabled, as explained in 3).
+
+arch/x86/kernel/process.c has examples of both polling and
+sleeping idle functions.
+
+
+Possible arch/ problems
+=======================
+
+Possible arch problems I found (and either tried to fix or didn't):
+
+ia64 - is safe_halt call racy vs interrupts? (does it sleep?) (See #4a)
+
+sh64 - Is sleeping racy vs interrupts? (See #4a)
+
+sparc - IRQs on at this point(?), change local_irq_save to _disable.
+      - TODO: needs secondary CPUs to disable preempt (See #1)
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
deleted file mode 100644
index a2f27bbf2cba..000000000000
--- a/Documentation/scheduler/sched-arch.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-	CPU Scheduler implementation hints for architecture specific code
-
-	Nick Piggin, 2005
-
-Context switch
-==============
-1. Runqueue locking
-By default, the switch_to arch function is called with the runqueue
-locked. This is usually not a problem unless switch_to may need to
-take the runqueue lock. This is usually due to a wake up operation in
-the context switch. See arch/ia64/include/asm/switch_to.h for an example.
-
-To request the scheduler call switch_to with the runqueue unlocked,
-you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
-(typically the one where switch_to is defined).
-
-Unlocked context switches introduce only a very minor performance
-penalty to the core scheduler implementation in the CONFIG_SMP case.
-
-CPU idle
-========
-Your cpu_idle routines need to obey the following rules:
-
-1. Preempt should now disabled over idle routines. Should only
-   be enabled to call schedule() then disabled again.
-
-2. need_resched/TIF_NEED_RESCHED is only ever set, and will never
-   be cleared until the running task has called schedule(). Idle
-   threads need only ever query need_resched, and may never set or
-   clear it.
-
-3. When cpu_idle finds (need_resched() == 'true'), it should call
-   schedule(). It should not call schedule() otherwise.
-
-4. The only time interrupts need to be disabled when checking
-   need_resched is if we are about to sleep the processor until
-   the next interrupt (this doesn't provide any protection of
-   need_resched, it prevents losing an interrupt).
-
-	4a. Common problem with this type of sleep appears to be:
-	        local_irq_disable();
-	        if (!need_resched()) {
-	                local_irq_enable();
-	                *** resched interrupt arrives here ***
-	                __asm__("sleep until next interrupt");
-	        }
-
-5. TIF_POLLING_NRFLAG can be set by idle routines that do not
-   need an interrupt to wake them up when need_resched goes high.
-   In other words, they must be periodically polling need_resched,
-   although it may be reasonable to do some background work or enter
-   a low CPU priority.
-
-   	5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter
-	    an interrupt sleep, it needs to be cleared then a memory
-	    barrier issued (followed by a test of need_resched with
-	    interrupts disabled, as explained in 3).
-
-arch/x86/kernel/process.c has examples of both polling and
-sleeping idle functions.
-
-
-Possible arch/ problems
-=======================
-
-Possible arch problems I found (and either tried to fix or didn't):
-
-ia64 - is safe_halt call racy vs interrupts? (does it sleep?) (See #4a)
-
-sh64 - Is sleeping racy vs interrupts? (See #4a)
-
-sparc - IRQs on at this point(?), change local_irq_save to _disable.
-      - TODO: needs secondary CPUs to disable preempt (See #1)
-
diff --git a/Documentation/scheduler/sched-bwc.rst b/Documentation/scheduler/sched-bwc.rst
new file mode 100644
index 000000000000..3a9064219656
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.rst
@@ -0,0 +1,128 @@
+=====================
+CFS Bandwidth Control
+=====================
+
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+  The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.rst ]
+
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary.  As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis.  The amount transferred
+within each of these updates is tunable and described as the "slice".
+
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+
+The default values are::
+
+	cpu.cfs_period_us=100ms
+	cpu.cfs_quota=-1
+
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group.  This represents the traditional work-conserving behavior for
+CFS.
+
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms.  There is also an
+upper bound on the period length of 1s.  Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion.  This greatly reduces global accounting pressure
+on large systems.  The amount transferred each time such an update is required
+is described as the "slice".
+
+This is tunable via procfs::
+
+	/proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+
+cpu.stat:
+
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+  of the group have been throttled.
+
+This interface is read-only.
+
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy:
+
+  e.g. \Sum (c_i) may exceed C
+
+[ Where C is the parent's bandwidth, and c_i its children ]
+
+
+There are two ways in which a group may become throttled:
+
+	a. it fully consumes its own quota within a period
+	b. a parent's quota is fully consumed within its period
+
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime::
+
+	If period is 250ms and quota is also 250ms, the group will get
+	1 CPU worth of runtime every 250ms.
+
+	# echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+	# echo 250000 > cpu.cfs_period_us /* period = 250ms */
+
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine
+
+   With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+   runtime every 500ms::
+
+	# echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+	# echo 500000 > cpu.cfs_period_us /* period = 500ms */
+
+	The larger period here allows for increased burst capacity.
+
+3. Limit a group to 20% of 1 CPU.
+
+   With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU::
+
+	# echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+	# echo 50000 > cpu.cfs_period_us /* period = 50ms */
+
+   By using a small period here we are ensuring a consistent latency
+   response at the expense of burst capacity.
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
deleted file mode 100644
index f6b1873f68ab..000000000000
--- a/Documentation/scheduler/sched-bwc.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-CFS Bandwidth Control
-=====================
-
-[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
-  The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
-
-CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
-specification of the maximum CPU bandwidth available to a group or hierarchy.
-
-The bandwidth allowed for a group is specified using a quota and period. Within
-each given "period" (microseconds), a group is allowed to consume only up to
-"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
-group exceeds this limit (for that period), the tasks belonging to its
-hierarchy will be throttled and are not allowed to run again until the next
-period.
-
-A group's unused runtime is globally tracked, being refreshed with quota units
-above at each period boundary.  As threads consume this bandwidth it is
-transferred to cpu-local "silos" on a demand basis.  The amount transferred
-within each of these updates is tunable and described as the "slice".
-
-Management
-----------
-Quota and period are managed within the cpu subsystem via cgroupfs.
-
-cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
-cpu.cfs_period_us: the length of a period (in microseconds)
-cpu.stat: exports throttling statistics [explained further below]
-
-The default values are:
-	cpu.cfs_period_us=100ms
-	cpu.cfs_quota=-1
-
-A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
-bandwidth restriction in place, such a group is described as an unconstrained
-bandwidth group.  This represents the traditional work-conserving behavior for
-CFS.
-
-Writing any (valid) positive value(s) will enact the specified bandwidth limit.
-The minimum quota allowed for the quota or period is 1ms.  There is also an
-upper bound on the period length of 1s.  Additional restrictions exist when
-bandwidth limits are used in a hierarchical fashion, these are explained in
-more detail below.
-
-Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
-and return the group to an unconstrained state once more.
-
-Any updates to a group's bandwidth specification will result in it becoming
-unthrottled if it is in a constrained state.
-
-System wide settings
---------------------
-For efficiency run-time is transferred between the global pool and CPU local
-"silos" in a batch fashion.  This greatly reduces global accounting pressure
-on large systems.  The amount transferred each time such an update is required
-is described as the "slice".
-
-This is tunable via procfs:
-	/proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
-
-Larger slice values will reduce transfer overheads, while smaller values allow
-for more fine-grained consumption.
-
-Statistics
-----------
-A group's bandwidth statistics are exported via 3 fields in cpu.stat.
-
-cpu.stat:
-- nr_periods: Number of enforcement intervals that have elapsed.
-- nr_throttled: Number of times the group has been throttled/limited.
-- throttled_time: The total time duration (in nanoseconds) for which entities
-  of the group have been throttled.
-
-This interface is read-only.
-
-Hierarchical considerations
----------------------------
-The interface enforces that an individual entity's bandwidth is always
-attainable, that is: max(c_i) <= C. However, over-subscription in the
-aggregate case is explicitly allowed to enable work-conserving semantics
-within a hierarchy.
-  e.g. \Sum (c_i) may exceed C
-[ Where C is the parent's bandwidth, and c_i its children ]
-
-
-There are two ways in which a group may become throttled:
-	a. it fully consumes its own quota within a period
-	b. a parent's quota is fully consumed within its period
-
-In case b) above, even though the child may have runtime remaining it will not
-be allowed to until the parent's runtime is refreshed.
-
-Examples
---------
-1. Limit a group to 1 CPU worth of runtime.
-
-	If period is 250ms and quota is also 250ms, the group will get
-	1 CPU worth of runtime every 250ms.
-
-	# echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
-	# echo 250000 > cpu.cfs_period_us /* period = 250ms */
-
-2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
-
-	With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
-	runtime every 500ms.
-
-	# echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
-	# echo 500000 > cpu.cfs_period_us /* period = 500ms */
-
-	The larger period here allows for increased burst capacity.
-
-3. Limit a group to 20% of 1 CPU.
-
-	With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
-
-	# echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
-	# echo 50000 > cpu.cfs_period_us /* period = 50ms */
-
-	By using a small period here we are ensuring a consistent latency
-	response at the expense of burst capacity.
-
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
new file mode 100644
index 000000000000..14a2f7bf63fe
--- /dev/null
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -0,0 +1,888 @@
+========================
+Deadline Task Scheduling
+========================
+
+.. CONTENTS
+
+    0. WARNING
+    1. Overview
+    2. Scheduling algorithm
+      2.1 Main algorithm
+      2.2 Bandwidth reclaiming
+    3. Scheduling Real-Time Tasks
+      3.1 Definitions
+      3.2 Schedulability Analysis for Uniprocessor Systems
+      3.3 Schedulability Analysis for Multiprocessor Systems
+      3.4 Relationship with SCHED_DEADLINE Parameters
+    4. Bandwidth management
+      4.1 System-wide settings
+      4.2 Task interface
+      4.3 Default behavior
+      4.4 Behavior of sched_yield()
+    5. Tasks CPU affinity
+      5.1 SCHED_DEADLINE and cpusets HOWTO
+    6. Future plans
+    A. Test suite
+    B. Minimal main()
+
+
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unpredictable or even unstable
+ system behavior. As for -rt (group) scheduling, it is assumed that root users
+ know what they're doing.
+
+
+1. Overview
+===========
+
+ The SCHED_DEADLINE policy contained inside the sched_dl scheduling class is
+ basically an implementation of the Earliest Deadline First (EDF) scheduling
+ algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS)
+ that makes it possible to isolate the behavior of tasks between each other.
+
+
+2. Scheduling algorithm
+=======================
+
+2.1 Main algorithm
+------------------
+
+ SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
+ "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
+ "runtime" microseconds of execution time every "period" microseconds, and
+ these "runtime" microseconds are available within "deadline" microseconds
+ from the beginning of the period.  In order to implement this behavior,
+ every time the task wakes up, the scheduler computes a "scheduling deadline"
+ consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
+ scheduled using EDF[1] on these scheduling deadlines (the task with the
+ earliest scheduling deadline is selected for execution). Notice that the
+ task actually receives "runtime" time units within "deadline" if a proper
+ "admission control" strategy (see Section "4. Bandwidth management") is used
+ (clearly, if the system is overloaded this guarantee cannot be respected).
+
+ Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
+ that each task runs for at most its runtime every period, avoiding any
+ interference between different tasks (bandwidth isolation), while the EDF[1]
+ algorithm selects the task with the earliest scheduling deadline as the one
+ to be executed next. Thanks to this feature, tasks that do not strictly comply
+ with the "traditional" real-time task model (see Section 3) can effectively
+ use the new policy.
+
+ In more details, the CBS algorithm assigns scheduling deadlines to
+ tasks in the following way:
+
+  - Each SCHED_DEADLINE task is characterized by the "runtime",
+    "deadline", and "period" parameters;
+
+  - The state of the task is described by a "scheduling deadline", and
+    a "remaining runtime". These two parameters are initially set to 0;
+
+  - When a SCHED_DEADLINE task wakes up (becomes ready for execution),
+    the scheduler checks if::
+
+                 remaining runtime                  runtime
+        ----------------------------------    >    ---------
+        scheduling deadline - current time           period
+
+    then, if the scheduling deadline is smaller than the current time, or
+    this condition is verified, the scheduling deadline and the
+    remaining runtime are re-initialized as
+
+         scheduling deadline = current time + deadline
+         remaining runtime = runtime
+
+    otherwise, the scheduling deadline and the remaining runtime are
+    left unchanged;
+
+  - When a SCHED_DEADLINE task executes for an amount of time t, its
+    remaining runtime is decreased as::
+
+         remaining runtime = remaining runtime - t
+
+    (technically, the runtime is decreased at every tick, or when the
+    task is descheduled / preempted);
+
+  - When the remaining runtime becomes less or equal than 0, the task is
+    said to be "throttled" (also known as "depleted" in real-time literature)
+    and cannot be scheduled until its scheduling deadline. The "replenishment
+    time" for this task (see next item) is set to be equal to the current
+    value of the scheduling deadline;
+
+  - When the current time is equal to the replenishment time of a
+    throttled task, the scheduling deadline and the remaining runtime are
+    updated as::
+
+         scheduling deadline = scheduling deadline + period
+         remaining runtime = remaining runtime + runtime
+
+ The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
+ to get informed about runtime overruns through the delivery of SIGXCPU
+ signals.
+
+
+2.2 Bandwidth reclaiming
+------------------------
+
+ Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
+ Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
+ when flag SCHED_FLAG_RECLAIM is set.
+
+ The following diagram illustrates the state names for tasks handled by GRUB::
+
+                             ------------
+                 (d)        |   Active   |
+              ------------->|            |
+              |             | Contending |
+              |              ------------
+              |                A      |
+          ----------           |      |
+         |          |          |      |
+         | Inactive |          |(b)   | (a)
+         |          |          |      |
+          ----------           |      |
+              A                |      V
+              |              ------------
+              |             |   Active   |
+              --------------|     Non    |
+                 (c)        | Contending |
+                             ------------
+
+ A task can be in one of the following states:
+
+  - ActiveContending: if it is ready for execution (or executing);
+
+  - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
+    time;
+
+  - Inactive: if it is blocked and has surpassed the 0-lag time.
+
+ State transitions:
+
+  (a) When a task blocks, it does not become immediately inactive since its
+      bandwidth cannot be immediately reclaimed without breaking the
+      real-time guarantees. It therefore enters a transitional state called
+      ActiveNonContending. The scheduler arms the "inactive timer" to fire at
+      the 0-lag time, when the task's bandwidth can be reclaimed without
+      breaking the real-time guarantees.
+
+      The 0-lag time for a task entering the ActiveNonContending state is
+      computed as::
+
+                        (runtime * dl_period)
+             deadline - ---------------------
+                             dl_runtime
+
+      where runtime is the remaining runtime, while dl_runtime and dl_period
+      are the reservation parameters.
+
+  (b) If the task wakes up before the inactive timer fires, the task re-enters
+      the ActiveContending state and the "inactive timer" is canceled.
+      In addition, if the task wakes up on a different runqueue, then
+      the task's utilization must be removed from the previous runqueue's active
+      utilization and must be added to the new runqueue's active utilization.
+      In order to avoid races between a task waking up on a runqueue while the
+      "inactive timer" is running on a different CPU, the "dl_non_contending"
+      flag is used to indicate that a task is not on a runqueue but is active
+      (so, the flag is set when the task blocks and is cleared when the
+      "inactive timer" fires or when the task  wakes up).
+
+  (c) When the "inactive timer" fires, the task enters the Inactive state and
+      its utilization is removed from the runqueue's active utilization.
+
+  (d) When an inactive task wakes up, it enters the ActiveContending state and
+      its utilization is added to the active utilization of the runqueue where
+      it has been enqueued.
+
+ For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
+
+  - Active bandwidth (running_bw): this is the sum of the bandwidths of all
+    tasks in active state (i.e., ActiveContending or ActiveNonContending);
+
+  - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
+    runqueue, including the tasks in Inactive state.
+
+
+ The algorithm reclaims the bandwidth of the tasks in Inactive state.
+ It does so by decrementing the runtime of the executing task Ti at a pace equal
+ to
+
+           dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
+
+ where:
+
+  - Ui is the bandwidth of task Ti;
+  - Umax is the maximum reclaimable utilization (subjected to RT throttling
+    limits);
+  - Uinact is the (per runqueue) inactive utilization, computed as
+    (this_bq - running_bw);
+  - Uextra is the (per runqueue) extra reclaimable utilization
+    (subjected to RT throttling limits).
+
+
+ Let's now see a trivial example of two deadline tasks with runtime equal
+ to 4 and period equal to 8 (i.e., bandwidth equal to 0.5)::
+
+         A            Task T1
+         |
+         |                               |
+         |                               |
+         |--------                       |----
+         |       |                       V
+         |---|---|---|---|---|---|---|---|--------->t
+         0   1   2   3   4   5   6   7   8
+
+
+         A            Task T2
+         |
+         |                               |
+         |                               |
+         |       ------------------------|
+         |       |                       V
+         |---|---|---|---|---|---|---|---|--------->t
+         0   1   2   3   4   5   6   7   8
+
+
+         A            running_bw
+         |
+       1 -----------------               ------
+         |               |               |
+      0.5-               -----------------
+         |                               |
+         |---|---|---|---|---|---|---|---|--------->t
+         0   1   2   3   4   5   6   7   8
+
+
+  - Time t = 0:
+
+    Both tasks are ready for execution and therefore in ActiveContending state.
+    Suppose Task T1 is the first task to start execution.
+    Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
+
+  - Time t = 2:
+
+    Suppose that task T1 blocks
+    Task T1 therefore enters the ActiveNonContending state. Since its remaining
+    runtime is equal to 2, its 0-lag time is equal to t = 4.
+    Task T2 start execution, with runtime still decreased as dq = -1 dt since
+    there are no inactive tasks.
+
+  - Time t = 4:
+
+    This is the 0-lag time for Task T1. Since it didn't woken up in the
+    meantime, it enters the Inactive state. Its bandwidth is removed from
+    running_bw.
+    Task T2 continues its execution. However, its runtime is now decreased as
+    dq = - 0.5 dt because Uinact = 0.5.
+    Task T2 therefore reclaims the bandwidth unused by Task T1.
+
+  - Time t = 8:
+
+    Task T1 wakes up. It enters the ActiveContending state again, and the
+    running_bw is incremented.
+
+
+2.3 Energy-aware scheduling
+---------------------------
+
+ When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
+ GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
+ value that still allows to meet the deadlines. This behavior is currently
+ implemented only for ARM architectures.
+
+ A particular care must be taken in case the time needed for changing frequency
+ is of the same order of magnitude of the reservation period. In such cases,
+ setting a fixed CPU frequency results in a lower amount of deadline misses.
+
+
+3. Scheduling Real-Time Tasks
+=============================
+
+
+
+ ..  BIG FAT WARNING ******************************************************
+
+ .. warning::
+
+   This section contains a (not-thorough) summary on classical deadline
+   scheduling theory, and how it applies to SCHED_DEADLINE.
+   The reader can "safely" skip to Section 4 if only interested in seeing
+   how the scheduling policy can be used. Anyway, we strongly recommend
+   to come back here and continue reading (once the urge for testing is
+   satisfied :P) to be sure of fully understanding all technical details.
+
+ .. ************************************************************************
+
+ There are no limitations on what kind of task can exploit this new
+ scheduling discipline, even if it must be said that it is particularly
+ suited for periodic or sporadic real-time tasks that need guarantees on their
+ timing behavior, e.g., multimedia, streaming, control applications, etc.
+
+3.1 Definitions
+------------------------
+
+ A typical real-time task is composed of a repetition of computation phases
+ (task instances, or jobs) which are activated on a periodic or sporadic
+ fashion.
+ Each job J_j (where J_j is the j^th job of the task) is characterized by an
+ arrival time r_j (the time when the job starts), an amount of computation
+ time c_j needed to finish the job, and a job absolute deadline d_j, which
+ is the time within which the job should be finished. The maximum execution
+ time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
+ A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
+ sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
+ d_j = r_j + D, where D is the task's relative deadline.
+ Summing up, a real-time task can be described as
+
+	Task = (WCET, D, P)
+
+ The utilization of a real-time task is defined as the ratio between its
+ WCET and its period (or minimum inter-arrival time), and represents
+ the fraction of CPU time needed to execute the task.
+
+ If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
+ to the number of CPUs), then the scheduler is unable to respect all the
+ deadlines.
+ Note that total utilization is defined as the sum of the utilizations
+ WCET_i/P_i over all the real-time tasks in the system. When considering
+ multiple real-time tasks, the parameters of the i-th task are indicated
+ with the "_i" suffix.
+ Moreover, if the total utilization is larger than M, then we risk starving
+ non- real-time tasks by real-time tasks.
+ If, instead, the total utilization is smaller than M, then non real-time
+ tasks will not be starved and the system might be able to respect all the
+ deadlines.
+ As a matter of fact, in this case it is possible to provide an upper bound
+ for tardiness (defined as the maximum between 0 and the difference
+ between the finishing time of a job and its absolute deadline).
+ More precisely, it can be proven that using a global EDF scheduler the
+ maximum tardiness of each task is smaller or equal than
+
+	((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
+
+ where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
+ is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
+ utilization[12].
+
+3.2 Schedulability Analysis for Uniprocessor Systems
+----------------------------------------------------
+
+ If M=1 (uniprocessor system), or in case of partitioned scheduling (each
+ real-time task is statically assigned to one and only one CPU), it is
+ possible to formally check if all the deadlines are respected.
+ If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
+ of all the tasks executing on a CPU if and only if the total utilization
+ of the tasks running on such a CPU is smaller or equal than 1.
+ If D_i != P_i for some task, then it is possible to define the density of
+ a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
+ of all the tasks running on a CPU if the sum of the densities of the tasks
+ running on such a CPU is smaller or equal than 1:
+
+	sum(WCET_i / min{D_i, P_i}) <= 1
+
+ It is important to notice that this condition is only sufficient, and not
+ necessary: there are task sets that are schedulable, but do not respect the
+ condition. For example, consider the task set {Task_1,Task_2} composed by
+ Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
+ EDF is clearly able to schedule the two tasks without missing any deadline
+ (Task_1 is scheduled as soon as it is released, and finishes just in time
+ to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
+ its response time cannot be larger than 50ms + 10ms = 60ms) even if
+
+	50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
+
+ Of course it is possible to test the exact schedulability of tasks with
+ D_i != P_i (checking a condition that is both sufficient and necessary),
+ but this cannot be done by comparing the total utilization or density with
+ a constant. Instead, the so called "processor demand" approach can be used,
+ computing the total amount of CPU time h(t) needed by all the tasks to
+ respect all of their deadlines in a time interval of size t, and comparing
+ such a time with the interval size t. If h(t) is smaller than t (that is,
+ the amount of time needed by the tasks in a time interval of size t is
+ smaller than the size of the interval) for all the possible values of t, then
+ EDF is able to schedule the tasks respecting all of their deadlines. Since
+ performing this check for all possible values of t is impossible, it has been
+ proven[4,5,6] that it is sufficient to perform the test for values of t
+ between 0 and a maximum value L. The cited papers contain all of the
+ mathematical details and explain how to compute h(t) and L.
+ In any case, this kind of analysis is too complex as well as too
+ time-consuming to be performed on-line. Hence, as explained in Section
+ 4 Linux uses an admission test based on the tasks' utilizations.
+
+3.3 Schedulability Analysis for Multiprocessor Systems
+------------------------------------------------------
+
+ On multiprocessor systems with global EDF scheduling (non partitioned
+ systems), a sufficient test for schedulability can not be based on the
+ utilizations or densities: it can be shown that even if D_i = P_i task
+ sets with utilizations slightly larger than 1 can miss deadlines regardless
+ of the number of CPUs.
+
+ Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
+ CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
+ and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
+ arbitrarily small worst case execution time (indicated as "e" here) and a
+ period smaller than the one of the first task. Hence, if all the tasks
+ activate at the same time t, global EDF schedules these M tasks first
+ (because their absolute deadlines are equal to t + P - 1, hence they are
+ smaller than the absolute deadline of Task_1, which is t + P). As a
+ result, Task_1 can be scheduled only at time t + e, and will finish at
+ time t + e + P, after its absolute deadline. The total utilization of the
+ task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
+ values of e this can become very close to 1. This is known as "Dhall's
+ effect"[7]. Note: the example in the original paper by Dhall has been
+ slightly simplified here (for example, Dhall more correctly computed
+ lim_{e->0}U).
+
+ More complex schedulability tests for global EDF have been developed in
+ real-time literature[8,9], but they are not based on a simple comparison
+ between total utilization (or density) and a fixed constant. If all tasks
+ have D_i = P_i, a sufficient schedulability condition can be expressed in
+ a simple way:
+
+	sum(WCET_i / P_i) <= M - (M - 1) · U_max
+
+ where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
+ M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
+ just confirms the Dhall's effect. A more complete survey of the literature
+ about schedulability tests for multi-processor real-time scheduling can be
+ found in [11].
+
+ As seen, enforcing that the total utilization is smaller than M does not
+ guarantee that global EDF schedules the tasks without missing any deadline
+ (in other words, global EDF is not an optimal scheduling algorithm). However,
+ a total utilization smaller than M is enough to guarantee that non real-time
+ tasks are not starved and that the tardiness of real-time tasks has an upper
+ bound[12] (as previously noted). Different bounds on the maximum tardiness
+ experienced by real-time tasks have been developed in various papers[13,14],
+ but the theoretical result that is important for SCHED_DEADLINE is that if
+ the total utilization is smaller or equal than M then the response times of
+ the tasks are limited.
+
+3.4 Relationship with SCHED_DEADLINE Parameters
+-----------------------------------------------
+
+ Finally, it is important to understand the relationship between the
+ SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
+ deadline and period) and the real-time task parameters (WCET, D, P)
+ described in this section. Note that the tasks' temporal constraints are
+ represented by its absolute deadlines d_j = r_j + D described above, while
+ SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
+ Section 2).
+ If an admission test is used to guarantee that the scheduling deadlines
+ are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
+ guaranteeing that all the jobs' deadlines of a task are respected.
+ In order to do this, a task must be scheduled by setting:
+
+  - runtime >= WCET
+  - deadline = D
+  - period <= P
+
+ IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
+ and the absolute deadlines (d_j) coincide, so a proper admission control
+ allows to respect the jobs' absolute deadlines for this task (this is what is
+ called "hard schedulability property" and is an extension of Lemma 1 of [2]).
+ Notice that if runtime > deadline the admission control will surely reject
+ this task, as it is not possible to respect its temporal constraints.
+
+ References:
+
+  1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram-
+      ming in a hard-real-time environment. Journal of the Association for
+      Computing Machinery, 20(1), 1973.
+  2 - L. Abeni , G. Buttazzo. Integrating Multimedia Applications in Hard
+      Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems
+      Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
+  3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
+      Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
+  4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
+      Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
+      no. 3, pp. 115-118, 1980.
+  5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
+      Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
+      11th IEEE Real-time Systems Symposium, 1990.
+  6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
+      Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
+      One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
+      1990.
+  7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
+      research, vol. 26, no. 1, pp 127-140, 1978.
+  8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
+      Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
+  9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
+      IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
+      pp 760-768, 2005.
+  10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
+       Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
+       vol. 25, no. 2–3, pp. 187–205, 2003.
+  11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
+       Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
+       http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
+  12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
+       Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
+       no. 2, pp 133-189, 2008.
+  13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
+       Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
+       the 26th IEEE Real-Time Systems Symposium, 2005.
+  14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
+       Global EDF. Proceedings of the 22nd Euromicro Conference on
+       Real-Time Systems, 2010.
+  15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
+       constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
+       Systems, 2000.
+  16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
+       SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
+       Dusseldorf, Germany, 2014.
+  17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
+       or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
+       Computing, 2016.
+  18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
+       Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
+       2016.
+  19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
+       the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
+       2018), Pau, France, April 2018.
+
+
+4. Bandwidth management
+=======================
+
+ As previously mentioned, in order for -deadline scheduling to be
+ effective and useful (that is, to be able to provide "runtime" time units
+ within "deadline"), it is important to have some method to keep the allocation
+ of the available fractions of CPU time to the various tasks under control.
+ This is usually called "admission control" and if it is not performed, then
+ no guarantee can be given on the actual scheduling of the -deadline tasks.
+
+ As already stated in Section 3, a necessary condition to be respected to
+ correctly schedule a set of real-time tasks is that the total utilization
+ is smaller than M. When talking about -deadline tasks, this requires that
+ the sum of the ratio between runtime and period for all tasks is smaller
+ than M. Notice that the ratio runtime/period is equivalent to the utilization
+ of a "traditional" real-time task, and is also often referred to as
+ "bandwidth".
+ The interface used to control the CPU bandwidth that can be allocated
+ to -deadline tasks is similar to the one already used for -rt
+ tasks with real-time group scheduling (a.k.a. RT-throttling - see
+ Documentation/scheduler/sched-rt-group.rst), and is based on readable/
+ writable control files located in procfs (for system wide settings).
+ Notice that per-group settings (controlled through cgroupfs) are still not
+ defined for -deadline tasks, because more discussion is needed in order to
+ figure out how we want to manage SCHED_DEADLINE bandwidth at the task group
+ level.
+
+ A main difference between deadline bandwidth management and RT-throttling
+ is that -deadline tasks have bandwidth on their own (while -rt ones don't!),
+ and thus we don't need a higher level throttling mechanism to enforce the
+ desired bandwidth. In other words, this means that interface parameters are
+ only used at admission control time (i.e., when the user calls
+ sched_setattr()). Scheduling is then performed considering actual tasks'
+ parameters, so that CPU bandwidth is allocated to SCHED_DEADLINE tasks
+ respecting their needs in terms of granularity. Therefore, using this simple
+ interface we can put a cap on total utilization of -deadline tasks (i.e.,
+ \Sum (runtime_i / period_i) < global_dl_utilization_cap).
+
+4.1 System wide settings
+------------------------
+
+ The system wide settings are configured under the /proc virtual file system.
+
+ For now the -rt knobs are used for -deadline admission control and the
+ -deadline runtime is accounted against the -rt runtime. We realize that this
+ isn't entirely desirable; however, it is better to have a small interface for
+ now, and be able to change it easily later. The ideal situation (see 5.) is to
+ run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
+ direct subset of dl_bw.
+
+ This means that, for a root_domain comprising M CPUs, -deadline tasks
+ can be created while the sum of their bandwidths stays below:
+
+   M * (sched_rt_runtime_us / sched_rt_period_us)
+
+ It is also possible to disable this bandwidth management logic, and
+ be thus free of oversubscribing the system up to any arbitrary level.
+ This is done by writing -1 in /proc/sys/kernel/sched_rt_runtime_us.
+
+
+4.2 Task interface
+------------------
+
+ Specifying a periodic/sporadic task that executes for a given amount of
+ runtime at each instance, and that is scheduled according to the urgency of
+ its own timing constraints needs, in general, a way of declaring:
+
+  - a (maximum/typical) instance execution time,
+  - a minimum interval between consecutive instances,
+  - a time constraint by which each instance must be completed.
+
+ Therefore:
+
+  * a new struct sched_attr, containing all the necessary fields is
+    provided;
+  * the new scheduling related syscalls that manipulate it, i.e.,
+    sched_setattr() and sched_getattr() are implemented.
+
+ For debugging purposes, the leftover runtime and absolute deadline of a
+ SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
+ dl.runtime and dl.deadline, both values in ns). A programmatic way to
+ retrieve these values from production code is under discussion.
+
+
+4.3 Default behavior
+---------------------
+
+ The default value for SCHED_DEADLINE bandwidth is to have rt_runtime equal to
+ 950000. With rt_period equal to 1000000, by default, it means that -deadline
+ tasks can use at most 95%, multiplied by the number of CPUs that compose the
+ root_domain, for each root_domain.
+ This means that non -deadline tasks will receive at least 5% of the CPU time,
+ and that -deadline tasks will receive their runtime with a guaranteed
+ worst-case delay respect to the "deadline" parameter. If "deadline" = "period"
+ and the cpuset mechanism is used to implement partitioned scheduling (see
+ Section 5), then this simple setting of the bandwidth management is able to
+ deterministically guarantee that -deadline tasks will receive their runtime
+ in a period.
+
+ Finally, notice that in order not to jeopardize the admission control a
+ -deadline task cannot fork.
+
+
+4.4 Behavior of sched_yield()
+-----------------------------
+
+ When a SCHED_DEADLINE task calls sched_yield(), it gives up its
+ remaining runtime and is immediately throttled, until the next
+ period, when its runtime will be replenished (a special flag
+ dl_yielded is set and used to handle correctly throttling and runtime
+ replenishment after a call to sched_yield()).
+
+ This behavior of sched_yield() allows the task to wake-up exactly at
+ the beginning of the next period. Also, this may be useful in the
+ future with bandwidth reclaiming mechanisms, where sched_yield() will
+ make the leftoever runtime available for reclamation by other
+ SCHED_DEADLINE tasks.
+
+
+5. Tasks CPU affinity
+=====================
+
+ -deadline tasks cannot have an affinity mask smaller that the entire
+ root_domain they are created on. However, affinities can be specified
+ through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst).
+
+5.1 SCHED_DEADLINE and cpusets HOWTO
+------------------------------------
+
+ An example of a simple configuration (pin a -deadline task to CPU0)
+ follows (rt-app is used to create a -deadline task)::
+
+   mkdir /dev/cpuset
+   mount -t cgroup -o cpuset cpuset /dev/cpuset
+   cd /dev/cpuset
+   mkdir cpu0
+   echo 0 > cpu0/cpuset.cpus
+   echo 0 > cpu0/cpuset.mems
+   echo 1 > cpuset.cpu_exclusive
+   echo 0 > cpuset.sched_load_balance
+   echo 1 > cpu0/cpuset.cpu_exclusive
+   echo 1 > cpu0/cpuset.mem_exclusive
+   echo $$ > cpu0/tasks
+   rt-app -t 100000:10000:d:0 -D5 # it is now actually superfluous to specify
+				  # task affinity
+
+6. Future plans
+===============
+
+ Still missing:
+
+  - programmatic way to retrieve current runtime and absolute deadline
+  - refinements to deadline inheritance, especially regarding the possibility
+    of retaining bandwidth isolation among non-interacting tasks. This is
+    being studied from both theoretical and practical points of view, and
+    hopefully we should be able to produce some demonstrative code soon;
+  - (c)group based bandwidth management, and maybe scheduling;
+  - access control for non-root users (and related security concerns to
+    address), which is the best way to allow unprivileged use of the mechanisms
+    and how to prevent non-root users "cheat" the system?
+
+ As already discussed, we are planning also to merge this work with the EDF
+ throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in
+ the preliminary phases of the merge and we really seek feedback that would
+ help us decide on the direction it should take.
+
+Appendix A. Test suite
+======================
+
+ The SCHED_DEADLINE policy can be easily tested using two applications that
+ are part of a wider Linux Scheduler validation suite. The suite is
+ available as a GitHub repository: https://github.com/scheduler-tools.
+
+ The first testing application is called rt-app and can be used to
+ start multiple threads with specific parameters. rt-app supports
+ SCHED_{OTHER,FIFO,RR,DEADLINE} scheduling policies and their related
+ parameters (e.g., niceness, priority, runtime/deadline/period). rt-app
+ is a valuable tool, as it can be used to synthetically recreate certain
+ workloads (maybe mimicking real use-cases) and evaluate how the scheduler
+ behaves under such workloads. In this way, results are easily reproducible.
+ rt-app is available at: https://github.com/scheduler-tools/rt-app.
+
+ Thread parameters can be specified from the command line, with something like
+ this::
+
+  # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5
+
+ The above creates 2 threads. The first one, scheduled by SCHED_DEADLINE,
+ executes for 10ms every 100ms. The second one, scheduled at SCHED_FIFO
+ priority 10, executes for 20ms every 150ms. The test will run for a total
+ of 5 seconds.
+
+ More interestingly, configurations can be described with a json file that
+ can be passed as input to rt-app with something like this::
+
+  # rt-app my_config.json
+
+ The parameters that can be specified with the second method are a superset
+ of the command line options. Please refer to rt-app documentation for more
+ details (`<rt-app-sources>/doc/*.json`).
+
+ The second testing application is a modification of schedtool, called
+ schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
+ certain pid/application. schedtool-dl is available at:
+ https://github.com/scheduler-tools/schedtool-dl.git.
+
+ The usage is straightforward::
+
+  # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
+
+ With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
+ of 10ms every 100ms (note that parameters are expressed in microseconds).
+ You can also use schedtool to create a reservation for an already running
+ application, given that you know its pid::
+
+  # schedtool -E -t 10000000:100000000 my_app_pid
+
+Appendix B. Minimal main()
+==========================
+
+ We provide in what follows a simple (ugly) self-contained code snippet
+ showing how SCHED_DEADLINE reservations can be created by a real-time
+ application developer::
+
+   #define _GNU_SOURCE
+   #include <unistd.h>
+   #include <stdio.h>
+   #include <stdlib.h>
+   #include <string.h>
+   #include <time.h>
+   #include <linux/unistd.h>
+   #include <linux/kernel.h>
+   #include <linux/types.h>
+   #include <sys/syscall.h>
+   #include <pthread.h>
+
+   #define gettid() syscall(__NR_gettid)
+
+   #define SCHED_DEADLINE	6
+
+   /* XXX use the proper syscall numbers */
+   #ifdef __x86_64__
+   #define __NR_sched_setattr		314
+   #define __NR_sched_getattr		315
+   #endif
+
+   #ifdef __i386__
+   #define __NR_sched_setattr		351
+   #define __NR_sched_getattr		352
+   #endif
+
+   #ifdef __arm__
+   #define __NR_sched_setattr		380
+   #define __NR_sched_getattr		381
+   #endif
+
+   static volatile int done;
+
+   struct sched_attr {
+	__u32 size;
+
+	__u32 sched_policy;
+	__u64 sched_flags;
+
+	/* SCHED_NORMAL, SCHED_BATCH */
+	__s32 sched_nice;
+
+	/* SCHED_FIFO, SCHED_RR */
+	__u32 sched_priority;
+
+	/* SCHED_DEADLINE (nsec) */
+	__u64 sched_runtime;
+	__u64 sched_deadline;
+	__u64 sched_period;
+   };
+
+   int sched_setattr(pid_t pid,
+		  const struct sched_attr *attr,
+		  unsigned int flags)
+   {
+	return syscall(__NR_sched_setattr, pid, attr, flags);
+   }
+
+   int sched_getattr(pid_t pid,
+		  struct sched_attr *attr,
+		  unsigned int size,
+		  unsigned int flags)
+   {
+	return syscall(__NR_sched_getattr, pid, attr, size, flags);
+   }
+
+   void *run_deadline(void *data)
+   {
+	struct sched_attr attr;
+	int x = 0;
+	int ret;
+	unsigned int flags = 0;
+
+	printf("deadline thread started [%ld]\n", gettid());
+
+	attr.size = sizeof(attr);
+	attr.sched_flags = 0;
+	attr.sched_nice = 0;
+	attr.sched_priority = 0;
+
+	/* This creates a 10ms/30ms reservation */
+	attr.sched_policy = SCHED_DEADLINE;
+	attr.sched_runtime = 10 * 1000 * 1000;
+	attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000;
+
+	ret = sched_setattr(0, &attr, flags);
+	if (ret < 0) {
+		done = 0;
+		perror("sched_setattr");
+		exit(-1);
+	}
+
+	while (!done) {
+		x++;
+	}
+
+	printf("deadline thread dies [%ld]\n", gettid());
+	return NULL;
+   }
+
+   int main (int argc, char **argv)
+   {
+	pthread_t thread;
+
+	printf("main thread [%ld]\n", gettid());
+
+	pthread_create(&thread, NULL, run_deadline, NULL);
+
+	sleep(10);
+
+	done = 1;
+	pthread_join(thread, NULL);
+
+	printf("main dies [%ld]\n", gettid());
+	return 0;
+   }
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
deleted file mode 100644
index b14e03ff3528..000000000000
--- a/Documentation/scheduler/sched-deadline.txt
+++ /dev/null
@@ -1,871 +0,0 @@
-			  Deadline Task Scheduling
-			  ------------------------
-
-CONTENTS
-========
-
- 0. WARNING
- 1. Overview
- 2. Scheduling algorithm
-   2.1 Main algorithm
-   2.2 Bandwidth reclaiming
- 3. Scheduling Real-Time Tasks
-   3.1 Definitions
-   3.2 Schedulability Analysis for Uniprocessor Systems
-   3.3 Schedulability Analysis for Multiprocessor Systems
-   3.4 Relationship with SCHED_DEADLINE Parameters
- 4. Bandwidth management
-   4.1 System-wide settings
-   4.2 Task interface
-   4.3 Default behavior
-   4.4 Behavior of sched_yield()
- 5. Tasks CPU affinity
-   5.1 SCHED_DEADLINE and cpusets HOWTO
- 6. Future plans
- A. Test suite
- B. Minimal main()
-
-
-0. WARNING
-==========
-
- Fiddling with these settings can result in an unpredictable or even unstable
- system behavior. As for -rt (group) scheduling, it is assumed that root users
- know what they're doing.
-
-
-1. Overview
-===========
-
- The SCHED_DEADLINE policy contained inside the sched_dl scheduling class is
- basically an implementation of the Earliest Deadline First (EDF) scheduling
- algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS)
- that makes it possible to isolate the behavior of tasks between each other.
-
-
-2. Scheduling algorithm
-==================
-
-2.1 Main algorithm
-------------------
-
- SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
- "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
- "runtime" microseconds of execution time every "period" microseconds, and
- these "runtime" microseconds are available within "deadline" microseconds
- from the beginning of the period.  In order to implement this behavior,
- every time the task wakes up, the scheduler computes a "scheduling deadline"
- consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
- scheduled using EDF[1] on these scheduling deadlines (the task with the
- earliest scheduling deadline is selected for execution). Notice that the
- task actually receives "runtime" time units within "deadline" if a proper
- "admission control" strategy (see Section "4. Bandwidth management") is used
- (clearly, if the system is overloaded this guarantee cannot be respected).
-
- Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
- that each task runs for at most its runtime every period, avoiding any
- interference between different tasks (bandwidth isolation), while the EDF[1]
- algorithm selects the task with the earliest scheduling deadline as the one
- to be executed next. Thanks to this feature, tasks that do not strictly comply
- with the "traditional" real-time task model (see Section 3) can effectively
- use the new policy.
-
- In more details, the CBS algorithm assigns scheduling deadlines to
- tasks in the following way:
-
-  - Each SCHED_DEADLINE task is characterized by the "runtime",
-    "deadline", and "period" parameters;
-
-  - The state of the task is described by a "scheduling deadline", and
-    a "remaining runtime". These two parameters are initially set to 0;
-
-  - When a SCHED_DEADLINE task wakes up (becomes ready for execution),
-    the scheduler checks if
-
-                 remaining runtime                  runtime
-        ----------------------------------    >    ---------
-        scheduling deadline - current time           period
-
-    then, if the scheduling deadline is smaller than the current time, or
-    this condition is verified, the scheduling deadline and the
-    remaining runtime are re-initialized as
-
-         scheduling deadline = current time + deadline
-         remaining runtime = runtime
-
-    otherwise, the scheduling deadline and the remaining runtime are
-    left unchanged;
-
-  - When a SCHED_DEADLINE task executes for an amount of time t, its
-    remaining runtime is decreased as
-
-         remaining runtime = remaining runtime - t
-
-    (technically, the runtime is decreased at every tick, or when the
-    task is descheduled / preempted);
-
-  - When the remaining runtime becomes less or equal than 0, the task is
-    said to be "throttled" (also known as "depleted" in real-time literature)
-    and cannot be scheduled until its scheduling deadline. The "replenishment
-    time" for this task (see next item) is set to be equal to the current
-    value of the scheduling deadline;
-
-  - When the current time is equal to the replenishment time of a
-    throttled task, the scheduling deadline and the remaining runtime are
-    updated as
-
-         scheduling deadline = scheduling deadline + period
-         remaining runtime = remaining runtime + runtime
-
- The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
- to get informed about runtime overruns through the delivery of SIGXCPU
- signals.
-
-
-2.2 Bandwidth reclaiming
-------------------------
-
- Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
- Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
- when flag SCHED_FLAG_RECLAIM is set.
-
- The following diagram illustrates the state names for tasks handled by GRUB:
-
-                             ------------
-                 (d)        |   Active   |
-              ------------->|            |
-              |             | Contending |
-              |              ------------
-              |                A      |
-          ----------           |      |
-         |          |          |      |
-         | Inactive |          |(b)   | (a)
-         |          |          |      |
-          ----------           |      |
-              A                |      V
-              |              ------------
-              |             |   Active   |
-              --------------|     Non    |
-                 (c)        | Contending |
-                             ------------
-
- A task can be in one of the following states:
-
-  - ActiveContending: if it is ready for execution (or executing);
-
-  - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
-    time;
-
-  - Inactive: if it is blocked and has surpassed the 0-lag time.
-
- State transitions:
-
-  (a) When a task blocks, it does not become immediately inactive since its
-      bandwidth cannot be immediately reclaimed without breaking the
-      real-time guarantees. It therefore enters a transitional state called
-      ActiveNonContending. The scheduler arms the "inactive timer" to fire at
-      the 0-lag time, when the task's bandwidth can be reclaimed without
-      breaking the real-time guarantees.
-
-      The 0-lag time for a task entering the ActiveNonContending state is
-      computed as
-
-                        (runtime * dl_period)
-             deadline - ---------------------
-                             dl_runtime
-
-      where runtime is the remaining runtime, while dl_runtime and dl_period
-      are the reservation parameters.
-
-  (b) If the task wakes up before the inactive timer fires, the task re-enters
-      the ActiveContending state and the "inactive timer" is canceled.
-      In addition, if the task wakes up on a different runqueue, then
-      the task's utilization must be removed from the previous runqueue's active
-      utilization and must be added to the new runqueue's active utilization.
-      In order to avoid races between a task waking up on a runqueue while the
-       "inactive timer" is running on a different CPU, the "dl_non_contending"
-      flag is used to indicate that a task is not on a runqueue but is active
-      (so, the flag is set when the task blocks and is cleared when the
-      "inactive timer" fires or when the task  wakes up).
-
-  (c) When the "inactive timer" fires, the task enters the Inactive state and
-      its utilization is removed from the runqueue's active utilization.
-
-  (d) When an inactive task wakes up, it enters the ActiveContending state and
-      its utilization is added to the active utilization of the runqueue where
-      it has been enqueued.
-
- For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
-
-  - Active bandwidth (running_bw): this is the sum of the bandwidths of all
-    tasks in active state (i.e., ActiveContending or ActiveNonContending);
-
-  - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
-    runqueue, including the tasks in Inactive state.
-
-
- The algorithm reclaims the bandwidth of the tasks in Inactive state.
- It does so by decrementing the runtime of the executing task Ti at a pace equal
- to
-
-           dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
-
- where:
-
-  - Ui is the bandwidth of task Ti;
-  - Umax is the maximum reclaimable utilization (subjected to RT throttling
-    limits);
-  - Uinact is the (per runqueue) inactive utilization, computed as
-    (this_bq - running_bw);
-  - Uextra is the (per runqueue) extra reclaimable utilization
-    (subjected to RT throttling limits).
-
-
- Let's now see a trivial example of two deadline tasks with runtime equal
- to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):
-
-     A            Task T1
-     |
-     |                               |
-     |                               |
-     |--------                       |----
-     |       |                       V
-     |---|---|---|---|---|---|---|---|--------->t
-     0   1   2   3   4   5   6   7   8
-
-
-     A            Task T2
-     |
-     |                               |
-     |                               |
-     |       ------------------------|
-     |       |                       V
-     |---|---|---|---|---|---|---|---|--------->t
-     0   1   2   3   4   5   6   7   8
-
-
-     A            running_bw
-     |
-   1 -----------------               ------
-     |               |               |
-  0.5-               -----------------
-     |                               |
-     |---|---|---|---|---|---|---|---|--------->t
-     0   1   2   3   4   5   6   7   8
-
-
-  - Time t = 0:
-
-    Both tasks are ready for execution and therefore in ActiveContending state.
-    Suppose Task T1 is the first task to start execution.
-    Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
-
-  - Time t = 2:
-
-    Suppose that task T1 blocks
-    Task T1 therefore enters the ActiveNonContending state. Since its remaining
-    runtime is equal to 2, its 0-lag time is equal to t = 4.
-    Task T2 start execution, with runtime still decreased as dq = -1 dt since
-    there are no inactive tasks.
-
-  - Time t = 4:
-
-    This is the 0-lag time for Task T1. Since it didn't woken up in the
-    meantime, it enters the Inactive state. Its bandwidth is removed from
-    running_bw.
-    Task T2 continues its execution. However, its runtime is now decreased as
-    dq = - 0.5 dt because Uinact = 0.5.
-    Task T2 therefore reclaims the bandwidth unused by Task T1.
-
-  - Time t = 8:
-
-    Task T1 wakes up. It enters the ActiveContending state again, and the
-    running_bw is incremented.
-
-
-2.3 Energy-aware scheduling
-------------------------
-
- When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
- GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
- value that still allows to meet the deadlines. This behavior is currently
- implemented only for ARM architectures.
-
- A particular care must be taken in case the time needed for changing frequency
- is of the same order of magnitude of the reservation period. In such cases,
- setting a fixed CPU frequency results in a lower amount of deadline misses.
-
-
-3. Scheduling Real-Time Tasks
-=============================
-
- * BIG FAT WARNING ******************************************************
- *
- * This section contains a (not-thorough) summary on classical deadline
- * scheduling theory, and how it applies to SCHED_DEADLINE.
- * The reader can "safely" skip to Section 4 if only interested in seeing
- * how the scheduling policy can be used. Anyway, we strongly recommend
- * to come back here and continue reading (once the urge for testing is
- * satisfied :P) to be sure of fully understanding all technical details.
- ************************************************************************
-
- There are no limitations on what kind of task can exploit this new
- scheduling discipline, even if it must be said that it is particularly
- suited for periodic or sporadic real-time tasks that need guarantees on their
- timing behavior, e.g., multimedia, streaming, control applications, etc.
-
-3.1 Definitions
-------------------------
-
- A typical real-time task is composed of a repetition of computation phases
- (task instances, or jobs) which are activated on a periodic or sporadic
- fashion.
- Each job J_j (where J_j is the j^th job of the task) is characterized by an
- arrival time r_j (the time when the job starts), an amount of computation
- time c_j needed to finish the job, and a job absolute deadline d_j, which
- is the time within which the job should be finished. The maximum execution
- time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
- A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
- sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
- d_j = r_j + D, where D is the task's relative deadline.
- Summing up, a real-time task can be described as
-	Task = (WCET, D, P)
-
- The utilization of a real-time task is defined as the ratio between its
- WCET and its period (or minimum inter-arrival time), and represents
- the fraction of CPU time needed to execute the task.
-
- If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
- to the number of CPUs), then the scheduler is unable to respect all the
- deadlines.
- Note that total utilization is defined as the sum of the utilizations
- WCET_i/P_i over all the real-time tasks in the system. When considering
- multiple real-time tasks, the parameters of the i-th task are indicated
- with the "_i" suffix.
- Moreover, if the total utilization is larger than M, then we risk starving
- non- real-time tasks by real-time tasks.
- If, instead, the total utilization is smaller than M, then non real-time
- tasks will not be starved and the system might be able to respect all the
- deadlines.
- As a matter of fact, in this case it is possible to provide an upper bound
- for tardiness (defined as the maximum between 0 and the difference
- between the finishing time of a job and its absolute deadline).
- More precisely, it can be proven that using a global EDF scheduler the
- maximum tardiness of each task is smaller or equal than
-	((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
- where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
- is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
- utilization[12].
-
-3.2 Schedulability Analysis for Uniprocessor Systems
-------------------------
-
- If M=1 (uniprocessor system), or in case of partitioned scheduling (each
- real-time task is statically assigned to one and only one CPU), it is
- possible to formally check if all the deadlines are respected.
- If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
- of all the tasks executing on a CPU if and only if the total utilization
- of the tasks running on such a CPU is smaller or equal than 1.
- If D_i != P_i for some task, then it is possible to define the density of
- a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
- of all the tasks running on a CPU if the sum of the densities of the tasks
- running on such a CPU is smaller or equal than 1:
-	sum(WCET_i / min{D_i, P_i}) <= 1
- It is important to notice that this condition is only sufficient, and not
- necessary: there are task sets that are schedulable, but do not respect the
- condition. For example, consider the task set {Task_1,Task_2} composed by
- Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
- EDF is clearly able to schedule the two tasks without missing any deadline
- (Task_1 is scheduled as soon as it is released, and finishes just in time
- to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
- its response time cannot be larger than 50ms + 10ms = 60ms) even if
-	50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
- Of course it is possible to test the exact schedulability of tasks with
- D_i != P_i (checking a condition that is both sufficient and necessary),
- but this cannot be done by comparing the total utilization or density with
- a constant. Instead, the so called "processor demand" approach can be used,
- computing the total amount of CPU time h(t) needed by all the tasks to
- respect all of their deadlines in a time interval of size t, and comparing
- such a time with the interval size t. If h(t) is smaller than t (that is,
- the amount of time needed by the tasks in a time interval of size t is
- smaller than the size of the interval) for all the possible values of t, then
- EDF is able to schedule the tasks respecting all of their deadlines. Since
- performing this check for all possible values of t is impossible, it has been
- proven[4,5,6] that it is sufficient to perform the test for values of t
- between 0 and a maximum value L. The cited papers contain all of the
- mathematical details and explain how to compute h(t) and L.
- In any case, this kind of analysis is too complex as well as too
- time-consuming to be performed on-line. Hence, as explained in Section
- 4 Linux uses an admission test based on the tasks' utilizations.
-
-3.3 Schedulability Analysis for Multiprocessor Systems
-------------------------
-
- On multiprocessor systems with global EDF scheduling (non partitioned
- systems), a sufficient test for schedulability can not be based on the
- utilizations or densities: it can be shown that even if D_i = P_i task
- sets with utilizations slightly larger than 1 can miss deadlines regardless
- of the number of CPUs.
-
- Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
- CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
- and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
- arbitrarily small worst case execution time (indicated as "e" here) and a
- period smaller than the one of the first task. Hence, if all the tasks
- activate at the same time t, global EDF schedules these M tasks first
- (because their absolute deadlines are equal to t + P - 1, hence they are
- smaller than the absolute deadline of Task_1, which is t + P). As a
- result, Task_1 can be scheduled only at time t + e, and will finish at
- time t + e + P, after its absolute deadline. The total utilization of the
- task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
- values of e this can become very close to 1. This is known as "Dhall's
- effect"[7]. Note: the example in the original paper by Dhall has been
- slightly simplified here (for example, Dhall more correctly computed
- lim_{e->0}U).
-
- More complex schedulability tests for global EDF have been developed in
- real-time literature[8,9], but they are not based on a simple comparison
- between total utilization (or density) and a fixed constant. If all tasks
- have D_i = P_i, a sufficient schedulability condition can be expressed in
- a simple way:
-	sum(WCET_i / P_i) <= M - (M - 1) · U_max
- where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
- M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
- just confirms the Dhall's effect. A more complete survey of the literature
- about schedulability tests for multi-processor real-time scheduling can be
- found in [11].
-
- As seen, enforcing that the total utilization is smaller than M does not
- guarantee that global EDF schedules the tasks without missing any deadline
- (in other words, global EDF is not an optimal scheduling algorithm). However,
- a total utilization smaller than M is enough to guarantee that non real-time
- tasks are not starved and that the tardiness of real-time tasks has an upper
- bound[12] (as previously noted). Different bounds on the maximum tardiness
- experienced by real-time tasks have been developed in various papers[13,14],
- but the theoretical result that is important for SCHED_DEADLINE is that if
- the total utilization is smaller or equal than M then the response times of
- the tasks are limited.
-
-3.4 Relationship with SCHED_DEADLINE Parameters
-------------------------
-
- Finally, it is important to understand the relationship between the
- SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
- deadline and period) and the real-time task parameters (WCET, D, P)
- described in this section. Note that the tasks' temporal constraints are
- represented by its absolute deadlines d_j = r_j + D described above, while
- SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
- Section 2).
- If an admission test is used to guarantee that the scheduling deadlines
- are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
- guaranteeing that all the jobs' deadlines of a task are respected.
- In order to do this, a task must be scheduled by setting:
-
-  - runtime >= WCET
-  - deadline = D
-  - period <= P
-
- IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
- and the absolute deadlines (d_j) coincide, so a proper admission control
- allows to respect the jobs' absolute deadlines for this task (this is what is
- called "hard schedulability property" and is an extension of Lemma 1 of [2]).
- Notice that if runtime > deadline the admission control will surely reject
- this task, as it is not possible to respect its temporal constraints.
-
- References:
-  1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram-
-      ming in a hard-real-time environment. Journal of the Association for
-      Computing Machinery, 20(1), 1973.
-  2 - L. Abeni , G. Buttazzo. Integrating Multimedia Applications in Hard
-      Real-Time Systems. Proceedings of the 19th IEEE Real-time Systems
-      Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
-  3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
-      Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
-  4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
-      Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
-      no. 3, pp. 115-118, 1980.
-  5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
-      Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
-      11th IEEE Real-time Systems Symposium, 1990.
-  6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
-      Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
-      One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
-      1990.
-  7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
-      research, vol. 26, no. 1, pp 127-140, 1978.
-  8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
-      Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
-  9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
-      IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
-      pp 760-768, 2005.
-  10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
-       Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
-       vol. 25, no. 2–3, pp. 187–205, 2003.
-  11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
-       Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
-       http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
-  12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
-       Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
-       no. 2, pp 133-189, 2008.
-  13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
-       Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
-       the 26th IEEE Real-Time Systems Symposium, 2005.
-  14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
-       Global EDF. Proceedings of the 22nd Euromicro Conference on
-       Real-Time Systems, 2010.
-  15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
-       constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
-       Systems, 2000.
-  16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
-       SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
-       Dusseldorf, Germany, 2014.
-  17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
-       or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
-       Computing, 2016.
-  18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
-       Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
-       2016.
-  19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
-       the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
-       2018), Pau, France, April 2018.
-
-
-4. Bandwidth management
-=======================
-
- As previously mentioned, in order for -deadline scheduling to be
- effective and useful (that is, to be able to provide "runtime" time units
- within "deadline"), it is important to have some method to keep the allocation
- of the available fractions of CPU time to the various tasks under control.
- This is usually called "admission control" and if it is not performed, then
- no guarantee can be given on the actual scheduling of the -deadline tasks.
-
- As already stated in Section 3, a necessary condition to be respected to
- correctly schedule a set of real-time tasks is that the total utilization
- is smaller than M. When talking about -deadline tasks, this requires that
- the sum of the ratio between runtime and period for all tasks is smaller
- than M. Notice that the ratio runtime/period is equivalent to the utilization
- of a "traditional" real-time task, and is also often referred to as
- "bandwidth".
- The interface used to control the CPU bandwidth that can be allocated
- to -deadline tasks is similar to the one already used for -rt
- tasks with real-time group scheduling (a.k.a. RT-throttling - see
- Documentation/scheduler/sched-rt-group.txt), and is based on readable/
- writable control files located in procfs (for system wide settings).
- Notice that per-group settings (controlled through cgroupfs) are still not
- defined for -deadline tasks, because more discussion is needed in order to
- figure out how we want to manage SCHED_DEADLINE bandwidth at the task group
- level.
-
- A main difference between deadline bandwidth management and RT-throttling
- is that -deadline tasks have bandwidth on their own (while -rt ones don't!),
- and thus we don't need a higher level throttling mechanism to enforce the
- desired bandwidth. In other words, this means that interface parameters are
- only used at admission control time (i.e., when the user calls
- sched_setattr()). Scheduling is then performed considering actual tasks'
- parameters, so that CPU bandwidth is allocated to SCHED_DEADLINE tasks
- respecting their needs in terms of granularity. Therefore, using this simple
- interface we can put a cap on total utilization of -deadline tasks (i.e.,
- \Sum (runtime_i / period_i) < global_dl_utilization_cap).
-
-4.1 System wide settings
-------------------------
-
- The system wide settings are configured under the /proc virtual file system.
-
- For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realize that this
- isn't entirely desirable; however, it is better to have a small interface for
- now, and be able to change it easily later. The ideal situation (see 5.) is to
- run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
- direct subset of dl_bw.
-
- This means that, for a root_domain comprising M CPUs, -deadline tasks
- can be created while the sum of their bandwidths stays below:
-
-   M * (sched_rt_runtime_us / sched_rt_period_us)
-
- It is also possible to disable this bandwidth management logic, and
- be thus free of oversubscribing the system up to any arbitrary level.
- This is done by writing -1 in /proc/sys/kernel/sched_rt_runtime_us.
-
-
-4.2 Task interface
-------------------
-
- Specifying a periodic/sporadic task that executes for a given amount of
- runtime at each instance, and that is scheduled according to the urgency of
- its own timing constraints needs, in general, a way of declaring:
-  - a (maximum/typical) instance execution time,
-  - a minimum interval between consecutive instances,
-  - a time constraint by which each instance must be completed.
-
- Therefore:
-  * a new struct sched_attr, containing all the necessary fields is
-    provided;
-  * the new scheduling related syscalls that manipulate it, i.e.,
-    sched_setattr() and sched_getattr() are implemented.
-
- For debugging purposes, the leftover runtime and absolute deadline of a
- SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
- dl.runtime and dl.deadline, both values in ns). A programmatic way to
- retrieve these values from production code is under discussion.
-
-
-4.3 Default behavior
----------------------
-
- The default value for SCHED_DEADLINE bandwidth is to have rt_runtime equal to
- 950000. With rt_period equal to 1000000, by default, it means that -deadline
- tasks can use at most 95%, multiplied by the number of CPUs that compose the
- root_domain, for each root_domain.
- This means that non -deadline tasks will receive at least 5% of the CPU time,
- and that -deadline tasks will receive their runtime with a guaranteed
- worst-case delay respect to the "deadline" parameter. If "deadline" = "period"
- and the cpuset mechanism is used to implement partitioned scheduling (see
- Section 5), then this simple setting of the bandwidth management is able to
- deterministically guarantee that -deadline tasks will receive their runtime
- in a period.
-
- Finally, notice that in order not to jeopardize the admission control a
- -deadline task cannot fork.
-
-
-4.4 Behavior of sched_yield()
------------------------------
-
- When a SCHED_DEADLINE task calls sched_yield(), it gives up its
- remaining runtime and is immediately throttled, until the next
- period, when its runtime will be replenished (a special flag
- dl_yielded is set and used to handle correctly throttling and runtime
- replenishment after a call to sched_yield()).
-
- This behavior of sched_yield() allows the task to wake-up exactly at
- the beginning of the next period. Also, this may be useful in the
- future with bandwidth reclaiming mechanisms, where sched_yield() will
- make the leftoever runtime available for reclamation by other
- SCHED_DEADLINE tasks.
-
-
-5. Tasks CPU affinity
-=====================
-
- -deadline tasks cannot have an affinity mask smaller that the entire
- root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.txt).
-
-5.1 SCHED_DEADLINE and cpusets HOWTO
-------------------------------------
-
- An example of a simple configuration (pin a -deadline task to CPU0)
- follows (rt-app is used to create a -deadline task).
-
- mkdir /dev/cpuset
- mount -t cgroup -o cpuset cpuset /dev/cpuset
- cd /dev/cpuset
- mkdir cpu0
- echo 0 > cpu0/cpuset.cpus
- echo 0 > cpu0/cpuset.mems
- echo 1 > cpuset.cpu_exclusive
- echo 0 > cpuset.sched_load_balance
- echo 1 > cpu0/cpuset.cpu_exclusive
- echo 1 > cpu0/cpuset.mem_exclusive
- echo $$ > cpu0/tasks
- rt-app -t 100000:10000:d:0 -D5 (it is now actually superfluous to specify
- task affinity)
-
-6. Future plans
-===============
-
- Still missing:
-
-  - programmatic way to retrieve current runtime and absolute deadline
-  - refinements to deadline inheritance, especially regarding the possibility
-    of retaining bandwidth isolation among non-interacting tasks. This is
-    being studied from both theoretical and practical points of view, and
-    hopefully we should be able to produce some demonstrative code soon;
-  - (c)group based bandwidth management, and maybe scheduling;
-  - access control for non-root users (and related security concerns to
-    address), which is the best way to allow unprivileged use of the mechanisms
-    and how to prevent non-root users "cheat" the system?
-
- As already discussed, we are planning also to merge this work with the EDF
- throttling patches [https://lkml.org/lkml/2010/2/23/239] but we still are in
- the preliminary phases of the merge and we really seek feedback that would
- help us decide on the direction it should take.
-
-Appendix A. Test suite
-======================
-
- The SCHED_DEADLINE policy can be easily tested using two applications that
- are part of a wider Linux Scheduler validation suite. The suite is
- available as a GitHub repository: https://github.com/scheduler-tools.
-
- The first testing application is called rt-app and can be used to
- start multiple threads with specific parameters. rt-app supports
- SCHED_{OTHER,FIFO,RR,DEADLINE} scheduling policies and their related
- parameters (e.g., niceness, priority, runtime/deadline/period). rt-app
- is a valuable tool, as it can be used to synthetically recreate certain
- workloads (maybe mimicking real use-cases) and evaluate how the scheduler
- behaves under such workloads. In this way, results are easily reproducible.
- rt-app is available at: https://github.com/scheduler-tools/rt-app.
-
- Thread parameters can be specified from the command line, with something like
- this:
-
-  # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5
-
- The above creates 2 threads. The first one, scheduled by SCHED_DEADLINE,
- executes for 10ms every 100ms. The second one, scheduled at SCHED_FIFO
- priority 10, executes for 20ms every 150ms. The test will run for a total
- of 5 seconds.
-
- More interestingly, configurations can be described with a json file that
- can be passed as input to rt-app with something like this:
-
-  # rt-app my_config.json
-
- The parameters that can be specified with the second method are a superset
- of the command line options. Please refer to rt-app documentation for more
- details (<rt-app-sources>/doc/*.json).
-
- The second testing application is a modification of schedtool, called
- schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
- certain pid/application. schedtool-dl is available at:
- https://github.com/scheduler-tools/schedtool-dl.git.
-
- The usage is straightforward:
-
-  # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
-
- With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
- of 10ms every 100ms (note that parameters are expressed in microseconds).
- You can also use schedtool to create a reservation for an already running
- application, given that you know its pid:
-
-  # schedtool -E -t 10000000:100000000 my_app_pid
-
-Appendix B. Minimal main()
-==========================
-
- We provide in what follows a simple (ugly) self-contained code snippet
- showing how SCHED_DEADLINE reservations can be created by a real-time
- application developer.
-
- #define _GNU_SOURCE
- #include <unistd.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <time.h>
- #include <linux/unistd.h>
- #include <linux/kernel.h>
- #include <linux/types.h>
- #include <sys/syscall.h>
- #include <pthread.h>
-
- #define gettid() syscall(__NR_gettid)
-
- #define SCHED_DEADLINE	6
-
- /* XXX use the proper syscall numbers */
- #ifdef __x86_64__
- #define __NR_sched_setattr		314
- #define __NR_sched_getattr		315
- #endif
-
- #ifdef __i386__
- #define __NR_sched_setattr		351
- #define __NR_sched_getattr		352
- #endif
-
- #ifdef __arm__
- #define __NR_sched_setattr		380
- #define __NR_sched_getattr		381
- #endif
-
- static volatile int done;
-
- struct sched_attr {
-	__u32 size;
-
-	__u32 sched_policy;
-	__u64 sched_flags;
-
-	/* SCHED_NORMAL, SCHED_BATCH */
-	__s32 sched_nice;
-
-	/* SCHED_FIFO, SCHED_RR */
-	__u32 sched_priority;
-
-	/* SCHED_DEADLINE (nsec) */
-	__u64 sched_runtime;
-	__u64 sched_deadline;
-	__u64 sched_period;
- };
-
- int sched_setattr(pid_t pid,
-		  const struct sched_attr *attr,
-		  unsigned int flags)
- {
-	return syscall(__NR_sched_setattr, pid, attr, flags);
- }
-
- int sched_getattr(pid_t pid,
-		  struct sched_attr *attr,
-		  unsigned int size,
-		  unsigned int flags)
- {
-	return syscall(__NR_sched_getattr, pid, attr, size, flags);
- }
-
- void *run_deadline(void *data)
- {
-	struct sched_attr attr;
-	int x = 0;
-	int ret;
-	unsigned int flags = 0;
-
-	printf("deadline thread started [%ld]\n", gettid());
-
-	attr.size = sizeof(attr);
-	attr.sched_flags = 0;
-	attr.sched_nice = 0;
-	attr.sched_priority = 0;
-
-	/* This creates a 10ms/30ms reservation */
-	attr.sched_policy = SCHED_DEADLINE;
-	attr.sched_runtime = 10 * 1000 * 1000;
-	attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000;
-
-	ret = sched_setattr(0, &attr, flags);
-	if (ret < 0) {
-		done = 0;
-		perror("sched_setattr");
-		exit(-1);
-	}
-
-	while (!done) {
-		x++;
-	}
-
-	printf("deadline thread dies [%ld]\n", gettid());
-	return NULL;
- }
-
- int main (int argc, char **argv)
- {
-	pthread_t thread;
-
-	printf("main thread [%ld]\n", gettid());
-
-	pthread_create(&thread, NULL, run_deadline, NULL);
-
-	sleep(10);
-
-	done = 1;
-	pthread_join(thread, NULL);
-
-	printf("main dies [%ld]\n", gettid());
-	return 0;
- }
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
new file mode 100644
index 000000000000..a96c72651877
--- /dev/null
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -0,0 +1,249 @@
+=============
+CFS Scheduler
+=============
+
+
+1.  OVERVIEW
+============
+
+CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
+scheduler implemented by Ingo Molnar and merged in Linux 2.6.23.  It is the
+replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
+code.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically models
+an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent  :-)) CPU that has 100% physical
+power and which can run each task at precise equal speed, in parallel, each at
+1/nr_running speed.  For example: if there are 2 tasks running, then it runs
+each at 50% physical power --- i.e., actually in parallel.
+
+On real hardware, we can run only a single task at once, so we have to
+introduce the concept of "virtual runtime."  The virtual runtime of a task
+specifies when its next timeslice would start execution on the ideal
+multi-tasking CPU described above.  In practice, the virtual runtime of a task
+is its actual runtime normalized to the total number of running tasks.
+
+
+
+2.  FEW IMPLEMENTATION DETAILS
+==============================
+
+In CFS the virtual runtime is expressed and tracked via the per-task
+p->se.vruntime (nanosec-unit) value.  This way, it's possible to accurately
+timestamp and measure the "expected CPU time" a task should have gotten.
+
+[ small detail: on "ideal" hardware, at any time all tasks would have the same
+  p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
+  would ever get "out of balance" from the "ideal" share of CPU time.  ]
+
+CFS's task picking logic is based on this p->se.vruntime value and it is thus
+very simple: it always tries to run the task with the smallest p->se.vruntime
+value (i.e., the task which executed least so far).  CFS always tries to split
+up CPU time between runnable tasks as close to "ideal multitasking hardware" as
+possible.
+
+Most of the rest of CFS's design just falls out of this really simple concept,
+with a few add-on embellishments like nice levels, multiprocessing and various
+algorithm variants to recognize sleepers.
+
+
+
+3.  THE RBTREE
+==============
+
+CFS's design is quite radical: it does not use the old data structures for the
+runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
+task execution, and thus has no "array switch" artifacts (by which both the
+previous vanilla scheduler and RSDL/SD are affected).
+
+CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
+increasing value tracking the smallest vruntime among all tasks in the
+runqueue.  The total amount of work done by the system is tracked using
+min_vruntime; that value is used to place newly activated entities on the left
+side of the tree as much as possible.
+
+The total number of running tasks in the runqueue is accounted through the
+rq->cfs.load value, which is the sum of the weights of the tasks queued on the
+runqueue.
+
+CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
+p->se.vruntime key. CFS picks the "leftmost" task from this tree and sticks to it.
+As the system progresses forwards, the executed tasks are put into the tree
+more and more to the right --- slowly but surely giving a chance for every task
+to become the "leftmost task" and thus get on the CPU within a deterministic
+amount of time.
+
+Summing up, CFS works like this: it runs a task a bit, and when the task
+schedules (or a scheduler tick happens) the task's CPU usage is "accounted
+for": the (small) time it just spent using the physical CPU is added to
+p->se.vruntime.  Once p->se.vruntime gets high enough so that another task
+becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
+small amount of "granularity" distance relative to the leftmost task so that we
+do not over-schedule tasks and trash the cache), then the new leftmost task is
+picked and the current task is preempted.
+
+
+
+4.  SOME FEATURES OF CFS
+========================
+
+CFS uses nanosecond granularity accounting and does not rely on any jiffies or
+other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
+way the previous scheduler had, and has no heuristics whatsoever.  There is
+only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+   /proc/sys/kernel/sched_min_granularity_ns
+
+which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+"server" (i.e., good batching) workloads.  It defaults to a setting suitable
+for desktop workloads.  SCHED_BATCH is handled by the CFS scheduler module too.
+
+Due to its design, the CFS scheduler is not prone to any of the "attacks" that
+exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
+chew.c, ring-test.c, massive_intr.c all work fine and do not impact
+interactivity and produce the expected behavior.
+
+The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
+than the previous vanilla scheduler: both types of workloads are isolated much
+more aggressively.
+
+SMP load-balancing has been reworked/sanitized: the runqueue-walking
+assumptions are gone from the load-balancing code now, and iterators of the
+scheduling modules are used.  The balancing code got quite a bit simpler as a
+result.
+
+
+
+5. Scheduling policies
+======================
+
+CFS implements three scheduling policies:
+
+  - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
+    policy that is used for regular tasks.
+
+  - SCHED_BATCH: Does not preempt nearly as often as regular tasks
+    would, thereby allowing tasks to run longer and make better use of
+    caches but at the cost of interactivity. This is well suited for
+    batch jobs.
+
+  - SCHED_IDLE: This is even weaker than nice 19, but its not a true
+    idle timer scheduler in order to avoid to get into priority
+    inversion problems which would deadlock the machine.
+
+SCHED_FIFO/_RR are implemented in sched/rt.c and are as specified by
+POSIX.
+
+The command chrt from util-linux-ng 2.13.1.1 can set all of these except
+SCHED_IDLE.
+
+
+
+6.  SCHEDULING CLASSES
+======================
+
+The new CFS scheduler has been designed in such a way to introduce "Scheduling
+Classes," an extensible hierarchy of scheduler modules.  These modules
+encapsulate scheduling policy details and are handled by the scheduler core
+without the core code assuming too much about them.
+
+sched/fair.c implements the CFS scheduler described above.
+
+sched/rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
+the previous vanilla scheduler did.  It uses 100 runqueues (for all 100 RT
+priority levels, instead of 140 in the previous scheduler) and it needs no
+expired array.
+
+Scheduling classes are implemented through the sched_class structure, which
+contains hooks to functions that must be called whenever an interesting event
+occurs.
+
+This is the (partial) list of the hooks:
+
+ - enqueue_task(...)
+
+   Called when a task enters a runnable state.
+   It puts the scheduling entity (task) into the red-black tree and
+   increments the nr_running variable.
+
+ - dequeue_task(...)
+
+   When a task is no longer runnable, this function is called to keep the
+   corresponding scheduling entity out of the red-black tree.  It decrements
+   the nr_running variable.
+
+ - yield_task(...)
+
+   This function is basically just a dequeue followed by an enqueue, unless the
+   compat_yield sysctl is turned on; in that case, it places the scheduling
+   entity at the right-most end of the red-black tree.
+
+ - check_preempt_curr(...)
+
+   This function checks if a task that entered the runnable state should
+   preempt the currently running task.
+
+ - pick_next_task(...)
+
+   This function chooses the most appropriate task eligible to run next.
+
+ - set_curr_task(...)
+
+   This function is called when a task changes its scheduling class or changes
+   its task group.
+
+ - task_tick(...)
+
+   This function is mostly called from time tick functions; it might lead to
+   process switch.  This drives the running preemption.
+
+
+
+
+7.  GROUP SCHEDULER EXTENSIONS TO CFS
+=====================================
+
+Normally, the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task.  Sometimes, it may be desirable to group tasks and
+provide fair CPU time to each such task group.  For example, it may be
+desirable to first provide fair CPU time to each user on the system and then to
+each task belonging to a user.
+
+CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
+grouped and divides CPU time fairly among such groups.
+
+CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
+SCHED_RR) tasks.
+
+CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
+SCHED_BATCH) tasks.
+
+   These options need CONFIG_CGROUPS to be defined, and let the administrator
+   create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
+   Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem.
+
+When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
+group created using the pseudo filesystem.  See example steps below to create
+task groups and modify their CPU share using the "cgroups" pseudo filesystem::
+
+	# mount -t tmpfs cgroup_root /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/cpu
+	# mount -t cgroup -ocpu none /sys/fs/cgroup/cpu
+	# cd /sys/fs/cgroup/cpu
+
+	# mkdir multimedia	# create "multimedia" group of tasks
+	# mkdir browser		# create "browser" group of tasks
+
+	# #Configure the multimedia group to receive twice the CPU bandwidth
+	# #that of browser group
+
+	# echo 2048 > multimedia/cpu.shares
+	# echo 1024 > browser/cpu.shares
+
+	# firefox &	# Launch firefox and move it to "browser" group
+	# echo <firefox_pid> > browser/tasks
+
+	# #Launch gmplayer (or your favourite movie player)
+	# echo <movie_player_pid> > multimedia/tasks
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
deleted file mode 100644
index edd861c94c1b..000000000000
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ /dev/null
@@ -1,242 +0,0 @@
-                      =============
-                      CFS Scheduler
-                      =============
-
-
-1.  OVERVIEW
-
-CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
-scheduler implemented by Ingo Molnar and merged in Linux 2.6.23.  It is the
-replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
-code.
-
-80% of CFS's design can be summed up in a single sentence: CFS basically models
-an "ideal, precise multi-tasking CPU" on real hardware.
-
-"Ideal multi-tasking CPU" is a (non-existent  :-)) CPU that has 100% physical
-power and which can run each task at precise equal speed, in parallel, each at
-1/nr_running speed.  For example: if there are 2 tasks running, then it runs
-each at 50% physical power --- i.e., actually in parallel.
-
-On real hardware, we can run only a single task at once, so we have to
-introduce the concept of "virtual runtime."  The virtual runtime of a task
-specifies when its next timeslice would start execution on the ideal
-multi-tasking CPU described above.  In practice, the virtual runtime of a task
-is its actual runtime normalized to the total number of running tasks.
-
-
-
-2.  FEW IMPLEMENTATION DETAILS
-
-In CFS the virtual runtime is expressed and tracked via the per-task
-p->se.vruntime (nanosec-unit) value.  This way, it's possible to accurately
-timestamp and measure the "expected CPU time" a task should have gotten.
-
-[ small detail: on "ideal" hardware, at any time all tasks would have the same
-  p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
-  would ever get "out of balance" from the "ideal" share of CPU time.  ]
-
-CFS's task picking logic is based on this p->se.vruntime value and it is thus
-very simple: it always tries to run the task with the smallest p->se.vruntime
-value (i.e., the task which executed least so far).  CFS always tries to split
-up CPU time between runnable tasks as close to "ideal multitasking hardware" as
-possible.
-
-Most of the rest of CFS's design just falls out of this really simple concept,
-with a few add-on embellishments like nice levels, multiprocessing and various
-algorithm variants to recognize sleepers.
-
-
-
-3.  THE RBTREE
-
-CFS's design is quite radical: it does not use the old data structures for the
-runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
-task execution, and thus has no "array switch" artifacts (by which both the
-previous vanilla scheduler and RSDL/SD are affected).
-
-CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
-increasing value tracking the smallest vruntime among all tasks in the
-runqueue.  The total amount of work done by the system is tracked using
-min_vruntime; that value is used to place newly activated entities on the left
-side of the tree as much as possible.
-
-The total number of running tasks in the runqueue is accounted through the
-rq->cfs.load value, which is the sum of the weights of the tasks queued on the
-runqueue.
-
-CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
-p->se.vruntime key. CFS picks the "leftmost" task from this tree and sticks to it.
-As the system progresses forwards, the executed tasks are put into the tree
-more and more to the right --- slowly but surely giving a chance for every task
-to become the "leftmost task" and thus get on the CPU within a deterministic
-amount of time.
-
-Summing up, CFS works like this: it runs a task a bit, and when the task
-schedules (or a scheduler tick happens) the task's CPU usage is "accounted
-for": the (small) time it just spent using the physical CPU is added to
-p->se.vruntime.  Once p->se.vruntime gets high enough so that another task
-becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
-small amount of "granularity" distance relative to the leftmost task so that we
-do not over-schedule tasks and trash the cache), then the new leftmost task is
-picked and the current task is preempted.
-
-
-
-4.  SOME FEATURES OF CFS
-
-CFS uses nanosecond granularity accounting and does not rely on any jiffies or
-other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
-way the previous scheduler had, and has no heuristics whatsoever.  There is
-only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
-
-   /proc/sys/kernel/sched_min_granularity_ns
-
-which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
-"server" (i.e., good batching) workloads.  It defaults to a setting suitable
-for desktop workloads.  SCHED_BATCH is handled by the CFS scheduler module too.
-
-Due to its design, the CFS scheduler is not prone to any of the "attacks" that
-exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
-chew.c, ring-test.c, massive_intr.c all work fine and do not impact
-interactivity and produce the expected behavior.
-
-The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
-than the previous vanilla scheduler: both types of workloads are isolated much
-more aggressively.
-
-SMP load-balancing has been reworked/sanitized: the runqueue-walking
-assumptions are gone from the load-balancing code now, and iterators of the
-scheduling modules are used.  The balancing code got quite a bit simpler as a
-result.
-
-
-
-5. Scheduling policies
-
-CFS implements three scheduling policies:
-
-  - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
-    policy that is used for regular tasks.
-
-  - SCHED_BATCH: Does not preempt nearly as often as regular tasks
-    would, thereby allowing tasks to run longer and make better use of
-    caches but at the cost of interactivity. This is well suited for
-    batch jobs.
-
-  - SCHED_IDLE: This is even weaker than nice 19, but its not a true
-    idle timer scheduler in order to avoid to get into priority
-    inversion problems which would deadlock the machine.
-
-SCHED_FIFO/_RR are implemented in sched/rt.c and are as specified by
-POSIX.
-
-The command chrt from util-linux-ng 2.13.1.1 can set all of these except
-SCHED_IDLE.
-
-
-
-6.  SCHEDULING CLASSES
-
-The new CFS scheduler has been designed in such a way to introduce "Scheduling
-Classes," an extensible hierarchy of scheduler modules.  These modules
-encapsulate scheduling policy details and are handled by the scheduler core
-without the core code assuming too much about them.
-
-sched/fair.c implements the CFS scheduler described above.
-
-sched/rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
-the previous vanilla scheduler did.  It uses 100 runqueues (for all 100 RT
-priority levels, instead of 140 in the previous scheduler) and it needs no
-expired array.
-
-Scheduling classes are implemented through the sched_class structure, which
-contains hooks to functions that must be called whenever an interesting event
-occurs.
-
-This is the (partial) list of the hooks:
-
- - enqueue_task(...)
-
-   Called when a task enters a runnable state.
-   It puts the scheduling entity (task) into the red-black tree and
-   increments the nr_running variable.
-
- - dequeue_task(...)
-
-   When a task is no longer runnable, this function is called to keep the
-   corresponding scheduling entity out of the red-black tree.  It decrements
-   the nr_running variable.
-
- - yield_task(...)
-
-   This function is basically just a dequeue followed by an enqueue, unless the
-   compat_yield sysctl is turned on; in that case, it places the scheduling
-   entity at the right-most end of the red-black tree.
-
- - check_preempt_curr(...)
-
-   This function checks if a task that entered the runnable state should
-   preempt the currently running task.
-
- - pick_next_task(...)
-
-   This function chooses the most appropriate task eligible to run next.
-
- - set_curr_task(...)
-
-   This function is called when a task changes its scheduling class or changes
-   its task group.
-
- - task_tick(...)
-
-   This function is mostly called from time tick functions; it might lead to
-   process switch.  This drives the running preemption.
-
-
-
-
-7.  GROUP SCHEDULER EXTENSIONS TO CFS
-
-Normally, the scheduler operates on individual tasks and strives to provide
-fair CPU time to each task.  Sometimes, it may be desirable to group tasks and
-provide fair CPU time to each such task group.  For example, it may be
-desirable to first provide fair CPU time to each user on the system and then to
-each task belonging to a user.
-
-CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
-grouped and divides CPU time fairly among such groups.
-
-CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
-SCHED_RR) tasks.
-
-CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
-SCHED_BATCH) tasks.
-
-   These options need CONFIG_CGROUPS to be defined, and let the administrator
-   create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
-   Documentation/cgroup-v1/cgroups.txt for more information about this filesystem.
-
-When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
-group created using the pseudo filesystem.  See example steps below to create
-task groups and modify their CPU share using the "cgroups" pseudo filesystem.
-
-	# mount -t tmpfs cgroup_root /sys/fs/cgroup
-	# mkdir /sys/fs/cgroup/cpu
-	# mount -t cgroup -ocpu none /sys/fs/cgroup/cpu
-	# cd /sys/fs/cgroup/cpu
-
-	# mkdir multimedia	# create "multimedia" group of tasks
-	# mkdir browser		# create "browser" group of tasks
-
-	# #Configure the multimedia group to receive twice the CPU bandwidth
-	# #that of browser group
-
-	# echo 2048 > multimedia/cpu.shares
-	# echo 1024 > browser/cpu.shares
-
-	# firefox &	# Launch firefox and move it to "browser" group
-	# echo <firefox_pid> > browser/tasks
-
-	# #Launch gmplayer (or your favourite movie player)
-	# echo <movie_player_pid> > multimedia/tasks
diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst
new file mode 100644
index 000000000000..f7504226f445
--- /dev/null
+++ b/Documentation/scheduler/sched-domains.rst
@@ -0,0 +1,83 @@
+=================
+Scheduler Domains
+=================
+
+Each CPU has a "base" scheduling domain (struct sched_domain). The domain
+hierarchy is built from these base domains via the ->parent pointer. ->parent
+MUST be NULL terminated, and domain structures should be per-CPU as they are
+locklessly updated.
+
+Each scheduling domain spans a number of CPUs (stored in the ->span field).
+A domain's span MUST be a superset of it child's span (this restriction could
+be relaxed if the need arises), and a base domain for CPU i MUST span at least
+i. The top domain for each CPU will generally span all CPUs in the system
+although strictly it doesn't have to, but this could lead to a case where some
+CPUs will never be given tasks to run unless the CPUs allowed mask is
+explicitly set. A sched domain's span means "balance process load among these
+CPUs".
+
+Each scheduling domain must have one or more CPU groups (struct sched_group)
+which are organised as a circular one way linked list from the ->groups
+pointer. The union of cpumasks of these groups MUST be the same as the
+domain's span. The intersection of cpumasks from any two of these groups
+MUST be the empty set. The group pointed to by the ->groups pointer MUST
+contain the CPU to which the domain belongs. Groups may be shared among
+CPUs as they contain read only data after they have been set up.
+
+Balancing within a sched domain occurs between groups. That is, each group
+is treated as one entity. The load of a group is defined as the sum of the
+load of each of its member CPUs, and only when the load of a group becomes
+out of balance are tasks moved between groups.
+
+In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
+through scheduler_tick(). It raises a softirq after the next regularly scheduled
+rebalancing event for the current runqueue has arrived. The actual load
+balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run
+in softirq context (SCHED_SOFTIRQ).
+
+The latter function takes two arguments: the current CPU and whether it was idle
+at the time the scheduler_tick() happened and iterates over all sched domains
+our CPU is on, starting from its base domain and going up the ->parent chain.
+While doing that, it checks to see if the current domain has exhausted its
+rebalance interval. If so, it runs load_balance() on that domain. It then checks
+the parent sched_domain (if it exists), and the parent of the parent and so
+forth.
+
+Initially, load_balance() finds the busiest group in the current sched domain.
+If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in
+that group. If it manages to find such a runqueue, it locks both our initial
+CPU's runqueue and the newly found busiest one and starts moving tasks from it
+to our runqueue. The exact number of tasks amounts to an imbalance previously
+computed while iterating over this sched domain's groups.
+
+Implementing sched domains
+==========================
+
+The "base" domain will "span" the first level of the hierarchy. In the case
+of SMT, you'll span all siblings of the physical CPU, with each group being
+a single virtual CPU.
+
+In SMP, the parent of the base domain will span all physical CPUs in the
+node. Each group being a single physical CPU. Then with NUMA, the parent
+of the SMP domain will span the entire machine, with each group having the
+cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
+might have just one domain covering its one NUMA level.
+
+The implementor should read comments in include/linux/sched.h:
+struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
+the specifics and what to tune.
+
+Architectures may retain the regular override the default SD_*_INIT flags
+while using the generic domain builder in kernel/sched/core.c if they wish to
+retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
+can be done by #define'ing ARCH_HASH_SCHED_TUNE.
+
+Alternatively, the architecture may completely override the generic domain
+builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
+arch_init_sched_domains function. This function will attach domains to all
+CPUs using cpu_attach_domain.
+
+The sched-domains debugging infrastructure can be enabled by enabling
+CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
+which should catch most possible errors (described above). It also prints out
+the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
deleted file mode 100644
index 4af80b1c05aa..000000000000
--- a/Documentation/scheduler/sched-domains.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Each CPU has a "base" scheduling domain (struct sched_domain). The domain
-hierarchy is built from these base domains via the ->parent pointer. ->parent
-MUST be NULL terminated, and domain structures should be per-CPU as they are
-locklessly updated.
-
-Each scheduling domain spans a number of CPUs (stored in the ->span field).
-A domain's span MUST be a superset of it child's span (this restriction could
-be relaxed if the need arises), and a base domain for CPU i MUST span at least
-i. The top domain for each CPU will generally span all CPUs in the system
-although strictly it doesn't have to, but this could lead to a case where some
-CPUs will never be given tasks to run unless the CPUs allowed mask is
-explicitly set. A sched domain's span means "balance process load among these
-CPUs".
-
-Each scheduling domain must have one or more CPU groups (struct sched_group)
-which are organised as a circular one way linked list from the ->groups
-pointer. The union of cpumasks of these groups MUST be the same as the
-domain's span. The intersection of cpumasks from any two of these groups
-MUST be the empty set. The group pointed to by the ->groups pointer MUST
-contain the CPU to which the domain belongs. Groups may be shared among
-CPUs as they contain read only data after they have been set up.
-
-Balancing within a sched domain occurs between groups. That is, each group
-is treated as one entity. The load of a group is defined as the sum of the
-load of each of its member CPUs, and only when the load of a group becomes
-out of balance are tasks moved between groups.
-
-In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
-through scheduler_tick(). It raises a softirq after the next regularly scheduled
-rebalancing event for the current runqueue has arrived. The actual load
-balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run
-in softirq context (SCHED_SOFTIRQ).
-
-The latter function takes two arguments: the current CPU and whether it was idle
-at the time the scheduler_tick() happened and iterates over all sched domains
-our CPU is on, starting from its base domain and going up the ->parent chain.
-While doing that, it checks to see if the current domain has exhausted its
-rebalance interval. If so, it runs load_balance() on that domain. It then checks
-the parent sched_domain (if it exists), and the parent of the parent and so
-forth.
-
-Initially, load_balance() finds the busiest group in the current sched domain.
-If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in
-that group. If it manages to find such a runqueue, it locks both our initial
-CPU's runqueue and the newly found busiest one and starts moving tasks from it
-to our runqueue. The exact number of tasks amounts to an imbalance previously
-computed while iterating over this sched domain's groups.
-
-*** Implementing sched domains ***
-The "base" domain will "span" the first level of the hierarchy. In the case
-of SMT, you'll span all siblings of the physical CPU, with each group being
-a single virtual CPU.
-
-In SMP, the parent of the base domain will span all physical CPUs in the
-node. Each group being a single physical CPU. Then with NUMA, the parent
-of the SMP domain will span the entire machine, with each group having the
-cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
-might have just one domain covering its one NUMA level.
-
-The implementor should read comments in include/linux/sched.h:
-struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
-the specifics and what to tune.
-
-Architectures may retain the regular override the default SD_*_INIT flags
-while using the generic domain builder in kernel/sched/core.c if they wish to
-retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
-can be done by #define'ing ARCH_HASH_SCHED_TUNE.
-
-Alternatively, the architecture may completely override the generic domain
-builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
-arch_init_sched_domains function. This function will attach domains to all
-CPUs using cpu_attach_domain.
-
-The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
-which should catch most possible errors (described above). It also prints out
-the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst
new file mode 100644
index 000000000000..9580c57a52bc
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.rst
@@ -0,0 +1,430 @@
+=======================
+Energy Aware Scheduling
+=======================
+
+1. Introduction
+---------------
+
+Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict
+the impact of its decisions on the energy consumed by CPUs. EAS relies on an
+Energy Model (EM) of the CPUs to select an energy efficient CPU for each task,
+with a minimal impact on throughput. This document aims at providing an
+introduction on how EAS works, what are the main design decisions behind it, and
+details what is needed to get it to run.
+
+Before going any further, please note that at the time of writing::
+
+   /!\ EAS does not support platforms with symmetric CPU topologies /!\
+
+EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE)
+because this is where the potential for saving energy through scheduling is
+the highest.
+
+The actual EM used by EAS is _not_ maintained by the scheduler, but by a
+dedicated framework. For details about this framework and what it provides,
+please refer to its documentation (see Documentation/power/energy-model.rst).
+
+
+2. Background and Terminology
+-----------------------------
+
+To make it clear from the start:
+ - energy = [joule] (resource like a battery on powered devices)
+ - power = energy/time = [joule/second] = [watt]
+
+The goal of EAS is to minimize energy, while still getting the job done. That
+is, we want to maximize::
+
+	performance [inst/s]
+	--------------------
+	    power [W]
+
+which is equivalent to minimizing::
+
+	energy [J]
+	-----------
+	instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance.
+
+The idea behind introducing an EM is to allow the scheduler to evaluate the
+implications of its decisions rather than blindly applying energy-saving
+techniques that may have positive effects only on some platforms. At the same
+time, the EM must be as simple as possible to minimize the scheduler latency
+impact.
+
+In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time
+for the scheduler to decide where a task should run (during wake-up), the EM
+is used to break the tie between several good CPU candidates and pick the one
+that is predicted to yield the best energy consumption without harming the
+system's throughput. The predictions made by EAS rely on specific elements of
+knowledge about the platform's topology, which include the 'capacity' of CPUs,
+and their respective energy costs.
+
+
+3. Topology information
+-----------------------
+
+EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to
+differentiate CPUs with different computing throughput. The 'capacity' of a CPU
+represents the amount of work it can absorb when running at its highest
+frequency compared to the most capable CPU of the system. Capacity values are
+normalized in a 1024 range, and are comparable with the utilization signals of
+tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks
+to capacity and utilization values, EAS is able to estimate how big/busy a
+task/CPU is, and to take this into consideration when evaluating performance vs
+energy trade-offs. The capacity of CPUs is provided via arch-specific code
+through the arch_scale_cpu_capacity() callback.
+
+The rest of platform knowledge used by EAS is directly read from the Energy
+Model (EM) framework. The EM of a platform is composed of a power cost table
+per 'performance domain' in the system (see Documentation/power/energy-model.rst
+for futher details about performance domains).
+
+The scheduler manages references to the EM objects in the topology code when the
+scheduling domains are built, or re-built. For each root domain (rd), the
+scheduler maintains a singly linked list of all performance domains intersecting
+the current rd->span. Each node in the list contains a pointer to a struct
+em_perf_domain as provided by the EM framework.
+
+The lists are attached to the root domains in order to cope with exclusive
+cpuset configurations. Since the boundaries of exclusive cpusets do not
+necessarily match those of performance domains, the lists of different root
+domains can contain duplicate elements.
+
+Example 1.
+    Let us consider a platform with 12 CPUs, split in 3 performance domains
+    (pd0, pd4 and pd8), organized as follows::
+
+	          CPUs:   0 1 2 3 4 5 6 7 8 9 10 11
+	          PDs:   |--pd0--|--pd4--|---pd8---|
+	          RDs:   |----rd1----|-----rd2-----|
+
+    Now, consider that userspace decided to split the system with two
+    exclusive cpusets, hence creating two independent root domains, each
+    containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the
+    above figure. Since pd4 intersects with both rd1 and rd2, it will be
+    present in the linked list '->pd' attached to each of them:
+
+       * rd1->pd: pd0 -> pd4
+       * rd2->pd: pd4 -> pd8
+
+    Please note that the scheduler will create two duplicate list nodes for
+    pd4 (one for each list). However, both just hold a pointer to the same
+    shared data structure of the EM framework.
+
+Since the access to these lists can happen concurrently with hotplug and other
+things, they are protected by RCU, like the rest of topology structures
+manipulated by the scheduler.
+
+EAS also maintains a static key (sched_energy_present) which is enabled when at
+least one root domain meets all conditions for EAS to start. Those conditions
+are summarized in Section 6.
+
+
+4. Energy-Aware task placement
+------------------------------
+
+EAS overrides the CFS task wake-up balancing code. It uses the EM of the
+platform and the PELT signals to choose an energy-efficient target CPU during
+wake-up balance. When EAS is enabled, select_task_rq_fair() calls
+find_energy_efficient_cpu() to do the placement decision. This function looks
+for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in
+each performance domain since it is the one which will allow us to keep the
+frequency the lowest. Then, the function checks if placing the task there could
+save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran
+in its previous activation.
+
+find_energy_efficient_cpu() uses compute_energy() to estimate what will be the
+energy consumed by the system if the waking task was migrated. compute_energy()
+looks at the current utilization landscape of the CPUs and adjusts it to
+'simulate' the task migration. The EM framework provides the em_pd_energy() API
+which computes the expected energy consumption of each performance domain for
+the given utilization landscape.
+
+An example of energy-optimized task placement decision is detailed below.
+
+Example 2.
+    Let us consider a (fake) platform with 2 independent performance domains
+    composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3
+    are big.
+
+    The scheduler must decide where to place a task P whose util_avg = 200
+    and prev_cpu = 0.
+
+    The current utilization landscape of the CPUs is depicted on the graph
+    below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively
+    Each performance domain has three Operating Performance Points (OPPs).
+    The CPU capacity and power cost associated with each OPP is listed in
+    the Energy Model table. The util_avg of P is shown on the figures
+    below as 'PP'::
+
+     CPU util.
+      1024                 - - - - - - -              Energy Model
+                                               +-----------+-------------+
+                                               |  Little   |     Big     |
+       768                 =============       +-----+-----+------+------+
+                                               | Cap | Pwr | Cap  | Pwr  |
+                                               +-----+-----+------+------+
+       512  ===========    - ##- - - - -       | 170 | 50  | 512  | 400  |
+                             ##     ##         | 341 | 150 | 768  | 800  |
+       341  -PP - - - -      ##     ##         | 512 | 300 | 1024 | 1700 |
+             PP              ##     ##         +-----+-----+------+------+
+       170  -## - - - -      ##     ##
+             ##     ##       ##     ##
+           ------------    -------------
+            CPU0   CPU1     CPU2   CPU3
+
+      Current OPP: =====       Other OPP: - - -     util_avg (100 each): ##
+
+
+    find_energy_efficient_cpu() will first look for the CPUs with the
+    maximum spare capacity in the two performance domains. In this example,
+    CPU1 and CPU3. Then it will estimate the energy of the system if P was
+    placed on either of them, and check if that would save some energy
+    compared to leaving P on CPU0. EAS assumes that OPPs follow utilization
+    (which is coherent with the behaviour of the schedutil CPUFreq
+    governor, see Section 6. for more details on this topic).
+
+    **Case 1. P is migrated to CPU1**::
+
+      1024                 - - - - - - -
+
+                                            Energy calculation:
+       768                 =============     * CPU0: 200 / 341 * 150 = 88
+                                             * CPU1: 300 / 341 * 150 = 131
+                                             * CPU2: 600 / 768 * 800 = 625
+       512  - - - - - -    - ##- - - - -     * CPU3: 500 / 768 * 800 = 520
+                             ##     ##          => total_energy = 1364
+       341  ===========      ##     ##
+                    PP       ##     ##
+       170  -## - - PP-      ##     ##
+             ##     ##       ##     ##
+           ------------    -------------
+            CPU0   CPU1     CPU2   CPU3
+
+
+    **Case 2. P is migrated to CPU3**::
+
+      1024                 - - - - - - -
+
+                                            Energy calculation:
+       768                 =============     * CPU0: 200 / 341 * 150 = 88
+                                             * CPU1: 100 / 341 * 150 = 43
+                                    PP       * CPU2: 600 / 768 * 800 = 625
+       512  - - - - - -    - ##- - -PP -     * CPU3: 700 / 768 * 800 = 729
+                             ##     ##          => total_energy = 1485
+       341  ===========      ##     ##
+                             ##     ##
+       170  -## - - - -      ##     ##
+             ##     ##       ##     ##
+           ------------    -------------
+            CPU0   CPU1     CPU2   CPU3
+
+
+    **Case 3. P stays on prev_cpu / CPU 0**::
+
+      1024                 - - - - - - -
+
+                                            Energy calculation:
+       768                 =============     * CPU0: 400 / 512 * 300 = 234
+                                             * CPU1: 100 / 512 * 300 = 58
+                                             * CPU2: 600 / 768 * 800 = 625
+       512  ===========    - ##- - - - -     * CPU3: 500 / 768 * 800 = 520
+                             ##     ##          => total_energy = 1437
+       341  -PP - - - -      ##     ##
+             PP              ##     ##
+       170  -## - - - -      ##     ##
+             ##     ##       ##     ##
+           ------------    -------------
+            CPU0   CPU1     CPU2   CPU3
+
+
+    From these calculations, the Case 1 has the lowest total energy. So CPU 1
+    is be the best candidate from an energy-efficiency standpoint.
+
+Big CPUs are generally more power hungry than the little ones and are thus used
+mainly when a task doesn't fit the littles. However, little CPUs aren't always
+necessarily more energy-efficient than big CPUs. For some systems, the high OPPs
+of the little CPUs can be less energy-efficient than the lowest OPPs of the
+bigs, for example. So, if the little CPUs happen to have enough utilization at
+a specific point in time, a small task waking up at that moment could be better
+of executing on the big side in order to save energy, even though it would fit
+on the little side.
+
+And even in the case where all OPPs of the big CPUs are less energy-efficient
+than those of the little, using the big CPUs for a small task might still, under
+specific conditions, save energy. Indeed, placing a task on a little CPU can
+result in raising the OPP of the entire performance domain, and that will
+increase the cost of the tasks already running there. If the waking task is
+placed on a big CPU, its own execution cost might be higher than if it was
+running on a little, but it won't impact the other tasks of the little CPUs
+which will keep running at a lower OPP. So, when considering the total energy
+consumed by CPUs, the extra cost of running that one task on a big core can be
+smaller than the cost of raising the OPP on the little CPUs for all the other
+tasks.
+
+The examples above would be nearly impossible to get right in a generic way, and
+for all platforms, without knowing the cost of running at different OPPs on all
+CPUs of the system. Thanks to its EM-based design, EAS should cope with them
+correctly without too many troubles. However, in order to ensure a minimal
+impact on throughput for high-utilization scenarios, EAS also implements another
+mechanism called 'over-utilization'.
+
+
+5. Over-utilization
+-------------------
+
+From a general standpoint, the use-cases where EAS can help the most are those
+involving a light/medium CPU utilization. Whenever long CPU-bound tasks are
+being run, they will require all of the available CPU capacity, and there isn't
+much that can be done by the scheduler to save energy without severly harming
+throughput. In order to avoid hurting performance with EAS, CPUs are flagged as
+'over-utilized' as soon as they are used at more than 80% of their compute
+capacity. As long as no CPUs are over-utilized in a root domain, load balancing
+is disabled and EAS overridess the wake-up balancing code. EAS is likely to load
+the most energy efficient CPUs of the system more than the others if that can be
+done without harming throughput. So, the load-balancer is disabled to prevent
+it from breaking the energy-efficient task placement found by EAS. It is safe to
+do so when the system isn't overutilized since being below the 80% tipping point
+implies that:
+
+    a. there is some idle time on all CPUs, so the utilization signals used by
+       EAS are likely to accurately represent the 'size' of the various tasks
+       in the system;
+    b. all tasks should already be provided with enough CPU capacity,
+       regardless of their nice values;
+    c. since there is spare capacity all tasks must be blocking/sleeping
+       regularly and balancing at wake-up is sufficient.
+
+As soon as one CPU goes above the 80% tipping point, at least one of the three
+assumptions above becomes incorrect. In this scenario, the 'overutilized' flag
+is raised for the entire root domain, EAS is disabled, and the load-balancer is
+re-enabled. By doing so, the scheduler falls back onto load-based algorithms for
+wake-up and load balance under CPU-bound conditions. This provides a better
+respect of the nice values of tasks.
+
+Since the notion of overutilization largely relies on detecting whether or not
+there is some idle time in the system, the CPU capacity 'stolen' by higher
+(than CFS) scheduling classes (as well as IRQ) must be taken into account. As
+such, the detection of overutilization accounts for the capacity used not only
+by CFS tasks, but also by the other scheduling classes and IRQ.
+
+
+6. Dependencies and requirements for EAS
+----------------------------------------
+
+Energy Aware Scheduling depends on the CPUs of the system having specific
+hardware properties and on other features of the kernel being enabled. This
+section lists these dependencies and provides hints as to how they can be met.
+
+
+6.1 - Asymmetric CPU topology
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+As mentioned in the introduction, EAS is only supported on platforms with
+asymmetric CPU topologies for now. This requirement is checked at run-time by
+looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling
+domains are built.
+
+The flag is set/cleared automatically by the scheduler topology code whenever
+there are CPUs with different capacities in a root domain. The capacities of
+CPUs are provided by arch-specific code through the arch_scale_cpu_capacity()
+callback. As an example, arm and arm64 share an implementation of this callback
+which uses a combination of CPUFreq data and device-tree bindings to compute the
+capacity of CPUs (see drivers/base/arch_topology.c for more details).
+
+So, in order to use EAS on your platform your architecture must implement the
+arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower
+capacity than others.
+
+Please note that EAS is not fundamentally incompatible with SMP, but no
+significant savings on SMP platforms have been observed yet. This restriction
+could be amended in the future if proven otherwise.
+
+
+6.2 - Energy Model presence
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+EAS uses the EM of a platform to estimate the impact of scheduling decisions on
+energy. So, your platform must provide power cost tables to the EM framework in
+order to make EAS start. To do so, please refer to documentation of the
+independent EM framework in Documentation/power/energy-model.rst.
+
+Please also note that the scheduling domains need to be re-built after the
+EM has been registered in order to start EAS.
+
+
+6.3 - Energy Model complexity
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The task wake-up path is very latency-sensitive. When the EM of a platform is
+too complex (too many CPUs, too many performance domains, too many performance
+states, ...), the cost of using it in the wake-up path can become prohibitive.
+The energy-aware wake-up algorithm has a complexity of:
+
+	C = Nd * (Nc + Ns)
+
+with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
+total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
+
+A complexity check is performed at the root domain level, when scheduling
+domains are built. EAS will not start on a root domain if its C happens to be
+higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
+time of writing).
+
+If you really want to use EAS but the complexity of your platform's Energy
+Model is too high to be used with a single root domain, you're left with only
+two possible options:
+
+    1. split your system into separate, smaller, root domains using exclusive
+       cpusets and enable EAS locally on each of them. This option has the
+       benefit to work out of the box but the drawback of preventing load
+       balance between root domains, which can result in an unbalanced system
+       overall;
+    2. submit patches to reduce the complexity of the EAS wake-up algorithm,
+       hence enabling it to cope with larger EMs in reasonable time.
+
+
+6.4 - Schedutil governor
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+EAS tries to predict at which OPP will the CPUs be running in the close future
+in order to estimate their energy consumption. To do so, it is assumed that OPPs
+of CPUs follow their utilization.
+
+Although it is very difficult to provide hard guarantees regarding the accuracy
+of this assumption in practice (because the hardware might not do what it is
+told to do, for example), schedutil as opposed to other CPUFreq governors at
+least _requests_ frequencies calculated using the utilization signals.
+Consequently, the only sane governor to use together with EAS is schedutil,
+because it is the only one providing some degree of consistency between
+frequency requests and energy predictions.
+
+Using EAS with any other governor than schedutil is not supported.
+
+
+6.5 Scale-invariant utilization signals
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to make accurate prediction across CPUs and for all performance
+states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can
+be obtained using the architecture-defined arch_scale{cpu,freq}_capacity()
+callbacks.
+
+Using EAS on a platform that doesn't implement these two callbacks is not
+supported.
+
+
+6.6 Multithreading (SMT)
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+EAS in its current form is SMT unaware and is not able to leverage
+multithreaded hardware to save energy. EAS considers threads as independent
+CPUs, which can actually be counter-productive for both performance and energy.
+
+EAS on SMT is not supported.
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
deleted file mode 100644
index 197d81f4b836..000000000000
--- a/Documentation/scheduler/sched-energy.txt
+++ /dev/null
@@ -1,425 +0,0 @@
-			   =======================
-			   Energy Aware Scheduling
-			   =======================
-
-1. Introduction
----------------
-
-Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict
-the impact of its decisions on the energy consumed by CPUs. EAS relies on an
-Energy Model (EM) of the CPUs to select an energy efficient CPU for each task,
-with a minimal impact on throughput. This document aims at providing an
-introduction on how EAS works, what are the main design decisions behind it, and
-details what is needed to get it to run.
-
-Before going any further, please note that at the time of writing:
-
-   /!\ EAS does not support platforms with symmetric CPU topologies /!\
-
-EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE)
-because this is where the potential for saving energy through scheduling is
-the highest.
-
-The actual EM used by EAS is _not_ maintained by the scheduler, but by a
-dedicated framework. For details about this framework and what it provides,
-please refer to its documentation (see Documentation/power/energy-model.txt).
-
-
-2. Background and Terminology
------------------------------
-
-To make it clear from the start:
- - energy = [joule] (resource like a battery on powered devices)
- - power = energy/time = [joule/second] = [watt]
-
-The goal of EAS is to minimize energy, while still getting the job done. That
-is, we want to maximize:
-
-	performance [inst/s]
-	--------------------
-	    power [W]
-
-which is equivalent to minimizing:
-
-	energy [J]
-	-----------
-	instruction
-
-while still getting 'good' performance. It is essentially an alternative
-optimization objective to the current performance-only objective for the
-scheduler. This alternative considers two objectives: energy-efficiency and
-performance.
-
-The idea behind introducing an EM is to allow the scheduler to evaluate the
-implications of its decisions rather than blindly applying energy-saving
-techniques that may have positive effects only on some platforms. At the same
-time, the EM must be as simple as possible to minimize the scheduler latency
-impact.
-
-In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time
-for the scheduler to decide where a task should run (during wake-up), the EM
-is used to break the tie between several good CPU candidates and pick the one
-that is predicted to yield the best energy consumption without harming the
-system's throughput. The predictions made by EAS rely on specific elements of
-knowledge about the platform's topology, which include the 'capacity' of CPUs,
-and their respective energy costs.
-
-
-3. Topology information
------------------------
-
-EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to
-differentiate CPUs with different computing throughput. The 'capacity' of a CPU
-represents the amount of work it can absorb when running at its highest
-frequency compared to the most capable CPU of the system. Capacity values are
-normalized in a 1024 range, and are comparable with the utilization signals of
-tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks
-to capacity and utilization values, EAS is able to estimate how big/busy a
-task/CPU is, and to take this into consideration when evaluating performance vs
-energy trade-offs. The capacity of CPUs is provided via arch-specific code
-through the arch_scale_cpu_capacity() callback.
-
-The rest of platform knowledge used by EAS is directly read from the Energy
-Model (EM) framework. The EM of a platform is composed of a power cost table
-per 'performance domain' in the system (see Documentation/power/energy-model.txt
-for futher details about performance domains).
-
-The scheduler manages references to the EM objects in the topology code when the
-scheduling domains are built, or re-built. For each root domain (rd), the
-scheduler maintains a singly linked list of all performance domains intersecting
-the current rd->span. Each node in the list contains a pointer to a struct
-em_perf_domain as provided by the EM framework.
-
-The lists are attached to the root domains in order to cope with exclusive
-cpuset configurations. Since the boundaries of exclusive cpusets do not
-necessarily match those of performance domains, the lists of different root
-domains can contain duplicate elements.
-
-Example 1.
-    Let us consider a platform with 12 CPUs, split in 3 performance domains
-    (pd0, pd4 and pd8), organized as follows:
-
-	          CPUs:   0 1 2 3 4 5 6 7 8 9 10 11
-	          PDs:   |--pd0--|--pd4--|---pd8---|
-	          RDs:   |----rd1----|-----rd2-----|
-
-    Now, consider that userspace decided to split the system with two
-    exclusive cpusets, hence creating two independent root domains, each
-    containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the
-    above figure. Since pd4 intersects with both rd1 and rd2, it will be
-    present in the linked list '->pd' attached to each of them:
-       * rd1->pd: pd0 -> pd4
-       * rd2->pd: pd4 -> pd8
-
-    Please note that the scheduler will create two duplicate list nodes for
-    pd4 (one for each list). However, both just hold a pointer to the same
-    shared data structure of the EM framework.
-
-Since the access to these lists can happen concurrently with hotplug and other
-things, they are protected by RCU, like the rest of topology structures
-manipulated by the scheduler.
-
-EAS also maintains a static key (sched_energy_present) which is enabled when at
-least one root domain meets all conditions for EAS to start. Those conditions
-are summarized in Section 6.
-
-
-4. Energy-Aware task placement
-------------------------------
-
-EAS overrides the CFS task wake-up balancing code. It uses the EM of the
-platform and the PELT signals to choose an energy-efficient target CPU during
-wake-up balance. When EAS is enabled, select_task_rq_fair() calls
-find_energy_efficient_cpu() to do the placement decision. This function looks
-for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in
-each performance domain since it is the one which will allow us to keep the
-frequency the lowest. Then, the function checks if placing the task there could
-save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran
-in its previous activation.
-
-find_energy_efficient_cpu() uses compute_energy() to estimate what will be the
-energy consumed by the system if the waking task was migrated. compute_energy()
-looks at the current utilization landscape of the CPUs and adjusts it to
-'simulate' the task migration. The EM framework provides the em_pd_energy() API
-which computes the expected energy consumption of each performance domain for
-the given utilization landscape.
-
-An example of energy-optimized task placement decision is detailed below.
-
-Example 2.
-    Let us consider a (fake) platform with 2 independent performance domains
-    composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3
-    are big.
-
-    The scheduler must decide where to place a task P whose util_avg = 200
-    and prev_cpu = 0.
-
-    The current utilization landscape of the CPUs is depicted on the graph
-    below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively
-    Each performance domain has three Operating Performance Points (OPPs).
-    The CPU capacity and power cost associated with each OPP is listed in
-    the Energy Model table. The util_avg of P is shown on the figures
-    below as 'PP'.
-
-    CPU util.
-      1024                 - - - - - - -              Energy Model
-                                               +-----------+-------------+
-                                               |  Little   |     Big     |
-       768                 =============       +-----+-----+------+------+
-                                               | Cap | Pwr | Cap  | Pwr  |
-                                               +-----+-----+------+------+
-       512  ===========    - ##- - - - -       | 170 | 50  | 512  | 400  |
-                             ##     ##         | 341 | 150 | 768  | 800  |
-       341  -PP - - - -      ##     ##         | 512 | 300 | 1024 | 1700 |
-             PP              ##     ##         +-----+-----+------+------+
-       170  -## - - - -      ##     ##
-             ##     ##       ##     ##
-           ------------    -------------
-            CPU0   CPU1     CPU2   CPU3
-
-      Current OPP: =====       Other OPP: - - -     util_avg (100 each): ##
-
-
-    find_energy_efficient_cpu() will first look for the CPUs with the
-    maximum spare capacity in the two performance domains. In this example,
-    CPU1 and CPU3. Then it will estimate the energy of the system if P was
-    placed on either of them, and check if that would save some energy
-    compared to leaving P on CPU0. EAS assumes that OPPs follow utilization
-    (which is coherent with the behaviour of the schedutil CPUFreq
-    governor, see Section 6. for more details on this topic).
-
-    Case 1. P is migrated to CPU1
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      1024                 - - - - - - -
-
-                                            Energy calculation:
-       768                 =============     * CPU0: 200 / 341 * 150 = 88
-                                             * CPU1: 300 / 341 * 150 = 131
-                                             * CPU2: 600 / 768 * 800 = 625
-       512  - - - - - -    - ##- - - - -     * CPU3: 500 / 768 * 800 = 520
-                             ##     ##          => total_energy = 1364
-       341  ===========      ##     ##
-                    PP       ##     ##
-       170  -## - - PP-      ##     ##
-             ##     ##       ##     ##
-           ------------    -------------
-            CPU0   CPU1     CPU2   CPU3
-
-
-    Case 2. P is migrated to CPU3
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      1024                 - - - - - - -
-
-                                            Energy calculation:
-       768                 =============     * CPU0: 200 / 341 * 150 = 88
-                                             * CPU1: 100 / 341 * 150 = 43
-                                    PP       * CPU2: 600 / 768 * 800 = 625
-       512  - - - - - -    - ##- - -PP -     * CPU3: 700 / 768 * 800 = 729
-                             ##     ##          => total_energy = 1485
-       341  ===========      ##     ##
-                             ##     ##
-       170  -## - - - -      ##     ##
-             ##     ##       ##     ##
-           ------------    -------------
-            CPU0   CPU1     CPU2   CPU3
-
-
-    Case 3. P stays on prev_cpu / CPU 0
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      1024                 - - - - - - -
-
-                                            Energy calculation:
-       768                 =============     * CPU0: 400 / 512 * 300 = 234
-                                             * CPU1: 100 / 512 * 300 = 58
-                                             * CPU2: 600 / 768 * 800 = 625
-       512  ===========    - ##- - - - -     * CPU3: 500 / 768 * 800 = 520
-                             ##     ##          => total_energy = 1437
-       341  -PP - - - -      ##     ##
-             PP              ##     ##
-       170  -## - - - -      ##     ##
-             ##     ##       ##     ##
-           ------------    -------------
-            CPU0   CPU1     CPU2   CPU3
-
-
-    From these calculations, the Case 1 has the lowest total energy. So CPU 1
-    is be the best candidate from an energy-efficiency standpoint.
-
-Big CPUs are generally more power hungry than the little ones and are thus used
-mainly when a task doesn't fit the littles. However, little CPUs aren't always
-necessarily more energy-efficient than big CPUs. For some systems, the high OPPs
-of the little CPUs can be less energy-efficient than the lowest OPPs of the
-bigs, for example. So, if the little CPUs happen to have enough utilization at
-a specific point in time, a small task waking up at that moment could be better
-of executing on the big side in order to save energy, even though it would fit
-on the little side.
-
-And even in the case where all OPPs of the big CPUs are less energy-efficient
-than those of the little, using the big CPUs for a small task might still, under
-specific conditions, save energy. Indeed, placing a task on a little CPU can
-result in raising the OPP of the entire performance domain, and that will
-increase the cost of the tasks already running there. If the waking task is
-placed on a big CPU, its own execution cost might be higher than if it was
-running on a little, but it won't impact the other tasks of the little CPUs
-which will keep running at a lower OPP. So, when considering the total energy
-consumed by CPUs, the extra cost of running that one task on a big core can be
-smaller than the cost of raising the OPP on the little CPUs for all the other
-tasks.
-
-The examples above would be nearly impossible to get right in a generic way, and
-for all platforms, without knowing the cost of running at different OPPs on all
-CPUs of the system. Thanks to its EM-based design, EAS should cope with them
-correctly without too many troubles. However, in order to ensure a minimal
-impact on throughput for high-utilization scenarios, EAS also implements another
-mechanism called 'over-utilization'.
-
-
-5. Over-utilization
--------------------
-
-From a general standpoint, the use-cases where EAS can help the most are those
-involving a light/medium CPU utilization. Whenever long CPU-bound tasks are
-being run, they will require all of the available CPU capacity, and there isn't
-much that can be done by the scheduler to save energy without severly harming
-throughput. In order to avoid hurting performance with EAS, CPUs are flagged as
-'over-utilized' as soon as they are used at more than 80% of their compute
-capacity. As long as no CPUs are over-utilized in a root domain, load balancing
-is disabled and EAS overridess the wake-up balancing code. EAS is likely to load
-the most energy efficient CPUs of the system more than the others if that can be
-done without harming throughput. So, the load-balancer is disabled to prevent
-it from breaking the energy-efficient task placement found by EAS. It is safe to
-do so when the system isn't overutilized since being below the 80% tipping point
-implies that:
-
-    a. there is some idle time on all CPUs, so the utilization signals used by
-       EAS are likely to accurately represent the 'size' of the various tasks
-       in the system;
-    b. all tasks should already be provided with enough CPU capacity,
-       regardless of their nice values;
-    c. since there is spare capacity all tasks must be blocking/sleeping
-       regularly and balancing at wake-up is sufficient.
-
-As soon as one CPU goes above the 80% tipping point, at least one of the three
-assumptions above becomes incorrect. In this scenario, the 'overutilized' flag
-is raised for the entire root domain, EAS is disabled, and the load-balancer is
-re-enabled. By doing so, the scheduler falls back onto load-based algorithms for
-wake-up and load balance under CPU-bound conditions. This provides a better
-respect of the nice values of tasks.
-
-Since the notion of overutilization largely relies on detecting whether or not
-there is some idle time in the system, the CPU capacity 'stolen' by higher
-(than CFS) scheduling classes (as well as IRQ) must be taken into account. As
-such, the detection of overutilization accounts for the capacity used not only
-by CFS tasks, but also by the other scheduling classes and IRQ.
-
-
-6. Dependencies and requirements for EAS
-----------------------------------------
-
-Energy Aware Scheduling depends on the CPUs of the system having specific
-hardware properties and on other features of the kernel being enabled. This
-section lists these dependencies and provides hints as to how they can be met.
-
-
-  6.1 - Asymmetric CPU topology
-
-As mentioned in the introduction, EAS is only supported on platforms with
-asymmetric CPU topologies for now. This requirement is checked at run-time by
-looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling
-domains are built.
-
-The flag is set/cleared automatically by the scheduler topology code whenever
-there are CPUs with different capacities in a root domain. The capacities of
-CPUs are provided by arch-specific code through the arch_scale_cpu_capacity()
-callback. As an example, arm and arm64 share an implementation of this callback
-which uses a combination of CPUFreq data and device-tree bindings to compute the
-capacity of CPUs (see drivers/base/arch_topology.c for more details).
-
-So, in order to use EAS on your platform your architecture must implement the
-arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower
-capacity than others.
-
-Please note that EAS is not fundamentally incompatible with SMP, but no
-significant savings on SMP platforms have been observed yet. This restriction
-could be amended in the future if proven otherwise.
-
-
-  6.2 - Energy Model presence
-
-EAS uses the EM of a platform to estimate the impact of scheduling decisions on
-energy. So, your platform must provide power cost tables to the EM framework in
-order to make EAS start. To do so, please refer to documentation of the
-independent EM framework in Documentation/power/energy-model.txt.
-
-Please also note that the scheduling domains need to be re-built after the
-EM has been registered in order to start EAS.
-
-
-  6.3 - Energy Model complexity
-
-The task wake-up path is very latency-sensitive. When the EM of a platform is
-too complex (too many CPUs, too many performance domains, too many performance
-states, ...), the cost of using it in the wake-up path can become prohibitive.
-The energy-aware wake-up algorithm has a complexity of:
-
-	C = Nd * (Nc + Ns)
-
-with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
-total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
-
-A complexity check is performed at the root domain level, when scheduling
-domains are built. EAS will not start on a root domain if its C happens to be
-higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
-time of writing).
-
-If you really want to use EAS but the complexity of your platform's Energy
-Model is too high to be used with a single root domain, you're left with only
-two possible options:
-
-    1. split your system into separate, smaller, root domains using exclusive
-       cpusets and enable EAS locally on each of them. This option has the
-       benefit to work out of the box but the drawback of preventing load
-       balance between root domains, which can result in an unbalanced system
-       overall;
-    2. submit patches to reduce the complexity of the EAS wake-up algorithm,
-       hence enabling it to cope with larger EMs in reasonable time.
-
-
-  6.4 - Schedutil governor
-
-EAS tries to predict at which OPP will the CPUs be running in the close future
-in order to estimate their energy consumption. To do so, it is assumed that OPPs
-of CPUs follow their utilization.
-
-Although it is very difficult to provide hard guarantees regarding the accuracy
-of this assumption in practice (because the hardware might not do what it is
-told to do, for example), schedutil as opposed to other CPUFreq governors at
-least _requests_ frequencies calculated using the utilization signals.
-Consequently, the only sane governor to use together with EAS is schedutil,
-because it is the only one providing some degree of consistency between
-frequency requests and energy predictions.
-
-Using EAS with any other governor than schedutil is not supported.
-
-
-  6.5 Scale-invariant utilization signals
-
-In order to make accurate prediction across CPUs and for all performance
-states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can
-be obtained using the architecture-defined arch_scale{cpu,freq}_capacity()
-callbacks.
-
-Using EAS on a platform that doesn't implement these two callbacks is not
-supported.
-
-
-  6.6 Multithreading (SMT)
-
-EAS in its current form is SMT unaware and is not able to leverage
-multithreaded hardware to save energy. EAS considers threads as independent
-CPUs, which can actually be counter-productive for both performance and energy.
-
-EAS on SMT is not supported.
diff --git a/Documentation/scheduler/sched-nice-design.rst b/Documentation/scheduler/sched-nice-design.rst
new file mode 100644
index 000000000000..0571f1b47e64
--- /dev/null
+++ b/Documentation/scheduler/sched-nice-design.rst
@@ -0,0 +1,112 @@
+=====================
+Scheduler Nice Design
+=====================
+
+This document explains the thinking about the revamped and streamlined
+nice-levels implementation in the new Linux scheduler.
+
+Nice levels were always pretty weak under Linux and people continuously
+pestered us to make nice +19 tasks use up much less CPU time.
+
+Unfortunately that was not that easy to implement under the old
+scheduler, (otherwise we'd have done it long ago) because nice level
+support was historically coupled to timeslice length, and timeslice
+units were driven by the HZ tick, so the smallest timeslice was 1/HZ.
+
+In the O(1) scheduler (in 2003) we changed negative nice levels to be
+much stronger than they were before in 2.4 (and people were happy about
+that change), and we also intentionally calibrated the linear timeslice
+rule so that nice +19 level would be _exactly_ 1 jiffy. To better
+understand it, the timeslice graph went like this (cheesy ASCII art
+alert!)::
+
+
+                   A
+             \     | [timeslice length]
+              \    |
+               \   |
+                \  |
+                 \ |
+                  \|___100msecs
+                   |^ . _
+                   |      ^ . _
+                   |            ^ . _
+ -*----------------------------------*-----> [nice level]
+ -20               |                +19
+                   |
+                   |
+
+So that if someone wanted to really renice tasks, +19 would give a much
+bigger hit than the normal linear rule would do. (The solution of
+changing the ABI to extend priorities was discarded early on.)
+
+This approach worked to some degree for some time, but later on with
+HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which
+we felt to be a bit excessive. Excessive _not_ because it's too small of
+a CPU utilization, but because it causes too frequent (once per
+millisec) rescheduling. (and would thus trash the cache, etc. Remember,
+this was long ago when hardware was weaker and caches were smaller, and
+people were running number crunching apps at nice +19.)
+
+So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the
+right minimal granularity - and this translates to 5% CPU utilization.
+But the fundamental HZ-sensitive property for nice+19 still remained,
+and we never got a single complaint about nice +19 being too _weak_ in
+terms of CPU utilization, we only got complaints about it (still) being
+too _strong_ :-)
+
+To sum it up: we always wanted to make nice levels more consistent, but
+within the constraints of HZ and jiffies and their nasty design level
+coupling to timeslices and granularity it was not really viable.
+
+The second (less frequent but still periodically occurring) complaint
+about Linux's nice level support was its assymetry around the origo
+(which you can see demonstrated in the picture above), or more
+accurately: the fact that nice level behavior depended on the _absolute_
+nice level as well, while the nice API itself is fundamentally
+"relative":
+
+   int nice(int inc);
+
+   asmlinkage long sys_nice(int increment)
+
+(the first one is the glibc API, the second one is the syscall API.)
+Note that the 'inc' is relative to the current nice level. Tools like
+bash's "nice" command mirror this relative API.
+
+With the old scheduler, if you for example started a niced task with +1
+and another task with +2, the CPU split between the two tasks would
+depend on the nice level of the parent shell - if it was at nice -10 the
+CPU split was different than if it was at +5 or +10.
+
+A third complaint against Linux's nice level support was that negative
+nice levels were not 'punchy enough', so lots of people had to resort to
+run audio (and other multimedia) apps under RT priorities such as
+SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation
+proof, and a buggy SCHED_FIFO app can also lock up the system for good.
+
+The new scheduler in v2.6.23 addresses all three types of complaints:
+
+To address the first complaint (of nice levels being not "punchy"
+enough), the scheduler was decoupled from 'time slice' and HZ concepts
+(and granularity was made a separate concept from nice levels) and thus
+it was possible to implement better and more consistent nice +19
+support: with the new scheduler nice +19 tasks get a HZ-independent
+1.5%, instead of the variable 3%-5%-9% range they got in the old
+scheduler.
+
+To address the second complaint (of nice levels not being consistent),
+the new scheduler makes nice(1) have the same CPU utilization effect on
+tasks, regardless of their absolute nice levels. So on the new
+scheduler, running a nice +10 and a nice 11 task has the same CPU
+utilization "split" between them as running a nice -5 and a nice -4
+task. (one will get 55% of the CPU, the other 45%.) That is why nice
+levels were changed to be "multiplicative" (or exponential) - that way
+it does not matter which nice level you start out from, the 'relative
+result' will always be the same.
+
+The third complaint (of negative nice levels not being "punchy" enough
+and forcing audio apps to run under the more dangerous SCHED_FIFO
+scheduling policy) is addressed by the new scheduler almost
+automatically: stronger negative nice levels are an automatic
+side-effect of the recalibrated dynamic range of nice levels.
diff --git a/Documentation/scheduler/sched-nice-design.txt b/Documentation/scheduler/sched-nice-design.txt
deleted file mode 100644
index 3ac1e46d5365..000000000000
--- a/Documentation/scheduler/sched-nice-design.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-This document explains the thinking about the revamped and streamlined
-nice-levels implementation in the new Linux scheduler.
-
-Nice levels were always pretty weak under Linux and people continuously
-pestered us to make nice +19 tasks use up much less CPU time.
-
-Unfortunately that was not that easy to implement under the old
-scheduler, (otherwise we'd have done it long ago) because nice level
-support was historically coupled to timeslice length, and timeslice
-units were driven by the HZ tick, so the smallest timeslice was 1/HZ.
-
-In the O(1) scheduler (in 2003) we changed negative nice levels to be
-much stronger than they were before in 2.4 (and people were happy about
-that change), and we also intentionally calibrated the linear timeslice
-rule so that nice +19 level would be _exactly_ 1 jiffy. To better
-understand it, the timeslice graph went like this (cheesy ASCII art
-alert!):
-
-
-                   A
-             \     | [timeslice length]
-              \    |
-               \   |
-                \  |
-                 \ |
-                  \|___100msecs
-                   |^ . _
-                   |      ^ . _
-                   |            ^ . _
- -*----------------------------------*-----> [nice level]
- -20               |                +19
-                   |
-                   |
-
-So that if someone wanted to really renice tasks, +19 would give a much
-bigger hit than the normal linear rule would do. (The solution of
-changing the ABI to extend priorities was discarded early on.)
-
-This approach worked to some degree for some time, but later on with
-HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which
-we felt to be a bit excessive. Excessive _not_ because it's too small of
-a CPU utilization, but because it causes too frequent (once per
-millisec) rescheduling. (and would thus trash the cache, etc. Remember,
-this was long ago when hardware was weaker and caches were smaller, and
-people were running number crunching apps at nice +19.)
-
-So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the
-right minimal granularity - and this translates to 5% CPU utilization.
-But the fundamental HZ-sensitive property for nice+19 still remained,
-and we never got a single complaint about nice +19 being too _weak_ in
-terms of CPU utilization, we only got complaints about it (still) being
-too _strong_ :-)
-
-To sum it up: we always wanted to make nice levels more consistent, but
-within the constraints of HZ and jiffies and their nasty design level
-coupling to timeslices and granularity it was not really viable.
-
-The second (less frequent but still periodically occurring) complaint
-about Linux's nice level support was its assymetry around the origo
-(which you can see demonstrated in the picture above), or more
-accurately: the fact that nice level behavior depended on the _absolute_
-nice level as well, while the nice API itself is fundamentally
-"relative":
-
-   int nice(int inc);
-
-   asmlinkage long sys_nice(int increment)
-
-(the first one is the glibc API, the second one is the syscall API.)
-Note that the 'inc' is relative to the current nice level. Tools like
-bash's "nice" command mirror this relative API.
-
-With the old scheduler, if you for example started a niced task with +1
-and another task with +2, the CPU split between the two tasks would
-depend on the nice level of the parent shell - if it was at nice -10 the
-CPU split was different than if it was at +5 or +10.
-
-A third complaint against Linux's nice level support was that negative
-nice levels were not 'punchy enough', so lots of people had to resort to
-run audio (and other multimedia) apps under RT priorities such as
-SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation
-proof, and a buggy SCHED_FIFO app can also lock up the system for good.
-
-The new scheduler in v2.6.23 addresses all three types of complaints:
-
-To address the first complaint (of nice levels being not "punchy"
-enough), the scheduler was decoupled from 'time slice' and HZ concepts
-(and granularity was made a separate concept from nice levels) and thus
-it was possible to implement better and more consistent nice +19
-support: with the new scheduler nice +19 tasks get a HZ-independent
-1.5%, instead of the variable 3%-5%-9% range they got in the old
-scheduler.
-
-To address the second complaint (of nice levels not being consistent),
-the new scheduler makes nice(1) have the same CPU utilization effect on
-tasks, regardless of their absolute nice levels. So on the new
-scheduler, running a nice +10 and a nice 11 task has the same CPU
-utilization "split" between them as running a nice -5 and a nice -4
-task. (one will get 55% of the CPU, the other 45%.) That is why nice
-levels were changed to be "multiplicative" (or exponential) - that way
-it does not matter which nice level you start out from, the 'relative
-result' will always be the same.
-
-The third complaint (of negative nice levels not being "punchy" enough
-and forcing audio apps to run under the more dangerous SCHED_FIFO
-scheduling policy) is addressed by the new scheduler almost
-automatically: stronger negative nice levels are an automatic
-side-effect of the recalibrated dynamic range of nice levels.
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
index e4219139386a..7238b355919c 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
 	int i;
 	unsigned int x;
 
-	printf("static const u32 runnable_avg_yN_inv[] = {");
+	/* To silence -Wunused-but-set-variable warnings. */
+	printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
 	for (i = 0; i < HALFLIFE; i++) {
 		x = ((1UL<<32)-1)*pow(y, i);
 
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
new file mode 100644
index 000000000000..655a096ec8fb
--- /dev/null
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -0,0 +1,185 @@
+==========================
+Real-Time group scheduling
+==========================
+
+.. CONTENTS
+
+   0. WARNING
+   1. Overview
+     1.1 The problem
+     1.2 The solution
+   2. The interface
+     2.1 System-wide settings
+     2.2 Default behaviour
+     2.3 Basis for grouping tasks
+   3. Future plans
+
+
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unstable system, the knobs are
+ root only and assumes root knows what he is doing.
+
+Most notable:
+
+ * very small values in sched_rt_period_us can result in an unstable
+   system when the period is smaller than either the available hrtimer
+   resolution, or the time it takes to handle the budget refresh itself.
+
+ * very small values in sched_rt_runtime_us can result in an unstable
+   system when the runtime is so small the system has difficulty making
+   forward progress (NOTE: the migration thread and kstopmachine both
+   are real-time processes).
+
+1. Overview
+===========
+
+
+1.1 The problem
+---------------
+
+Realtime scheduling is all about determinism, a group has to be able to rely on
+the amount of bandwidth (eg. CPU time) being constant. In order to schedule
+multiple groups of realtime tasks, each group must be assigned a fixed portion
+of the CPU time available.  Without a minimum guarantee a realtime group can
+obviously fall short. A fuzzy upper limit is of no use since it cannot be
+relied upon. Which leaves us with just the single fixed portion.
+
+1.2 The solution
+----------------
+
+CPU time is divided by means of specifying how much time can be spent running
+in a given period. We allocate this "run time" for each realtime group which
+the other realtime groups will not be permitted to use.
+
+Any time not allocated to a realtime group will be used to run normal priority
+tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
+SCHED_OTHER.
+
+Let's consider an example: a frame fixed realtime renderer must deliver 25
+frames a second, which yields a period of 0.04s per frame. Now say it will also
+have to play some music and respond to input, leaving it with around 80% CPU
+time dedicated for the graphics. We can then give this group a run time of 0.8
+* 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s run time
+limit. Now if the audio thread needs to refill the DMA buffer every 0.005s, but
+needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
+0.00015s. So this group can be scheduled with a period of 0.005s and a run time
+of 0.00015s.
+
+The remaining CPU time will be used for user input and other tasks. Because
+realtime tasks have explicitly allocated the CPU time they need to perform
+their tasks, buffer underruns in the graphics or audio can be eliminated.
+
+NOTE: the above example is not fully implemented yet. We still
+lack an EDF scheduler to make non-uniform periods usable.
+
+
+2. The Interface
+================
+
+
+2.1 System wide settings
+------------------------
+
+The system wide settings are configured under the /proc virtual file system:
+
+/proc/sys/kernel/sched_rt_period_us:
+  The scheduling period that is equivalent to 100% CPU bandwidth
+
+/proc/sys/kernel/sched_rt_runtime_us:
+  A global limit on how much time realtime scheduling may use.  Even without
+  CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
+  processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
+  available to all realtime groups.
+
+  * Time is specified in us because the interface is s32. This gives an
+    operating range from 1us to about 35 minutes.
+  * sched_rt_period_us takes values from 1 to INT_MAX.
+  * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
+  * A run time of -1 specifies runtime == period, ie. no limit.
+
+
+2.2 Default behaviour
+---------------------
+
+The default values for sched_rt_period_us (1000000 or 1s) and
+sched_rt_runtime_us (950000 or 0.95s).  This gives 0.05s to be used by
+SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
+realtime tasks will not lock up the machine but leave a little time to recover
+it.  By setting runtime to -1 you'd get the old behaviour back.
+
+By default all bandwidth is assigned to the root group and new groups get the
+period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
+want to assign bandwidth to another group, reduce the root group's bandwidth
+and assign some or all of the difference to another group.
+
+Realtime group scheduling means you have to assign a portion of total CPU
+bandwidth to the group before it will accept realtime tasks. Therefore you will
+not be able to run realtime tasks as any user other than root until you have
+done that, even if the user has the rights to run processes with realtime
+priority!
+
+
+2.3 Basis for grouping tasks
+----------------------------
+
+Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
+CPU bandwidth to task groups.
+
+This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
+to control the CPU time reserved for each control group.
+
+For more information on working with control groups, you should read
+Documentation/admin-guide/cgroup-v1/cgroups.rst as well.
+
+Group settings are checked against the following limits in order to keep the
+configuration schedulable:
+
+   \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
+
+For now, this can be simplified to just the following (but see Future plans):
+
+   \Sum_{i} runtime_{i} <= global_runtime
+
+
+3. Future plans
+===============
+
+There is work in progress to make the scheduling period for each group
+("<cgroup>/cpu.rt_period_us") configurable as well.
+
+The constraint on the period is that a subgroup must have a smaller or
+equal period to its parent. But realistically its not very useful _yet_
+as its prone to starvation without deadline scheduling.
+
+Consider two sibling groups A and B; both have 50% bandwidth, but A's
+period is twice the length of B's.
+
+* group A: period=100000us, runtime=50000us
+
+	- this runs for 0.05s once every 0.1s
+
+* group B: period= 50000us, runtime=25000us
+
+	- this runs for 0.025s twice every 0.1s (or once every 0.05 sec).
+
+This means that currently a while (1) loop in A will run for the full period of
+B and can starve B's tasks (assuming they are of lower priority) for a whole
+period.
+
+The next project will be SCHED_EDF (Earliest Deadline First scheduling) to bring
+full deadline scheduling to the linux kernel. Deadline scheduling the above
+groups and treating end of the period as a deadline will ensure that they both
+get their allocated time.
+
+Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
+the biggest challenge as the current linux PI infrastructure is geared towards
+the limited static priority levels 0-99. With deadline scheduling you need to
+do deadline inheritance (since priority is inversely proportional to the
+deadline delta (deadline - now)).
+
+This means the whole PI machinery will have to be reworked - and that is one of
+the most complex pieces of code we have.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
deleted file mode 100644
index d8fce3e78457..000000000000
--- a/Documentation/scheduler/sched-rt-group.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-				Real-Time group scheduling
-				--------------------------
-
-CONTENTS
-========
-
-0. WARNING
-1. Overview
-  1.1 The problem
-  1.2 The solution
-2. The interface
-  2.1 System-wide settings
-  2.2 Default behaviour
-  2.3 Basis for grouping tasks
-3. Future plans
-
-
-0. WARNING
-==========
-
- Fiddling with these settings can result in an unstable system, the knobs are
- root only and assumes root knows what he is doing.
-
-Most notable:
-
- * very small values in sched_rt_period_us can result in an unstable
-   system when the period is smaller than either the available hrtimer
-   resolution, or the time it takes to handle the budget refresh itself.
-
- * very small values in sched_rt_runtime_us can result in an unstable
-   system when the runtime is so small the system has difficulty making
-   forward progress (NOTE: the migration thread and kstopmachine both
-   are real-time processes).
-
-1. Overview
-===========
-
-
-1.1 The problem
----------------
-
-Realtime scheduling is all about determinism, a group has to be able to rely on
-the amount of bandwidth (eg. CPU time) being constant. In order to schedule
-multiple groups of realtime tasks, each group must be assigned a fixed portion
-of the CPU time available.  Without a minimum guarantee a realtime group can
-obviously fall short. A fuzzy upper limit is of no use since it cannot be
-relied upon. Which leaves us with just the single fixed portion.
-
-1.2 The solution
-----------------
-
-CPU time is divided by means of specifying how much time can be spent running
-in a given period. We allocate this "run time" for each realtime group which
-the other realtime groups will not be permitted to use.
-
-Any time not allocated to a realtime group will be used to run normal priority
-tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
-SCHED_OTHER.
-
-Let's consider an example: a frame fixed realtime renderer must deliver 25
-frames a second, which yields a period of 0.04s per frame. Now say it will also
-have to play some music and respond to input, leaving it with around 80% CPU
-time dedicated for the graphics. We can then give this group a run time of 0.8
-* 0.04s = 0.032s.
-
-This way the graphics group will have a 0.04s period with a 0.032s run time
-limit. Now if the audio thread needs to refill the DMA buffer every 0.005s, but
-needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
-0.00015s. So this group can be scheduled with a period of 0.005s and a run time
-of 0.00015s.
-
-The remaining CPU time will be used for user input and other tasks. Because
-realtime tasks have explicitly allocated the CPU time they need to perform
-their tasks, buffer underruns in the graphics or audio can be eliminated.
-
-NOTE: the above example is not fully implemented yet. We still
-lack an EDF scheduler to make non-uniform periods usable.
-
-
-2. The Interface
-================
-
-
-2.1 System wide settings
-------------------------
-
-The system wide settings are configured under the /proc virtual file system:
-
-/proc/sys/kernel/sched_rt_period_us:
-  The scheduling period that is equivalent to 100% CPU bandwidth
-
-/proc/sys/kernel/sched_rt_runtime_us:
-  A global limit on how much time realtime scheduling may use.  Even without
-  CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
-  processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
-  available to all realtime groups.
-
-  * Time is specified in us because the interface is s32. This gives an
-    operating range from 1us to about 35 minutes.
-  * sched_rt_period_us takes values from 1 to INT_MAX.
-  * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
-  * A run time of -1 specifies runtime == period, ie. no limit.
-
-
-2.2 Default behaviour
----------------------
-
-The default values for sched_rt_period_us (1000000 or 1s) and
-sched_rt_runtime_us (950000 or 0.95s).  This gives 0.05s to be used by
-SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
-realtime tasks will not lock up the machine but leave a little time to recover
-it.  By setting runtime to -1 you'd get the old behaviour back.
-
-By default all bandwidth is assigned to the root group and new groups get the
-period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
-want to assign bandwidth to another group, reduce the root group's bandwidth
-and assign some or all of the difference to another group.
-
-Realtime group scheduling means you have to assign a portion of total CPU
-bandwidth to the group before it will accept realtime tasks. Therefore you will
-not be able to run realtime tasks as any user other than root until you have
-done that, even if the user has the rights to run processes with realtime
-priority!
-
-
-2.3 Basis for grouping tasks
-----------------------------
-
-Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
-CPU bandwidth to task groups.
-
-This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
-to control the CPU time reserved for each control group.
-
-For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.txt as well.
-
-Group settings are checked against the following limits in order to keep the
-configuration schedulable:
-
-   \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
-
-For now, this can be simplified to just the following (but see Future plans):
-
-   \Sum_{i} runtime_{i} <= global_runtime
-
-
-3. Future plans
-===============
-
-There is work in progress to make the scheduling period for each group
-("<cgroup>/cpu.rt_period_us") configurable as well.
-
-The constraint on the period is that a subgroup must have a smaller or
-equal period to its parent. But realistically its not very useful _yet_
-as its prone to starvation without deadline scheduling.
-
-Consider two sibling groups A and B; both have 50% bandwidth, but A's
-period is twice the length of B's.
-
-* group A: period=100000us, runtime=50000us
-	- this runs for 0.05s once every 0.1s
-
-* group B: period= 50000us, runtime=25000us
-	- this runs for 0.025s twice every 0.1s (or once every 0.05 sec).
-
-This means that currently a while (1) loop in A will run for the full period of
-B and can starve B's tasks (assuming they are of lower priority) for a whole
-period.
-
-The next project will be SCHED_EDF (Earliest Deadline First scheduling) to bring
-full deadline scheduling to the linux kernel. Deadline scheduling the above
-groups and treating end of the period as a deadline will ensure that they both
-get their allocated time.
-
-Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
-the biggest challenge as the current linux PI infrastructure is geared towards
-the limited static priority levels 0-99. With deadline scheduling you need to
-do deadline inheritance (since priority is inversely proportional to the
-deadline delta (deadline - now)).
-
-This means the whole PI machinery will have to be reworked - and that is one of
-the most complex pieces of code we have.
diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst
new file mode 100644
index 000000000000..0cb0aa714545
--- /dev/null
+++ b/Documentation/scheduler/sched-stats.rst
@@ -0,0 +1,167 @@
+====================
+Scheduler Statistics
+====================
+
+Version 15 of schedstats dropped counters for some sched_yield:
+yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
+identical to version 14.
+
+Version 14 of schedstats includes support for sched_domains, which hit the
+mainline kernel in 2.6.20 although it is identical to the stats from version
+12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel
+release).  Some counters make more sense to be per-runqueue; other to be
+per-domain.  Note that domains (and their associated information) will only
+be pertinent and available on machines utilizing CONFIG_SMP.
+
+In version 14 of schedstat, there is at least one level of domain
+statistics for each cpu listed, and there may well be more than one
+domain.  Domains have no particular names in this implementation, but
+the highest numbered one typically arbitrates balancing across all the
+cpus on the machine, while domain0 is the most tightly focused domain,
+sometimes balancing only between pairs of cpus.  At this time, there
+are no architectures which need more than three domain levels. The first
+field in the domain stats is a bit map indicating which cpus are affected
+by that domain.
+
+These fields are counters, and only increment.  Programs which make use
+of these will need to start with a baseline observation and then calculate
+the change in the counters at each subsequent observation.  A perl script
+which does this for many of the fields is available at
+
+    http://eaglet.rain.com/rick/linux/schedstat/
+
+Note that any such script will necessarily be version-specific, as the main
+reason to change versions is changes in the output format.  For those wishing
+to write their own scripts, the fields are described here.
+
+CPU statistics
+--------------
+cpu<N> 1 2 3 4 5 6 7 8 9
+
+First field is a sched_yield() statistic:
+
+     1) # of times sched_yield() was called
+
+Next three are schedule() statistics:
+
+     2) This field is a legacy array expiration count field used in the O(1)
+	scheduler. We kept it for ABI compatibility, but it is always set to zero.
+     3) # of times schedule() was called
+     4) # of times schedule() left the processor idle
+
+Next two are try_to_wake_up() statistics:
+
+     5) # of times try_to_wake_up() was called
+     6) # of times try_to_wake_up() was called to wake up the local cpu
+
+Next three are statistics describing scheduling latency:
+
+     7) sum of all time spent running by tasks on this processor (in jiffies)
+     8) sum of all time spent waiting to run by tasks on this processor (in
+        jiffies)
+     9) # of timeslices run on this cpu
+
+
+Domain statistics
+-----------------
+One of these is produced per domain for each cpu described. (Note that if
+CONFIG_SMP is not defined, *no* domains are utilized and these lines
+will not appear in the output.)
+
+domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
+
+The first field is a bit mask indicating what cpus this domain operates over.
+
+The next 24 are a variety of load_balance() statistics in grouped into types
+of idleness (idle, busy, and newly idle):
+
+    1)  # of times in this domain load_balance() was called when the
+        cpu was idle
+    2)  # of times in this domain load_balance() checked but found
+        the load did not require balancing when the cpu was idle
+    3)  # of times in this domain load_balance() tried to move one or
+        more tasks and failed, when the cpu was idle
+    4)  sum of imbalances discovered (if any) with each call to
+        load_balance() in this domain when the cpu was idle
+    5)  # of times in this domain pull_task() was called when the cpu
+        was idle
+    6)  # of times in this domain pull_task() was called even though
+        the target task was cache-hot when idle
+    7)  # of times in this domain load_balance() was called but did
+        not find a busier queue while the cpu was idle
+    8)  # of times in this domain a busier queue was found while the
+        cpu was idle but no busier group was found
+    9)  # of times in this domain load_balance() was called when the
+        cpu was busy
+    10) # of times in this domain load_balance() checked but found the
+        load did not require balancing when busy
+    11) # of times in this domain load_balance() tried to move one or
+        more tasks and failed, when the cpu was busy
+    12) sum of imbalances discovered (if any) with each call to
+        load_balance() in this domain when the cpu was busy
+    13) # of times in this domain pull_task() was called when busy
+    14) # of times in this domain pull_task() was called even though the
+        target task was cache-hot when busy
+    15) # of times in this domain load_balance() was called but did not
+        find a busier queue while the cpu was busy
+    16) # of times in this domain a busier queue was found while the cpu
+        was busy but no busier group was found
+
+    17) # of times in this domain load_balance() was called when the
+        cpu was just becoming idle
+    18) # of times in this domain load_balance() checked but found the
+        load did not require balancing when the cpu was just becoming idle
+    19) # of times in this domain load_balance() tried to move one or more
+        tasks and failed, when the cpu was just becoming idle
+    20) sum of imbalances discovered (if any) with each call to
+        load_balance() in this domain when the cpu was just becoming idle
+    21) # of times in this domain pull_task() was called when newly idle
+    22) # of times in this domain pull_task() was called even though the
+        target task was cache-hot when just becoming idle
+    23) # of times in this domain load_balance() was called but did not
+        find a busier queue while the cpu was just becoming idle
+    24) # of times in this domain a busier queue was found while the cpu
+        was just becoming idle but no busier group was found
+
+   Next three are active_load_balance() statistics:
+
+    25) # of times active_load_balance() was called
+    26) # of times active_load_balance() tried to move a task and failed
+    27) # of times active_load_balance() successfully moved a task
+
+   Next three are sched_balance_exec() statistics:
+
+    28) sbe_cnt is not used
+    29) sbe_balanced is not used
+    30) sbe_pushed is not used
+
+   Next three are sched_balance_fork() statistics:
+
+    31) sbf_cnt is not used
+    32) sbf_balanced is not used
+    33) sbf_pushed is not used
+
+   Next three are try_to_wake_up() statistics:
+
+    34) # of times in this domain try_to_wake_up() awoke a task that
+        last ran on a different cpu in this domain
+    35) # of times in this domain try_to_wake_up() moved a task to the
+        waking cpu because it was cache-cold on its own cpu anyway
+    36) # of times in this domain try_to_wake_up() started passive balancing
+
+/proc/<pid>/schedstat
+---------------------
+schedstats also adds a new /proc/<pid>/schedstat file to include some of
+the same information on a per-process level.  There are three fields in
+this file correlating for that process to:
+
+     1) time spent on the cpu
+     2) time spent waiting on a runqueue
+     3) # of timeslices run on this cpu
+
+A program could be easily written to make use of these extra fields to
+report on how well a particular process or set of processes is faring
+under the scheduler's policies.  A simple version of such a program is
+available at
+
+    http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c
diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt
deleted file mode 100644
index 8259b34a66ae..000000000000
--- a/Documentation/scheduler/sched-stats.txt
+++ /dev/null
@@ -1,154 +0,0 @@
-Version 15 of schedstats dropped counters for some sched_yield:
-yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
-identical to version 14.
-
-Version 14 of schedstats includes support for sched_domains, which hit the
-mainline kernel in 2.6.20 although it is identical to the stats from version
-12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel
-release).  Some counters make more sense to be per-runqueue; other to be
-per-domain.  Note that domains (and their associated information) will only
-be pertinent and available on machines utilizing CONFIG_SMP.
-
-In version 14 of schedstat, there is at least one level of domain
-statistics for each cpu listed, and there may well be more than one
-domain.  Domains have no particular names in this implementation, but
-the highest numbered one typically arbitrates balancing across all the
-cpus on the machine, while domain0 is the most tightly focused domain,
-sometimes balancing only between pairs of cpus.  At this time, there
-are no architectures which need more than three domain levels. The first
-field in the domain stats is a bit map indicating which cpus are affected
-by that domain.
-
-These fields are counters, and only increment.  Programs which make use
-of these will need to start with a baseline observation and then calculate
-the change in the counters at each subsequent observation.  A perl script
-which does this for many of the fields is available at
-
-    http://eaglet.rain.com/rick/linux/schedstat/
-
-Note that any such script will necessarily be version-specific, as the main
-reason to change versions is changes in the output format.  For those wishing
-to write their own scripts, the fields are described here.
-
-CPU statistics
---------------
-cpu<N> 1 2 3 4 5 6 7 8 9
-
-First field is a sched_yield() statistic:
-     1) # of times sched_yield() was called
-
-Next three are schedule() statistics:
-     2) This field is a legacy array expiration count field used in the O(1)
-	scheduler. We kept it for ABI compatibility, but it is always set to zero.
-     3) # of times schedule() was called
-     4) # of times schedule() left the processor idle
-
-Next two are try_to_wake_up() statistics:
-     5) # of times try_to_wake_up() was called
-     6) # of times try_to_wake_up() was called to wake up the local cpu
-
-Next three are statistics describing scheduling latency:
-     7) sum of all time spent running by tasks on this processor (in jiffies)
-     8) sum of all time spent waiting to run by tasks on this processor (in
-        jiffies)
-     9) # of timeslices run on this cpu
-
-
-Domain statistics
------------------
-One of these is produced per domain for each cpu described. (Note that if
-CONFIG_SMP is not defined, *no* domains are utilized and these lines
-will not appear in the output.)
-
-domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
-
-The first field is a bit mask indicating what cpus this domain operates over.
-
-The next 24 are a variety of load_balance() statistics in grouped into types
-of idleness (idle, busy, and newly idle):
-
-     1) # of times in this domain load_balance() was called when the
-        cpu was idle
-     2) # of times in this domain load_balance() checked but found
-        the load did not require balancing when the cpu was idle
-     3) # of times in this domain load_balance() tried to move one or
-        more tasks and failed, when the cpu was idle
-     4) sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was idle
-     5) # of times in this domain pull_task() was called when the cpu
-        was idle
-     6) # of times in this domain pull_task() was called even though
-        the target task was cache-hot when idle
-     7) # of times in this domain load_balance() was called but did
-        not find a busier queue while the cpu was idle
-     8) # of times in this domain a busier queue was found while the
-        cpu was idle but no busier group was found
-
-     9) # of times in this domain load_balance() was called when the
-        cpu was busy
-    10) # of times in this domain load_balance() checked but found the
-        load did not require balancing when busy
-    11) # of times in this domain load_balance() tried to move one or
-        more tasks and failed, when the cpu was busy
-    12) sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was busy
-    13) # of times in this domain pull_task() was called when busy
-    14) # of times in this domain pull_task() was called even though the
-        target task was cache-hot when busy
-    15) # of times in this domain load_balance() was called but did not
-        find a busier queue while the cpu was busy
-    16) # of times in this domain a busier queue was found while the cpu
-        was busy but no busier group was found
-
-    17) # of times in this domain load_balance() was called when the
-        cpu was just becoming idle
-    18) # of times in this domain load_balance() checked but found the
-        load did not require balancing when the cpu was just becoming idle
-    19) # of times in this domain load_balance() tried to move one or more
-        tasks and failed, when the cpu was just becoming idle
-    20) sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was just becoming idle
-    21) # of times in this domain pull_task() was called when newly idle
-    22) # of times in this domain pull_task() was called even though the
-        target task was cache-hot when just becoming idle
-    23) # of times in this domain load_balance() was called but did not
-        find a busier queue while the cpu was just becoming idle
-    24) # of times in this domain a busier queue was found while the cpu
-        was just becoming idle but no busier group was found
-
-   Next three are active_load_balance() statistics:
-    25) # of times active_load_balance() was called
-    26) # of times active_load_balance() tried to move a task and failed
-    27) # of times active_load_balance() successfully moved a task
-
-   Next three are sched_balance_exec() statistics:
-    28) sbe_cnt is not used
-    29) sbe_balanced is not used
-    30) sbe_pushed is not used
-
-   Next three are sched_balance_fork() statistics:
-    31) sbf_cnt is not used
-    32) sbf_balanced is not used
-    33) sbf_pushed is not used
-
-   Next three are try_to_wake_up() statistics:
-    34) # of times in this domain try_to_wake_up() awoke a task that
-        last ran on a different cpu in this domain
-    35) # of times in this domain try_to_wake_up() moved a task to the
-        waking cpu because it was cache-cold on its own cpu anyway
-    36) # of times in this domain try_to_wake_up() started passive balancing
-
-/proc/<pid>/schedstat
-----------------
-schedstats also adds a new /proc/<pid>/schedstat file to include some of
-the same information on a per-process level.  There are three fields in
-this file correlating for that process to:
-     1) time spent on the cpu
-     2) time spent waiting on a runqueue
-     3) # of timeslices run on this cpu
-
-A program could be easily written to make use of these extra fields to
-report on how well a particular process or set of processes is faring
-under the scheduler's policies.  A simple version of such a program is
-available at
-    http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c
diff --git a/Documentation/scheduler/text_files.rst b/Documentation/scheduler/text_files.rst
new file mode 100644
index 000000000000..0bc50307b241
--- /dev/null
+++ b/Documentation/scheduler/text_files.rst
@@ -0,0 +1,5 @@
+Scheduler pelt c program
+------------------------
+
+.. literalinclude:: sched-pelt.c
+    :language: c
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
deleted file mode 100644
index 00c8ebb2fd18..000000000000
--- a/Documentation/scsi/osst.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-README file for the osst driver
-===============================
-(w) Kurt Garloff <garloff@suse.de> 12/2000
-
-This file describes the osst driver as of version 0.8.x/0.9.x, the released
-version of the osst driver.
-It is intended to help advanced users to understand the role of osst and to
-get them started using (and maybe debugging) it.
-It won't address issues like "How do I compile a kernel?" or "How do I load
-a module?", as these are too basic.
-Once the OnStream got merged into the official kernel, the distro makers
-will provide the OnStream support for those who are not familiar with
-hacking their kernels.
-
-
-Purpose
--------
-The osst driver was developed, because the standard SCSI tape driver in
-Linux, st, does not support the OnStream SC-x0 SCSI tape. The st is not to
-blame for that, as the OnStream tape drives do not support the standard SCSI
-command set for Serial Access Storage Devices (SASDs), which basically
-corresponds to the QIC-157 spec.
-Nevertheless, the OnStream tapes are nice pieces of hardware and therefore
-the osst driver has been written to make these tape devs supported by Linux.
-The driver is free software. It's released under the GNU GPL and planned to
-be integrated into the mainstream kernel.
-
-
-Implementation
---------------
-The osst is a new high-level SCSI driver, just like st, sr, sd and sg. It
-can be compiled into the kernel or loaded as a module.
-As it represents a new device, it got assigned a new device node: /dev/osstX
-are character devices with major no 206 and minor numbers like the /dev/stX
-devices. If those are not present, you may create them by calling
-Makedevs.sh as root (see below).
-The driver started being a copy of st and as such, the osst devices'
-behavior looks very much the same as st to the userspace applications.
-
-
-History
--------
-In the first place, osst shared its identity very much with st. That meant
-that it used the same kernel structures and the same device node as st.
-So you could only have either of them being present in the kernel. This has
-been fixed by registering an own device, now.
-st and osst can coexist, each only accessing the devices it can support by
-themselves.
-
-
-Installation
-------------
-osst got integrated into the linux kernel. Select it during kernel
-configuration as module or compile statically into the kernel.
-Compile your kernel and install the modules.
-
-Now, your osst driver is inside the kernel or available as a module,
-depending on your choice during kernel config. You may still need to create
-the device nodes by calling the Makedevs.sh script (see below) manually.
-
-To load your module, you may use the command 
-modprobe osst
-as root. dmesg should show you, whether your OnStream tapes have been
-recognized.
-
-If you want to have the module autoloaded on access to /dev/osst, you may
-add something like
-alias char-major-206 osst
-to a file under /etc/modprobe.d/ directory.
-
-You may find it convenient to create a symbolic link 
-ln -s nosst0 /dev/tape
-to make programs assuming a default name of /dev/tape more convenient to
-use.
-
-The device nodes for osst have to be created. Use the Makedevs.sh script
-attached to this file.
-
-
-Using it
---------
-You may use the OnStream tape driver with your standard backup software,
-which may be tar, cpio, amanda, arkeia, BRU, Lone Tar, ...
-by specifying /dev/(n)osst0 as the tape device to use or using the above
-symlink trick. The IOCTLs to control tape operation are also mostly
-supported and you may try the mt (or mt_st) program to jump between
-filemarks, eject the tape, ...
-
-There's one limitation: You need to use a block size of 32kB.
-
-(This limitation is worked on and will be fixed in version 0.8.8 of
- this driver.)
-
-If you just want to get started with standard software, here is an example
-for creating and restoring a full backup:
-# Backup
-tar cvf - / --exclude /proc | buffer -s 32k -m 24M -B -t -o /dev/nosst0
-# Restore
-buffer -s 32k -m 8M -B -t -i /dev/osst0 | tar xvf - -C /
-
-The buffer command has been used to buffer the data before it goes to the
-tape (or the file system) in order to smooth out the data stream and prevent
-the tape from needing to stop and rewind. The OnStream does have an internal
-buffer and a variable speed which help this, but especially on writing, the
-buffering still proves useful in most cases. It also pads the data to
-guarantees the block size of 32k. (Otherwise you may pass the -b64 option to
-tar.)
-Expect something like 1.8MB/s for the SC-x0 drives and 0.9MB/s for the DI-30.
-The USB drive will give you about 0.7MB/s.
-On a fast machine, you may profit from software data compression (z flag for
-tar).
-
-
-USB and IDE
------------
-Via the SCSI emulation layers usb-storage and ide-scsi, you can also use the
-osst driver to drive the USB-30 and the DI-30 drives. (Unfortunately, there
-is no such layer for the parallel port, otherwise the DP-30 would work as
-well.) For the USB support, you need the latest 2.4.0-test kernels and the 
-latest usb-storage driver from 
-http://www.linux-usb.org/
-http://sourceforge.net/cvs/?group_id=3581
-
-Note that the ide-tape driver as of 1.16f uses a slightly outdated on-tape
-format and therefore is not completely interoperable with osst tapes.
-
-The ADR-x0 line is fully SCSI-2 compliant and is supported by st, not osst.
-The on-tape format is supposed to be compatible with the one used by osst.
-
-
-Feedback and updates
---------------------
-The driver development is coordinated through a mailing list
-<osst@linux1.onstream.nl>
-a CVS repository and some web pages. 
-The tester's pages which contain recent news and updated drivers to download
-can be found on
-http://sourceforge.net/projects/osst/
-
-If you find any problems, please have a look at the tester's page in order
-to see whether the problem is already known and solved. Otherwise, please
-report it to the mailing list. Your feedback is welcome. (This holds also
-for reports of successful usage, of course.) 
-In case of trouble, please do always provide the following info:
-* driver and kernel version used (see syslog)
-* driver messages (syslog)
-* SCSI config and OnStream Firmware (/proc/scsi/scsi)
-* description of error. Is it reproducible?
-* software and commands used
-
-You may subscribe to the mailing list, BTW, it's a majordomo list.
-
-
-Status
-------
-0.8.0 was the first widespread BETA release. Since then a lot of reports
-have been sent, but mostly reported success or only minor trouble.
-All the issues have been addressed.
-Check the web pages for more info about the current developments.
-0.9.x is the tree for the 2.3/2.4 kernel.
-
-
-Acknowledgments
-----------------
-The driver has been started by making a copy of Kai Makisara's st driver.
-Most of the development has been done by Willem Riede. The presence of the
-userspace program osg (onstreamsg) from Terry Hardie has been rather
-helpful. The same holds for Gadi Oxman's ide-tape support for the DI-30.
-I did add some patches to those drivers as well and coordinated things a
-little bit. 
-Note that most of them did mostly spend their spare time for the creation of
-this driver.
-The people from OnStream, especially Jack Bombeeck did support this project
-and always tried to answer HW or FW related questions. Furthermore, he
-pushed the FW developers to do the right things.
-SuSE did support this project by allowing me to work on it during my working
-time for them and by integrating the driver into their distro.
-
-More people did help by sending useful comments. Sorry to those who have
-been forgotten. Thanks to all the GNU/FSF and Linux developers who made this
-platform such an interesting, nice and stable platform.
-Thanks go to those who tested the drivers and did send useful reports. Your
-help is needed!
-
-
-Makedevs.sh
------------
-#!/bin/sh
-# Script to create OnStream SC-x0 device nodes (major 206)
-# Usage: Makedevs.sh [nos [path to dev]]
-# $Id: README.osst.kernel,v 1.4 2000/12/20 14:13:15 garloff Exp $
-major=206
-nrs=4
-dir=/dev
-test -z "$1" || nrs=$1
-test -z "$2" || dir=$2
-declare -i nr
-nr=0
-test -d $dir || mkdir -p $dir
-while test $nr -lt $nrs; do
-  mknod $dir/osst$nr c $major $nr
-  chown 0.disk $dir/osst$nr; chmod 660 $dir/osst$nr;
-  mknod $dir/nosst$nr c $major $[nr+128]
-  chown 0.disk $dir/nosst$nr; chmod 660 $dir/nosst$nr;
-  mknod $dir/osst${nr}l c $major $[nr+32]
-  chown 0.disk $dir/osst${nr}l; chmod 660 $dir/osst${nr}l;
-  mknod $dir/nosst${nr}l c $major $[nr+160]
-  chown 0.disk $dir/nosst${nr}l; chmod 660 $dir/nosst${nr}l;
-  mknod $dir/osst${nr}m c $major $[nr+64]
-  chown 0.disk $dir/osst${nr}m; chmod 660 $dir/osst${nr}m;
-  mknod $dir/nosst${nr}m c $major $[nr+192]
-  chown 0.disk $dir/nosst${nr}m; chmod 660 $dir/nosst${nr}m;
-  mknod $dir/osst${nr}a c $major $[nr+96]
-  chown 0.disk $dir/osst${nr}a; chmod 660 $dir/osst${nr}a;
-  mknod $dir/nosst${nr}a c $major $[nr+224]
-  chown 0.disk $dir/nosst${nr}a; chmod 660 $dir/nosst${nr}a;
-  let nr+=1
-done
diff --git a/Documentation/scsi/ufs.txt b/Documentation/scsi/ufs.txt
index 1769f71c4c20..81842ec3e116 100644
--- a/Documentation/scsi/ufs.txt
+++ b/Documentation/scsi/ufs.txt
@@ -158,6 +158,13 @@ send SG_IO with the applicable sg_io_v4:
 If you wish to read or write a descriptor, use the appropriate xferp of
 sg_io_v4.
 
+The userspace tool that interacts with the ufs-bsg endpoint and uses its
+upiu-based protocol is available at:
+
+	https://github.com/westerndigitalcorporation/ufs-tool
+
+For more detailed information about the tool and its supported
+features, please see the tool's README.
 
 UFS Specifications can be found at,
 UFS - http://www.jedec.org/sites/default/files/docs/JESD220.pdf
diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst
index 2cd0e273cc9a..3d1cca287aa4 100644
--- a/Documentation/security/IMA-templates.rst
+++ b/Documentation/security/IMA-templates.rst
@@ -69,15 +69,16 @@ descriptors by adding their identifier to the format string
    algorithm (field format: [<hash algo>:]digest, where the digest
    prefix is shown only if the hash algorithm is not SHA1 or MD5);
  - 'n-ng': the name of the event, without size limitations;
- - 'sig': the file signature.
+ - 'sig': the file signature;
+ - 'buf': the buffer data that was used to generate the hash without size limitations;
 
 
 Below, there is the list of defined template descriptors:
 
  - "ima": its format is ``d|n``;
  - "ima-ng" (default): its format is ``d-ng|n-ng``;
- - "ima-sig": its format is ``d-ng|n-ng|sig``.
-
+ - "ima-sig": its format is ``d-ng|n-ng|sig``;
+ - "ima-buf": its format is ``d-ng|n-ng|buf``;
 
 
 Use
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index aad6d92ffe31..fc503dd689a7 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -8,7 +8,10 @@ Security Documentation
    credentials
    IMA-templates
    keys/index
-   LSM
+   lsm
+   lsm-development
+   sak
    SCTP
    self-protection
+   siphash
    tpm/index
diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 9521c4207f01..d6d8b0b756b6 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -433,6 +433,10 @@ The main syscalls are:
      /sbin/request-key will be invoked in an attempt to obtain a key. The
      callout_info string will be passed as an argument to the program.
 
+     To link a key into the destination keyring the key must grant link
+     permission on the key to the caller and the keyring must grant write
+     permission.
+
      See also Documentation/security/keys/request-key.rst.
 
 
@@ -577,6 +581,27 @@ The keyctl syscall functions are:
      added.
 
 
+  *  Move a key from one keyring to another::
+
+	long keyctl(KEYCTL_MOVE,
+		    key_serial_t id,
+		    key_serial_t from_ring_id,
+		    key_serial_t to_ring_id,
+		    unsigned int flags);
+
+     Move the key specified by "id" from the keyring specified by
+     "from_ring_id" to the keyring specified by "to_ring_id".  If the two
+     keyrings are the same, nothing is done.
+
+     "flags" can have KEYCTL_MOVE_EXCL set in it to cause the operation to fail
+     with EEXIST if a matching key exists in the destination keyring, otherwise
+     such a key will be replaced.
+
+     A process must have link permission on the key for this function to be
+     successful and write permission on both keyrings.  Any errors that can
+     occur from KEYCTL_LINK also apply on the destination keyring here.
+
+
   *  Unlink a key or keyring from another keyring::
 
 	long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key);
@@ -1077,49 +1102,43 @@ payload contents" for more information.
     See also Documentation/security/keys/request-key.rst.
 
 
+ *  To search for a key in a specific domain, call:
+
+	struct key *request_key_tag(const struct key_type *type,
+				    const char *description,
+				    struct key_tag *domain_tag,
+				    const char *callout_info);
+
+    This is identical to request_key(), except that a domain tag may be
+    specifies that causes search algorithm to only match keys matching that
+    tag.  The domain_tag may be NULL, specifying a global domain that is
+    separate from any nominated domain.
+
+
  *  To search for a key, passing auxiliary data to the upcaller, call::
 
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
+					     struct key_tag *domain_tag,
 					     const void *callout_info,
 					     size_t callout_len,
 					     void *aux);
 
-    This is identical to request_key(), except that the auxiliary data is
-    passed to the key_type->request_key() op if it exists, and the callout_info
-    is a blob of length callout_len, if given (the length may be 0).
-
-
- *  A key can be requested asynchronously by calling one of::
-
-	struct key *request_key_async(const struct key_type *type,
-				      const char *description,
-				      const void *callout_info,
-				      size_t callout_len);
-
-    or::
+    This is identical to request_key_tag(), except that the auxiliary data is
+    passed to the key_type->request_key() op if it exists, and the
+    callout_info is a blob of length callout_len, if given (the length may be
+    0).
 
-	struct key *request_key_async_with_auxdata(const struct key_type *type,
-						   const char *description,
-						   const char *callout_info,
-					     	   size_t callout_len,
-					     	   void *aux);
 
-    which are asynchronous equivalents of request_key() and
-    request_key_with_auxdata() respectively.
+ *  To search for a key under RCU conditions, call::
 
-    These two functions return with the key potentially still under
-    construction.  To wait for construction completion, the following should be
-    called::
+	struct key *request_key_rcu(const struct key_type *type,
+				    const char *description,
+				    struct key_tag *domain_tag);
 
-	int wait_for_key_construction(struct key *key, bool intr);
-
-    The function will wait for the key to finish being constructed and then
-    invokes key_validate() to return an appropriate value to indicate the state
-    of the key (0 indicates the key is usable).
-
-    If intr is true, then the wait can be interrupted by a signal, in which
-    case error ERESTARTSYS will be returned.
+    which is similar to request_key_tag() except that it does not check for
+    keys that are under construction and it will not call out to userspace to
+    construct a key if it can't find a match.
 
 
  *  When it is no longer required, the key should be released using::
@@ -1159,11 +1178,13 @@ payload contents" for more information.
 
 	key_ref_t keyring_search(key_ref_t keyring_ref,
 				 const struct key_type *type,
-				 const char *description)
+				 const char *description,
+				 bool recurse)
 
-    This searches the keyring tree specified for a matching key. Error ENOKEY
-    is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful,
-    the returned key will need to be released.
+    This searches the specified keyring only (recurse == false) or keyring tree
+    (recurse == true) specified for a matching key. Error ENOKEY is returned
+    upon failure (use IS_ERR/PTR_ERR to determine). If successful, the returned
+    key will need to be released.
 
     The possession attribute from the keyring reference is used to control
     access through the permissions mask and is propagated to the returned key
@@ -1594,10 +1615,12 @@ The structure has a number of fields, some of which are mandatory:
      attempted key link operation. If there is no match, -EINVAL is returned.
 
 
-  *  ``int (*asym_eds_op)(struct kernel_pkey_params *params,
-			  const void *in, void *out);``
-     ``int (*asym_verify_signature)(struct kernel_pkey_params *params,
-				    const void *in, const void *in2);``
+  *  ``asym_eds_op`` and ``asym_verify_signature``::
+
+       int (*asym_eds_op)(struct kernel_pkey_params *params,
+			  const void *in, void *out);
+       int (*asym_verify_signature)(struct kernel_pkey_params *params,
+				    const void *in, const void *in2);
 
      These methods are optional.  If provided the first allows a key to be
      used to encrypt, decrypt or sign a blob of data, and the second allows a
@@ -1662,8 +1685,10 @@ The structure has a number of fields, some of which are mandatory:
      required crypto isn't available.
 
 
-  *  ``int (*asym_query)(const struct kernel_pkey_params *params,
-			 struct kernel_pkey_query *info);``
+  *  ``asym_query``::
+
+       int (*asym_query)(const struct kernel_pkey_params *params,
+			 struct kernel_pkey_query *info);
 
      This method is optional.  If provided it allows information about the
      public or asymmetric key held in the key to be determined.
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 600ad67d1707..35f2296b704a 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -15,26 +15,25 @@ The process starts by either the kernel requesting a service by calling
 
 or::
 
+	struct key *request_key_tag(const struct key_type *type,
+				    const char *description,
+				    const struct key_tag *domain_tag,
+				    const char *callout_info);
+
+or::
+
 	struct key *request_key_with_auxdata(const struct key_type *type,
 					     const char *description,
+					     const struct key_tag *domain_tag,
 					     const char *callout_info,
 					     size_t callout_len,
 					     void *aux);
 
 or::
 
-	struct key *request_key_async(const struct key_type *type,
-				      const char *description,
-				      const char *callout_info,
-				      size_t callout_len);
-
-or::
-
-	struct key *request_key_async_with_auxdata(const struct key_type *type,
-						   const char *description,
-						   const char *callout_info,
-					     	   size_t callout_len,
-						   void *aux);
+	struct key *request_key_rcu(const struct key_type *type,
+				    const char *description,
+				    const struct key_tag *domain_tag);
 
 Or by userspace invoking the request_key system call::
 
@@ -48,14 +47,18 @@ does not need to link the key to a keyring to prevent it from being immediately
 destroyed.  The kernel interface returns a pointer directly to the key, and
 it's up to the caller to destroy the key.
 
-The request_key*_with_auxdata() calls are like the in-kernel request_key*()
-calls, except that they permit auxiliary data to be passed to the upcaller (the
-default is NULL).  This is only useful for those key types that define their
-own upcall mechanism rather than using /sbin/request-key.
+The request_key_tag() call is like the in-kernel request_key(), except that it
+also takes a domain tag that allows keys to be separated by namespace and
+killed off as a group.
+
+The request_key_with_auxdata() calls is like the request_key_tag() call, except
+that they permit auxiliary data to be passed to the upcaller (the default is
+NULL).  This is only useful for those key types that define their own upcall
+mechanism rather than using /sbin/request-key.
 
-The two async in-kernel calls may return keys that are still in the process of
-being constructed.  The two non-async ones will wait for construction to
-complete first.
+The request_key_rcu() call is like the request_key_tag() call, except that it
+doesn't check for keys that are under construction and doesn't attempt to
+construct missing keys.
 
 The userspace interface links the key to a keyring associated with the process
 to prevent the key from going away, and returns the serial number of the key to
@@ -148,7 +151,7 @@ The Search Algorithm
 
 A search of any particular keyring proceeds in the following fashion:
 
-  1) When the key management code searches for a key (keyring_search_aux) it
+  1) When the key management code searches for a key (keyring_search_rcu) it
      firstly calls key_permission(SEARCH) on the keyring it's starting with,
      if this denies permission, it doesn't search further.
 
@@ -167,6 +170,9 @@ The process stops immediately a valid key is found with permission granted to
 use it.  Any error from a previous match attempt is discarded and the key is
 returned.
 
+When request_key() is invoked, if CONFIG_KEYS_REQUEST_CACHE=y, a per-task
+one-key cache is first checked for a match.
+
 When search_process_keyrings() is invoked, it performs the following searches
 until one succeeds:
 
@@ -186,7 +192,9 @@ until one succeeds:
       c) The calling process's session keyring is searched.
 
 The moment one succeeds, all pending errors are discarded and the found key is
-returned.
+returned.  If CONFIG_KEYS_REQUEST_CACHE=y, then that key is placed in the
+per-task cache, displacing the previous key.  The cache is cleared on exit or
+just prior to resumption of userspace.
 
 Only if all these fail does the whole thing fail with the highest priority
 error.  Note that several errors may have come from LSM.
diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst
index 7b35fcb58933..50ac8bcd6970 100644
--- a/Documentation/security/keys/trusted-encrypted.rst
+++ b/Documentation/security/keys/trusted-encrypted.rst
@@ -107,12 +107,14 @@ Where::
 
 Examples of trusted and encrypted key usage:
 
-Create and save a trusted key named "kmk" of length 32 bytes::
+Create and save a trusted key named "kmk" of length 32 bytes.
 
 Note: When using a TPM 2.0 with a persistent key with handle 0x81000001,
 append 'keyhandle=0x81000001' to statements between quotes, such as
 "new 32 keyhandle=0x81000001".
 
+::
+
     $ keyctl add trusted kmk "new 32" @u
     440502848
 
diff --git a/Documentation/security/LSM.rst b/Documentation/security/lsm-development.rst
index 31d92bc5fdd2..31d92bc5fdd2 100644
--- a/Documentation/security/LSM.rst
+++ b/Documentation/security/lsm-development.rst
diff --git a/Documentation/lsm.txt b/Documentation/security/lsm.rst
index ad4dfd020e0d..ad4dfd020e0d 100644
--- a/Documentation/lsm.txt
+++ b/Documentation/security/lsm.rst
diff --git a/Documentation/SAK.txt b/Documentation/security/sak.rst
index 260e1d3687bd..260e1d3687bd 100644
--- a/Documentation/SAK.txt
+++ b/Documentation/security/sak.rst
diff --git a/Documentation/siphash.txt b/Documentation/security/siphash.rst
index 9965821ab333..9965821ab333 100644
--- a/Documentation/siphash.txt
+++ b/Documentation/security/siphash.rst
diff --git a/Documentation/security/tpm/index.rst b/Documentation/security/tpm/index.rst
index af77a7bbb070..3296533e54cf 100644
--- a/Documentation/security/tpm/index.rst
+++ b/Documentation/security/tpm/index.rst
@@ -5,3 +5,4 @@ Trusted Platform Module documentation
 .. toctree::
 
    tpm_vtpm_proxy
+   xen-tpmfront
diff --git a/Documentation/security/tpm/xen-tpmfront.rst b/Documentation/security/tpm/xen-tpmfront.rst
new file mode 100644
index 000000000000..00d5b1db227d
--- /dev/null
+++ b/Documentation/security/tpm/xen-tpmfront.rst
@@ -0,0 +1,124 @@
+=============================
+Virtual TPM interface for Xen
+=============================
+
+Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA)
+
+This document describes the virtual Trusted Platform Module (vTPM) subsystem for
+Xen. The reader is assumed to have familiarity with building and installing Xen,
+Linux, and a basic understanding of the TPM and vTPM concepts.
+
+Introduction
+------------
+
+The goal of this work is to provide a TPM functionality to a virtual guest
+operating system (in Xen terms, a DomU).  This allows programs to interact with
+a TPM in a virtual system the same way they interact with a TPM on the physical
+system.  Each guest gets its own unique, emulated, software TPM.  However, each
+of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain,
+which seals the secrets to the Physical TPM.  If the process of creating each of
+these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends
+the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each
+major component of vTPM is implemented as a separate domain, providing secure
+separation guaranteed by the hypervisor. The vTPM domains are implemented in
+mini-os to reduce memory and processor overhead.
+
+This mini-os vTPM subsystem was built on top of the previous vTPM work done by
+IBM and Intel corporation.
+
+
+Design Overview
+---------------
+
+The architecture of vTPM is described below::
+
+  +------------------+
+  |    Linux DomU    | ...
+  |       |  ^       |
+  |       v  |       |
+  |   xen-tpmfront   |
+  +------------------+
+          |  ^
+          v  |
+  +------------------+
+  | mini-os/tpmback  |
+  |       |  ^       |
+  |       v  |       |
+  |  vtpm-stubdom    | ...
+  |       |  ^       |
+  |       v  |       |
+  | mini-os/tpmfront |
+  +------------------+
+          |  ^
+          v  |
+  +------------------+
+  | mini-os/tpmback  |
+  |       |  ^       |
+  |       v  |       |
+  | vtpmmgr-stubdom  |
+  |       |  ^       |
+  |       v  |       |
+  | mini-os/tpm_tis  |
+  +------------------+
+          |  ^
+          v  |
+  +------------------+
+  |   Hardware TPM   |
+  +------------------+
+
+* Linux DomU:
+	       The Linux based guest that wants to use a vTPM. There may be
+	       more than one of these.
+
+* xen-tpmfront.ko:
+		    Linux kernel virtual TPM frontend driver. This driver
+                    provides vTPM access to a Linux-based DomU.
+
+* mini-os/tpmback:
+		    Mini-os TPM backend driver. The Linux frontend driver
+		    connects to this backend driver to facilitate communications
+		    between the Linux DomU and its vTPM. This driver is also
+		    used by vtpmmgr-stubdom to communicate with vtpm-stubdom.
+
+* vtpm-stubdom:
+		 A mini-os stub domain that implements a vTPM. There is a
+		 one to one mapping between running vtpm-stubdom instances and
+                 logical vtpms on the system. The vTPM Platform Configuration
+                 Registers (PCRs) are normally all initialized to zero.
+
+* mini-os/tpmfront:
+		     Mini-os TPM frontend driver. The vTPM mini-os domain
+		     vtpm-stubdom uses this driver to communicate with
+		     vtpmmgr-stubdom. This driver is also used in mini-os
+		     domains such as pv-grub that talk to the vTPM domain.
+
+* vtpmmgr-stubdom:
+		    A mini-os domain that implements the vTPM manager. There is
+		    only one vTPM manager and it should be running during the
+		    entire lifetime of the machine.  This domain regulates
+		    access to the physical TPM on the system and secures the
+		    persistent state of each vTPM.
+
+* mini-os/tpm_tis:
+		    Mini-os TPM version 1.2 TPM Interface Specification (TIS)
+                    driver. This driver used by vtpmmgr-stubdom to talk directly to
+                    the hardware TPM. Communication is facilitated by mapping
+                    hardware memory pages into vtpmmgr-stubdom.
+
+* Hardware TPM:
+		The physical TPM that is soldered onto the motherboard.
+
+
+Integration With Xen
+--------------------
+
+Support for the vTPM driver was added in Xen using the libxl toolstack in Xen
+4.3.  See the Xen documentation (docs/misc/vtpm.txt) for details on setting up
+the vTPM and vTPM Manager stub domains.  Once the stub domains are running, a
+vTPM device is set up in the same manner as a disk or network device in the
+domain's configuration file.
+
+In order to use features such as IMA that require a TPM to be loaded prior to
+the initrd, the xen-tpmfront driver must be compiled in to the kernel.  If not
+using such features, the driver can be compiled as a module and will be loaded
+as usual.
diff --git a/Documentation/security/tpm/xen-tpmfront.txt b/Documentation/security/tpm/xen-tpmfront.txt
deleted file mode 100644
index 69346de87ff3..000000000000
--- a/Documentation/security/tpm/xen-tpmfront.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-Virtual TPM interface for Xen
-
-Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA)
-
-This document describes the virtual Trusted Platform Module (vTPM) subsystem for
-Xen. The reader is assumed to have familiarity with building and installing Xen,
-Linux, and a basic understanding of the TPM and vTPM concepts.
-
-INTRODUCTION
-
-The goal of this work is to provide a TPM functionality to a virtual guest
-operating system (in Xen terms, a DomU).  This allows programs to interact with
-a TPM in a virtual system the same way they interact with a TPM on the physical
-system.  Each guest gets its own unique, emulated, software TPM.  However, each
-of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain,
-which seals the secrets to the Physical TPM.  If the process of creating each of
-these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends
-the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each
-major component of vTPM is implemented as a separate domain, providing secure
-separation guaranteed by the hypervisor. The vTPM domains are implemented in
-mini-os to reduce memory and processor overhead.
-
-This mini-os vTPM subsystem was built on top of the previous vTPM work done by
-IBM and Intel corporation.
-
-
-DESIGN OVERVIEW
----------------
-
-The architecture of vTPM is described below:
-
-+------------------+
-|    Linux DomU    | ...
-|       |  ^       |
-|       v  |       |
-|   xen-tpmfront   |
-+------------------+
-        |  ^
-        v  |
-+------------------+
-| mini-os/tpmback  |
-|       |  ^       |
-|       v  |       |
-|  vtpm-stubdom    | ...
-|       |  ^       |
-|       v  |       |
-| mini-os/tpmfront |
-+------------------+
-        |  ^
-        v  |
-+------------------+
-| mini-os/tpmback  |
-|       |  ^       |
-|       v  |       |
-| vtpmmgr-stubdom  |
-|       |  ^       |
-|       v  |       |
-| mini-os/tpm_tis  |
-+------------------+
-        |  ^
-        v  |
-+------------------+
-|   Hardware TPM   |
-+------------------+
-
- * Linux DomU: The Linux based guest that wants to use a vTPM. There may be
-	       more than one of these.
-
- * xen-tpmfront.ko: Linux kernel virtual TPM frontend driver. This driver
-                    provides vTPM access to a Linux-based DomU.
-
- * mini-os/tpmback: Mini-os TPM backend driver. The Linux frontend driver
-		    connects to this backend driver to facilitate communications
-		    between the Linux DomU and its vTPM. This driver is also
-		    used by vtpmmgr-stubdom to communicate with vtpm-stubdom.
-
- * vtpm-stubdom: A mini-os stub domain that implements a vTPM. There is a
-		 one to one mapping between running vtpm-stubdom instances and
-                 logical vtpms on the system. The vTPM Platform Configuration
-                 Registers (PCRs) are normally all initialized to zero.
-
- * mini-os/tpmfront: Mini-os TPM frontend driver. The vTPM mini-os domain
-		     vtpm-stubdom uses this driver to communicate with
-		     vtpmmgr-stubdom. This driver is also used in mini-os
-		     domains such as pv-grub that talk to the vTPM domain.
-
- * vtpmmgr-stubdom: A mini-os domain that implements the vTPM manager. There is
-		    only one vTPM manager and it should be running during the
-		    entire lifetime of the machine.  This domain regulates
-		    access to the physical TPM on the system and secures the
-		    persistent state of each vTPM.
-
- * mini-os/tpm_tis: Mini-os TPM version 1.2 TPM Interface Specification (TIS)
-                    driver. This driver used by vtpmmgr-stubdom to talk directly to
-                    the hardware TPM. Communication is facilitated by mapping
-                    hardware memory pages into vtpmmgr-stubdom.
-
- * Hardware TPM: The physical TPM that is soldered onto the motherboard.
-
-
-INTEGRATION WITH XEN
---------------------
-
-Support for the vTPM driver was added in Xen using the libxl toolstack in Xen
-4.3.  See the Xen documentation (docs/misc/vtpm.txt) for details on setting up
-the vTPM and vTPM Manager stub domains.  Once the stub domains are running, a
-vTPM device is set up in the same manner as a disk or network device in the
-domain's configuration file.
-
-In order to use features such as IMA that require a TPM to be loaded prior to
-the initrd, the xen-tpmfront driver must be compiled in to the kernel.  If not
-using such features, the driver can be compiled as a module and will be loaded
-as usual.
diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst
deleted file mode 100644
index 4537119bf624..000000000000
--- a/Documentation/serial/driver.rst
+++ /dev/null
@@ -1,549 +0,0 @@
-====================
-Low Level Serial API
-====================
-
-
-This document is meant as a brief overview of some aspects of the new serial
-driver.  It is not complete, any questions you have should be directed to
-<rmk@arm.linux.org.uk>
-
-The reference implementation is contained within amba-pl011.c.
-
-
-
-Low Level Serial Hardware Driver
---------------------------------
-
-The low level serial hardware driver is responsible for supplying port
-information (defined by uart_port) and a set of control methods (defined
-by uart_ops) to the core serial driver.  The low level driver is also
-responsible for handling interrupts for the port, and providing any
-console support.
-
-
-Console Support
----------------
-
-The serial core provides a few helper functions.  This includes identifing
-the correct port structure (via uart_get_console) and decoding command line
-arguments (uart_parse_options).
-
-There is also a helper function (uart_console_write) which performs a
-character by character write, translating newlines to CRLF sequences.
-Driver writers are recommended to use this function rather than implementing
-their own version.
-
-
-Locking
--------
-
-It is the responsibility of the low level hardware driver to perform the
-necessary locking using port->lock.  There are some exceptions (which
-are described in the uart_ops listing below.)
-
-There are two locks.  A per-port spinlock, and an overall semaphore.
-
-From the core driver perspective, the port->lock locks the following
-data::
-
-	port->mctrl
-	port->icount
-	port->state->xmit.head (circ_buf->head)
-	port->state->xmit.tail (circ_buf->tail)
-
-The low level driver is free to use this lock to provide any additional
-locking.
-
-The port_sem semaphore is used to protect against ports being added/
-removed or reconfigured at inappropriate times. Since v2.6.27, this
-semaphore has been the 'mutex' member of the tty_port struct, and
-commonly referred to as the port mutex.
-
-
-uart_ops
---------
-
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See Documentation/serial/tty.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-Other functions
----------------
-
-uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
-
-	Locking: caller is expected to take port->lock
-
-	Interrupts: n/a
-
-uart_get_baud_rate(port,termios,old,min,max)
-	Return the numeric baud rate for the specified termios, taking
-	account of the special 38400 baud "kludge".  The B0 baud rate
-	is mapped to 9600 baud.
-
-	If the baud rate is not within min..max, then if old is non-NULL,
-	the original baud rate will be tried.  If that exceeds the
-	min..max constraint, 9600 baud will be returned.  termios will
-	be updated to the baud rate in use.
-
-	Note: min..max must always allow 9600 baud to be selected.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_get_divisor(port,baud)
-	Return the divisor (baud_base / baud) for the specified baud
-	rate, appropriately rounded.
-
-	If 38400 baud and custom divisor is selected, return the
-	custom divisor instead.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_match_port(port1,port2)
-	This utility function can be used to determine whether two
-	uart_port structures describe the same port.
-
-	Locking: n/a
-
-	Interrupts: n/a
-
-uart_write_wakeup(port)
-	A driver is expected to call this function when the number of
-	characters in the transmit buffer have dropped below a threshold.
-
-	Locking: port->lock should be held.
-
-	Interrupts: n/a
-
-uart_register_driver(drv)
-	Register a uart driver with the core driver.  We in turn register
-	with the tty layer, and initialise the core driver per-port state.
-
-	drv->port should be NULL, and the per-port structures should be
-	registered using uart_add_one_port after this call has succeeded.
-
-	Locking: none
-
-	Interrupts: enabled
-
-uart_unregister_driver()
-	Remove all references to a driver from the core driver.  The low
-	level driver must have removed all its ports via the
-	uart_remove_one_port() if it registered them with uart_add_one_port().
-
-	Locking: none
-
-	Interrupts: enabled
-
-**uart_suspend_port()**
-
-**uart_resume_port()**
-
-**uart_add_one_port()**
-
-**uart_remove_one_port()**
-
-Other notes
------------
-
-It is intended some day to drop the 'unused' entries from uart_port, and
-allow low level drivers to register their own individual uart_port's with
-the core.  This will allow drivers to use uart_port as a pointer to a
-structure containing both the uart_port entry with their own extensions,
-thus::
-
-	struct my_port {
-		struct uart_port	port;
-		int			my_stuff;
-	};
-
-Modem control lines via GPIO
-----------------------------
-
-Some helpers are provided in order to set/get modem control lines via GPIO.
-
-mctrl_gpio_init(port, idx):
-	This will get the {cts,rts,...}-gpios from device tree if they are
-	present and request them, set direction etc, and return an
-	allocated structure. `devm_*` functions are used, so there's no need
-	to call mctrl_gpio_free().
-	As this sets up the irq handling make sure to not handle changes to the
-	gpio input lines in your driver, too.
-
-mctrl_gpio_free(dev, gpios):
-	This will free the requested gpios in mctrl_gpio_init().
-	As `devm_*` functions are used, there's generally no need to call
-	this function.
-
-mctrl_gpio_to_gpiod(gpios, gidx)
-	This returns the gpio_desc structure associated to the modem line
-	index.
-
-mctrl_gpio_set(gpios, mctrl):
-	This will sets the gpios according to the mctrl state.
-
-mctrl_gpio_get(gpios, mctrl):
-	This will update mctrl with the gpios values.
-
-mctrl_gpio_enable_ms(gpios):
-	Enables irqs and handling of changes to the ms lines.
-
-mctrl_gpio_disable_ms(gpios):
-	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst
deleted file mode 100644
index d0ba22ea23bf..000000000000
--- a/Documentation/serial/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:orphan:
-
-==========================
-Support for Serial devices
-==========================
-
-.. toctree::
-    :maxdepth: 1
-
-
-    driver
-    tty
-
-Serial drivers
-==============
-
-.. toctree::
-    :maxdepth: 1
-
-    cyclades_z
-    moxa-smartio
-    n_gsm
-    rocket
-    serial-iso7816
-    serial-rs485
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/sh/conf.py b/Documentation/sh/conf.py
deleted file mode 100644
index 1eb684a13ac8..000000000000
--- a/Documentation/sh/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "SuperH architecture implementation manual"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'sh.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/sound/conf.py b/Documentation/sound/conf.py
deleted file mode 100644
index 3f1fc5e74e7b..000000000000
--- a/Documentation/sound/conf.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-
-project = "Linux Sound Subsystem Documentation"
-
-tags.add("subproject")
-
-latex_documents = [
-    ('index', 'sound.tex', project,
-     'The kernel development community', 'manual'),
-]
diff --git a/Documentation/sparc/index.rst b/Documentation/sparc/index.rst
index 91f7d6643dd5..71cff621f243 100644
--- a/Documentation/sparc/index.rst
+++ b/Documentation/sparc/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
 ==================
 Sparc Architecture
 ==================
diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py
new file mode 100644
index 000000000000..77e89c1956d7
--- /dev/null
+++ b/Documentation/sphinx/automarkup.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright 2019 Jonathan Corbet <corbet@lwn.net>
+#
+# Apply kernel-specific tweaks after the initial document processing
+# has been done.
+#
+from docutils import nodes
+from sphinx import addnodes
+from sphinx.environment import NoUri
+import re
+
+#
+# Regex nastiness.  Of course.
+# Try to identify "function()" that's not already marked up some
+# other way.  Sphinx doesn't like a lot of stuff right after a
+# :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last
+# bit tries to restrict matches to things that won't create trouble.
+#
+RE_function = re.compile(r'([\w_][\w\d_]+\(\))')
+
+#
+# Many places in the docs refer to common system calls.  It is
+# pointless to try to cross-reference them and, as has been known
+# to happen, somebody defining a function by these names can lead
+# to the creation of incorrect and confusing cross references.  So
+# just don't even try with these names.
+#
+Skipfuncs = [ 'open', 'close', 'read', 'write', 'fcntl', 'mmap'
+              'select', 'poll', 'fork', 'execve', 'clone', 'ioctl']
+
+#
+# Find all occurrences of function() and try to replace them with
+# appropriate cross references.
+#
+def markup_funcs(docname, app, node):
+    cdom = app.env.domains['c']
+    t = node.astext()
+    done = 0
+    repl = [ ]
+    for m in RE_function.finditer(t):
+        #
+        # Include any text prior to function() as a normal text node.
+        #
+        if m.start() > done:
+            repl.append(nodes.Text(t[done:m.start()]))
+        #
+        # Go through the dance of getting an xref out of the C domain
+        #
+        target = m.group(1)[:-2]
+        target_text = nodes.Text(target + '()')
+        xref = None
+        if target not in Skipfuncs:
+            lit_text = nodes.literal(classes=['xref', 'c', 'c-func'])
+            lit_text += target_text
+            pxref = addnodes.pending_xref('', refdomain = 'c',
+                                          reftype = 'function',
+                                          reftarget = target, modname = None,
+                                          classname = None)
+            #
+            # XXX The Latex builder will throw NoUri exceptions here,
+            # work around that by ignoring them.
+            #
+            try:
+                xref = cdom.resolve_xref(app.env, docname, app.builder,
+                                         'function', target, pxref, lit_text)
+            except NoUri:
+                xref = None
+        #
+        # Toss the xref into the list if we got it; otherwise just put
+        # the function text.
+        #
+        if xref:
+            repl.append(xref)
+        else:
+            repl.append(target_text)
+        done = m.end()
+    if done < len(t):
+        repl.append(nodes.Text(t[done:]))
+    return repl
+
+def auto_markup(app, doctree, name):
+    #
+    # This loop could eventually be improved on.  Someday maybe we
+    # want a proper tree traversal with a lot of awareness of which
+    # kinds of nodes to prune.  But this works well for now.
+    #
+    # The nodes.literal test catches ``literal text``, its purpose is to
+    # avoid adding cross-references to functions that have been explicitly
+    # marked with cc:func:.
+    #
+    for para in doctree.traverse(nodes.paragraph):
+        for node in para.traverse(nodes.Text):
+            if not isinstance(node.parent, nodes.literal):
+                node.parent.replace(node, markup_funcs(name, app, node))
+
+def setup(app):
+    app.connect('doctree-resolved', auto_markup)
+    return {
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+        }
diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py
index cf13ff3a656c..cbac8e608dc4 100644
--- a/Documentation/sphinx/cdomain.py
+++ b/Documentation/sphinx/cdomain.py
@@ -48,7 +48,10 @@ major, minor, patch = sphinx.version_info[:3]
 
 def setup(app):
 
-    app.override_domain(CDomain)
+    if (major == 1 and minor < 8):
+        app.override_domain(CDomain)
+    else:
+        app.add_domain(CDomain, override=True)
 
     return dict(
         version = __version__,
diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py
index 9d0a7f08f93b..1159405cb920 100644
--- a/Documentation/sphinx/kerneldoc.py
+++ b/Documentation/sphinx/kerneldoc.py
@@ -37,7 +37,19 @@ import glob
 from docutils import nodes, statemachine
 from docutils.statemachine import ViewList
 from docutils.parsers.rst import directives, Directive
-from sphinx.ext.autodoc import AutodocReporter
+
+#
+# AutodocReporter is only good up to Sphinx 1.7
+#
+import sphinx
+
+Use_SSI = sphinx.__version__[:3] >= '1.7'
+if Use_SSI:
+    from sphinx.util.docutils import switch_source_input
+else:
+    from sphinx.ext.autodoc import AutodocReporter
+
+import kernellog
 
 __version__  = '1.0'
 
@@ -90,7 +102,8 @@ class KernelDocDirective(Directive):
         cmd += [filename]
 
         try:
-            env.app.verbose('calling kernel-doc \'%s\'' % (" ".join(cmd)))
+            kernellog.verbose(env.app,
+                              'calling kernel-doc \'%s\'' % (" ".join(cmd)))
 
             p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             out, err = p.communicate()
@@ -100,7 +113,8 @@ class KernelDocDirective(Directive):
             if p.returncode != 0:
                 sys.stderr.write(err)
 
-                env.app.warn('kernel-doc \'%s\' failed with return code %d' % (" ".join(cmd), p.returncode))
+                kernellog.warn(env.app,
+                               'kernel-doc \'%s\' failed with return code %d' % (" ".join(cmd), p.returncode))
                 return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))]
             elif env.config.kerneldoc_verbosity > 0:
                 sys.stderr.write(err)
@@ -121,20 +135,28 @@ class KernelDocDirective(Directive):
                     lineoffset += 1
 
             node = nodes.section()
-            buf = self.state.memo.title_styles, self.state.memo.section_level, self.state.memo.reporter
+            self.do_parse(result, node)
+
+            return node.children
+
+        except Exception as e:  # pylint: disable=W0703
+            kernellog.warn(env.app, 'kernel-doc \'%s\' processing failed with: %s' %
+                           (" ".join(cmd), str(e)))
+            return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))]
+
+    def do_parse(self, result, node):
+        if Use_SSI:
+            with switch_source_input(self.state, result):
+                self.state.nested_parse(result, 0, node, match_titles=1)
+        else:
+            save = self.state.memo.title_styles, self.state.memo.section_level, self.state.memo.reporter
             self.state.memo.reporter = AutodocReporter(result, self.state.memo.reporter)
             self.state.memo.title_styles, self.state.memo.section_level = [], 0
             try:
                 self.state.nested_parse(result, 0, node, match_titles=1)
             finally:
-                self.state.memo.title_styles, self.state.memo.section_level, self.state.memo.reporter = buf
+                self.state.memo.title_styles, self.state.memo.section_level, self.state.memo.reporter = save
 
-            return node.children
-
-        except Exception as e:  # pylint: disable=W0703
-            env.app.warn('kernel-doc \'%s\' processing failed with: %s' %
-                         (" ".join(cmd), str(e)))
-            return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))]
 
 def setup(app):
     app.add_config_value('kerneldoc_bin', None, 'env')
diff --git a/Documentation/sphinx/kernellog.py b/Documentation/sphinx/kernellog.py
new file mode 100644
index 000000000000..af924f51a7dc
--- /dev/null
+++ b/Documentation/sphinx/kernellog.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Sphinx has deprecated its older logging interface, but the replacement
+# only goes back to 1.6.  So here's a wrapper layer to keep around for
+# as long as we support 1.4.
+#
+import sphinx
+
+if sphinx.__version__[:3] >= '1.6':
+    UseLogging = True
+    from sphinx.util import logging
+    logger = logging.getLogger('kerneldoc')
+else:
+    UseLogging = False
+
+def warn(app, message):
+    if UseLogging:
+        logger.warning(message)
+    else:
+        app.warn(message)
+
+def verbose(app, message):
+    if UseLogging:
+        logger.verbose(message)
+    else:
+        app.verbose(message)
+
+
diff --git a/Documentation/sphinx/kfigure.py b/Documentation/sphinx/kfigure.py
index b97228d2cc0e..fbfe6693bb60 100644
--- a/Documentation/sphinx/kfigure.py
+++ b/Documentation/sphinx/kfigure.py
@@ -60,6 +60,8 @@ import sphinx
 from sphinx.util.nodes import clean_astext
 from six import iteritems
 
+import kernellog
+
 PY3 = sys.version_info[0] == 3
 
 if PY3:
@@ -171,20 +173,20 @@ def setupTools(app):
     This function is called once, when the builder is initiated.
     """
     global dot_cmd, convert_cmd   # pylint: disable=W0603
-    app.verbose("kfigure: check installed tools ...")
+    kernellog.verbose(app, "kfigure: check installed tools ...")
 
     dot_cmd = which('dot')
     convert_cmd = which('convert')
 
     if dot_cmd:
-        app.verbose("use dot(1) from: " + dot_cmd)
+        kernellog.verbose(app, "use dot(1) from: " + dot_cmd)
     else:
-        app.warn("dot(1) not found, for better output quality install "
-                 "graphviz from http://www.graphviz.org")
+        kernellog.warn(app, "dot(1) not found, for better output quality install "
+                       "graphviz from http://www.graphviz.org")
     if convert_cmd:
-        app.verbose("use convert(1) from: " + convert_cmd)
+        kernellog.verbose(app, "use convert(1) from: " + convert_cmd)
     else:
-        app.warn(
+        kernellog.warn(app,
             "convert(1) not found, for SVG to PDF conversion install "
             "ImageMagick (https://www.imagemagick.org)")
 
@@ -220,12 +222,13 @@ def convert_image(img_node, translator, src_fname=None):
 
     # in kernel builds, use 'make SPHINXOPTS=-v' to see verbose messages
 
-    app.verbose('assert best format for: ' + img_node['uri'])
+    kernellog.verbose(app, 'assert best format for: ' + img_node['uri'])
 
     if in_ext == '.dot':
 
         if not dot_cmd:
-            app.verbose("dot from graphviz not available / include DOT raw.")
+            kernellog.verbose(app,
+                              "dot from graphviz not available / include DOT raw.")
             img_node.replace_self(file2literal(src_fname))
 
         elif translator.builder.format == 'latex':
@@ -252,7 +255,8 @@ def convert_image(img_node, translator, src_fname=None):
 
         if translator.builder.format == 'latex':
             if convert_cmd is None:
-                app.verbose("no SVG to PDF conversion available / include SVG raw.")
+                kernellog.verbose(app,
+                                  "no SVG to PDF conversion available / include SVG raw.")
                 img_node.replace_self(file2literal(src_fname))
             else:
                 dst_fname = path.join(translator.builder.outdir, fname + '.pdf')
@@ -265,18 +269,19 @@ def convert_image(img_node, translator, src_fname=None):
         _name = dst_fname[len(translator.builder.outdir) + 1:]
 
         if isNewer(dst_fname, src_fname):
-            app.verbose("convert: {out}/%s already exists and is newer" % _name)
+            kernellog.verbose(app,
+                              "convert: {out}/%s already exists and is newer" % _name)
 
         else:
             ok = False
             mkdir(path.dirname(dst_fname))
 
             if in_ext == '.dot':
-                app.verbose('convert DOT to: {out}/' + _name)
+                kernellog.verbose(app, 'convert DOT to: {out}/' + _name)
                 ok = dot2format(app, src_fname, dst_fname)
 
             elif in_ext == '.svg':
-                app.verbose('convert SVG to: {out}/' + _name)
+                kernellog.verbose(app, 'convert SVG to: {out}/' + _name)
                 ok = svg2pdf(app, src_fname, dst_fname)
 
             if not ok:
@@ -305,7 +310,8 @@ def dot2format(app, dot_fname, out_fname):
     with open(out_fname, "w") as out:
         exit_code = subprocess.call(cmd, stdout = out)
         if exit_code != 0:
-            app.warn("Error #%d when calling: %s" % (exit_code, " ".join(cmd)))
+            kernellog.warn(app,
+                          "Error #%d when calling: %s" % (exit_code, " ".join(cmd)))
     return bool(exit_code == 0)
 
 def svg2pdf(app, svg_fname, pdf_fname):
@@ -322,7 +328,7 @@ def svg2pdf(app, svg_fname, pdf_fname):
     # use stdout and stderr from parent
     exit_code = subprocess.call(cmd)
     if exit_code != 0:
-        app.warn("Error #%d when calling: %s" % (exit_code, " ".join(cmd)))
+        kernellog.warn(app, "Error #%d when calling: %s" % (exit_code, " ".join(cmd)))
     return bool(exit_code == 0)
 
 
@@ -415,15 +421,15 @@ def visit_kernel_render(self, node):
     app = self.builder.app
     srclang = node.get('srclang')
 
-    app.verbose('visit kernel-render node lang: "%s"' % (srclang))
+    kernellog.verbose(app, 'visit kernel-render node lang: "%s"' % (srclang))
 
     tmp_ext = RENDER_MARKUP_EXT.get(srclang, None)
     if tmp_ext is None:
-        app.warn('kernel-render: "%s" unknown / include raw.' % (srclang))
+        kernellog.warn(app, 'kernel-render: "%s" unknown / include raw.' % (srclang))
         return
 
     if not dot_cmd and tmp_ext == '.dot':
-        app.verbose("dot from graphviz not available / include raw.")
+        kernellog.verbose(app, "dot from graphviz not available / include raw.")
         return
 
     literal_block = node[0]
diff --git a/Documentation/sphinx/load_config.py b/Documentation/sphinx/load_config.py
index 301a21aa4f63..eeb394b39e2c 100644
--- a/Documentation/sphinx/load_config.py
+++ b/Documentation/sphinx/load_config.py
@@ -21,6 +21,29 @@ def loadConfig(namespace):
         and os.path.normpath(namespace["__file__"]) != os.path.normpath(config_file) ):
         config_file = os.path.abspath(config_file)
 
+        # Let's avoid one conf.py file just due to latex_documents
+        start = config_file.find('Documentation/')
+        if start >= 0:
+            start = config_file.find('/', start + 1)
+
+        end = config_file.rfind('/')
+        if start >= 0 and end > 0:
+            dir = config_file[start + 1:end]
+
+            print("source directory: %s" % dir)
+            new_latex_docs = []
+            latex_documents = namespace['latex_documents']
+
+            for l in latex_documents:
+                if l[0].find(dir + '/') == 0:
+                    has = True
+                    fn = l[0][len(dir) + 1:]
+                    new_latex_docs.append((fn, l[1], l[2], l[3], l[4]))
+                    break
+
+            namespace['latex_documents'] = new_latex_docs
+
+        # If there is an extra conf.py file, load it
         if os.path.isfile(config_file):
             sys.stdout.write("load additional sphinx-config: %s\n" % config_file)
             config = namespace.copy()
@@ -29,4 +52,6 @@ def loadConfig(namespace):
             del config['__file__']
             namespace.update(config)
         else:
-            sys.stderr.write("WARNING: additional sphinx-config not found: %s\n" % config_file)
+            config = namespace.copy()
+            config['tags'].add("subproject")
+            namespace.update(config)
diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt
index 742be3e12619..14e29a0ae480 100644
--- a/Documentation/sphinx/requirements.txt
+++ b/Documentation/sphinx/requirements.txt
@@ -1,3 +1,3 @@
-docutils==0.12
-Sphinx==1.4.9
+docutils
+Sphinx==1.7.9
 sphinx_rtd_theme
diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
deleted file mode 100644
index 30d6a64e53f7..000000000000
--- a/Documentation/switchtec.txt
+++ /dev/null
@@ -1,102 +0,0 @@
-========================
-Linux Switchtec Support
-========================
-
-Microsemi's "Switchtec" line of PCI switch devices is already
-supported by the kernel with standard PCI switch drivers. However, the
-Switchtec device advertises a special management endpoint which
-enables some additional functionality. This includes:
-
-* Packet and Byte Counters
-* Firmware Upgrades
-* Event and Error logs
-* Querying port link status
-* Custom user firmware commands
-
-The switchtec kernel module implements this functionality.
-
-
-Interface
-=========
-
-The primary means of communicating with the Switchtec management firmware is
-through the Memory-mapped Remote Procedure Call (MRPC) interface.
-Commands are submitted to the interface with a 4-byte command
-identifier and up to 1KB of command specific data. The firmware will
-respond with a 4-byte return code and up to 1KB of command-specific
-data. The interface only processes a single command at a time.
-
-
-Userspace Interface
-===================
-
-The MRPC interface will be exposed to userspace through a simple char
-device: /dev/switchtec#, one for each management endpoint in the system.
-
-The char device has the following semantics:
-
-* A write must consist of at least 4 bytes and no more than 1028 bytes.
-  The first 4 bytes will be interpreted as the Command ID and the
-  remainder will be used as the input data. A write will send the
-  command to the firmware to begin processing.
-
-* Each write must be followed by exactly one read. Any double write will
-  produce an error and any read that doesn't follow a write will
-  produce an error.
-
-* A read will block until the firmware completes the command and return
-  the 4-byte Command Return Value plus up to 1024 bytes of output
-  data. (The length will be specified by the size parameter of the read
-  call -- reading less than 4 bytes will produce an error.)
-
-* The poll call will also be supported for userspace applications that
-  need to do other things while waiting for the command to complete.
-
-The following IOCTLs are also supported by the device:
-
-* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
-  of partitions in the device.
-
-* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
-  any specified partition in flash.
-
-* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
-  indicating all uncleared events.
-
-* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
-  for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
-  with the event_id, index and flags set (index being the partition or PFF
-  number for non-global events). It returns whether the event has
-  occurred, the number of times and any event specific data. The flags
-  can be used to clear the count or enable and disable actions to
-  happen when the event occurs.
-  By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
-  you can set an event to trigger a poll command to return with
-  POLLPRI. In this way, userspace can wait for events to occur.
-
-* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
-  between PCI Function Framework number (used by the event system)
-  and Switchtec Logic Port ID and Partition number (which is more
-  user friendly).
-
-
-Non-Transparent Bridge (NTB) Driver
-===================================
-
-An NTB hardware driver is provided for the Switchtec hardware in
-ntb_hw_switchtec. Currently, it only supports switches configured with
-exactly 2 NT partitions and zero or more non-NT partitions. It also requires
-the following configuration settings:
-
-* Both NT partitions must be able to access each other's GAS spaces.
-  Thus, the bits in the GAS Access Vector under Management Settings
-  must be set to support this.
-* Kernel configuration MUST include support for NTB (CONFIG_NTB needs
-  to be set)
-
-NT EP BAR 2 will be dynamically configured as a Direct Window, and
-the configuration file does not need to configure it explicitly.
-
-Please refer to Documentation/ntb.txt in Linux source tree for an overall
-understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB
-Hardware Driver in this stack.
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README
deleted file mode 100644
index d5f24ab0ecc3..000000000000
--- a/Documentation/sysctl/README
+++ /dev/null
@@ -1,76 +0,0 @@
-Documentation for /proc/sys/		kernel version 2.2.10
-	(c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
-
-'Why', I hear you ask, 'would anyone even _want_ documentation
-for them sysctl files? If anybody really needs it, it's all in
-the source...'
-
-Well, this documentation is written because some people either
-don't know they need to tweak something, or because they don't
-have the time or knowledge to read the source code.
-
-Furthermore, the programmers who built sysctl have built it to
-be actually used, not just for the fun of programming it :-)
-
-==============================================================
-
-Legal blurb:
-
-As usual, there are two main things to consider:
-1. you get what you pay for
-2. it's free
-
-The consequences are that I won't guarantee the correctness of
-this document, and if you come to me complaining about how you
-screwed up your system because of wrong documentation, I won't
-feel sorry for you. I might even laugh at you...
-
-But of course, if you _do_ manage to screw up your system using
-only the sysctl options used in this file, I'd like to hear of
-it. Not only to have a great laugh, but also to make sure that
-you're the last RTFMing person to screw up.
-
-In short, e-mail your suggestions, corrections and / or horror
-stories to: <riel@nl.linux.org>
-
-Rik van Riel.
-
-==============================================================
-
-Introduction:
-
-Sysctl is a means of configuring certain aspects of the kernel
-at run-time, and the /proc/sys/ directory is there so that you
-don't even need special tools to do it!
-In fact, there are only four things needed to use these config
-facilities:
-- a running Linux system
-- root access
-- common sense (this is especially hard to come by these days)
-- knowledge of what all those values mean
-
-As a quick 'ls /proc/sys' will show, the directory consists of
-several (arch-dependent?) subdirs. Each subdir is mainly about
-one part of the kernel, so you can do configuration on a piece
-by piece basis, or just some 'thematic frobbing'.
-
-The subdirs are about:
-abi/		execution domains & personalities
-debug/		<empty>
-dev/		device specific information (eg dev/cdrom/info)
-fs/		specific filesystems
-		filehandle, inode, dentry and quota tuning
-		binfmt_misc <Documentation/admin-guide/binfmt-misc.rst>
-kernel/		global kernel info / tuning
-		miscellaneous stuff
-net/		networking stuff, for documentation look in:
-		<Documentation/networking/>
-proc/		<empty>
-sunrpc/		SUN Remote Procedure Call (NFS)
-vm/		memory management tuning
-		buffer and cache management
-user/		Per user per user namespace limits
-
-These are the subdirs I have on my system. There might be more
-or other subdirs in another setup. If you see another dir, I'd
-really like to hear about it :-)
diff --git a/Documentation/sysctl/abi.txt b/Documentation/sysctl/abi.txt
deleted file mode 100644
index 63f4ebcf652c..000000000000
--- a/Documentation/sysctl/abi.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Documentation for /proc/sys/abi/* kernel version 2.6.0.test2
-	(c) 2003,  Fabian Frederick <ffrederick@users.sourceforge.net>
-
-For general info : README.
-
-==============================================================
-
-This path is binary emulation relevant aka personality types aka abi.
-When a process is executed, it's linked to an exec_domain whose
-personality is defined using values available from /proc/sys/abi.
-You can find further details about abi in include/linux/personality.h.
-
-Here are the files featuring in 2.6 kernel :
-
-- defhandler_coff
-- defhandler_elf
-- defhandler_lcall7
-- defhandler_libcso
-- fake_utsname
-- trace
-
-===========================================================
-defhandler_coff:
-defined value :
-PER_SCOSVR3
-0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE
-
-===========================================================
-defhandler_elf:
-defined value :
-PER_LINUX
-0
-
-===========================================================
-defhandler_lcall7:
-defined value :
-PER_SVR4
-0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
-
-===========================================================
-defhandler_libsco:
-defined value:
-PER_SVR4
-0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
-
-===========================================================
-fake_utsname:
-Unused
-
-===========================================================
-trace:
-Unused
-
-===========================================================
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
deleted file mode 100644
index ebc679bcb2dc..000000000000
--- a/Documentation/sysctl/fs.txt
+++ /dev/null
@@ -1,374 +0,0 @@
-Documentation for /proc/sys/fs/*	kernel version 2.2.10
-	(c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
-	(c) 2009,        Shen Feng<shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains documentation for the sysctl files in
-/proc/sys/fs/ and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to tune and monitor
-miscellaneous and general things in the operation of the Linux
-kernel. Since some of the files _can_ be used to screw up your
-system, it is advisable to read both documentation and source
-before actually making adjustments.
-
-1. /proc/sys/fs
-----------------------------------------------------------
-
-Currently, these files are in /proc/sys/fs:
-- aio-max-nr
-- aio-nr
-- dentry-state
-- dquot-max
-- dquot-nr
-- file-max
-- file-nr
-- inode-max
-- inode-nr
-- inode-state
-- nr_open
-- overflowuid
-- overflowgid
-- pipe-user-pages-hard
-- pipe-user-pages-soft
-- protected_fifos
-- protected_hardlinks
-- protected_regular
-- protected_symlinks
-- suid_dumpable
-- super-max
-- super-nr
-
-==============================================================
-
-aio-nr & aio-max-nr:
-
-aio-nr is the running total of the number of events specified on the
-io_setup system call for all currently active aio contexts.  If aio-nr
-reaches aio-max-nr then io_setup will fail with EAGAIN.  Note that
-raising aio-max-nr does not result in the pre-allocation or re-sizing
-of any kernel data structures.
-
-==============================================================
-
-dentry-state:
-
-From linux/include/linux/dcache.h:
---------------------------------------------------------------
-struct dentry_stat_t dentry_stat {
-        int nr_dentry;
-        int nr_unused;
-        int age_limit;         /* age in seconds */
-        int want_pages;        /* pages requested by system */
-        int nr_negative;       /* # of unused negative dentries */
-        int dummy;             /* Reserved for future use */
-};
---------------------------------------------------------------
-
-Dentries are dynamically allocated and deallocated.
-
-nr_dentry shows the total number of dentries allocated (active
-+ unused). nr_unused shows the number of dentries that are not
-actively used, but are saved in the LRU list for future reuse.
-
-Age_limit is the age in seconds after which dcache entries
-can be reclaimed when memory is short and want_pages is
-nonzero when shrink_dcache_pages() has been called and the
-dcache isn't pruned yet.
-
-nr_negative shows the number of unused dentries that are also
-negative dentries which do not map to any files. Instead,
-they help speeding up rejection of non-existing files provided
-by the users.
-
-==============================================================
-
-dquot-max & dquot-nr:
-
-The file dquot-max shows the maximum number of cached disk
-quota entries.
-
-The file dquot-nr shows the number of allocated disk quota
-entries and the number of free disk quota entries.
-
-If the number of free cached disk quotas is very low and
-you have some awesome number of simultaneous system users,
-you might want to raise the limit.
-
-==============================================================
-
-file-max & file-nr:
-
-The value in file-max denotes the maximum number of file-
-handles that the Linux kernel will allocate. When you get lots
-of error messages about running out of file handles, you might
-want to increase this limit.
-
-Historically,the kernel was able to allocate file handles
-dynamically, but not to free them again. The three values in
-file-nr denote the number of allocated file handles, the number
-of allocated but unused file handles, and the maximum number of
-file handles. Linux 2.6 always reports 0 as the number of free
-file handles -- this is not an error, it just means that the
-number of allocated file handles exactly matches the number of
-used file handles.
-
-Attempts to allocate more file descriptors than file-max are
-reported with printk, look for "VFS: file-max limit <number>
-reached".
-==============================================================
-
-nr_open:
-
-This denotes the maximum number of file-handles a process can
-allocate. Default value is 1024*1024 (1048576) which should be
-enough for most machines. Actual limit depends on RLIMIT_NOFILE
-resource limit.
-
-==============================================================
-
-inode-max, inode-nr & inode-state:
-
-As with file handles, the kernel allocates the inode structures
-dynamically, but can't free them yet.
-
-The value in inode-max denotes the maximum number of inode
-handlers. This value should be 3-4 times larger than the value
-in file-max, since stdin, stdout and network sockets also
-need an inode struct to handle them. When you regularly run
-out of inodes, you need to increase this value.
-
-The file inode-nr contains the first two items from
-inode-state, so we'll skip to that file...
-
-Inode-state contains three actual numbers and four dummies.
-The actual numbers are, in order of appearance, nr_inodes,
-nr_free_inodes and preshrink.
-
-Nr_inodes stands for the number of inodes the system has
-allocated, this can be slightly more than inode-max because
-Linux allocates them one pageful at a time.
-
-Nr_free_inodes represents the number of free inodes (?) and
-preshrink is nonzero when the nr_inodes > inode-max and the
-system needs to prune the inode list instead of allocating
-more.
-
-==============================================================
-
-overflowgid & overflowuid:
-
-Some filesystems only support 16-bit UIDs and GIDs, although in Linux
-UIDs and GIDs are 32 bits. When one of these filesystems is mounted
-with writes enabled, any UID or GID that would exceed 65535 is translated
-to a fixed value before being written to disk.
-
-These sysctls allow you to change the value of the fixed UID and GID.
-The default is 65534.
-
-==============================================================
-
-pipe-user-pages-hard:
-
-Maximum total number of pages a non-privileged user may allocate for pipes.
-Once this limit is reached, no new pipes may be allocated until usage goes
-below the limit again. When set to 0, no limit is applied, which is the default
-setting.
-
-==============================================================
-
-pipe-user-pages-soft:
-
-Maximum total number of pages a non-privileged user may allocate for pipes
-before the pipe size gets limited to a single page. Once this limit is reached,
-new pipes will be limited to a single page in size for this user in order to
-limit total memory usage, and trying to increase them using fcntl() will be
-denied until usage goes below the limit again. The default value allows to
-allocate up to 1024 pipes at their default size. When set to 0, no limit is
-applied.
-
-==============================================================
-
-protected_fifos:
-
-The intent of this protection is to avoid unintentional writes to
-an attacker-controlled FIFO, where a program expected to create a regular
-file.
-
-When set to "0", writing to FIFOs is unrestricted.
-
-When set to "1" don't allow O_CREAT open on FIFOs that we don't own
-in world writable sticky directories, unless they are owned by the
-owner of the directory.
-
-When set to "2" it also applies to group writable sticky directories.
-
-This protection is based on the restrictions in Openwall.
-
-==============================================================
-
-protected_hardlinks:
-
-A long-standing class of security issues is the hardlink-based
-time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
-is to cross privilege boundaries when following a given hardlink (i.e. a
-root process follows a hardlink created by another user). Additionally,
-on systems without separated partitions, this stops unauthorized users
-from "pinning" vulnerable setuid/setgid files against being upgraded by
-the administrator, or linking to special files.
-
-When set to "0", hardlink creation behavior is unrestricted.
-
-When set to "1" hardlinks cannot be created by users if they do not
-already own the source file, or do not have read/write access to it.
-
-This protection is based on the restrictions in Openwall and grsecurity.
-
-==============================================================
-
-protected_regular:
-
-This protection is similar to protected_fifos, but it
-avoids writes to an attacker-controlled regular file, where a program
-expected to create one.
-
-When set to "0", writing to regular files is unrestricted.
-
-When set to "1" don't allow O_CREAT open on regular files that we
-don't own in world writable sticky directories, unless they are
-owned by the owner of the directory.
-
-When set to "2" it also applies to group writable sticky directories.
-
-==============================================================
-
-protected_symlinks:
-
-A long-standing class of security issues is the symlink-based
-time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
-is to cross privilege boundaries when following a given symlink (i.e. a
-root process follows a symlink belonging to another user). For a likely
-incomplete list of hundreds of examples across the years, please see:
-http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp
-
-When set to "0", symlink following behavior is unrestricted.
-
-When set to "1" symlinks are permitted to be followed only when outside
-a sticky world-writable directory, or when the uid of the symlink and
-follower match, or when the directory owner matches the symlink's owner.
-
-This protection is based on the restrictions in Openwall and grsecurity.
-
-==============================================================
-
-suid_dumpable:
-
-This value can be used to query and set the core dump mode for setuid
-or otherwise protected/tainted binaries. The modes are
-
-0 - (default) - traditional behaviour. Any process which has changed
-	privilege levels or is execute only will not be dumped.
-1 - (debug) - all processes dump core when possible. The core dump is
-	owned by the current user and no security is applied. This is
-	intended for system debugging situations only. Ptrace is unchecked.
-	This is insecure as it allows regular users to examine the memory
-	contents of privileged processes.
-2 - (suidsafe) - any binary which normally would not be dumped is dumped
-	anyway, but only if the "core_pattern" kernel sysctl is set to
-	either a pipe handler or a fully qualified path. (For more details
-	on this limitation, see CVE-2006-2451.) This mode is appropriate
-	when administrators are attempting to debug problems in a normal
-	environment, and either have a core dump pipe handler that knows
-	to treat privileged core dumps with care, or specific directory
-	defined for catching core dumps. If a core dump happens without
-	a pipe handler or fully qualifid path, a message will be emitted
-	to syslog warning about the lack of a correct setting.
-
-==============================================================
-
-super-max & super-nr:
-
-These numbers control the maximum number of superblocks, and
-thus the maximum number of mounted filesystems the kernel
-can have. You only need to increase super-max if you need to
-mount more filesystems than the current value in super-max
-allows you to.
-
-==============================================================
-
-aio-nr & aio-max-nr:
-
-aio-nr shows the current system-wide number of asynchronous io
-requests.  aio-max-nr allows you to change the maximum value
-aio-nr can grow to.
-
-==============================================================
-
-mount-max:
-
-This denotes the maximum number of mounts that may exist
-in a mount namespace.
-
-==============================================================
-
-
-2. /proc/sys/fs/binfmt_misc
-----------------------------------------------------------
-
-Documentation for the files in /proc/sys/fs/binfmt_misc is
-in Documentation/admin-guide/binfmt-misc.rst.
-
-
-3. /proc/sys/fs/mqueue - POSIX message queues filesystem
-----------------------------------------------------------
-
-The "mqueue"  filesystem provides  the necessary kernel features to enable the
-creation of a  user space  library that  implements  the  POSIX message queues
-API (as noted by the  MSG tag in the  POSIX 1003.1-2001 version  of the System
-Interfaces specification.)
-
-The "mqueue" filesystem contains values for determining/setting  the amount of
-resources used by the file system.
-
-/proc/sys/fs/mqueue/queues_max is a read/write  file for  setting/getting  the
-maximum number of message queues allowed on the system.
-
-/proc/sys/fs/mqueue/msg_max  is  a  read/write file  for  setting/getting  the
-maximum number of messages in a queue value.  In fact it is the limiting value
-for another (user) limit which is set in mq_open invocation. This attribute of
-a queue must be less or equal then msg_max.
-
-/proc/sys/fs/mqueue/msgsize_max is  a read/write  file for setting/getting the
-maximum  message size value (it is every  message queue's attribute set during
-its creation).
-
-/proc/sys/fs/mqueue/msg_default is  a read/write  file for setting/getting the
-default number of messages in a queue value if attr parameter of mq_open(2) is
-NULL. If it exceed msg_max, the default value is initialized msg_max.
-
-/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
-the default message size value if attr parameter of mq_open(2) is NULL. If it
-exceed msgsize_max, the default value is initialized msgsize_max.
-
-4. /proc/sys/fs/epoll - Configuration options for the epoll interface
---------------------------------------------------------
-
-This directory contains configuration options for the epoll(7) interface.
-
-max_user_watches
-----------------
-
-Every epoll file descriptor can store a number of files to be monitored
-for event readiness. Each one of these monitored files constitutes a "watch".
-This configuration option sets the maximum number of "watches" that are
-allowed for each user.
-Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
-on a 64bit one.
-The current default value for  max_user_watches  is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
-
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
deleted file mode 100644
index f0c86fbb3b48..000000000000
--- a/Documentation/sysctl/kernel.txt
+++ /dev/null
@@ -1,1145 +0,0 @@
-Documentation for /proc/sys/kernel/*	kernel version 2.2.10
-	(c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
-	(c) 2009,        Shen Feng<shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains documentation for the sysctl files in
-/proc/sys/kernel/ and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to tune and monitor
-miscellaneous and general things in the operation of the Linux
-kernel. Since some of the files _can_ be used to screw up your
-system, it is advisable to read both documentation and source
-before actually making adjustments.
-
-Currently, these files might (depending on your configuration)
-show up in /proc/sys/kernel:
-
-- acct
-- acpi_video_flags
-- auto_msgmni
-- bootloader_type	     [ X86 only ]
-- bootloader_version	     [ X86 only ]
-- callhome		     [ S390 only ]
-- cap_last_cap
-- core_pattern
-- core_pipe_limit
-- core_uses_pid
-- ctrl-alt-del
-- dmesg_restrict
-- domainname
-- hostname
-- hotplug
-- hardlockup_all_cpu_backtrace
-- hardlockup_panic
-- hung_task_panic
-- hung_task_check_count
-- hung_task_timeout_secs
-- hung_task_check_interval_secs
-- hung_task_warnings
-- hyperv_record_panic_msg
-- kexec_load_disabled
-- kptr_restrict
-- l2cr                        [ PPC only ]
-- modprobe                    ==> Documentation/debugging-modules.txt
-- modules_disabled
-- msg_next_id		      [ sysv ipc ]
-- msgmax
-- msgmnb
-- msgmni
-- nmi_watchdog
-- osrelease
-- ostype
-- overflowgid
-- overflowuid
-- panic
-- panic_on_oops
-- panic_on_stackoverflow
-- panic_on_unrecovered_nmi
-- panic_on_warn
-- panic_print
-- panic_on_rcu_stall
-- perf_cpu_time_max_percent
-- perf_event_paranoid
-- perf_event_max_stack
-- perf_event_mlock_kb
-- perf_event_max_contexts_per_stack
-- pid_max
-- powersave-nap               [ PPC only ]
-- printk
-- printk_delay
-- printk_ratelimit
-- printk_ratelimit_burst
-- pty                         ==> Documentation/filesystems/devpts.txt
-- randomize_va_space
-- real-root-dev               ==> Documentation/admin-guide/initrd.rst
-- reboot-cmd                  [ SPARC only ]
-- rtsig-max
-- rtsig-nr
-- sched_energy_aware
-- seccomp/                    ==> Documentation/userspace-api/seccomp_filter.rst
-- sem
-- sem_next_id		      [ sysv ipc ]
-- sg-big-buff                 [ generic SCSI device (sg) ]
-- shm_next_id		      [ sysv ipc ]
-- shm_rmid_forced
-- shmall
-- shmmax                      [ sysv ipc ]
-- shmmni
-- softlockup_all_cpu_backtrace
-- soft_watchdog
-- stack_erasing
-- stop-a                      [ SPARC only ]
-- sysrq                       ==> Documentation/admin-guide/sysrq.rst
-- sysctl_writes_strict
-- tainted                     ==> Documentation/admin-guide/tainted-kernels.rst
-- threads-max
-- unknown_nmi_panic
-- watchdog
-- watchdog_thresh
-- version
-
-==============================================================
-
-acct:
-
-highwater lowwater frequency
-
-If BSD-style process accounting is enabled these values control
-its behaviour. If free space on filesystem where the log lives
-goes below <lowwater>% accounting suspends. If free space gets
-above <highwater>% accounting resumes. <Frequency> determines
-how often do we check the amount of free space (value is in
-seconds). Default:
-4 2 30
-That is, suspend accounting if there left <= 2% free; resume it
-if we got >=4%; consider information about amount of free space
-valid for 30 seconds.
-
-==============================================================
-
-acpi_video_flags:
-
-flags
-
-See Doc*/kernel/power/video.txt, it allows mode of video boot to be
-set during run time.
-
-==============================================================
-
-auto_msgmni:
-
-This variable has no effect and may be removed in future kernel
-releases. Reading it always returns 0.
-Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni
-upon memory add/remove or upon ipc namespace creation/removal.
-Echoing "1" into this file enabled msgmni automatic recomputing.
-Echoing "0" turned it off. auto_msgmni default value was 1.
-
-
-==============================================================
-
-bootloader_type:
-
-x86 bootloader identification
-
-This gives the bootloader type number as indicated by the bootloader,
-shifted left by 4, and OR'd with the low four bits of the bootloader
-version.  The reason for this encoding is that this used to match the
-type_of_loader field in the kernel header; the encoding is kept for
-backwards compatibility.  That is, if the full bootloader type number
-is 0x15 and the full version number is 0x234, this file will contain
-the value 340 = 0x154.
-
-See the type_of_loader and ext_loader_type fields in
-Documentation/x86/boot.txt for additional information.
-
-==============================================================
-
-bootloader_version:
-
-x86 bootloader version
-
-The complete bootloader version number.  In the example above, this
-file will contain the value 564 = 0x234.
-
-See the type_of_loader and ext_loader_ver fields in
-Documentation/x86/boot.txt for additional information.
-
-==============================================================
-
-callhome:
-
-Controls the kernel's callhome behavior in case of a kernel panic.
-
-The s390 hardware allows an operating system to send a notification
-to a service organization (callhome) in case of an operating system panic.
-
-When the value in this file is 0 (which is the default behavior)
-nothing happens in case of a kernel panic. If this value is set to "1"
-the complete kernel oops message is send to the IBM customer service
-organization in case the mainframe the Linux operating system is running
-on has a service contract with IBM.
-
-==============================================================
-
-cap_last_cap
-
-Highest valid capability of the running kernel.  Exports
-CAP_LAST_CAP from the kernel.
-
-==============================================================
-
-core_pattern:
-
-core_pattern is used to specify a core dumpfile pattern name.
-. max length 127 characters; default value is "core"
-. core_pattern is used as a pattern template for the output filename;
-  certain string patterns (beginning with '%') are substituted with
-  their actual values.
-. backward compatibility with core_uses_pid:
-	If core_pattern does not include "%p" (default does not)
-	and core_uses_pid is set, then .PID will be appended to
-	the filename.
-. corename format specifiers:
-	%<NUL>	'%' is dropped
-	%%	output one '%'
-	%p	pid
-	%P	global pid (init PID namespace)
-	%i	tid
-	%I	global tid (init PID namespace)
-	%u	uid (in initial user namespace)
-	%g	gid (in initial user namespace)
-	%d	dump mode, matches PR_SET_DUMPABLE and
-		/proc/sys/fs/suid_dumpable
-	%s	signal number
-	%t	UNIX time of dump
-	%h	hostname
-	%e	executable filename (may be shortened)
-	%E	executable path
-	%<OTHER> both are dropped
-. If the first character of the pattern is a '|', the kernel will treat
-  the rest of the pattern as a command to run.  The core dump will be
-  written to the standard input of that program instead of to a file.
-
-==============================================================
-
-core_pipe_limit:
-
-This sysctl is only applicable when core_pattern is configured to pipe
-core files to a user space helper (when the first character of
-core_pattern is a '|', see above).  When collecting cores via a pipe
-to an application, it is occasionally useful for the collecting
-application to gather data about the crashing process from its
-/proc/pid directory.  In order to do this safely, the kernel must wait
-for the collecting process to exit, so as not to remove the crashing
-processes proc files prematurely.  This in turn creates the
-possibility that a misbehaving userspace collecting process can block
-the reaping of a crashed process simply by never exiting.  This sysctl
-defends against that.  It defines how many concurrent crashing
-processes may be piped to user space applications in parallel.  If
-this value is exceeded, then those crashing processes above that value
-are noted via the kernel log and their cores are skipped.  0 is a
-special value, indicating that unlimited processes may be captured in
-parallel, but that no waiting will take place (i.e. the collecting
-process is not guaranteed access to /proc/<crashing pid>/).  This
-value defaults to 0.
-
-==============================================================
-
-core_uses_pid:
-
-The default coredump filename is "core".  By setting
-core_uses_pid to 1, the coredump filename becomes core.PID.
-If core_pattern does not include "%p" (default does not)
-and core_uses_pid is set, then .PID will be appended to
-the filename.
-
-==============================================================
-
-ctrl-alt-del:
-
-When the value in this file is 0, ctrl-alt-del is trapped and
-sent to the init(1) program to handle a graceful restart.
-When, however, the value is > 0, Linux's reaction to a Vulcan
-Nerve Pinch (tm) will be an immediate reboot, without even
-syncing its dirty buffers.
-
-Note: when a program (like dosemu) has the keyboard in 'raw'
-mode, the ctrl-alt-del is intercepted by the program before it
-ever reaches the kernel tty layer, and it's up to the program
-to decide what to do with it.
-
-==============================================================
-
-dmesg_restrict:
-
-This toggle indicates whether unprivileged users are prevented
-from using dmesg(8) to view messages from the kernel's log buffer.
-When dmesg_restrict is set to (0) there are no restrictions. When
-dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use
-dmesg(8).
-
-The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the
-default value of dmesg_restrict.
-
-==============================================================
-
-domainname & hostname:
-
-These files can be used to set the NIS/YP domainname and the
-hostname of your box in exactly the same way as the commands
-domainname and hostname, i.e.:
-# echo "darkstar" > /proc/sys/kernel/hostname
-# echo "mydomain" > /proc/sys/kernel/domainname
-has the same effect as
-# hostname "darkstar"
-# domainname "mydomain"
-
-Note, however, that the classic darkstar.frop.org has the
-hostname "darkstar" and DNS (Internet Domain Name Server)
-domainname "frop.org", not to be confused with the NIS (Network
-Information Service) or YP (Yellow Pages) domainname. These two
-domain names are in general different. For a detailed discussion
-see the hostname(1) man page.
-
-==============================================================
-hardlockup_all_cpu_backtrace:
-
-This value controls the hard lockup detector behavior when a hard
-lockup condition is detected as to whether or not to gather further
-debug information. If enabled, arch-specific all-CPU stack dumping
-will be initiated.
-
-0: do nothing. This is the default behavior.
-
-1: on detection capture more debug information.
-==============================================================
-
-hardlockup_panic:
-
-This parameter can be used to control whether the kernel panics
-when a hard lockup is detected.
-
-   0 - don't panic on hard lockup
-   1 - panic on hard lockup
-
-See Documentation/lockup-watchdogs.txt for more information.  This can
-also be set using the nmi_watchdog kernel parameter.
-
-==============================================================
-
-hotplug:
-
-Path for the hotplug policy agent.
-Default value is "/sbin/hotplug".
-
-==============================================================
-
-hung_task_panic:
-
-Controls the kernel's behavior when a hung task is detected.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0: continue operation. This is the default behavior.
-
-1: panic immediately.
-
-==============================================================
-
-hung_task_check_count:
-
-The upper bound on the number of tasks that are checked.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-==============================================================
-
-hung_task_timeout_secs:
-
-When a task in D state did not get scheduled
-for more than this value report a warning.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0: means infinite timeout - no checking done.
-Possible values to set are in range {0..LONG_MAX/HZ}.
-
-==============================================================
-
-hung_task_check_interval_secs:
-
-Hung task check interval. If hung task checking is enabled
-(see hung_task_timeout_secs), the check is done every
-hung_task_check_interval_secs seconds.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0 (default): means use hung_task_timeout_secs as checking interval.
-Possible values to set are in range {0..LONG_MAX/HZ}.
-
-==============================================================
-
-hung_task_warnings:
-
-The maximum number of warnings to report. During a check interval
-if a hung task is detected, this value is decreased by 1.
-When this value reaches 0, no more warnings will be reported.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
--1: report an infinite number of warnings.
-
-==============================================================
-
-hyperv_record_panic_msg:
-
-Controls whether the panic kmsg data should be reported to Hyper-V.
-
-0: do not report panic kmsg data.
-
-1: report the panic kmsg data. This is the default behavior.
-
-==============================================================
-
-kexec_load_disabled:
-
-A toggle indicating if the kexec_load syscall has been disabled. This
-value defaults to 0 (false: kexec_load enabled), but can be set to 1
-(true: kexec_load disabled). Once true, kexec can no longer be used, and
-the toggle cannot be set back to false. This allows a kexec image to be
-loaded before disabling the syscall, allowing a system to set up (and
-later use) an image without it being altered. Generally used together
-with the "modules_disabled" sysctl.
-
-==============================================================
-
-kptr_restrict:
-
-This toggle indicates whether restrictions are placed on
-exposing kernel addresses via /proc and other interfaces.
-
-When kptr_restrict is set to 0 (the default) the address is hashed before
-printing. (This is the equivalent to %p.)
-
-When kptr_restrict is set to (1), kernel pointers printed using the %pK
-format specifier will be replaced with 0's unless the user has CAP_SYSLOG
-and effective user and group ids are equal to the real ids. This is
-because %pK checks are done at read() time rather than open() time, so
-if permissions are elevated between the open() and the read() (e.g via
-a setuid binary) then %pK will not leak kernel pointers to unprivileged
-users. Note, this is a temporary solution only. The correct long-term
-solution is to do the permission checks at open() time. Consider removing
-world read permissions from files that use %pK, and using dmesg_restrict
-to protect against uses of %pK in dmesg(8) if leaking kernel pointer
-values to unprivileged users is a concern.
-
-When kptr_restrict is set to (2), kernel pointers printed using
-%pK will be replaced with 0's regardless of privileges.
-
-==============================================================
-
-l2cr: (PPC only)
-
-This flag controls the L2 cache of G3 processor boards. If
-0, the cache is disabled. Enabled if nonzero.
-
-==============================================================
-
-modules_disabled:
-
-A toggle value indicating if modules are allowed to be loaded
-in an otherwise modular kernel.  This toggle defaults to off
-(0), but can be set true (1).  Once true, modules can be
-neither loaded nor unloaded, and the toggle cannot be set back
-to false.  Generally used with the "kexec_load_disabled" toggle.
-
-==============================================================
-
-msg_next_id, sem_next_id, and shm_next_id:
-
-These three toggles allows to specify desired id for next allocated IPC
-object: message, semaphore or shared memory respectively.
-
-By default they are equal to -1, which means generic allocation logic.
-Possible values to set are in range {0..INT_MAX}.
-
-Notes:
-1) kernel doesn't guarantee, that new object will have desired id. So,
-it's up to userspace, how to handle an object with "wrong" id.
-2) Toggle with non-default value will be set back to -1 by kernel after
-successful IPC object allocation. If an IPC object allocation syscall
-fails, it is undefined if the value remains unmodified or is reset to -1.
-
-==============================================================
-
-nmi_watchdog:
-
-This parameter can be used to control the NMI watchdog
-(i.e. the hard lockup detector) on x86 systems.
-
-   0 - disable the hard lockup detector
-   1 - enable the hard lockup detector
-
-The hard lockup detector monitors each CPU for its ability to respond to
-timer interrupts. The mechanism utilizes CPU performance counter registers
-that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
-while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
-
-The NMI watchdog is disabled by default if the kernel is running as a guest
-in a KVM virtual machine. This default can be overridden by adding
-
-   nmi_watchdog=1
-
-to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst).
-
-==============================================================
-
-numa_balancing
-
-Enables/disables automatic page fault based NUMA memory
-balancing. Memory is moved automatically to nodes
-that access it often.
-
-Enables/disables automatic NUMA memory balancing. On NUMA machines, there
-is a performance penalty if remote memory is accessed by a CPU. When this
-feature is enabled the kernel samples what task thread is accessing memory
-by periodically unmapping pages and later trapping a page fault. At the
-time of the page fault, it is determined if the data being accessed should
-be migrated to a local memory node.
-
-The unmapping of pages and trapping faults incur additional overhead that
-ideally is offset by improved memory locality but there is no universal
-guarantee. If the target workload is already bound to NUMA nodes then this
-feature should be disabled. Otherwise, if the system overhead from the
-feature is too high then the rate the kernel samples for NUMA hinting
-faults may be controlled by the numa_balancing_scan_period_min_ms,
-numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
-
-==============================================================
-
-numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
-numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
-
-Automatic NUMA balancing scans tasks address space and unmaps pages to
-detect if pages are properly placed or if the data should be migrated to a
-memory node local to where the task is running.  Every "scan delay" the task
-scans the next "scan size" number of pages in its address space. When the
-end of the address space is reached the scanner restarts from the beginning.
-
-In combination, the "scan delay" and "scan size" determine the scan rate.
-When "scan delay" decreases, the scan rate increases.  The scan delay and
-hence the scan rate of every task is adaptive and depends on historical
-behaviour. If pages are properly placed then the scan delay increases,
-otherwise the scan delay decreases.  The "scan size" is not adaptive but
-the higher the "scan size", the higher the scan rate.
-
-Higher scan rates incur higher system overhead as page faults must be
-trapped and potentially data must be migrated. However, the higher the scan
-rate, the more quickly a tasks memory is migrated to a local node if the
-workload pattern changes and minimises performance impact due to remote
-memory accesses. These sysctls control the thresholds for scan delays and
-the number of pages scanned.
-
-numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the maximum scanning
-rate for each task.
-
-numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
-when it initially forks.
-
-numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the minimum scanning
-rate for each task.
-
-numa_balancing_scan_size_mb is how many megabytes worth of pages are
-scanned for a given scan.
-
-==============================================================
-
-osrelease, ostype & version:
-
-# cat osrelease
-2.1.88
-# cat ostype
-Linux
-# cat version
-#5 Wed Feb 25 21:49:24 MET 1998
-
-The files osrelease and ostype should be clear enough. Version
-needs a little more clarification however. The '#5' means that
-this is the fifth kernel built from this source base and the
-date behind it indicates the time the kernel was built.
-The only way to tune these values is to rebuild the kernel :-)
-
-==============================================================
-
-overflowgid & overflowuid:
-
-if your architecture did not always support 32-bit UIDs (i.e. arm,
-i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to
-applications that use the old 16-bit UID/GID system calls, if the
-actual UID or GID would exceed 65535.
-
-These sysctls allow you to change the value of the fixed UID and GID.
-The default is 65534.
-
-==============================================================
-
-panic:
-
-The value in this file represents the number of seconds the kernel
-waits before rebooting on a panic. When you use the software watchdog,
-the recommended setting is 60.
-
-==============================================================
-
-panic_on_io_nmi:
-
-Controls the kernel's behavior when a CPU receives an NMI caused by
-an IO error.
-
-0: try to continue operation (default)
-
-1: panic immediately. The IO error triggered an NMI. This indicates a
-   serious system condition which could result in IO data corruption.
-   Rather than continuing, panicking might be a better choice. Some
-   servers issue this sort of NMI when the dump button is pushed,
-   and you can use this option to take a crash dump.
-
-==============================================================
-
-panic_on_oops:
-
-Controls the kernel's behaviour when an oops or BUG is encountered.
-
-0: try to continue operation
-
-1: panic immediately.  If the `panic' sysctl is also non-zero then the
-   machine will be rebooted.
-
-==============================================================
-
-panic_on_stackoverflow:
-
-Controls the kernel's behavior when detecting the overflows of
-kernel, IRQ and exception stacks except a user stack.
-This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
-
-0: try to continue operation.
-
-1: panic immediately.
-
-==============================================================
-
-panic_on_unrecovered_nmi:
-
-The default Linux behaviour on an NMI of either memory or unknown is
-to continue operation. For many environments such as scientific
-computing it is preferable that the box is taken out and the error
-dealt with than an uncorrected parity/ECC error get propagated.
-
-A small number of systems do generate NMI's for bizarre random reasons
-such as power management so the default is off. That sysctl works like
-the existing panic controls already in that directory.
-
-==============================================================
-
-panic_on_warn:
-
-Calls panic() in the WARN() path when set to 1.  This is useful to avoid
-a kernel rebuild when attempting to kdump at the location of a WARN().
-
-0: only WARN(), default behaviour.
-
-1: call panic() after printing out WARN() location.
-
-==============================================================
-
-panic_print:
-
-Bitmask for printing system info when panic happens. User can chose
-combination of the following bits:
-
-bit 0: print all tasks info
-bit 1: print system memory info
-bit 2: print timer info
-bit 3: print locks info if CONFIG_LOCKDEP is on
-bit 4: print ftrace buffer
-
-So for example to print tasks and memory info on panic, user can:
-  echo 3 > /proc/sys/kernel/panic_print
-
-==============================================================
-
-panic_on_rcu_stall:
-
-When set to 1, calls panic() after RCU stall detection messages. This
-is useful to define the root cause of RCU stalls using a vmcore.
-
-0: do not panic() when RCU stall takes place, default behavior.
-
-1: panic() after printing RCU stall messages.
-
-==============================================================
-
-perf_cpu_time_max_percent:
-
-Hints to the kernel how much CPU time it should be allowed to
-use to handle perf sampling events.  If the perf subsystem
-is informed that its samples are exceeding this limit, it
-will drop its sampling frequency to attempt to reduce its CPU
-usage.
-
-Some perf sampling happens in NMIs.  If these samples
-unexpectedly take too long to execute, the NMIs can become
-stacked up next to each other so much that nothing else is
-allowed to execute.
-
-0: disable the mechanism.  Do not monitor or correct perf's
-   sampling rate no matter how CPU time it takes.
-
-1-100: attempt to throttle perf's sample rate to this
-   percentage of CPU.  Note: the kernel calculates an
-   "expected" length of each sample event.  100 here means
-   100% of that expected length.  Even if this is set to
-   100, you may still see sample throttling if this
-   length is exceeded.  Set to 0 if you truly do not care
-   how much CPU is consumed.
-
-==============================================================
-
-perf_event_paranoid:
-
-Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN).  The default value is 2.
-
- -1: Allow use of (almost) all events by all users
-     Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
->=0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
-     Disallow raw tracepoint access by users without CAP_SYS_ADMIN
->=1: Disallow CPU event access by users without CAP_SYS_ADMIN
->=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
-
-==============================================================
-
-perf_event_max_stack:
-
-Controls maximum number of stack frames to copy for (attr.sample_type &
-PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
-'perf record -g' or 'perf trace --call-graph fp'.
-
-This can only be done when no events are in use that have callchains
-enabled, otherwise writing to this file will return -EBUSY.
-
-The default value is 127.
-
-==============================================================
-
-perf_event_mlock_kb:
-
-Control size of per-cpu ring buffer not counted agains mlock limit.
-
-The default value is 512 + 1 page
-
-==============================================================
-
-perf_event_max_contexts_per_stack:
-
-Controls maximum number of stack frame context entries for
-(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
-instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
-
-This can only be done when no events are in use that have callchains
-enabled, otherwise writing to this file will return -EBUSY.
-
-The default value is 8.
-
-==============================================================
-
-pid_max:
-
-PID allocation wrap value.  When the kernel's next PID value
-reaches this value, it wraps back to a minimum PID value.
-PIDs of value pid_max or larger are not allocated.
-
-==============================================================
-
-ns_last_pid:
-
-The last pid allocated in the current (the one task using this sysctl
-lives in) pid namespace. When selecting a pid for a next task on fork
-kernel tries to allocate a number starting from this one.
-
-==============================================================
-
-powersave-nap: (PPC only)
-
-If set, Linux-PPC will use the 'nap' mode of powersaving,
-otherwise the 'doze' mode will be used.
-
-==============================================================
-
-printk:
-
-The four values in printk denote: console_loglevel,
-default_message_loglevel, minimum_console_loglevel and
-default_console_loglevel respectively.
-
-These values influence printk() behavior when printing or
-logging error messages. See 'man 2 syslog' for more info on
-the different loglevels.
-
-- console_loglevel: messages with a higher priority than
-  this will be printed to the console
-- default_message_loglevel: messages without an explicit priority
-  will be printed with this priority
-- minimum_console_loglevel: minimum (highest) value to which
-  console_loglevel can be set
-- default_console_loglevel: default value for console_loglevel
-
-==============================================================
-
-printk_delay:
-
-Delay each printk message in printk_delay milliseconds
-
-Value from 0 - 10000 is allowed.
-
-==============================================================
-
-printk_ratelimit:
-
-Some warning messages are rate limited. printk_ratelimit specifies
-the minimum length of time between these messages (in jiffies), by
-default we allow one every 5 seconds.
-
-A value of 0 will disable rate limiting.
-
-==============================================================
-
-printk_ratelimit_burst:
-
-While long term we enforce one message per printk_ratelimit
-seconds, we do allow a burst of messages to pass through.
-printk_ratelimit_burst specifies the number of messages we can
-send before ratelimiting kicks in.
-
-==============================================================
-
-printk_devkmsg:
-
-Control the logging to /dev/kmsg from userspace:
-
-ratelimit: default, ratelimited
-on: unlimited logging to /dev/kmsg from userspace
-off: logging to /dev/kmsg disabled
-
-The kernel command line parameter printk.devkmsg= overrides this and is
-a one-time setting until next reboot: once set, it cannot be changed by
-this sysctl interface anymore.
-
-==============================================================
-
-randomize_va_space:
-
-This option can be used to select the type of process address
-space randomization that is used in the system, for architectures
-that support this feature.
-
-0 - Turn the process address space randomization off.  This is the
-    default for architectures that do not support this feature anyways,
-    and kernels that are booted with the "norandmaps" parameter.
-
-1 - Make the addresses of mmap base, stack and VDSO page randomized.
-    This, among other things, implies that shared libraries will be
-    loaded to random addresses.  Also for PIE-linked binaries, the
-    location of code start is randomized.  This is the default if the
-    CONFIG_COMPAT_BRK option is enabled.
-
-2 - Additionally enable heap randomization.  This is the default if
-    CONFIG_COMPAT_BRK is disabled.
-
-    There are a few legacy applications out there (such as some ancient
-    versions of libc.so.5 from 1996) that assume that brk area starts
-    just after the end of the code+bss.  These applications break when
-    start of the brk area is randomized.  There are however no known
-    non-legacy applications that would be broken this way, so for most
-    systems it is safe to choose full randomization.
-
-    Systems with ancient and/or broken binaries should be configured
-    with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
-    address space randomization.
-
-==============================================================
-
-reboot-cmd: (Sparc only)
-
-??? This seems to be a way to give an argument to the Sparc
-ROM/Flash boot loader. Maybe to tell it what to do after
-rebooting. ???
-
-==============================================================
-
-rtsig-max & rtsig-nr:
-
-The file rtsig-max can be used to tune the maximum number
-of POSIX realtime (queued) signals that can be outstanding
-in the system.
-
-rtsig-nr shows the number of RT signals currently queued.
-
-==============================================================
-
-sched_energy_aware:
-
-Enables/disables Energy Aware Scheduling (EAS). EAS starts
-automatically on platforms where it can run (that is,
-platforms with asymmetric CPU topologies and having an Energy
-Model available). If your platform happens to meet the
-requirements for EAS but you do not want to use it, change
-this value to 0.
-
-==============================================================
-
-sched_schedstats:
-
-Enables/disables scheduler statistics. Enabling this feature
-incurs a small amount of overhead in the scheduler but is
-useful for debugging and performance tuning.
-
-==============================================================
-
-sg-big-buff:
-
-This file shows the size of the generic SCSI (sg) buffer.
-You can't tune it just yet, but you could change it on
-compile time by editing include/scsi/sg.h and changing
-the value of SG_BIG_BUFF.
-
-There shouldn't be any reason to change this value. If
-you can come up with one, you probably know what you
-are doing anyway :)
-
-==============================================================
-
-shmall:
-
-This parameter sets the total amount of shared memory pages that
-can be used system wide. Hence, SHMALL should always be at least
-ceil(shmmax/PAGE_SIZE).
-
-If you are not sure what the default PAGE_SIZE is on your Linux
-system, you can run the following command:
-
-# getconf PAGE_SIZE
-
-==============================================================
-
-shmmax:
-
-This value can be used to query and set the run time limit
-on the maximum shared memory segment size that can be created.
-Shared memory segments up to 1Gb are now supported in the
-kernel.  This value defaults to SHMMAX.
-
-==============================================================
-
-shm_rmid_forced:
-
-Linux lets you set resource limits, including how much memory one
-process can consume, via setrlimit(2).  Unfortunately, shared memory
-segments are allowed to exist without association with any process, and
-thus might not be counted against any resource limits.  If enabled,
-shared memory segments are automatically destroyed when their attach
-count becomes zero after a detach or a process termination.  It will
-also destroy segments that were created, but never attached to, on exit
-from the process.  The only use left for IPC_RMID is to immediately
-destroy an unattached segment.  Of course, this breaks the way things are
-defined, so some applications might stop working.  Note that this
-feature will do you no good unless you also configure your resource
-limits (in particular, RLIMIT_AS and RLIMIT_NPROC).  Most systems don't
-need this.
-
-Note that if you change this from 0 to 1, already created segments
-without users and with a dead originative process will be destroyed.
-
-==============================================================
-
-sysctl_writes_strict:
-
-Control how file position affects the behavior of updating sysctl values
-via the /proc/sys interface:
-
-  -1 - Legacy per-write sysctl value handling, with no printk warnings.
-       Each write syscall must fully contain the sysctl value to be
-       written, and multiple writes on the same sysctl file descriptor
-       will rewrite the sysctl value, regardless of file position.
-   0 - Same behavior as above, but warn about processes that perform writes
-       to a sysctl file descriptor when the file position is not 0.
-   1 - (default) Respect file position when writing sysctl strings. Multiple
-       writes will append to the sysctl value buffer. Anything past the max
-       length of the sysctl value buffer will be ignored. Writes to numeric
-       sysctl entries must always be at file position 0 and the value must
-       be fully contained in the buffer sent in the write syscall.
-
-==============================================================
-
-softlockup_all_cpu_backtrace:
-
-This value controls the soft lockup detector thread's behavior
-when a soft lockup condition is detected as to whether or not
-to gather further debug information. If enabled, each cpu will
-be issued an NMI and instructed to capture stack trace.
-
-This feature is only applicable for architectures which support
-NMI.
-
-0: do nothing. This is the default behavior.
-
-1: on detection capture more debug information.
-
-==============================================================
-
-soft_watchdog
-
-This parameter can be used to control the soft lockup detector.
-
-   0 - disable the soft lockup detector
-   1 - enable the soft lockup detector
-
-The soft lockup detector monitors CPUs for threads that are hogging the CPUs
-without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
-from running. The mechanism depends on the CPUs ability to respond to timer
-interrupts which are needed for the 'watchdog/N' threads to be woken up by
-the watchdog timer function, otherwise the NMI watchdog - if enabled - can
-detect a hard lockup condition.
-
-==============================================================
-
-stack_erasing
-
-This parameter can be used to control kernel stack erasing at the end
-of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK.
-
-That erasing reduces the information which kernel stack leak bugs
-can reveal and blocks some uninitialized stack variable attacks.
-The tradeoff is the performance impact: on a single CPU system kernel
-compilation sees a 1% slowdown, other systems and workloads may vary.
-
-  0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated.
-
-  1: kernel stack erasing is enabled (default), it is performed before
-     returning to the userspace at the end of syscalls.
-==============================================================
-
-tainted
-
-Non-zero if the kernel has been tainted. Numeric values, which can be
-ORed together. The letters are seen in "Tainted" line of Oops reports.
-
-     1 (P): proprietary module was loaded
-     2 (F): module was force loaded
-     4 (S): SMP kernel oops on an officially SMP incapable processor
-     8 (R): module was force unloaded
-    16 (M): processor reported a Machine Check Exception (MCE)
-    32 (B): bad page referenced or some unexpected page flags
-    64 (U): taint requested by userspace application
-   128 (D): kernel died recently, i.e. there was an OOPS or BUG
-   256 (A): an ACPI table was overridden by user
-   512 (W): kernel issued warning
-  1024 (C): staging driver was loaded
-  2048 (I): workaround for bug in platform firmware applied
-  4096 (O): externally-built ("out-of-tree") module was loaded
-  8192 (E): unsigned module was loaded
- 16384 (L): soft lockup occurred
- 32768 (K): kernel has been live patched
- 65536 (X): Auxiliary taint, defined and used by for distros
-131072 (T): The kernel was built with the struct randomization plugin
-
-See Documentation/admin-guide/tainted-kernels.rst for more information.
-
-==============================================================
-
-threads-max
-
-This value controls the maximum number of threads that can be created
-using fork().
-
-During initialization the kernel sets this value such that even if the
-maximum number of threads is created, the thread structures occupy only
-a part (1/8th) of the available RAM pages.
-
-The minimum value that can be written to threads-max is 20.
-The maximum value that can be written to threads-max is given by the
-constant FUTEX_TID_MASK (0x3fffffff).
-If a value outside of this range is written to threads-max an error
-EINVAL occurs.
-
-The value written is checked against the available RAM pages. If the
-thread structures would occupy too much (more than 1/8th) of the
-available RAM pages threads-max is reduced accordingly.
-
-==============================================================
-
-unknown_nmi_panic:
-
-The value in this file affects behavior of handling NMI. When the
-value is non-zero, unknown NMI is trapped and then panic occurs. At
-that time, kernel debugging information is displayed on console.
-
-NMI switch that most IA32 servers have fires unknown NMI up, for
-example.  If a system hangs up, try pressing the NMI switch.
-
-==============================================================
-
-watchdog:
-
-This parameter can be used to disable or enable the soft lockup detector
-_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
-
-   0 - disable both lockup detectors
-   1 - enable both lockup detectors
-
-The soft lockup detector and the NMI watchdog can also be disabled or
-enabled individually, using the soft_watchdog and nmi_watchdog parameters.
-If the watchdog parameter is read, for example by executing
-
-   cat /proc/sys/kernel/watchdog
-
-the output of this command (0 or 1) shows the logical OR of soft_watchdog
-and nmi_watchdog.
-
-==============================================================
-
-watchdog_cpumask:
-
-This value can be used to control on which cpus the watchdog may run.
-The default cpumask is all possible cores, but if NO_HZ_FULL is
-enabled in the kernel config, and cores are specified with the
-nohz_full= boot argument, those cores are excluded by default.
-Offline cores can be included in this mask, and if the core is later
-brought online, the watchdog will be started based on the mask value.
-
-Typically this value would only be touched in the nohz_full case
-to re-enable cores that by default were not running the watchdog,
-if a kernel lockup was suspected on those cores.
-
-The argument value is the standard cpulist format for cpumasks,
-so for example to enable the watchdog on cores 0, 2, 3, and 4 you
-might say:
-
-  echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
-
-==============================================================
-
-watchdog_thresh:
-
-This value can be used to control the frequency of hrtimer and NMI
-events and the soft and hard lockup thresholds. The default threshold
-is 10 seconds.
-
-The softlockup threshold is (2 * watchdog_thresh). Setting this
-tunable to zero will disable lockup detection altogether.
-
-==============================================================
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
deleted file mode 100644
index 2ae91d3873bb..000000000000
--- a/Documentation/sysctl/net.txt
+++ /dev/null
@@ -1,422 +0,0 @@
-Documentation for /proc/sys/net/*
-	(c) 1999		Terrehon Bowden <terrehon@pacbell.net>
-				Bodo Bauer <bb@ricochet.net>
-	(c) 2000		Jorge Nerin <comandante@zaralinux.com>
-	(c) 2009		Shen Feng <shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/net
-
-The interface  to  the  networking  parts  of  the  kernel  is  located  in
-/proc/sys/net. The following table shows all possible subdirectories.  You may
-see only some of them, depending on your kernel's configuration.
-
-
-Table : Subdirectories in /proc/sys/net
-..............................................................................
- Directory Content             Directory  Content
- core      General parameter   appletalk  Appletalk protocol
- unix      Unix domain sockets netrom     NET/ROM
- 802       E802 protocol       ax25       AX25
- ethernet  Ethernet protocol   rose       X.25 PLP layer
- ipv4      IP version 4        x25        X.25 protocol
- ipx       IPX                 token-ring IBM token ring
- bridge    Bridging            decnet     DEC net
- ipv6      IP version 6        tipc       TIPC
-..............................................................................
-
-1. /proc/sys/net/core - Network core options
--------------------------------------------------------
-
-bpf_jit_enable
---------------
-
-This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
-and efficient infrastructure allowing to execute bytecode at various
-hook points. It is used in a number of Linux kernel subsystems such
-as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
-and security (e.g. seccomp). LLVM has a BPF back end that can compile
-restricted C into a sequence of BPF instructions. After program load
-through bpf(2) and passing a verifier in the kernel, a JIT will then
-translate these BPF proglets into native CPU instructions. There are
-two flavors of JITs, the newer eBPF JIT currently supported on:
-  - x86_64
-  - x86_32
-  - arm64
-  - arm32
-  - ppc64
-  - sparc64
-  - mips64
-  - s390x
-  - riscv
-
-And the older cBPF JIT supported on the following archs:
-  - mips
-  - ppc
-  - sparc
-
-eBPF JITs are a superset of cBPF JITs, meaning the kernel will
-migrate cBPF instructions into eBPF instructions and then JIT
-compile them transparently. Older cBPF JITs can only translate
-tcpdump filters, seccomp rules, etc, but not mentioned eBPF
-programs loaded through bpf(2).
-
-Values :
-	0 - disable the JIT (default value)
-	1 - enable the JIT
-	2 - enable the JIT and ask the compiler to emit traces on kernel log.
-
-bpf_jit_harden
---------------
-
-This enables hardening for the BPF JIT compiler. Supported are eBPF
-JIT backends. Enabling hardening trades off performance, but can
-mitigate JIT spraying.
-Values :
-	0 - disable JIT hardening (default value)
-	1 - enable JIT hardening for unprivileged users only
-	2 - enable JIT hardening for all users
-
-bpf_jit_kallsyms
-----------------
-
-When BPF JIT compiler is enabled, then compiled images are unknown
-addresses to the kernel, meaning they neither show up in traces nor
-in /proc/kallsyms. This enables export of these addresses, which can
-be used for debugging/tracing. If bpf_jit_harden is enabled, this
-feature is disabled.
-Values :
-	0 - disable JIT kallsyms export (default value)
-	1 - enable JIT kallsyms export for privileged users only
-
-bpf_jit_limit
--------------
-
-This enforces a global limit for memory allocations to the BPF JIT
-compiler in order to reject unprivileged JIT requests once it has
-been surpassed. bpf_jit_limit contains the value of the global limit
-in bytes.
-
-dev_weight
---------------
-
-The maximum number of packets that kernel can handle on a NAPI interrupt,
-it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware
-aggregated packet is counted as one packet in this context.
-
-Default: 64
-
-dev_weight_rx_bias
---------------
-
-RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
-of the driver for the per softirq cycle netdev_budget. This parameter influences
-the proportion of the configured netdev_budget that is spent on RPS based packet
-processing during RX softirq cycles. It is further meant for making current
-dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
-(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
-on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
-Default: 1
-
-dev_weight_tx_bias
---------------
-
-Scales the maximum number of packets that can be processed during a TX softirq cycle.
-Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
-net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
-Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
-Default: 1
-
-default_qdisc
---------------
-
-The default queuing discipline to use for network devices. This allows
-overriding the default of pfifo_fast with an alternative. Since the default
-queuing discipline is created without additional parameters so is best suited
-to queuing disciplines that work well without configuration like stochastic
-fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use
-queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin
-which require setting up classes and bandwidths. Note that physical multiqueue
-interfaces still use mq as root qdisc, which in turn uses this default for its
-leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead
-default to noqueue.
-Default: pfifo_fast
-
-busy_read
-----------------
-Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL)
-Approximate time in us to busy loop waiting for packets on the device queue.
-This sets the default value of the SO_BUSY_POLL socket option.
-Can be set or overridden per socket by setting socket option SO_BUSY_POLL,
-which is the preferred method of enabling. If you need to enable the feature
-globally via sysctl, a value of 50 is recommended.
-Will increase power usage.
-Default: 0 (off)
-
-busy_poll
-----------------
-Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL)
-Approximate time in us to busy loop waiting for events.
-Recommended value depends on the number of sockets you poll on.
-For several sockets 50, for several hundreds 100.
-For more than that you probably want to use epoll.
-Note that only sockets with SO_BUSY_POLL set will be busy polled,
-so you want to either selectively set SO_BUSY_POLL on those sockets or set
-sysctl.net.busy_read globally.
-Will increase power usage.
-Default: 0 (off)
-
-rmem_default
-------------
-
-The default setting of the socket receive buffer in bytes.
-
-rmem_max
---------
-
-The maximum receive socket buffer size in bytes.
-
-tstamp_allow_data
------------------
-Allow processes to receive tx timestamps looped together with the original
-packet contents. If disabled, transmit timestamp requests from unprivileged
-processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set.
-Default: 1 (on)
-
-
-wmem_default
-------------
-
-The default setting (in bytes) of the socket send buffer.
-
-wmem_max
---------
-
-The maximum send socket buffer size in bytes.
-
-message_burst and message_cost
-------------------------------
-
-These parameters  are used to limit the warning messages written to the kernel
-log from  the  networking  code.  They  enforce  a  rate  limit  to  make  a
-denial-of-service attack  impossible. A higher message_cost factor, results in
-fewer messages that will be written. Message_burst controls when messages will
-be dropped.  The  default  settings  limit  warning messages to one every five
-seconds.
-
-warnings
---------
-
-This sysctl is now unused.
-
-This was used to control console messages from the networking stack that
-occur because of problems on the network like duplicate address or bad
-checksums.
-
-These messages are now emitted at KERN_DEBUG and can generally be enabled
-and controlled by the dynamic_debug facility.
-
-netdev_budget
--------------
-
-Maximum number of packets taken from all interfaces in one polling cycle (NAPI
-poll). In one polling cycle interfaces which are registered to polling are
-probed in a round-robin manner. Also, a polling cycle may not exceed
-netdev_budget_usecs microseconds, even if netdev_budget has not been
-exhausted.
-
-netdev_budget_usecs
----------------------
-
-Maximum number of microseconds in one NAPI polling cycle. Polling
-will exit when either netdev_budget_usecs have elapsed during the
-poll cycle or the number of packets processed reaches netdev_budget.
-
-netdev_max_backlog
-------------------
-
-Maximum number  of  packets,  queued  on  the  INPUT  side, when the interface
-receives packets faster than kernel can process them.
-
-netdev_rss_key
---------------
-
-RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is
-randomly generated.
-Some user space might need to gather its content even if drivers do not
-provide ethtool -x support yet.
-
-myhost:~# cat /proc/sys/net/core/netdev_rss_key
-84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total)
-
-File contains nul bytes if no driver ever called netdev_rss_key_fill() function.
-Note:
-/proc/sys/net/core/netdev_rss_key contains 52 bytes of key,
-but most drivers only use 40 bytes of it.
-
-myhost:~# ethtool -x eth0
-RX flow hash indirection table for eth0 with 8 RX ring(s):
-    0:    0     1     2     3     4     5     6     7
-RSS hash key:
-84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89
-
-netdev_tstamp_prequeue
-----------------------
-
-If set to 0, RX packet timestamps can be sampled after RPS processing, when
-the target CPU processes packets. It might give some delay on timestamps, but
-permit to distribute the load on several cpus.
-
-If set to 1 (default), timestamps are sampled as soon as possible, before
-queueing.
-
-optmem_max
-----------
-
-Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
-of struct cmsghdr structures with appended data.
-
-fb_tunnels_only_for_init_net
-----------------------------
-
-Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
-sit0, ip6tnl0, ip6gre0) are automatically created when a new
-network namespace is created, if corresponding tunnel is present
-in initial network namespace.
-If set to 1, these devices are not automatically created, and
-user space is responsible for creating them if needed.
-
-Default : 0  (for compatibility reasons)
-
-devconf_inherit_init_net
-----------------------------
-
-Controls if a new network namespace should inherit all current
-settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By
-default, we keep the current behavior: for IPv4 we inherit all current
-settings from init_net and for IPv6 we reset all settings to default.
-
-If set to 1, both IPv4 and IPv6 settings are forced to inherit from
-current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
-forced to reset to their default values.
-
-Default : 0  (for compatibility reasons)
-
-2. /proc/sys/net/unix - Parameters for Unix domain sockets
--------------------------------------------------------
-
-There is only one file in this directory.
-unix_dgram_qlen limits the max number of datagrams queued in Unix domain
-socket's buffer. It will not take effect unless PF_UNIX flag is specified.
-
-
-3. /proc/sys/net/ipv4 - IPV4 settings
--------------------------------------------------------
-Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
-descriptions of these entries.
-
-
-4. Appletalk
--------------------------------------------------------
-
-The /proc/sys/net/appletalk  directory  holds the Appletalk configuration data
-when Appletalk is loaded. The configurable parameters are:
-
-aarp-expiry-time
-----------------
-
-The amount  of  time  we keep an ARP entry before expiring it. Used to age out
-old hosts.
-
-aarp-resolve-time
------------------
-
-The amount of time we will spend trying to resolve an Appletalk address.
-
-aarp-retransmit-limit
----------------------
-
-The number of times we will retransmit a query before giving up.
-
-aarp-tick-time
---------------
-
-Controls the rate at which expires are checked.
-
-The directory  /proc/net/appletalk  holds the list of active Appletalk sockets
-on a machine.
-
-The fields  indicate  the DDP type, the local address (in network:node format)
-the remote  address,  the  size of the transmit pending queue, the size of the
-received queue  (bytes waiting for applications to read) the state and the uid
-owning the socket.
-
-/proc/net/atalk_iface lists  all  the  interfaces  configured for appletalk.It
-shows the  name  of the interface, its Appletalk address, the network range on
-that address  (or  network number for phase 1 networks), and the status of the
-interface.
-
-/proc/net/atalk_route lists  each  known  network  route.  It lists the target
-(network) that the route leads to, the router (may be directly connected), the
-route flags, and the device the route is using.
-
-
-5. IPX
--------------------------------------------------------
-
-The IPX protocol has no tunable values in proc/sys/net.
-
-The IPX  protocol  does,  however,  provide  proc/net/ipx. This lists each IPX
-socket giving  the  local  and  remote  addresses  in  Novell  format (that is
-network:node:port). In  accordance  with  the  strange  Novell  tradition,
-everything but the port is in hex. Not_Connected is displayed for sockets that
-are not  tied to a specific remote address. The Tx and Rx queue sizes indicate
-the number  of  bytes  pending  for  transmission  and  reception.  The  state
-indicates the  state  the  socket  is  in and the uid is the owning uid of the
-socket.
-
-The /proc/net/ipx_interface  file lists all IPX interfaces. For each interface
-it gives  the network number, the node number, and indicates if the network is
-the primary  network.  It  also  indicates  which  device  it  is bound to (or
-Internal for  internal  networks)  and  the  Frame  Type if appropriate. Linux
-supports 802.3,  802.2,  802.2  SNAP  and DIX (Blue Book) ethernet framing for
-IPX.
-
-The /proc/net/ipx_route  table  holds  a list of IPX routes. For each route it
-gives the  destination  network, the router node (or Directly) and the network
-address of the router (or Connected) for internal networks.
-
-6. TIPC
--------------------------------------------------------
-
-tipc_rmem
-----------
-
-The TIPC protocol now has a tunable for the receive memory, similar to the
-tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max)
-
-    # cat /proc/sys/net/tipc/tipc_rmem
-    4252725 34021800        68043600
-    #
-
-The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values
-are scaled (shifted) versions of that same value.  Note that the min value
-is not at this point in time used in any meaningful way, but the triplet is
-preserved in order to be consistent with things like tcp_rmem.
-
-named_timeout
---------------
-
-TIPC name table updates are distributed asynchronously in a cluster, without
-any form of transaction handling. This means that different race scenarios are
-possible. One such is that a name withdrawal sent out by one node and received
-by another node may arrive after a second, overlapping name publication already
-has been accepted from a third node, although the conflicting updates
-originally may have been issued in the correct sequential order.
-If named_timeout is nonzero, failed topology updates will be placed on a defer
-queue until another event arrives that clears the error, or until the timeout
-expires. Value is in milliseconds.
diff --git a/Documentation/sysctl/sunrpc.txt b/Documentation/sysctl/sunrpc.txt
deleted file mode 100644
index ae1ecac6f85a..000000000000
--- a/Documentation/sysctl/sunrpc.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Documentation for /proc/sys/sunrpc/*	kernel version 2.2.10
-	(c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/sunrpc and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to (re)set the debug
-flags of the SUN Remote Procedure Call (RPC) subsystem in
-the Linux kernel. This stuff is used for NFS, KNFSD and
-maybe a few other things as well.
-
-The files in there are used to control the debugging flags:
-rpc_debug, nfs_debug, nfsd_debug and nlm_debug.
-
-These flags are for kernel hackers only. You should read the
-source code in net/sunrpc/ for more information.
diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt
deleted file mode 100644
index a5882865836e..000000000000
--- a/Documentation/sysctl/user.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-Documentation for /proc/sys/user/*	kernel version 4.9.0
-	(c) 2016		Eric Biederman <ebiederm@xmission.com>
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/user.
-
-The files in this directory can be used to override the default
-limits on the number of namespaces and other objects that have
-per user per user namespace limits.
-
-The primary purpose of these limits is to stop programs that
-malfunction and attempt to create a ridiculous number of objects,
-before the malfunction becomes a system wide problem.  It is the
-intention that the defaults of these limits are set high enough that
-no program in normal operation should run into these limits.
-
-The creation of per user per user namespace objects are charged to
-the user in the user namespace who created the object and
-verified to be below the per user limit in that user namespace.
-
-The creation of objects is also charged to all of the users
-who created user namespaces the creation of the object happens
-in (user namespaces can be nested) and verified to be below the per user
-limits in the user namespaces of those users.
-
-This recursive counting of created objects ensures that creating a
-user namespace does not allow a user to escape their current limits.
-
-Currently, these files are in /proc/sys/user:
-
-- max_cgroup_namespaces
-
-  The maximum number of cgroup namespaces that any user in the current
-  user namespace may create.
-
-- max_ipc_namespaces
-
-  The maximum number of ipc namespaces that any user in the current
-  user namespace may create.
-
-- max_mnt_namespaces
-
-  The maximum number of mount namespaces that any user in the current
-  user namespace may create.
-
-- max_net_namespaces
-
-  The maximum number of network namespaces that any user in the
-  current user namespace may create.
-
-- max_pid_namespaces
-
-  The maximum number of pid namespaces that any user in the current
-  user namespace may create.
-
-- max_user_namespaces
-
-  The maximum number of user namespaces that any user in the current
-  user namespace may create.
-
-- max_uts_namespaces
-
-  The maximum number of user namespaces that any user in the current
-  user namespace may create.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
deleted file mode 100644
index 749322060f10..000000000000
--- a/Documentation/sysctl/vm.txt
+++ /dev/null
@@ -1,946 +0,0 @@
-Documentation for /proc/sys/vm/*	kernel version 2.6.29
-	(c) 1998, 1999,  Rik van Riel <riel@nl.linux.org>
-	(c) 2008         Peter W. Morreale <pmorreale@novell.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/vm and is valid for Linux kernel version 2.6.29.
-
-The files in this directory can be used to tune the operation
-of the virtual memory (VM) subsystem of the Linux kernel and
-the writeout of dirty data to disk.
-
-Default values and initialization routines for most of these
-files can be found in mm/swap.c.
-
-Currently, these files are in /proc/sys/vm:
-
-- admin_reserve_kbytes
-- block_dump
-- compact_memory
-- compact_unevictable_allowed
-- dirty_background_bytes
-- dirty_background_ratio
-- dirty_bytes
-- dirty_expire_centisecs
-- dirty_ratio
-- dirtytime_expire_seconds
-- dirty_writeback_centisecs
-- drop_caches
-- extfrag_threshold
-- hugetlb_shm_group
-- laptop_mode
-- legacy_va_layout
-- lowmem_reserve_ratio
-- max_map_count
-- memory_failure_early_kill
-- memory_failure_recovery
-- min_free_kbytes
-- min_slab_ratio
-- min_unmapped_ratio
-- mmap_min_addr
-- mmap_rnd_bits
-- mmap_rnd_compat_bits
-- nr_hugepages
-- nr_hugepages_mempolicy
-- nr_overcommit_hugepages
-- nr_trim_pages         (only if CONFIG_MMU=n)
-- numa_zonelist_order
-- oom_dump_tasks
-- oom_kill_allocating_task
-- overcommit_kbytes
-- overcommit_memory
-- overcommit_ratio
-- page-cluster
-- panic_on_oom
-- percpu_pagelist_fraction
-- stat_interval
-- stat_refresh
-- numa_stat
-- swappiness
-- unprivileged_userfaultfd
-- user_reserve_kbytes
-- vfs_cache_pressure
-- watermark_boost_factor
-- watermark_scale_factor
-- zone_reclaim_mode
-
-==============================================================
-
-admin_reserve_kbytes
-
-The amount of free memory in the system that should be reserved for users
-with the capability cap_sys_admin.
-
-admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
-
-That should provide enough for the admin to log in and kill a process,
-if necessary, under the default overcommit 'guess' mode.
-
-Systems running under overcommit 'never' should increase this to account
-for the full Virtual Memory Size of programs used to recover. Otherwise,
-root may not be able to log in to recover the system.
-
-How do you calculate a minimum useful reserve?
-
-sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
-
-For overcommit 'guess', we can sum resident set sizes (RSS).
-On x86_64 this is about 8MB.
-
-For overcommit 'never', we can take the max of their virtual sizes (VSZ)
-and add the sum of their RSS.
-On x86_64 this is about 128MB.
-
-Changing this takes effect whenever an application requests memory.
-
-==============================================================
-
-block_dump
-
-block_dump enables block I/O debugging when set to a nonzero value. More
-information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
-
-==============================================================
-
-compact_memory
-
-Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
-all zones are compacted such that free memory is available in contiguous
-blocks where possible. This can be important for example in the allocation of
-huge pages although processes will also directly compact memory as required.
-
-==============================================================
-
-compact_unevictable_allowed
-
-Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
-allowed to examine the unevictable lru (mlocked pages) for pages to compact.
-This should be used on systems where stalls for minor page faults are an
-acceptable trade for large contiguous free memory.  Set to 0 to prevent
-compaction from moving pages that are unevictable.  Default value is 1.
-
-==============================================================
-
-dirty_background_bytes
-
-Contains the amount of dirty memory at which the background kernel
-flusher threads will start writeback.
-
-Note: dirty_background_bytes is the counterpart of dirty_background_ratio. Only
-one of them may be specified at a time. When one sysctl is written it is
-immediately taken into account to evaluate the dirty memory limits and the
-other appears as 0 when read.
-
-==============================================================
-
-dirty_background_ratio
-
-Contains, as a percentage of total available memory that contains free pages
-and reclaimable pages, the number of pages at which the background kernel
-flusher threads will start writing out dirty data.
-
-The total available memory is not equal to total system memory.
-
-==============================================================
-
-dirty_bytes
-
-Contains the amount of dirty memory at which a process generating disk writes
-will itself start writeback.
-
-Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be
-specified at a time. When one sysctl is written it is immediately taken into
-account to evaluate the dirty memory limits and the other appears as 0 when
-read.
-
-Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
-value lower than this limit will be ignored and the old configuration will be
-retained.
-
-==============================================================
-
-dirty_expire_centisecs
-
-This tunable is used to define when dirty data is old enough to be eligible
-for writeout by the kernel flusher threads.  It is expressed in 100'ths
-of a second.  Data which has been dirty in-memory for longer than this
-interval will be written out next time a flusher thread wakes up.
-
-==============================================================
-
-dirty_ratio
-
-Contains, as a percentage of total available memory that contains free pages
-and reclaimable pages, the number of pages at which a process which is
-generating disk writes will itself start writing out dirty data.
-
-The total available memory is not equal to total system memory.
-
-==============================================================
-
-dirtytime_expire_seconds
-
-When a lazytime inode is constantly having its pages dirtied, the inode with
-an updated timestamp will never get chance to be written out.  And, if the
-only thing that has happened on the file system is a dirtytime inode caused
-by an atime update, a worker will be scheduled to make sure that inode
-eventually gets pushed out to disk.  This tunable is used to define when dirty
-inode is old enough to be eligible for writeback by the kernel flusher threads.
-And, it is also used as the interval to wakeup dirtytime_writeback thread.
-
-==============================================================
-
-dirty_writeback_centisecs
-
-The kernel flusher threads will periodically wake up and write `old' data
-out to disk.  This tunable expresses the interval between those wakeups, in
-100'ths of a second.
-
-Setting this to zero disables periodic writeback altogether.
-
-==============================================================
-
-drop_caches
-
-Writing to this will cause the kernel to drop clean caches, as well as
-reclaimable slab objects like dentries and inodes.  Once dropped, their
-memory becomes free.
-
-To free pagecache:
-	echo 1 > /proc/sys/vm/drop_caches
-To free reclaimable slab objects (includes dentries and inodes):
-	echo 2 > /proc/sys/vm/drop_caches
-To free slab objects and pagecache:
-	echo 3 > /proc/sys/vm/drop_caches
-
-This is a non-destructive operation and will not free any dirty objects.
-To increase the number of objects freed by this operation, the user may run
-`sync' prior to writing to /proc/sys/vm/drop_caches.  This will minimize the
-number of dirty objects on the system and create more candidates to be
-dropped.
-
-This file is not a means to control the growth of the various kernel caches
-(inodes, dentries, pagecache, etc...)  These objects are automatically
-reclaimed by the kernel when memory is needed elsewhere on the system.
-
-Use of this file can cause performance problems.  Since it discards cached
-objects, it may cost a significant amount of I/O and CPU to recreate the
-dropped objects, especially if they were under heavy use.  Because of this,
-use outside of a testing or debugging environment is not recommended.
-
-You may see informational messages in your kernel log when this file is
-used:
-
-	cat (1234): drop_caches: 3
-
-These are informational only.  They do not mean that anything is wrong
-with your system.  To disable them, echo 4 (bit 2) into drop_caches.
-
-==============================================================
-
-extfrag_threshold
-
-This parameter affects whether the kernel will compact memory or direct
-reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in
-debugfs shows what the fragmentation index for each order is in each zone in
-the system. Values tending towards 0 imply allocations would fail due to lack
-of memory, values towards 1000 imply failures are due to fragmentation and -1
-implies that the allocation will succeed as long as watermarks are met.
-
-The kernel will not compact memory in a zone if the
-fragmentation index is <= extfrag_threshold. The default value is 500.
-
-==============================================================
-
-highmem_is_dirtyable
-
-Available only for systems with CONFIG_HIGHMEM enabled (32b systems).
-
-This parameter controls whether the high memory is considered for dirty
-writers throttling.  This is not the case by default which means that
-only the amount of memory directly visible/usable by the kernel can
-be dirtied. As a result, on systems with a large amount of memory and
-lowmem basically depleted writers might be throttled too early and
-streaming writes can get very slow.
-
-Changing the value to non zero would allow more memory to be dirtied
-and thus allow writers to write more data which can be flushed to the
-storage more effectively. Note this also comes with a risk of pre-mature
-OOM killer because some writers (e.g. direct block device writes) can
-only use the low memory and they can fill it up with dirty data without
-any throttling.
-
-==============================================================
-
-hugetlb_shm_group
-
-hugetlb_shm_group contains group id that is allowed to create SysV
-shared memory segment using hugetlb page.
-
-==============================================================
-
-laptop_mode
-
-laptop_mode is a knob that controls "laptop mode". All the things that are
-controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt.
-
-==============================================================
-
-legacy_va_layout
-
-If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel
-will use the legacy (2.4) layout for all processes.
-
-==============================================================
-
-lowmem_reserve_ratio
-
-For some specialised workloads on highmem machines it is dangerous for
-the kernel to allow process memory to be allocated from the "lowmem"
-zone.  This is because that memory could then be pinned via the mlock()
-system call, or by unavailability of swapspace.
-
-And on large highmem machines this lack of reclaimable lowmem memory
-can be fatal.
-
-So the Linux page allocator has a mechanism which prevents allocations
-which _could_ use highmem from using too much lowmem.  This means that
-a certain amount of lowmem is defended from the possibility of being
-captured into pinned user memory.
-
-(The same argument applies to the old 16 megabyte ISA DMA region.  This
-mechanism will also defend that region from allocations which could use
-highmem or lowmem).
-
-The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
-in defending these lower zones.
-
-If you have a machine which uses highmem or ISA DMA and your
-applications are using mlock(), or if you are running with no swap then
-you probably should change the lowmem_reserve_ratio setting.
-
-The lowmem_reserve_ratio is an array. You can see them by reading this file.
--
-% cat /proc/sys/vm/lowmem_reserve_ratio
-256     256     32
--
-
-But, these values are not used directly. The kernel calculates # of protection
-pages for each zones from them. These are shown as array of protection pages
-in /proc/zoneinfo like followings. (This is an example of x86-64 box).
-Each zone has an array of protection pages like this.
-
--
-Node 0, zone      DMA
-  pages free     1355
-        min      3
-        low      3
-        high     4
-	:
-	:
-    numa_other   0
-        protection: (0, 2004, 2004, 2004)
-	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  pagesets
-    cpu: 0 pcp: 0
-        :
--
-These protections are added to score to judge whether this zone should be used
-for page allocation or should be reclaimed.
-
-In this example, if normal pages (index=2) are required to this DMA zone and
-watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
-not be used because pages_free(1355) is smaller than watermark + protection[2]
-(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
-normal page requirement. If requirement is DMA zone(index=0), protection[0]
-(=0) is used.
-
-zone[i]'s protection[j] is calculated by following expression.
-
-(i < j):
-  zone[i]->protection[j]
-  = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
-    / lowmem_reserve_ratio[i];
-(i = j):
-   (should not be protected. = 0;
-(i > j):
-   (not necessary, but looks 0)
-
-The default values of lowmem_reserve_ratio[i] are
-    256 (if zone[i] means DMA or DMA32 zone)
-    32  (others).
-As above expression, they are reciprocal number of ratio.
-256 means 1/256. # of protection pages becomes about "0.39%" of total managed
-pages of higher zones on the node.
-
-If you would like to protect more pages, smaller values are effective.
-The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
-disables protection of the pages.
-
-==============================================================
-
-max_map_count:
-
-This file contains the maximum number of memory map areas a process
-may have. Memory map areas are used as a side-effect of calling
-malloc, directly by mmap, mprotect, and madvise, and also when loading
-shared libraries.
-
-While most applications need less than a thousand maps, certain
-programs, particularly malloc debuggers, may consume lots of them,
-e.g., up to one or two maps per allocation.
-
-The default value is 65536.
-
-=============================================================
-
-memory_failure_early_kill:
-
-Control how to kill processes when uncorrected memory error (typically
-a 2bit error in a memory module) is detected in the background by hardware
-that cannot be handled by the kernel. In some cases (like the page
-still having a valid copy on disk) the kernel will handle the failure
-transparently without affecting any applications. But if there is
-no other uptodate copy of the data it will kill to prevent any data
-corruptions from propagating.
-
-1: Kill all processes that have the corrupted and not reloadable page mapped
-as soon as the corruption is detected.  Note this is not supported
-for a few types of pages, like kernel internally allocated data or
-the swap cache, but works for the majority of user pages.
-
-0: Only unmap the corrupted page from all processes and only kill a process
-who tries to access it.
-
-The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
-handle this if they want to.
-
-This is only active on architectures/platforms with advanced machine
-check handling and depends on the hardware capabilities.
-
-Applications can override this setting individually with the PR_MCE_KILL prctl
-
-==============================================================
-
-memory_failure_recovery
-
-Enable memory failure recovery (when supported by the platform)
-
-1: Attempt recovery.
-
-0: Always panic on a memory failure.
-
-==============================================================
-
-min_free_kbytes:
-
-This is used to force the Linux VM to keep a minimum number
-of kilobytes free.  The VM uses this number to compute a
-watermark[WMARK_MIN] value for each lowmem zone in the system.
-Each lowmem zone gets a number of reserved free pages based
-proportionally on its size.
-
-Some minimal amount of memory is needed to satisfy PF_MEMALLOC
-allocations; if you set this to lower than 1024KB, your system will
-become subtly broken, and prone to deadlock under high loads.
-
-Setting this too high will OOM your machine instantly.
-
-=============================================================
-
-min_slab_ratio:
-
-This is available only on NUMA kernels.
-
-A percentage of the total pages in each zone.  On Zone reclaim
-(fallback from the local zone occurs) slabs will be reclaimed if more
-than this percentage of pages in a zone are reclaimable slab pages.
-This insures that the slab growth stays under control even in NUMA
-systems that rarely perform global reclaim.
-
-The default is 5 percent.
-
-Note that slab reclaim is triggered in a per zone / node fashion.
-The process of reclaiming slab memory is currently not node specific
-and may not be fast.
-
-=============================================================
-
-min_unmapped_ratio:
-
-This is available only on NUMA kernels.
-
-This is a percentage of the total pages in each zone. Zone reclaim will
-only occur if more than this percentage of pages are in a state that
-zone_reclaim_mode allows to be reclaimed.
-
-If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
-against all file-backed unmapped pages including swapcache pages and tmpfs
-files. Otherwise, only unmapped pages backed by normal files but not tmpfs
-files and similar are considered.
-
-The default is 1 percent.
-
-==============================================================
-
-mmap_min_addr
-
-This file indicates the amount of address space  which a user process will
-be restricted from mmapping.  Since kernel null dereference bugs could
-accidentally operate based on the information in the first couple of pages
-of memory userspace processes should not be allowed to write to them.  By
-default this value is set to 0 and no protections will be enforced by the
-security module.  Setting this value to something like 64k will allow the
-vast majority of applications to work correctly and provide defense in depth
-against future potential kernel bugs.
-
-==============================================================
-
-mmap_rnd_bits:
-
-This value can be used to select the number of bits to use to
-determine the random offset to the base address of vma regions
-resulting from mmap allocations on architectures which support
-tuning address space randomization.  This value will be bounded
-by the architecture's minimum and maximum supported values.
-
-This value can be changed after boot using the
-/proc/sys/vm/mmap_rnd_bits tunable
-
-==============================================================
-
-mmap_rnd_compat_bits:
-
-This value can be used to select the number of bits to use to
-determine the random offset to the base address of vma regions
-resulting from mmap allocations for applications run in
-compatibility mode on architectures which support tuning address
-space randomization.  This value will be bounded by the
-architecture's minimum and maximum supported values.
-
-This value can be changed after boot using the
-/proc/sys/vm/mmap_rnd_compat_bits tunable
-
-==============================================================
-
-nr_hugepages
-
-Change the minimum size of the hugepage pool.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_hugepages_mempolicy
-
-Change the size of the hugepage pool at run-time on a specific
-set of NUMA nodes.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_overcommit_hugepages
-
-Change the maximum size of the hugepage pool. The maximum is
-nr_hugepages + nr_overcommit_hugepages.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_trim_pages
-
-This is available only on NOMMU kernels.
-
-This value adjusts the excess page trimming behaviour of power-of-2 aligned
-NOMMU mmap allocations.
-
-A value of 0 disables trimming of allocations entirely, while a value of 1
-trims excess pages aggressively. Any value >= 1 acts as the watermark where
-trimming of allocations is initiated.
-
-The default value is 1.
-
-See Documentation/nommu-mmap.txt for more information.
-
-==============================================================
-
-numa_zonelist_order
-
-This sysctl is only for NUMA and it is deprecated. Anything but
-Node order will fail!
-
-'where the memory is allocated from' is controlled by zonelists.
-(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
- you may be able to read ZONE_DMA as ZONE_DMA32...)
-
-In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
-ZONE_NORMAL -> ZONE_DMA
-This means that a memory allocation request for GFP_KERNEL will
-get memory from ZONE_DMA only when ZONE_NORMAL is not available.
-
-In NUMA case, you can think of following 2 types of order.
-Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL
-
-(A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
-(B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
-
-Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
-will be used before ZONE_NORMAL exhaustion. This increases possibility of
-out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
-
-Type(B) cannot offer the best locality but is more robust against OOM of
-the DMA zone.
-
-Type(A) is called as "Node" order. Type (B) is "Zone" order.
-
-"Node order" orders the zonelists by node, then by zone within each node.
-Specify "[Nn]ode" for node order
-
-"Zone Order" orders the zonelists by zone type, then by node within each
-zone.  Specify "[Zz]one" for zone order.
-
-Specify "[Dd]efault" to request automatic configuration.
-
-On 32-bit, the Normal zone needs to be preserved for allocations accessible
-by the kernel, so "zone" order will be selected.
-
-On 64-bit, devices that require DMA32/DMA are relatively rare, so "node"
-order will be selected.
-
-Default order is recommended unless this is causing problems for your
-system/application.
-
-==============================================================
-
-oom_dump_tasks
-
-Enables a system-wide task dump (excluding kernel threads) to be produced
-when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
-score, and name.  This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
-
-If this is set to zero, this information is suppressed.  On very
-large systems with thousands of tasks it may not be feasible to dump
-the memory state information for each one.  Such systems should not
-be forced to incur a performance penalty in OOM conditions when the
-information may not be desired.
-
-If this is set to non-zero, this information is shown whenever the
-OOM killer actually kills a memory-hogging task.
-
-The default value is 1 (enabled).
-
-==============================================================
-
-oom_kill_allocating_task
-
-This enables or disables killing the OOM-triggering task in
-out-of-memory situations.
-
-If this is set to zero, the OOM killer will scan through the entire
-tasklist and select a task based on heuristics to kill.  This normally
-selects a rogue memory-hogging task that frees up a large amount of
-memory when killed.
-
-If this is set to non-zero, the OOM killer simply kills the task that
-triggered the out-of-memory condition.  This avoids the expensive
-tasklist scan.
-
-If panic_on_oom is selected, it takes precedence over whatever value
-is used in oom_kill_allocating_task.
-
-The default value is 0.
-
-==============================================================
-
-overcommit_kbytes:
-
-When overcommit_memory is set to 2, the committed address space is not
-permitted to exceed swap plus this amount of physical RAM. See below.
-
-Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
-of them may be specified at a time. Setting one disables the other (which
-then appears as 0 when read).
-
-==============================================================
-
-overcommit_memory:
-
-This value contains a flag that enables memory overcommitment.
-
-When this flag is 0, the kernel attempts to estimate the amount
-of free memory left when userspace requests more memory.
-
-When this flag is 1, the kernel pretends there is always enough
-memory until it actually runs out.
-
-When this flag is 2, the kernel uses a "never overcommit"
-policy that attempts to prevent any overcommit of memory.
-Note that user_reserve_kbytes affects this policy.
-
-This feature can be very useful because there are a lot of
-programs that malloc() huge amounts of memory "just-in-case"
-and don't use much of it.
-
-The default value is 0.
-
-See Documentation/vm/overcommit-accounting.rst and
-mm/util.c::__vm_enough_memory() for more information.
-
-==============================================================
-
-overcommit_ratio:
-
-When overcommit_memory is set to 2, the committed address
-space is not permitted to exceed swap plus this percentage
-of physical RAM.  See above.
-
-==============================================================
-
-page-cluster
-
-page-cluster controls the number of pages up to which consecutive pages
-are read in from swap in a single attempt. This is the swap counterpart
-to page cache readahead.
-The mentioned consecutivity is not in terms of virtual/physical addresses,
-but consecutive on swap space - that means they were swapped out together.
-
-It is a logarithmic value - setting it to zero means "1 page", setting
-it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
-Zero disables swap readahead completely.
-
-The default value is three (eight pages at a time).  There may be some
-small benefits in tuning this to a different value if your workload is
-swap-intensive.
-
-Lower values mean lower latencies for initial faults, but at the same time
-extra faults and I/O delays for following faults if they would have been part of
-that consecutive pages readahead would have brought in.
-
-=============================================================
-
-panic_on_oom
-
-This enables or disables panic on out-of-memory feature.
-
-If this is set to 0, the kernel will kill some rogue process,
-called oom_killer.  Usually, oom_killer can kill rogue processes and
-system will survive.
-
-If this is set to 1, the kernel panics when out-of-memory happens.
-However, if a process limits using nodes by mempolicy/cpusets,
-and those nodes become memory exhaustion status, one process
-may be killed by oom-killer. No panic occurs in this case.
-Because other nodes' memory may be free. This means system total status
-may be not fatal yet.
-
-If this is set to 2, the kernel panics compulsorily even on the
-above-mentioned. Even oom happens under memory cgroup, the whole
-system panics.
-
-The default value is 0.
-1 and 2 are for failover of clustering. Please select either
-according to your policy of failover.
-panic_on_oom=2+kdump gives you very strong tool to investigate
-why oom happens. You can get snapshot.
-
-=============================================================
-
-percpu_pagelist_fraction
-
-This is the fraction of pages at most (high mark pcp->high) in each zone that
-are allocated for each per cpu page list.  The min value for this is 8.  It
-means that we don't allow more than 1/8th of pages in each zone to be
-allocated in any single per_cpu_pagelist.  This entry only changes the value
-of hot per cpu pagelists.  User can specify a number like 100 to allocate
-1/100th of each zone to each per cpu page list.
-
-The batch value of each per cpu pagelist is also updated as a result.  It is
-set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
-
-The initial value is zero.  Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.  If the user writes '0' to this
-sysctl, it will revert to this default behavior.
-
-==============================================================
-
-stat_interval
-
-The time interval between which vm statistics are updated.  The default
-is 1 second.
-
-==============================================================
-
-stat_refresh
-
-Any read or write (by root only) flushes all the per-cpu vm statistics
-into their global totals, for more accurate reports when testing
-e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
-
-As a side-effect, it also checks for negative totals (elsewhere reported
-as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
-(At time of writing, a few stats are known sometimes to be found negative,
-with no ill effects: errors and warnings on these stats are suppressed.)
-
-==============================================================
-
-numa_stat
-
-This interface allows runtime configuration of numa statistics.
-
-When page allocation performance becomes a bottleneck and you can tolerate
-some possible tool breakage and decreased numa counter precision, you can
-do:
-	echo 0 > /proc/sys/vm/numa_stat
-
-When page allocation performance is not a bottleneck and you want all
-tooling to work, you can do:
-	echo 1 > /proc/sys/vm/numa_stat
-
-==============================================================
-
-swappiness
-
-This control is used to define how aggressive the kernel will swap
-memory pages.  Higher values will increase aggressiveness, lower values
-decrease the amount of swap.  A value of 0 instructs the kernel not to
-initiate swap until the amount of free and file-backed pages is less
-than the high water mark in a zone.
-
-The default value is 60.
-
-==============================================================
-
-unprivileged_userfaultfd
-
-This flag controls whether unprivileged users can use the userfaultfd
-system calls.  Set this to 1 to allow unprivileged users to use the
-userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
-privileged users (with SYS_CAP_PTRACE capability).
-
-The default value is 1.
-
-==============================================================
-
-- user_reserve_kbytes
-
-When overcommit_memory is set to 2, "never overcommit" mode, reserve
-min(3% of current process size, user_reserve_kbytes) of free memory.
-This is intended to prevent a user from starting a single memory hogging
-process, such that they cannot recover (kill the hog).
-
-user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
-
-If this is reduced to zero, then the user will be allowed to allocate
-all free memory with a single process, minus admin_reserve_kbytes.
-Any subsequent attempts to execute a command will result in
-"fork: Cannot allocate memory".
-
-Changing this takes effect whenever an application requests memory.
-
-==============================================================
-
-vfs_cache_pressure
-------------------
-
-This percentage value controls the tendency of the kernel to reclaim
-the memory which is used for caching of directory and inode objects.
-
-At the default value of vfs_cache_pressure=100 the kernel will attempt to
-reclaim dentries and inodes at a "fair" rate with respect to pagecache and
-swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
-to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
-never reclaim dentries and inodes due to memory pressure and this can easily
-lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
-causes the kernel to prefer to reclaim dentries and inodes.
-
-Increasing vfs_cache_pressure significantly beyond 100 may have negative
-performance impact. Reclaim code needs to take various locks to find freeable
-directory and inode objects. With vfs_cache_pressure=1000, it will look for
-ten times more freeable objects than there are.
-
-=============================================================
-
-watermark_boost_factor:
-
-This factor controls the level of reclaim when memory is being fragmented.
-It defines the percentage of the high watermark of a zone that will be
-reclaimed if pages of different mobility are being mixed within pageblocks.
-The intent is that compaction has less work to do in the future and to
-increase the success rate of future high-order allocations such as SLUB
-allocations, THP and hugetlbfs pages.
-
-To make it sensible with respect to the watermark_scale_factor
-parameter, the unit is in fractions of 10,000. The default value of
-15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
-watermark will be reclaimed in the event of a pageblock being mixed due
-to fragmentation. The level of reclaim is determined by the number of
-fragmentation events that occurred in the recent past. If this value is
-smaller than a pageblock then a pageblocks worth of pages will be reclaimed
-(e.g.  2MB on 64-bit x86). A boost factor of 0 will disable the feature.
-
-=============================================================
-
-watermark_scale_factor:
-
-This factor controls the aggressiveness of kswapd. It defines the
-amount of memory left in a node/system before kswapd is woken up and
-how much memory needs to be free before kswapd goes back to sleep.
-
-The unit is in fractions of 10,000. The default value of 10 means the
-distances between watermarks are 0.1% of the available memory in the
-node/system. The maximum value is 1000, or 10% of memory.
-
-A high rate of threads entering direct reclaim (allocstall) or kswapd
-going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
-that the number of free pages kswapd maintains for latency reasons is
-too small for the allocation bursts occurring in the system. This knob
-can then be used to tune kswapd aggressiveness accordingly.
-
-==============================================================
-
-zone_reclaim_mode:
-
-Zone_reclaim_mode allows someone to set more or less aggressive approaches to
-reclaim memory when a zone runs out of memory. If it is set to zero then no
-zone reclaim occurs. Allocations will be satisfied from other zones / nodes
-in the system.
-
-This is value ORed together of
-
-1	= Zone reclaim on
-2	= Zone reclaim writes dirty pages out
-4	= Zone reclaim swaps pages
-
-zone_reclaim_mode is disabled by default.  For file servers or workloads
-that benefit from having their data cached, zone_reclaim_mode should be
-left disabled as the caching effect is likely to be more important than
-data locality.
-
-zone_reclaim may be enabled if it's known that the workload is partitioned
-such that each partition fits within a NUMA node and that accessing remote
-memory would cause a measurable performance reduction.  The page allocator
-will then reclaim easily reusable pages (those page cache pages that are
-currently not used) before allocating off node pages.
-
-Allowing zone reclaim to write out pages stops processes that are
-writing large amounts of data from dirtying pages on other nodes. Zone
-reclaim will write out dirty pages if a zone fills up and so effectively
-throttle the process. This may decrease the performance of a single process
-since it cannot use all of system memory to buffer the outgoing writes
-anymore but it preserve the memory on other nodes so that the performance
-of other processes running on other nodes will not be affected.
-
-Allowing regular swap effectively restricts allocations to the local
-node unless explicitly overridden by memory policies or cpuset
-configurations.
-
-============ End of Document =================================
diff --git a/Documentation/target/index.rst b/Documentation/target/index.rst
new file mode 100644
index 000000000000..4b24f81f747e
--- /dev/null
+++ b/Documentation/target/index.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+TCM Virtual Device
+==================
+
+.. toctree::
+    :maxdepth: 1
+
+    tcmu-design
+    tcm_mod_builder
+    scripts
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/target/scripts.rst b/Documentation/target/scripts.rst
new file mode 100644
index 000000000000..172d42b522e4
--- /dev/null
+++ b/Documentation/target/scripts.rst
@@ -0,0 +1,11 @@
+TCM mod builder script
+----------------------
+
+.. literalinclude:: tcm_mod_builder.py
+    :language: perl
+
+Target export device script
+---------------------------
+
+.. literalinclude:: target-export-device
+    :language: shell
diff --git a/Documentation/target/tcm_mod_builder.rst b/Documentation/target/tcm_mod_builder.rst
new file mode 100644
index 000000000000..9bfc9822e2bd
--- /dev/null
+++ b/Documentation/target/tcm_mod_builder.rst
@@ -0,0 +1,149 @@
+=========================================
+The TCM v4 fabric module script generator
+=========================================
+
+Greetings all,
+
+This document is intended to be a mini-HOWTO for using the tcm_mod_builder.py
+script to generate a brand new functional TCM v4 fabric .ko module of your very own,
+that once built can be immediately be loaded to start access the new TCM/ConfigFS
+fabric skeleton, by simply using::
+
+	modprobe $TCM_NEW_MOD
+	mkdir -p /sys/kernel/config/target/$TCM_NEW_MOD
+
+This script will create a new drivers/target/$TCM_NEW_MOD/, and will do the following
+
+	1) Generate new API callers for drivers/target/target_core_fabric_configs.c logic
+	   ->make_tpg(), ->drop_tpg(), ->make_wwn(), ->drop_wwn().  These are created
+	   into $TCM_NEW_MOD/$TCM_NEW_MOD_configfs.c
+	2) Generate basic infrastructure for loading/unloading LKMs and TCM/ConfigFS fabric module
+	   using a skeleton struct target_core_fabric_ops API template.
+	3) Based on user defined T10 Proto_Ident for the new fabric module being built,
+	   the TransportID / Initiator and Target WWPN related handlers for
+	   SPC-3 persistent reservation are automatically generated in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c
+	   using drivers/target/target_core_fabric_lib.c logic.
+	4) NOP API calls for all other Data I/O path and fabric dependent attribute logic
+	   in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c
+
+tcm_mod_builder.py depends upon the mandatory '-p $PROTO_IDENT' and '-m
+$FABRIC_MOD_name' parameters, and actually running the script looks like::
+
+  target:/mnt/sdb/lio-core-2.6.git/Documentation/target# python tcm_mod_builder.py -p iSCSI -m tcm_nab5000
+  tcm_dir: /mnt/sdb/lio-core-2.6.git/Documentation/target/../../
+  Set fabric_mod_name: tcm_nab5000
+  Set fabric_mod_dir:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000
+  Using proto_ident: iSCSI
+  Creating fabric_mod_dir:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_base.h
+  Using tcm_mod_scan_fabric_ops:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../include/target/target_core_fabric_ops.h
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.c
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.h
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_configfs.c
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kbuild
+  Writing file:
+  /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kconfig
+  Would you like to add tcm_nab5000to drivers/target/Kbuild..? [yes,no]: yes
+  Would you like to add tcm_nab5000to drivers/target/Kconfig..? [yes,no]: yes
+
+At the end of tcm_mod_builder.py. the script will ask to add the following
+line to drivers/target/Kbuild::
+
+	obj-$(CONFIG_TCM_NAB5000)       += tcm_nab5000/
+
+and the same for drivers/target/Kconfig::
+
+	source "drivers/target/tcm_nab5000/Kconfig"
+
+#) Run 'make menuconfig' and select the new CONFIG_TCM_NAB5000 item::
+
+	<M>   TCM_NAB5000 fabric module
+
+#) Build using 'make modules', once completed you will have::
+
+    target:/mnt/sdb/lio-core-2.6.git# ls -la drivers/target/tcm_nab5000/
+    total 1348
+    drwxr-xr-x 2 root root   4096 2010-10-05 03:23 .
+    drwxr-xr-x 9 root root   4096 2010-10-05 03:22 ..
+    -rw-r--r-- 1 root root    282 2010-10-05 03:22 Kbuild
+    -rw-r--r-- 1 root root    171 2010-10-05 03:22 Kconfig
+    -rw-r--r-- 1 root root     49 2010-10-05 03:23 modules.order
+    -rw-r--r-- 1 root root    738 2010-10-05 03:22 tcm_nab5000_base.h
+    -rw-r--r-- 1 root root   9096 2010-10-05 03:22 tcm_nab5000_configfs.c
+    -rw-r--r-- 1 root root 191200 2010-10-05 03:23 tcm_nab5000_configfs.o
+    -rw-r--r-- 1 root root  40504 2010-10-05 03:23 .tcm_nab5000_configfs.o.cmd
+    -rw-r--r-- 1 root root   5414 2010-10-05 03:22 tcm_nab5000_fabric.c
+    -rw-r--r-- 1 root root   2016 2010-10-05 03:22 tcm_nab5000_fabric.h
+    -rw-r--r-- 1 root root 190932 2010-10-05 03:23 tcm_nab5000_fabric.o
+    -rw-r--r-- 1 root root  40713 2010-10-05 03:23 .tcm_nab5000_fabric.o.cmd
+    -rw-r--r-- 1 root root 401861 2010-10-05 03:23 tcm_nab5000.ko
+    -rw-r--r-- 1 root root    265 2010-10-05 03:23 .tcm_nab5000.ko.cmd
+    -rw-r--r-- 1 root root    459 2010-10-05 03:23 tcm_nab5000.mod.c
+    -rw-r--r-- 1 root root  23896 2010-10-05 03:23 tcm_nab5000.mod.o
+    -rw-r--r-- 1 root root  22655 2010-10-05 03:23 .tcm_nab5000.mod.o.cmd
+    -rw-r--r-- 1 root root 379022 2010-10-05 03:23 tcm_nab5000.o
+    -rw-r--r-- 1 root root    211 2010-10-05 03:23 .tcm_nab5000.o.cmd
+
+#) Load the new module, create a lun_0 configfs group, and add new TCM Core
+   IBLOCK backstore symlink to port::
+
+    target:/mnt/sdb/lio-core-2.6.git# insmod drivers/target/tcm_nab5000.ko
+    target:/mnt/sdb/lio-core-2.6.git# mkdir -p /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0
+    target:/mnt/sdb/lio-core-2.6.git# cd /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0/
+    target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# ln -s /sys/kernel/config/target/core/iblock_0/lvm_test0 nab5000_port
+
+    target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# cd -
+    target:/mnt/sdb/lio-core-2.6.git# tree /sys/kernel/config/target/nab5000/
+    /sys/kernel/config/target/nab5000/
+    |-- discovery_auth
+    |-- iqn.foo
+    |   `-- tpgt_1
+    |       |-- acls
+    |       |-- attrib
+    |       |-- lun
+    |       |   `-- lun_0
+    |       |       |-- alua_tg_pt_gp
+    |       |       |-- alua_tg_pt_offline
+    |       |       |-- alua_tg_pt_status
+    |       |       |-- alua_tg_pt_write_md
+    |	|	`-- nab5000_port -> ../../../../../../target/core/iblock_0/lvm_test0
+    |       |-- np
+    |       `-- param
+    `-- version
+
+    target:/mnt/sdb/lio-core-2.6.git# lsmod
+    Module                  Size  Used by
+    tcm_nab5000             3935  4
+    iscsi_target_mod      193211  0
+    target_core_stgt        8090  0
+    target_core_pscsi      11122  1
+    target_core_file        9172  2
+    target_core_iblock      9280  1
+    target_core_mod       228575  31
+    tcm_nab5000,iscsi_target_mod,target_core_stgt,target_core_pscsi,target_core_file,target_core_iblock
+    libfc                  73681  0
+    scsi_debug             56265  0
+    scsi_tgt                8666  1 target_core_stgt
+    configfs               20644  2 target_core_mod
+
+----------------------------------------------------------------------
+
+Future TODO items
+=================
+
+	1) Add more T10 proto_idents
+	2) Make tcm_mod_dump_fabric_ops() smarter and generate function pointer
+	   defs directly from include/target/target_core_fabric_ops.h:struct target_core_fabric_ops
+	   structure members.
+
+October 5th, 2010
+
+Nicholas A. Bellinger <nab@linux-iscsi.org>
diff --git a/Documentation/target/tcm_mod_builder.txt b/Documentation/target/tcm_mod_builder.txt
deleted file mode 100644
index ae22f7005540..000000000000
--- a/Documentation/target/tcm_mod_builder.txt
+++ /dev/null
@@ -1,145 +0,0 @@
->>>>>>>>>> The TCM v4 fabric module script generator <<<<<<<<<<
-
-Greetings all,
-
-This document is intended to be a mini-HOWTO for using the tcm_mod_builder.py
-script to generate a brand new functional TCM v4 fabric .ko module of your very own,
-that once built can be immediately be loaded to start access the new TCM/ConfigFS
-fabric skeleton, by simply using:
-
-	modprobe $TCM_NEW_MOD
-	mkdir -p /sys/kernel/config/target/$TCM_NEW_MOD
-
-This script will create a new drivers/target/$TCM_NEW_MOD/, and will do the following
-
-	*) Generate new API callers for drivers/target/target_core_fabric_configs.c logic
-	   ->make_tpg(), ->drop_tpg(), ->make_wwn(), ->drop_wwn().  These are created
-	   into $TCM_NEW_MOD/$TCM_NEW_MOD_configfs.c
-	*) Generate basic infrastructure for loading/unloading LKMs and TCM/ConfigFS fabric module
-	   using a skeleton struct target_core_fabric_ops API template.
-	*) Based on user defined T10 Proto_Ident for the new fabric module being built,
-	   the TransportID / Initiator and Target WWPN related handlers for
-	   SPC-3 persistent reservation are automatically generated in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c
-	   using drivers/target/target_core_fabric_lib.c logic.
-	*) NOP API calls for all other Data I/O path and fabric dependent attribute logic
-	   in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c
-
-tcm_mod_builder.py depends upon the mandatory '-p $PROTO_IDENT' and '-m
-$FABRIC_MOD_name' parameters, and actually running the script looks like:
-
-target:/mnt/sdb/lio-core-2.6.git/Documentation/target# python tcm_mod_builder.py -p iSCSI -m tcm_nab5000
-tcm_dir: /mnt/sdb/lio-core-2.6.git/Documentation/target/../../
-Set fabric_mod_name: tcm_nab5000
-Set fabric_mod_dir:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000
-Using proto_ident: iSCSI
-Creating fabric_mod_dir:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_base.h
-Using tcm_mod_scan_fabric_ops:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../include/target/target_core_fabric_ops.h
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.c
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.h
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_configfs.c
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kbuild
-Writing file:
-/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kconfig
-Would you like to add tcm_nab5000to drivers/target/Kbuild..? [yes,no]: yes
-Would you like to add tcm_nab5000to drivers/target/Kconfig..? [yes,no]: yes
-
-At the end of tcm_mod_builder.py. the script will ask to add the following
-line to drivers/target/Kbuild:
-
-	obj-$(CONFIG_TCM_NAB5000)       += tcm_nab5000/
-
-and the same for drivers/target/Kconfig:
-
-	source "drivers/target/tcm_nab5000/Kconfig"
-
-*) Run 'make menuconfig' and select the new CONFIG_TCM_NAB5000 item:
-
-	<M>   TCM_NAB5000 fabric module
-
-*) Build using 'make modules', once completed you will have:
-
-target:/mnt/sdb/lio-core-2.6.git# ls -la drivers/target/tcm_nab5000/
-total 1348
-drwxr-xr-x 2 root root   4096 2010-10-05 03:23 .
-drwxr-xr-x 9 root root   4096 2010-10-05 03:22 ..
--rw-r--r-- 1 root root    282 2010-10-05 03:22 Kbuild
--rw-r--r-- 1 root root    171 2010-10-05 03:22 Kconfig
--rw-r--r-- 1 root root     49 2010-10-05 03:23 modules.order
--rw-r--r-- 1 root root    738 2010-10-05 03:22 tcm_nab5000_base.h
--rw-r--r-- 1 root root   9096 2010-10-05 03:22 tcm_nab5000_configfs.c
--rw-r--r-- 1 root root 191200 2010-10-05 03:23 tcm_nab5000_configfs.o
--rw-r--r-- 1 root root  40504 2010-10-05 03:23 .tcm_nab5000_configfs.o.cmd
--rw-r--r-- 1 root root   5414 2010-10-05 03:22 tcm_nab5000_fabric.c
--rw-r--r-- 1 root root   2016 2010-10-05 03:22 tcm_nab5000_fabric.h
--rw-r--r-- 1 root root 190932 2010-10-05 03:23 tcm_nab5000_fabric.o
--rw-r--r-- 1 root root  40713 2010-10-05 03:23 .tcm_nab5000_fabric.o.cmd
--rw-r--r-- 1 root root 401861 2010-10-05 03:23 tcm_nab5000.ko
--rw-r--r-- 1 root root    265 2010-10-05 03:23 .tcm_nab5000.ko.cmd
--rw-r--r-- 1 root root    459 2010-10-05 03:23 tcm_nab5000.mod.c
--rw-r--r-- 1 root root  23896 2010-10-05 03:23 tcm_nab5000.mod.o
--rw-r--r-- 1 root root  22655 2010-10-05 03:23 .tcm_nab5000.mod.o.cmd
--rw-r--r-- 1 root root 379022 2010-10-05 03:23 tcm_nab5000.o
--rw-r--r-- 1 root root    211 2010-10-05 03:23 .tcm_nab5000.o.cmd
-
-*) Load the new module, create a lun_0 configfs group, and add new TCM Core
-   IBLOCK backstore symlink to port:
-
-target:/mnt/sdb/lio-core-2.6.git# insmod drivers/target/tcm_nab5000.ko
-target:/mnt/sdb/lio-core-2.6.git# mkdir -p /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0
-target:/mnt/sdb/lio-core-2.6.git# cd /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0/
-target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# ln -s /sys/kernel/config/target/core/iblock_0/lvm_test0 nab5000_port
-
-target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# cd -
-target:/mnt/sdb/lio-core-2.6.git# tree /sys/kernel/config/target/nab5000/
-/sys/kernel/config/target/nab5000/
-|-- discovery_auth
-|-- iqn.foo
-|   `-- tpgt_1
-|       |-- acls
-|       |-- attrib
-|       |-- lun
-|       |   `-- lun_0
-|       |       |-- alua_tg_pt_gp
-|       |       |-- alua_tg_pt_offline
-|       |       |-- alua_tg_pt_status
-|       |       |-- alua_tg_pt_write_md
-|	|	`-- nab5000_port -> ../../../../../../target/core/iblock_0/lvm_test0
-|       |-- np
-|       `-- param
-`-- version
-
-target:/mnt/sdb/lio-core-2.6.git# lsmod
-Module                  Size  Used by
-tcm_nab5000             3935  4
-iscsi_target_mod      193211  0
-target_core_stgt        8090  0
-target_core_pscsi      11122  1
-target_core_file        9172  2
-target_core_iblock      9280  1
-target_core_mod       228575  31
-tcm_nab5000,iscsi_target_mod,target_core_stgt,target_core_pscsi,target_core_file,target_core_iblock
-libfc                  73681  0
-scsi_debug             56265  0
-scsi_tgt                8666  1 target_core_stgt
-configfs               20644  2 target_core_mod
-
-----------------------------------------------------------------------
-
-Future TODO items:
-
-	*) Add more T10 proto_idents
-	*) Make tcm_mod_dump_fabric_ops() smarter and generate function pointer
-	   defs directly from include/target/target_core_fabric_ops.h:struct target_core_fabric_ops
-	   structure members.
-
-October 5th, 2010
-Nicholas A. Bellinger <nab@linux-iscsi.org>
diff --git a/Documentation/target/tcmu-design.rst b/Documentation/target/tcmu-design.rst
new file mode 100644
index 000000000000..a7b426707bf6
--- /dev/null
+++ b/Documentation/target/tcmu-design.rst
@@ -0,0 +1,405 @@
+====================
+TCM Userspace Design
+====================
+
+
+.. Contents:
+
+   1) TCM Userspace Design
+     a) Background
+     b) Benefits
+     c) Design constraints
+     d) Implementation overview
+        i. Mailbox
+        ii. Command ring
+        iii. Data Area
+     e) Device discovery
+     f) Device events
+     g) Other contingencies
+   2) Writing a user pass-through handler
+     a) Discovering and configuring TCMU uio devices
+     b) Waiting for events on the device(s)
+     c) Managing the command ring
+   3) A final note
+
+
+TCM Userspace Design
+====================
+
+TCM is another name for LIO, an in-kernel iSCSI target (server).
+Existing TCM targets run in the kernel.  TCMU (TCM in Userspace)
+allows userspace programs to be written which act as iSCSI targets.
+This document describes the design.
+
+The existing kernel provides modules for different SCSI transport
+protocols.  TCM also modularizes the data storage.  There are existing
+modules for file, block device, RAM or using another SCSI device as
+storage.  These are called "backstores" or "storage engines".  These
+built-in modules are implemented entirely as kernel code.
+
+Background
+----------
+
+In addition to modularizing the transport protocol used for carrying
+SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes
+the actual data storage as well. These are referred to as "backstores"
+or "storage engines". The target comes with backstores that allow a
+file, a block device, RAM, or another SCSI device to be used for the
+local storage needed for the exported SCSI LUN. Like the rest of LIO,
+these are implemented entirely as kernel code.
+
+These backstores cover the most common use cases, but not all. One new
+use case that other non-kernel target solutions, such as tgt, are able
+to support is using Gluster's GLFS or Ceph's RBD as a backstore. The
+target then serves as a translator, allowing initiators to store data
+in these non-traditional networked storage systems, while still only
+using standard protocols themselves.
+
+If the target is a userspace process, supporting these is easy. tgt,
+for example, needs only a small adapter module for each, because the
+modules just use the available userspace libraries for RBD and GLFS.
+
+Adding support for these backstores in LIO is considerably more
+difficult, because LIO is entirely kernel code. Instead of undertaking
+the significant work to port the GLFS or RBD APIs and protocols to the
+kernel, another approach is to create a userspace pass-through
+backstore for LIO, "TCMU".
+
+
+Benefits
+--------
+
+In addition to allowing relatively easy support for RBD and GLFS, TCMU
+will also allow easier development of new backstores. TCMU combines
+with the LIO loopback fabric to become something similar to FUSE
+(Filesystem in Userspace), but at the SCSI layer instead of the
+filesystem layer. A SUSE, if you will.
+
+The disadvantage is there are more distinct components to configure, and
+potentially to malfunction. This is unavoidable, but hopefully not
+fatal if we're careful to keep things as simple as possible.
+
+Design constraints
+------------------
+
+- Good performance: high throughput, low latency
+- Cleanly handle if userspace:
+
+   1) never attaches
+   2) hangs
+   3) dies
+   4) misbehaves
+
+- Allow future flexibility in user & kernel implementations
+- Be reasonably memory-efficient
+- Simple to configure & run
+- Simple to write a userspace backend
+
+
+Implementation overview
+-----------------------
+
+The core of the TCMU interface is a memory region that is shared
+between kernel and userspace. Within this region is: a control area
+(mailbox); a lockless producer/consumer circular buffer for commands
+to be passed up, and status returned; and an in/out data buffer area.
+
+TCMU uses the pre-existing UIO subsystem. UIO allows device driver
+development in userspace, and this is conceptually very close to the
+TCMU use case, except instead of a physical device, TCMU implements a
+memory-mapped layout designed for SCSI commands. Using UIO also
+benefits TCMU by handling device introspection (e.g. a way for
+userspace to determine how large the shared region is) and signaling
+mechanisms in both directions.
+
+There are no embedded pointers in the memory region. Everything is
+expressed as an offset from the region's starting address. This allows
+the ring to still work if the user process dies and is restarted with
+the region mapped at a different virtual address.
+
+See target_core_user.h for the struct definitions.
+
+The Mailbox
+-----------
+
+The mailbox is always at the start of the shared memory region, and
+contains a version, details about the starting offset and size of the
+command ring, and head and tail pointers to be used by the kernel and
+userspace (respectively) to put commands on the ring, and indicate
+when the commands are completed.
+
+version - 1 (userspace should abort if otherwise)
+
+flags:
+    - TCMU_MAILBOX_FLAG_CAP_OOOC:
+	indicates out-of-order completion is supported.
+	See "The Command Ring" for details.
+
+cmdr_off
+	The offset of the start of the command ring from the start
+	of the memory region, to account for the mailbox size.
+cmdr_size
+	The size of the command ring. This does *not* need to be a
+	power of two.
+cmd_head
+	Modified by the kernel to indicate when a command has been
+	placed on the ring.
+cmd_tail
+	Modified by userspace to indicate when it has completed
+	processing of a command.
+
+The Command Ring
+----------------
+
+Commands are placed on the ring by the kernel incrementing
+mailbox.cmd_head by the size of the command, modulo cmdr_size, and
+then signaling userspace via uio_event_notify(). Once the command is
+completed, userspace updates mailbox.cmd_tail in the same way and
+signals the kernel via a 4-byte write(). When cmd_head equals
+cmd_tail, the ring is empty -- no commands are currently waiting to be
+processed by userspace.
+
+TCMU commands are 8-byte aligned. They start with a common header
+containing "len_op", a 32-bit value that stores the length, as well as
+the opcode in the lowest unused bits. It also contains cmd_id and
+flags fields for setting by the kernel (kflags) and userspace
+(uflags).
+
+Currently only two opcodes are defined, TCMU_OP_CMD and TCMU_OP_PAD.
+
+When the opcode is CMD, the entry in the command ring is a struct
+tcmu_cmd_entry. Userspace finds the SCSI CDB (Command Data Block) via
+tcmu_cmd_entry.req.cdb_off. This is an offset from the start of the
+overall shared memory region, not the entry. The data in/out buffers
+are accessible via tht req.iov[] array. iov_cnt contains the number of
+entries in iov[] needed to describe either the Data-In or Data-Out
+buffers. For bidirectional commands, iov_cnt specifies how many iovec
+entries cover the Data-Out area, and iov_bidi_cnt specifies how many
+iovec entries immediately after that in iov[] cover the Data-In
+area. Just like other fields, iov.iov_base is an offset from the start
+of the region.
+
+When completing a command, userspace sets rsp.scsi_status, and
+rsp.sense_buffer if necessary. Userspace then increments
+mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the
+kernel via the UIO method, a 4-byte write to the file descriptor.
+
+If TCMU_MAILBOX_FLAG_CAP_OOOC is set for mailbox->flags, kernel is
+capable of handling out-of-order completions. In this case, userspace can
+handle command in different order other than original. Since kernel would
+still process the commands in the same order it appeared in the command
+ring, userspace need to update the cmd->id when completing the
+command(a.k.a steal the original command's entry).
+
+When the opcode is PAD, userspace only updates cmd_tail as above --
+it's a no-op. (The kernel inserts PAD entries to ensure each CMD entry
+is contiguous within the command ring.)
+
+More opcodes may be added in the future. If userspace encounters an
+opcode it does not handle, it must set UNKNOWN_OP bit (bit 0) in
+hdr.uflags, update cmd_tail, and proceed with processing additional
+commands, if any.
+
+The Data Area
+-------------
+
+This is shared-memory space after the command ring. The organization
+of this area is not defined in the TCMU interface, and userspace
+should access only the parts referenced by pending iovs.
+
+
+Device Discovery
+----------------
+
+Other devices may be using UIO besides TCMU. Unrelated user processes
+may also be handling different sets of TCMU devices. TCMU userspace
+processes must find their devices by scanning sysfs
+class/uio/uio*/name. For TCMU devices, these names will be of the
+format::
+
+	tcm-user/<hba_num>/<device_name>/<subtype>/<path>
+
+where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num>
+and <device_name> allow userspace to find the device's path in the
+kernel target's configfs tree. Assuming the usual mount point, it is
+found at::
+
+	/sys/kernel/config/target/core/user_<hba_num>/<device_name>
+
+This location contains attributes such as "hw_block_size", that
+userspace needs to know for correct operation.
+
+<subtype> will be a userspace-process-unique string to identify the
+TCMU device as expecting to be backed by a certain handler, and <path>
+will be an additional handler-specific string for the user process to
+configure the device, if needed. The name cannot contain ':', due to
+LIO limitations.
+
+For all devices so discovered, the user handler opens /dev/uioX and
+calls mmap()::
+
+	mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0)
+
+where size must be equal to the value read from
+/sys/class/uio/uioX/maps/map0/size.
+
+
+Device Events
+-------------
+
+If a new device is added or removed, a notification will be broadcast
+over netlink, using a generic netlink family name of "TCM-USER" and a
+multicast group named "config". This will include the UIO name as
+described in the previous section, as well as the UIO minor
+number. This should allow userspace to identify both the UIO device and
+the LIO device, so that after determining the device is supported
+(based on subtype) it can take the appropriate action.
+
+
+Other contingencies
+-------------------
+
+Userspace handler process never attaches:
+
+- TCMU will post commands, and then abort them after a timeout period
+  (30 seconds.)
+
+Userspace handler process is killed:
+
+- It is still possible to restart and re-connect to TCMU
+  devices. Command ring is preserved. However, after the timeout period,
+  the kernel will abort pending tasks.
+
+Userspace handler process hangs:
+
+- The kernel will abort pending tasks after a timeout period.
+
+Userspace handler process is malicious:
+
+- The process can trivially break the handling of devices it controls,
+  but should not be able to access kernel memory outside its shared
+  memory areas.
+
+
+Writing a user pass-through handler (with example code)
+=======================================================
+
+A user process handing a TCMU device must support the following:
+
+a) Discovering and configuring TCMU uio devices
+b) Waiting for events on the device(s)
+c) Managing the command ring: Parsing operations and commands,
+   performing work as needed, setting response fields (scsi_status and
+   possibly sense_buffer), updating cmd_tail, and notifying the kernel
+   that work has been finished
+
+First, consider instead writing a plugin for tcmu-runner. tcmu-runner
+implements all of this, and provides a higher-level API for plugin
+authors.
+
+TCMU is designed so that multiple unrelated processes can manage TCMU
+devices separately. All handlers should make sure to only open their
+devices, based opon a known subtype string.
+
+a) Discovering and configuring TCMU UIO devices::
+
+      /* error checking omitted for brevity */
+
+      int fd, dev_fd;
+      char buf[256];
+      unsigned long long map_len;
+      void *map;
+
+      fd = open("/sys/class/uio/uio0/name", O_RDONLY);
+      ret = read(fd, buf, sizeof(buf));
+      close(fd);
+      buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
+
+      /* we only want uio devices whose name is a format we expect */
+      if (strncmp(buf, "tcm-user", 8))
+	exit(-1);
+
+      /* Further checking for subtype also needed here */
+
+      fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY);
+      ret = read(fd, buf, sizeof(buf));
+      close(fd);
+      str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
+
+      map_len = strtoull(buf, NULL, 0);
+
+      dev_fd = open("/dev/uio0", O_RDWR);
+      map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0);
+
+
+      b) Waiting for events on the device(s)
+
+      while (1) {
+        char buf[4];
+
+        int ret = read(dev_fd, buf, 4); /* will block */
+
+        handle_device_events(dev_fd, map);
+      }
+
+
+c) Managing the command ring::
+
+      #include <linux/target_core_user.h>
+
+      int handle_device_events(int fd, void *map)
+      {
+        struct tcmu_mailbox *mb = map;
+        struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
+        int did_some_work = 0;
+
+        /* Process events from cmd ring until we catch up with cmd_head */
+        while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) {
+
+          if (tcmu_hdr_get_op(ent->hdr.len_op) == TCMU_OP_CMD) {
+            uint8_t *cdb = (void *)mb + ent->req.cdb_off;
+            bool success = true;
+
+            /* Handle command here. */
+            printf("SCSI opcode: 0x%x\n", cdb[0]);
+
+            /* Set response fields */
+            if (success)
+              ent->rsp.scsi_status = SCSI_NO_SENSE;
+            else {
+              /* Also fill in rsp->sense_buffer here */
+              ent->rsp.scsi_status = SCSI_CHECK_CONDITION;
+            }
+          }
+          else if (tcmu_hdr_get_op(ent->hdr.len_op) != TCMU_OP_PAD) {
+            /* Tell the kernel we didn't handle unknown opcodes */
+            ent->hdr.uflags |= TCMU_UFLAG_UNKNOWN_OP;
+          }
+          else {
+            /* Do nothing for PAD entries except update cmd_tail */
+          }
+
+          /* update cmd_tail */
+          mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size;
+          ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
+          did_some_work = 1;
+        }
+
+        /* Notify the kernel that work has been finished */
+        if (did_some_work) {
+          uint32_t buf = 0;
+
+          write(fd, &buf, 4);
+        }
+
+        return 0;
+      }
+
+
+A final note
+============
+
+Please be careful to return codes as defined by the SCSI
+specifications. These are different than some values defined in the
+scsi/scsi.h include file. For example, CHECK CONDITION's status code
+is 2, not 1.
diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt
deleted file mode 100644
index 4cebc1ebf99a..000000000000
--- a/Documentation/target/tcmu-design.txt
+++ /dev/null
@@ -1,381 +0,0 @@
-Contents:
-
-1) TCM Userspace Design
-  a) Background
-  b) Benefits
-  c) Design constraints
-  d) Implementation overview
-     i. Mailbox
-     ii. Command ring
-     iii. Data Area
-  e) Device discovery
-  f) Device events
-  g) Other contingencies
-2) Writing a user pass-through handler
-  a) Discovering and configuring TCMU uio devices
-  b) Waiting for events on the device(s)
-  c) Managing the command ring
-3) A final note
-
-
-TCM Userspace Design
---------------------
-
-TCM is another name for LIO, an in-kernel iSCSI target (server).
-Existing TCM targets run in the kernel.  TCMU (TCM in Userspace)
-allows userspace programs to be written which act as iSCSI targets.
-This document describes the design.
-
-The existing kernel provides modules for different SCSI transport
-protocols.  TCM also modularizes the data storage.  There are existing
-modules for file, block device, RAM or using another SCSI device as
-storage.  These are called "backstores" or "storage engines".  These
-built-in modules are implemented entirely as kernel code.
-
-Background:
-
-In addition to modularizing the transport protocol used for carrying
-SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes
-the actual data storage as well. These are referred to as "backstores"
-or "storage engines". The target comes with backstores that allow a
-file, a block device, RAM, or another SCSI device to be used for the
-local storage needed for the exported SCSI LUN. Like the rest of LIO,
-these are implemented entirely as kernel code.
-
-These backstores cover the most common use cases, but not all. One new
-use case that other non-kernel target solutions, such as tgt, are able
-to support is using Gluster's GLFS or Ceph's RBD as a backstore. The
-target then serves as a translator, allowing initiators to store data
-in these non-traditional networked storage systems, while still only
-using standard protocols themselves.
-
-If the target is a userspace process, supporting these is easy. tgt,
-for example, needs only a small adapter module for each, because the
-modules just use the available userspace libraries for RBD and GLFS.
-
-Adding support for these backstores in LIO is considerably more
-difficult, because LIO is entirely kernel code. Instead of undertaking
-the significant work to port the GLFS or RBD APIs and protocols to the
-kernel, another approach is to create a userspace pass-through
-backstore for LIO, "TCMU".
-
-
-Benefits:
-
-In addition to allowing relatively easy support for RBD and GLFS, TCMU
-will also allow easier development of new backstores. TCMU combines
-with the LIO loopback fabric to become something similar to FUSE
-(Filesystem in Userspace), but at the SCSI layer instead of the
-filesystem layer. A SUSE, if you will.
-
-The disadvantage is there are more distinct components to configure, and
-potentially to malfunction. This is unavoidable, but hopefully not
-fatal if we're careful to keep things as simple as possible.
-
-Design constraints:
-
-- Good performance: high throughput, low latency
-- Cleanly handle if userspace:
-   1) never attaches
-   2) hangs
-   3) dies
-   4) misbehaves
-- Allow future flexibility in user & kernel implementations
-- Be reasonably memory-efficient
-- Simple to configure & run
-- Simple to write a userspace backend
-
-
-Implementation overview:
-
-The core of the TCMU interface is a memory region that is shared
-between kernel and userspace. Within this region is: a control area
-(mailbox); a lockless producer/consumer circular buffer for commands
-to be passed up, and status returned; and an in/out data buffer area.
-
-TCMU uses the pre-existing UIO subsystem. UIO allows device driver
-development in userspace, and this is conceptually very close to the
-TCMU use case, except instead of a physical device, TCMU implements a
-memory-mapped layout designed for SCSI commands. Using UIO also
-benefits TCMU by handling device introspection (e.g. a way for
-userspace to determine how large the shared region is) and signaling
-mechanisms in both directions.
-
-There are no embedded pointers in the memory region. Everything is
-expressed as an offset from the region's starting address. This allows
-the ring to still work if the user process dies and is restarted with
-the region mapped at a different virtual address.
-
-See target_core_user.h for the struct definitions.
-
-The Mailbox:
-
-The mailbox is always at the start of the shared memory region, and
-contains a version, details about the starting offset and size of the
-command ring, and head and tail pointers to be used by the kernel and
-userspace (respectively) to put commands on the ring, and indicate
-when the commands are completed.
-
-version - 1 (userspace should abort if otherwise)
-flags:
-- TCMU_MAILBOX_FLAG_CAP_OOOC: indicates out-of-order completion is
-  supported.  See "The Command Ring" for details.
-cmdr_off - The offset of the start of the command ring from the start
-of the memory region, to account for the mailbox size.
-cmdr_size - The size of the command ring. This does *not* need to be a
-power of two.
-cmd_head - Modified by the kernel to indicate when a command has been
-placed on the ring.
-cmd_tail - Modified by userspace to indicate when it has completed
-processing of a command.
-
-The Command Ring:
-
-Commands are placed on the ring by the kernel incrementing
-mailbox.cmd_head by the size of the command, modulo cmdr_size, and
-then signaling userspace via uio_event_notify(). Once the command is
-completed, userspace updates mailbox.cmd_tail in the same way and
-signals the kernel via a 4-byte write(). When cmd_head equals
-cmd_tail, the ring is empty -- no commands are currently waiting to be
-processed by userspace.
-
-TCMU commands are 8-byte aligned. They start with a common header
-containing "len_op", a 32-bit value that stores the length, as well as
-the opcode in the lowest unused bits. It also contains cmd_id and
-flags fields for setting by the kernel (kflags) and userspace
-(uflags).
-
-Currently only two opcodes are defined, TCMU_OP_CMD and TCMU_OP_PAD.
-
-When the opcode is CMD, the entry in the command ring is a struct
-tcmu_cmd_entry. Userspace finds the SCSI CDB (Command Data Block) via
-tcmu_cmd_entry.req.cdb_off. This is an offset from the start of the
-overall shared memory region, not the entry. The data in/out buffers
-are accessible via tht req.iov[] array. iov_cnt contains the number of
-entries in iov[] needed to describe either the Data-In or Data-Out
-buffers. For bidirectional commands, iov_cnt specifies how many iovec
-entries cover the Data-Out area, and iov_bidi_cnt specifies how many
-iovec entries immediately after that in iov[] cover the Data-In
-area. Just like other fields, iov.iov_base is an offset from the start
-of the region.
-
-When completing a command, userspace sets rsp.scsi_status, and
-rsp.sense_buffer if necessary. Userspace then increments
-mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the
-kernel via the UIO method, a 4-byte write to the file descriptor.
-
-If TCMU_MAILBOX_FLAG_CAP_OOOC is set for mailbox->flags, kernel is
-capable of handling out-of-order completions. In this case, userspace can
-handle command in different order other than original. Since kernel would
-still process the commands in the same order it appeared in the command
-ring, userspace need to update the cmd->id when completing the
-command(a.k.a steal the original command's entry).
-
-When the opcode is PAD, userspace only updates cmd_tail as above --
-it's a no-op. (The kernel inserts PAD entries to ensure each CMD entry
-is contiguous within the command ring.)
-
-More opcodes may be added in the future. If userspace encounters an
-opcode it does not handle, it must set UNKNOWN_OP bit (bit 0) in
-hdr.uflags, update cmd_tail, and proceed with processing additional
-commands, if any.
-
-The Data Area:
-
-This is shared-memory space after the command ring. The organization
-of this area is not defined in the TCMU interface, and userspace
-should access only the parts referenced by pending iovs.
-
-
-Device Discovery:
-
-Other devices may be using UIO besides TCMU. Unrelated user processes
-may also be handling different sets of TCMU devices. TCMU userspace
-processes must find their devices by scanning sysfs
-class/uio/uio*/name. For TCMU devices, these names will be of the
-format:
-
-tcm-user/<hba_num>/<device_name>/<subtype>/<path>
-
-where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num>
-and <device_name> allow userspace to find the device's path in the
-kernel target's configfs tree. Assuming the usual mount point, it is
-found at:
-
-/sys/kernel/config/target/core/user_<hba_num>/<device_name>
-
-This location contains attributes such as "hw_block_size", that
-userspace needs to know for correct operation.
-
-<subtype> will be a userspace-process-unique string to identify the
-TCMU device as expecting to be backed by a certain handler, and <path>
-will be an additional handler-specific string for the user process to
-configure the device, if needed. The name cannot contain ':', due to
-LIO limitations.
-
-For all devices so discovered, the user handler opens /dev/uioX and
-calls mmap():
-
-mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0)
-
-where size must be equal to the value read from
-/sys/class/uio/uioX/maps/map0/size.
-
-
-Device Events:
-
-If a new device is added or removed, a notification will be broadcast
-over netlink, using a generic netlink family name of "TCM-USER" and a
-multicast group named "config". This will include the UIO name as
-described in the previous section, as well as the UIO minor
-number. This should allow userspace to identify both the UIO device and
-the LIO device, so that after determining the device is supported
-(based on subtype) it can take the appropriate action.
-
-
-Other contingencies:
-
-Userspace handler process never attaches:
-
-- TCMU will post commands, and then abort them after a timeout period
-  (30 seconds.)
-
-Userspace handler process is killed:
-
-- It is still possible to restart and re-connect to TCMU
-  devices. Command ring is preserved. However, after the timeout period,
-  the kernel will abort pending tasks.
-
-Userspace handler process hangs:
-
-- The kernel will abort pending tasks after a timeout period.
-
-Userspace handler process is malicious:
-
-- The process can trivially break the handling of devices it controls,
-  but should not be able to access kernel memory outside its shared
-  memory areas.
-
-
-Writing a user pass-through handler (with example code)
--------------------------------------------------------
-
-A user process handing a TCMU device must support the following:
-
-a) Discovering and configuring TCMU uio devices
-b) Waiting for events on the device(s)
-c) Managing the command ring: Parsing operations and commands,
-   performing work as needed, setting response fields (scsi_status and
-   possibly sense_buffer), updating cmd_tail, and notifying the kernel
-   that work has been finished
-
-First, consider instead writing a plugin for tcmu-runner. tcmu-runner
-implements all of this, and provides a higher-level API for plugin
-authors.
-
-TCMU is designed so that multiple unrelated processes can manage TCMU
-devices separately. All handlers should make sure to only open their
-devices, based opon a known subtype string.
-
-a) Discovering and configuring TCMU UIO devices:
-
-(error checking omitted for brevity)
-
-int fd, dev_fd;
-char buf[256];
-unsigned long long map_len;
-void *map;
-
-fd = open("/sys/class/uio/uio0/name", O_RDONLY);
-ret = read(fd, buf, sizeof(buf));
-close(fd);
-buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
-
-/* we only want uio devices whose name is a format we expect */
-if (strncmp(buf, "tcm-user", 8))
-	exit(-1);
-
-/* Further checking for subtype also needed here */
-
-fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY);
-ret = read(fd, buf, sizeof(buf));
-close(fd);
-str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
-
-map_len = strtoull(buf, NULL, 0);
-
-dev_fd = open("/dev/uio0", O_RDWR);
-map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0);
-
-
-b) Waiting for events on the device(s)
-
-while (1) {
-  char buf[4];
-
-  int ret = read(dev_fd, buf, 4); /* will block */
-
-  handle_device_events(dev_fd, map);
-}
-
-
-c) Managing the command ring
-
-#include <linux/target_core_user.h>
-
-int handle_device_events(int fd, void *map)
-{
-  struct tcmu_mailbox *mb = map;
-  struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
-  int did_some_work = 0;
-
-  /* Process events from cmd ring until we catch up with cmd_head */
-  while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) {
-
-    if (tcmu_hdr_get_op(ent->hdr.len_op) == TCMU_OP_CMD) {
-      uint8_t *cdb = (void *)mb + ent->req.cdb_off;
-      bool success = true;
-
-      /* Handle command here. */
-      printf("SCSI opcode: 0x%x\n", cdb[0]);
-
-      /* Set response fields */
-      if (success)
-        ent->rsp.scsi_status = SCSI_NO_SENSE;
-      else {
-        /* Also fill in rsp->sense_buffer here */
-        ent->rsp.scsi_status = SCSI_CHECK_CONDITION;
-      }
-    }
-    else if (tcmu_hdr_get_op(ent->hdr.len_op) != TCMU_OP_PAD) {
-      /* Tell the kernel we didn't handle unknown opcodes */
-      ent->hdr.uflags |= TCMU_UFLAG_UNKNOWN_OP;
-    }
-    else {
-      /* Do nothing for PAD entries except update cmd_tail */
-    }
-
-    /* update cmd_tail */
-    mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size;
-    ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
-    did_some_work = 1;
-  }
-
-  /* Notify the kernel that work has been finished */
-  if (did_some_work) {
-    uint32_t buf = 0;
-
-    write(fd, &buf, 4);
-  }
-
-  return 0;
-}
-
-
-A final note
-------------
-
-Please be careful to return codes as defined by the SCSI
-specifications. These are different than some values defined in the
-scsi/scsi.h include file. For example, CHECK CONDITION's status code
-is 2, not 1.
diff --git a/Documentation/tee.txt b/Documentation/tee.txt
index 56ea85ffebf2..afacdf2fd1de 100644
--- a/Documentation/tee.txt
+++ b/Documentation/tee.txt
@@ -32,7 +32,7 @@ User space (the client) connects to the driver by opening /dev/tee[0-9]* or
   memory.
 
 - TEE_IOC_VERSION lets user space know which TEE this driver handles and
-  the its capabilities.
+  its capabilities.
 
 - TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application.
 
diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst
new file mode 100644
index 000000000000..645d914c45a6
--- /dev/null
+++ b/Documentation/thermal/cpu-cooling-api.rst
@@ -0,0 +1,107 @@
+=======================
+CPU cooling APIs How To
+=======================
+
+Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
+
+Updated: 6 Jan 2015
+
+Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
+
+0. Introduction
+===============
+
+The generic cpu cooling(freq clipping) provides registration/unregistration APIs
+to the caller. The binding of the cooling devices to the trip point is left for
+the user. The registration APIs returns the cooling device pointer.
+
+1. cpu cooling APIs
+===================
+
+1.1 cpufreq registration/unregistration APIs
+--------------------------------------------
+
+    ::
+
+	struct thermal_cooling_device
+	*cpufreq_cooling_register(struct cpumask *clip_cpus)
+
+    This interface function registers the cpufreq cooling device with the name
+    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+    cooling devices.
+
+   clip_cpus:
+	cpumask of cpus where the frequency constraints will happen.
+
+    ::
+
+	struct thermal_cooling_device
+	*of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+
+    This interface function registers the cpufreq cooling device with
+    the name "thermal-cpufreq-%x" linking it with a device tree node, in
+    order to bind it via the thermal DT code. This api can support multiple
+    instances of cpufreq cooling devices.
+
+    policy:
+	CPUFreq policy.
+
+
+    ::
+
+	void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+
+    cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+===============
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic power (static power isn't
+supported currently).  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as::
+
+	Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes::
+
+	Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
deleted file mode 100644
index 7df567eaea1a..000000000000
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-CPU cooling APIs How To
-===================================
-
-Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
-
-Updated: 6 Jan 2015
-
-Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
-
-0. Introduction
-
-The generic cpu cooling(freq clipping) provides registration/unregistration APIs
-to the caller. The binding of the cooling devices to the trip point is left for
-the user. The registration APIs returns the cooling device pointer.
-
-1. cpu cooling APIs
-
-1.1 cpufreq registration/unregistration APIs
-1.1.1 struct thermal_cooling_device *cpufreq_cooling_register(
-	struct cpumask *clip_cpus)
-
-    This interface function registers the cpufreq cooling device with the name
-    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
-    cooling devices.
-
-   clip_cpus: cpumask of cpus where the frequency constraints will happen.
-
-1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
-					struct cpufreq_policy *policy)
-
-    This interface function registers the cpufreq cooling device with
-    the name "thermal-cpufreq-%x" linking it with a device tree node, in
-    order to bind it via the thermal DT code. This api can support multiple
-    instances of cpufreq cooling devices.
-
-    policy: CPUFreq policy.
-
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
-
-    cdev: Cooling device pointer which has to be unregistered.
-
-2. Power models
-
-The power API registration functions provide a simple power model for
-CPUs.  The current power is calculated as dynamic power (static power isn't
-supported currently).  This power model requires that the operating-points of
-the CPUs are registered using the kernel's opp library and the
-`cpufreq_frequency_table` is assigned to the `struct device` of the
-cpu.  If you are using CONFIG_CPUFREQ_DT then the
-`cpufreq_frequency_table` should already be assigned to the cpu
-device.
-
-The dynamic power consumption of a processor depends on many factors.
-For a given processor implementation the primary factors are:
-
-- The time the processor spends running, consuming dynamic power, as
-  compared to the time in idle states where dynamic consumption is
-  negligible.  Herein we refer to this as 'utilisation'.
-- The voltage and frequency levels as a result of DVFS.  The DVFS
-  level is a dominant factor governing power consumption.
-- In running time the 'execution' behaviour (instruction types, memory
-  access patterns and so forth) causes, in most cases, a second order
-  variation.  In pathological cases this variation can be significant,
-  but typically it is of a much lesser impact than the factors above.
-
-A high level dynamic power consumption model may then be represented as:
-
-Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
-
-f(run) here represents the described execution behaviour and its
-result has a units of Watts/Hz/Volt^2 (this often expressed in
-mW/MHz/uVolt^2)
-
-The detailed behaviour for f(run) could be modelled on-line.  However,
-in practice, such an on-line model has dependencies on a number of
-implementation specific processor support and characterisation
-factors.  Therefore, in initial implementation that contribution is
-represented as a constant coefficient.  This is a simplification
-consistent with the relative contribution to overall power variation.
-
-In this simplified representation our model becomes:
-
-Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
-
-Where `capacitance` is a constant that represents an indicative
-running time dynamic power coefficient in fundamental units of
-mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
-from 100 to 500.  For reference, the approximate values for the SoC in
-ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
-140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal
deleted file mode 100644
index 9010c4416967..000000000000
--- a/Documentation/thermal/exynos_thermal
+++ /dev/null
@@ -1,77 +0,0 @@
-Kernel driver exynos_tmu
-=================
-
-Supported chips:
-* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
-  Datasheet: Not publicly available
-
-Authors: Donggeun Kim <dg77.kim@samsung.com>
-Authors: Amit Daniel <amit.daniel@samsung.com>
-
-TMU controller Description:
----------------------------
-
-This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
-
-The chip only exposes the measured 8-bit temperature code value
-through a register.
-Temperature can be taken from the temperature code.
-There are three equations converting from temperature to temperature code.
-
-The three equations are:
-  1. Two point trimming
-	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
-
-  2. One point trimming
-	Tc = T + TI1 - 25
-
-  3. No trimming
-	Tc = T + 50
-
-  Tc: Temperature code, T: Temperature,
-  TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 25 degree Celsius which is unchanged
-  TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 85 degree Celsius which is unchanged
-
-TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
-when temperature exceeds pre-defined levels.
-The maximum number of configurable threshold is five.
-The threshold levels are defined as follows:
-  Level_0: current temperature > trigger_level_0 + threshold
-  Level_1: current temperature > trigger_level_1 + threshold
-  Level_2: current temperature > trigger_level_2 + threshold
-  Level_3: current temperature > trigger_level_3 + threshold
-
-  The threshold and each trigger_level are set
-  through the corresponding registers.
-
-When an interrupt occurs, this driver notify kernel thermal framework
-with the function exynos_report_trigger.
-Although an interrupt condition for level_0 can be set,
-it can be used to synchronize the cooling action.
-
-TMU driver description:
------------------------
-
-The exynos thermal driver is structured as,
-
-					Kernel Core thermal framework
-				(thermal_core.c, step_wise.c, cpu_cooling.c)
-								^
-								|
-								|
-TMU configuration data -------> TMU Driver  <------> Exynos Core thermal wrapper
-(exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
-(exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
-
-a) TMU configuration data: This consist of TMU register offsets/bitfields
-		described through structure exynos_tmu_registers. Also several
-		other platform data (struct exynos_tmu_platform_data) members
-		are used to configure the TMU.
-b) TMU driver: This component initialises the TMU controller and sets different
-		thresholds. It invokes core thermal implementation with the call
-		exynos_report_trigger.
-c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the
-		Kernel core thermal framework. They are exynos_unregister_thermal,
-		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst
new file mode 100644
index 000000000000..5bd556566c70
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal.rst
@@ -0,0 +1,90 @@
+========================
+Kernel driver exynos_tmu
+========================
+
+Supported chips:
+
+* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
+
+  Datasheet: Not publicly available
+
+Authors: Donggeun Kim <dg77.kim@samsung.com>
+Authors: Amit Daniel <amit.daniel@samsung.com>
+
+TMU controller Description:
+---------------------------
+
+This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
+
+The chip only exposes the measured 8-bit temperature code value
+through a register.
+Temperature can be taken from the temperature code.
+There are three equations converting from temperature to temperature code.
+
+The three equations are:
+  1. Two point trimming::
+
+	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
+
+  2. One point trimming::
+
+	Tc = T + TI1 - 25
+
+  3. No trimming::
+
+	Tc = T + 50
+
+  Tc:
+       Temperature code, T: Temperature,
+  TI1:
+       Trimming info for 25 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 25 degree Celsius which is unchanged
+  TI2:
+       Trimming info for 85 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 85 degree Celsius which is unchanged
+
+TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
+when temperature exceeds pre-defined levels.
+The maximum number of configurable threshold is five.
+The threshold levels are defined as follows::
+
+  Level_0: current temperature > trigger_level_0 + threshold
+  Level_1: current temperature > trigger_level_1 + threshold
+  Level_2: current temperature > trigger_level_2 + threshold
+  Level_3: current temperature > trigger_level_3 + threshold
+
+The threshold and each trigger_level are set
+through the corresponding registers.
+
+When an interrupt occurs, this driver notify kernel thermal framework
+with the function exynos_report_trigger.
+Although an interrupt condition for level_0 can be set,
+it can be used to synchronize the cooling action.
+
+TMU driver description:
+-----------------------
+
+The exynos thermal driver is structured as::
+
+					Kernel Core thermal framework
+				(thermal_core.c, step_wise.c, cpu_cooling.c)
+								^
+								|
+								|
+  TMU configuration data -----> TMU Driver  <----> Exynos Core thermal wrapper
+  (exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
+  (exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
+
+a) TMU configuration data:
+		This consist of TMU register offsets/bitfields
+		described through structure exynos_tmu_registers. Also several
+		other platform data (struct exynos_tmu_platform_data) members
+		are used to configure the TMU.
+b) TMU driver:
+		This component initialises the TMU controller and sets different
+		thresholds. It invokes core thermal implementation with the call
+		exynos_report_trigger.
+c) Exynos Core thermal wrapper:
+		This provides 3 wrapper function to use the
+		Kernel core thermal framework. They are exynos_unregister_thermal,
+		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation
deleted file mode 100644
index b15efec6ca28..000000000000
--- a/Documentation/thermal/exynos_thermal_emulation
+++ /dev/null
@@ -1,53 +0,0 @@
-EXYNOS EMULATION MODE
-========================
-
-Copyright (C) 2012 Samsung Electronics
-
-Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
-
-Description
------------
-
-Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit.
-Thermal emulation mode supports software debug for TMU's operation. User can set temperature
-manually with software code and TMU will read current temperature from user value not from
-sensor's value.
-
-Enabling CONFIG_THERMAL_EMULATION option will make this support available.
-When it's enabled, sysfs node will be created as
-/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
-
-The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
-temperature you want to update to sysfs node, it automatically enable emulation mode and
-current temperature will be changed into it.
-(Exynos also supports user changeable delay time which would be used to delay of
- changing temperature. However, this node only uses same delay of real sensing time, 938us.)
-
-Exynos emulation mode requires synchronous of value changing and enabling. It means when you
-want to update the any value of delay or next temperature, then you have to enable emulation
-mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value repeatedly. That's why
-this node gives users the right to change termerpature only. Just one interface makes it more
-simply to use.
-
-Disabling emulation mode only requires writing value 0 to sysfs node.
-
-
-TEMP	120 |
-	    |
-	100 |
-	    |
-	 80 |
-	    |		     	 	 +-----------
-	 60 |      		     	 |	    |
-	    |	           +-------------|          |
-	 40 |              |         	 |          |
-	    |		   |	     	 |          |
-	 20 |		   |	     	 |          +----------
-	    |	 	   |	     	 |          |          |
-	  0 |______________|_____________|__________|__________|_________
-		   A	    	 A	    A	   	       A     TIME
-		   |<----->|	 |<----->|  |<----->|	       |
-		   | 938us |  	 |	 |  |       |          |
-emulation    :  0  50	   |  	 70      |  20      |          0
-current temp :   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst
new file mode 100644
index 000000000000..c21d10838bc5
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal_emulation.rst
@@ -0,0 +1,61 @@
+=====================
+Exynos Emulation Mode
+=====================
+
+Copyright (C) 2012 Samsung Electronics
+
+Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
+
+Description
+-----------
+
+Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal
+management unit. Thermal emulation mode supports software debug for
+TMU's operation. User can set temperature manually with software code
+and TMU will read current temperature from user value not from sensor's
+value.
+
+Enabling CONFIG_THERMAL_EMULATION option will make this support
+available. When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
+
+The sysfs node, 'emul_node', will contain value 0 for the initial state.
+When you input any temperature you want to update to sysfs node, it
+automatically enable emulation mode and current temperature will be
+changed into it.
+
+(Exynos also supports user changeable delay time which would be used to
+delay of changing temperature. However, this node only uses same delay
+of real sensing time, 938us.)
+
+Exynos emulation mode requires synchronous of value changing and
+enabling. It means when you want to update the any value of delay or
+next temperature, then you have to enable emulation mode at the same
+time. (Or you have to keep the mode enabling.) If you don't, it fails to
+change the value to updated one and just use last succeessful value
+repeatedly. That's why this node gives users the right to change
+termerpature only. Just one interface makes it more simply to use.
+
+Disabling emulation mode only requires writing value 0 to sysfs node.
+
+::
+
+
+  TEMP	120 |
+	    |
+	100 |
+	    |
+	 80 |
+	    |				 +-----------
+	 60 |      			 |	    |
+	    |		   +-------------|          |
+	 40 |              |         	 |          |
+	    |		   |		 |          |
+	 20 |		   |		 |          +----------
+	    |		   |		 |          |          |
+	  0 |______________|_____________|__________|__________|_________
+		   A		 A	    A		       A     TIME
+		   |<----->|	 |<----->|  |<----->|	       |
+		   | 938us |  	 |	 |  |       |          |
+  emulation   : 0  50	   |  	 70      |  20      |          0
+  current temp:   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst
new file mode 100644
index 000000000000..8c1c00146cad
--- /dev/null
+++ b/Documentation/thermal/index.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+=======
+Thermal
+=======
+
+.. toctree::
+   :maxdepth: 1
+
+   cpu-cooling-api
+   sysfs-api
+   power_allocator
+
+   exynos_thermal
+   exynos_thermal_emulation
+   intel_powerclamp
+   nouveau_thermal
+   x86_pkg_temperature_thermal
diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst
new file mode 100644